mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-09 03:23:55 +02:00
* fix(sync): fail-closed staging-dir ownership guard — prevent rm -rf of repo (#1802) Adopts community fix #1827 by @diazMelgarejo (cyre). New lib/staging-guard.ts exports checkOwnedStagingDir(), the single fail-closed predicate for 'safe to recurse-delete or resume into', wired at cleanupStagingDir() (the deletion chokepoint), decideResume(), the ingest entry point, and makeStagingDir() (mints the .gstack-staging marker). Fixes #1802. Co-Authored-By: cyre <diazMelgarejo@users.noreply.github.com> Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(sync): don't route the remote-http persistent transcript dir through cleanup (#1802) The ingest finally ran cleanupStagingDir() unconditionally, but in remote-http mode stagingDir is the PERSISTENT transcript dir (~/.gstack/transcripts/) that gstack-brain-sync push must consume. The remote-http branch documents the intent to skip cleanup, but a finally runs on its return. Gate the call on !remoteHttpMode so the ownership guard only ever sees .staging-ingest-* dirs. Pre-gate this dir was deleted outright (broken artifacts handoff); post-#1827 it produced a false 'prevent data loss' warning every sync. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(sync): preserve staging dir on internal import timeout (#1802 C3) The import-timeout branch printed 'checkpoint preserved' but the finally then deleted the staging dir: the SIGTERM forwarder's preserve branch only runs when the PARENT is signalled, and an internal runGbrainImport timeout kills just the child and returns normally. So #1611 resume-after-timeout never actually worked. Mirror the forwarder in the timeout branch: set preserveStaging only when gbrain checkpointed against this dir (finally then skips cleanup); otherwise clean up and tell the user it restages instead of falsely promising a resume. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(sync): resume must not mark failed files as ingested (#1802 C4) On resume, stagedPathToSource was rebuilt as an empty Map, so readNewFailures() could not map gbrain's per-file failures back to source paths. Every failure fell through to state recording — failed files were silently marked ingested and never retried. Reconstruct the map from the prepared pages via a shared stagedRelPath() helper (single source of truth with writeStaged, so the keys can never drift). Exports stagedRelPath + readNewFailures for a behavioral test proving the reconstructed map recovers the failure the empty map dropped. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * harden(sync): close staging-guard TOCTOU + fail hard on marker write (#1802 C5) checkOwnedStagingDir() now returns the realpath-resolved canonicalPath on a pass, and cleanupStagingDir() rmSync's that instead of the raw input — closing the gap where the input is a symlink swapped between the ownership check and the delete. makeStagingDir() tears down the partial dir and rethrows if the marker write fails, so a marker-less dir (which the guard would refuse forever) can never leak. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * chore: v1.56.1.0 — staging-dir ownership guard + resume-correctness fixes (#1802) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * ci: grant the eval report job issues:write so PR comment upsert stops 401ing Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: cyre <diazMelgarejo@users.noreply.github.com> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+87
-12
@@ -65,6 +65,7 @@ import {
|
||||
withErrorContext,
|
||||
} from "../lib/gstack-memory-helpers";
|
||||
import { execGbrainText, spawnGbrainAsync } from "../lib/gbrain-exec";
|
||||
import { checkOwnedStagingDir, STAGING_MARKER } from "../lib/staging-guard";
|
||||
|
||||
// ── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -907,13 +908,23 @@ interface StagingResult {
|
||||
* Filename = `${slug}.md`. mkdir is recursive. Existing files overwrite.
|
||||
* Errors per-file are collected; the whole batch is best-effort.
|
||||
*/
|
||||
/**
|
||||
* Staging-relative path for a prepared page's slug. Single source of truth so
|
||||
* writeStaged() (which mints the map) and the resume-path reconstruction (#1802
|
||||
* C4) compute identical keys — if they diverge, readNewFailures() silently stops
|
||||
* mapping gbrain's failures back to sources and failed files get marked ingested.
|
||||
*/
|
||||
export function stagedRelPath(slug: string): string {
|
||||
return `${slug}.md`;
|
||||
}
|
||||
|
||||
function writeStaged(prepared: PreparedPage[], stagingDir: string): StagingResult {
|
||||
mkdirSync(stagingDir, { recursive: true });
|
||||
const stagedPathToSource = new Map<string, string>();
|
||||
const errors: Array<{ slug: string; error: string }> = [];
|
||||
let written = 0;
|
||||
for (const p of prepared) {
|
||||
const relPath = `${p.slug}.md`;
|
||||
const relPath = stagedRelPath(p.slug);
|
||||
const absPath = join(stagingDir, relPath);
|
||||
try {
|
||||
mkdirSync(dirname(absPath), { recursive: true });
|
||||
@@ -978,7 +989,7 @@ function parseImportJson(stdout: string): ImportJsonResult | null {
|
||||
* staging-dir-relative filename gbrain saw (e.g. "transcripts/foo.md").
|
||||
* stagedPathToSource maps that back to the original source file.
|
||||
*/
|
||||
function readNewFailures(
|
||||
export function readNewFailures(
|
||||
syncFailuresPath: string,
|
||||
preImportOffset: number,
|
||||
stagedPathToSource: Map<string, string>,
|
||||
@@ -1198,6 +1209,17 @@ function preparePages(
|
||||
function makeStagingDir(): string {
|
||||
const dir = join(GSTACK_HOME, `.staging-ingest-${process.pid}-${Date.now()}`);
|
||||
mkdirSync(dir, { recursive: true });
|
||||
// Mint the ownership marker (#1802) so cleanupStagingDir() and decideResume()
|
||||
// can prove this dir was created by us before any recursive delete or resume.
|
||||
// #1802 C5: fail hard if the marker can't be written — a marker-less dir would
|
||||
// be refused by the guard forever (leaked, never cleaned). Tear down the
|
||||
// partial dir and rethrow so the caller fails loudly instead of leaking.
|
||||
try {
|
||||
writeFileSync(join(dir, STAGING_MARKER), `${process.pid}\n${Date.now()}\n`, "utf-8");
|
||||
} catch (err) {
|
||||
try { rmSync(dir, { recursive: true, force: true }); } catch { /* best-effort */ }
|
||||
throw err;
|
||||
}
|
||||
return dir;
|
||||
}
|
||||
|
||||
@@ -1259,8 +1281,21 @@ function isRemoteHttpMcpMode(): boolean {
|
||||
* cleanup failure.
|
||||
*/
|
||||
function cleanupStagingDir(dir: string): void {
|
||||
// #1802 deletion chokepoint: never recurse-delete a path we cannot PROVE we
|
||||
// own. A poisoned resume could otherwise route the repo root here.
|
||||
const verdict = checkOwnedStagingDir(dir, GSTACK_HOME);
|
||||
if (!verdict.ok) {
|
||||
console.error(
|
||||
`[gbrain] staging cleanup REFUSED: "${dir}" is not an owned staging dir ` +
|
||||
`(${verdict.reason}). Skipping rm -rf to prevent data loss (#1802).`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
rmSync(dir, { recursive: true, force: true });
|
||||
// #1802 C5: delete the realpath-resolved dir the guard validated, not the
|
||||
// raw input — closes the TOCTOU gap where `dir` is a symlink swapped between
|
||||
// the check above and this rmSync. canonicalPath is always set when ok.
|
||||
rmSync(verdict.canonicalPath ?? dir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// best-effort
|
||||
}
|
||||
@@ -1515,10 +1550,20 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
// tells it where to resume.
|
||||
const remoteHttpMode = isRemoteHttpMcpMode();
|
||||
const resumeDir = process.env.GSTACK_INGEST_RESUME_DIR;
|
||||
// #1802 second entry point: this binary is runnable directly, so it must not
|
||||
// trust GSTACK_INGEST_RESUME_DIR just because it exists — a stale/poisoned env
|
||||
// could make us `gbrain import` (and later clean up) an arbitrary directory.
|
||||
// Prove ownership here too, independently of the orchestrator's decideResume.
|
||||
const resuming = !remoteHttpMode
|
||||
&& typeof resumeDir === "string"
|
||||
&& resumeDir.length > 0
|
||||
&& existsSync(resumeDir);
|
||||
&& existsSync(resumeDir)
|
||||
&& checkOwnedStagingDir(resumeDir, GSTACK_HOME).ok;
|
||||
if (!remoteHttpMode && resumeDir && resumeDir.length > 0 && !resuming) {
|
||||
console.error(
|
||||
`[memory-ingest] ignoring GSTACK_INGEST_RESUME_DIR="${resumeDir}" — not a proven staging dir (#1802); staging fresh.`,
|
||||
);
|
||||
}
|
||||
const stagingDir = resuming
|
||||
? resumeDir!
|
||||
: remoteHttpMode
|
||||
@@ -1531,6 +1576,11 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
if (!remoteHttpMode) {
|
||||
_activeStagingDir = stagingDir;
|
||||
}
|
||||
// #1802 C3: set when the import-timeout branch leaves a resumable checkpoint
|
||||
// pointing at this staging dir, so the finally preserves it for the next run
|
||||
// instead of deleting it (the SIGTERM forwarder's preserve branch only runs
|
||||
// when the PARENT is signalled, which an internal timeout never does).
|
||||
let preserveStaging = false;
|
||||
try {
|
||||
let staging: StagingResult;
|
||||
if (resuming) {
|
||||
@@ -1543,7 +1593,15 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
`[memory-ingest] resuming previous staging dir ${stagingDir} (skipping prepare phase)`,
|
||||
);
|
||||
}
|
||||
staging = { staging_dir: stagingDir, written: prep.prepared.length, errors: [], stagedPathToSource: new Map() };
|
||||
// #1802 C4: reconstruct stagedPathToSource from the prepared pages so
|
||||
// readNewFailures() can still map gbrain's per-file failures back to
|
||||
// sources on resume. An empty map made every failed file fall through to
|
||||
// state-recording — i.e. silently marked ingested despite failing.
|
||||
const stagedPathToSource = new Map<string, string>();
|
||||
for (const p of prep.prepared) {
|
||||
stagedPathToSource.set(stagedRelPath(p.slug), p.source_path);
|
||||
}
|
||||
staging = { staging_dir: stagingDir, written: prep.prepared.length, errors: [], stagedPathToSource };
|
||||
} else {
|
||||
staging = writeStaged(prep.prepared, stagingDir);
|
||||
}
|
||||
@@ -1639,14 +1697,23 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
const importJson = parseImportJson(stdout);
|
||||
|
||||
if (importResult.status !== 0) {
|
||||
// #1611: on timeout, gbrain's import-checkpoint.json is preserved (the
|
||||
// SIGTERM forwarder keeps the staging dir), so the next /sync-gbrain
|
||||
// resumes rather than restarting. Tell the user instead of looking failed.
|
||||
// #1611/#1802 C3: on timeout, gbrain may have written
|
||||
// import-checkpoint.json so the next /sync-gbrain can resume. But an
|
||||
// INTERNAL timeout (runGbrainImport kills the child and returns here)
|
||||
// never signals the parent, so the SIGTERM forwarder's preserve branch
|
||||
// doesn't run — and the finally would otherwise delete the staging dir
|
||||
// despite a "checkpoint preserved" message. Mirror the forwarder: preserve
|
||||
// only when gbrain actually checkpointed against this dir; otherwise let
|
||||
// the finally clean up (nothing to resume) and say so honestly.
|
||||
if (importResult.timedOut) {
|
||||
const mins = Math.round(resolveImportTimeoutMs() / 60000);
|
||||
const msg =
|
||||
`gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` +
|
||||
`/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`;
|
||||
const checkpointed = stagingDirIsCheckpointed(stagingDir);
|
||||
const msg = checkpointed
|
||||
? `gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` +
|
||||
`/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`
|
||||
: `gbrain import timed out after ${mins}min before writing a checkpoint; ` +
|
||||
`re-run /sync-gbrain to restage (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`;
|
||||
if (checkpointed) preserveStaging = true;
|
||||
console.error(`[memory-ingest] ${msg}`);
|
||||
return {
|
||||
written: 0,
|
||||
@@ -1754,7 +1821,15 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
cleanupStagingDir(stagingDir);
|
||||
// #1802 D1: in remote-http mode `stagingDir` is the PERSISTENT transcript
|
||||
// dir (makePersistentTranscriptDir, under ~/.gstack/transcripts/) that
|
||||
// gstack-brain-sync push must pick up — it is NOT a `.staging-ingest-*` dir
|
||||
// and must never be deleted here. The remote-http branch above already
|
||||
// documents this intent ("Skip the ... cleanupStagingDir paths"), but a
|
||||
// `finally` runs on its `return`, so the gate has to live here. Gating on
|
||||
// mode (rather than widening the ownership guard) keeps checkOwnedStagingDir
|
||||
// strict: it only ever sees `.staging-ingest-*` dirs.
|
||||
if (!remoteHttpMode && !preserveStaging) cleanupStagingDir(stagingDir);
|
||||
_activeStagingDir = null;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user