fix(sync): preserve staging dir on internal import timeout (#1802 C3)

The import-timeout branch printed 'checkpoint preserved' but the finally then
deleted the staging dir: the SIGTERM forwarder's preserve branch only runs when
the PARENT is signalled, and an internal runGbrainImport timeout kills just the
child and returns normally. So #1611 resume-after-timeout never actually worked.
Mirror the forwarder in the timeout branch: set preserveStaging only when gbrain
checkpointed against this dir (finally then skips cleanup); otherwise clean up
and tell the user it restages instead of falsely promising a resume.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-03 07:20:03 -07:00
parent db825fc7af
commit 68e452c0ed
2 changed files with 52 additions and 8 deletions
+21 -7
View File
@@ -1555,6 +1555,11 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
if (!remoteHttpMode) {
_activeStagingDir = stagingDir;
}
// #1802 C3: set when the import-timeout branch leaves a resumable checkpoint
// pointing at this staging dir, so the finally preserves it for the next run
// instead of deleting it (the SIGTERM forwarder's preserve branch only runs
// when the PARENT is signalled, which an internal timeout never does).
let preserveStaging = false;
try {
let staging: StagingResult;
if (resuming) {
@@ -1663,14 +1668,23 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
const importJson = parseImportJson(stdout);
if (importResult.status !== 0) {
// #1611: on timeout, gbrain's import-checkpoint.json is preserved (the
// SIGTERM forwarder keeps the staging dir), so the next /sync-gbrain
// resumes rather than restarting. Tell the user instead of looking failed.
// #1611/#1802 C3: on timeout, gbrain may have written
// import-checkpoint.json so the next /sync-gbrain can resume. But an
// INTERNAL timeout (runGbrainImport kills the child and returns here)
// never signals the parent, so the SIGTERM forwarder's preserve branch
// doesn't run — and the finally would otherwise delete the staging dir
// despite a "checkpoint preserved" message. Mirror the forwarder: preserve
// only when gbrain actually checkpointed against this dir; otherwise let
// the finally clean up (nothing to resume) and say so honestly.
if (importResult.timedOut) {
const mins = Math.round(resolveImportTimeoutMs() / 60000);
const msg =
`gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` +
`/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`;
const checkpointed = stagingDirIsCheckpointed(stagingDir);
const msg = checkpointed
? `gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` +
`/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`
: `gbrain import timed out after ${mins}min before writing a checkpoint; ` +
`re-run /sync-gbrain to restage (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`;
if (checkpointed) preserveStaging = true;
console.error(`[memory-ingest] ${msg}`);
return {
written: 0,
@@ -1786,7 +1800,7 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
// `finally` runs on its `return`, so the gate has to live here. Gating on
// mode (rather than widening the ownership guard) keeps checkOwnedStagingDir
// strict: it only ever sees `.staging-ingest-*` dirs.
if (!remoteHttpMode) cleanupStagingDir(stagingDir);
if (!remoteHttpMode && !preserveStaging) cleanupStagingDir(stagingDir);
_activeStagingDir = null;
}
@@ -365,7 +365,9 @@ describe("#1802 D1 — remote-http finally gate (static invariant)", () => {
);
test("finally gates cleanupStagingDir on !remoteHttpMode", () => {
expect(ingest).toMatch(/if \(!remoteHttpMode\) cleanupStagingDir\(stagingDir\)/);
// Tolerates additional guards (e.g. C3's !preserveStaging) in the same
// condition — the load-bearing invariant is that remote-http never deletes.
expect(ingest).toMatch(/if \(!remoteHttpMode[^)]*\) cleanupStagingDir\(stagingDir\)/);
});
test("the only finally-scoped cleanup call is the gated one", () => {
@@ -377,3 +379,31 @@ describe("#1802 D1 — remote-http finally gate (static invariant)", () => {
expect(finallySlice).not.toMatch(/^\s*cleanupStagingDir\(stagingDir\);/m);
});
});
// ── #1802 C3: internal import-timeout must preserve a checkpointed staging dir ─
// runGbrainImport kills only the child on an internal timeout; the parent
// returns normally, so the SIGTERM forwarder's preserve branch never runs. The
// timeout branch must mirror it (preserve when checkpointed) and the finally
// must honor that — otherwise "checkpoint preserved" is a lie and resume breaks.
describe("#1802 C3 — import-timeout preserve (static invariant)", () => {
const ingest = fs.readFileSync(
path.join(ROOT, "bin", "gstack-memory-ingest.ts"),
"utf-8",
);
test("timeout branch checks stagingDirIsCheckpointed and sets preserveStaging", () => {
const timeoutAt = ingest.indexOf("if (importResult.timedOut)");
expect(timeoutAt).toBeGreaterThan(-1);
const slice = ingest.slice(timeoutAt, timeoutAt + 1200);
expect(slice).toMatch(/stagingDirIsCheckpointed\(stagingDir\)/);
expect(slice).toMatch(/preserveStaging = true/);
// The not-checkpointed path must say so honestly rather than promising resume.
expect(slice).toMatch(/before writing a checkpoint/);
});
test("finally honors preserveStaging", () => {
expect(ingest).toMatch(
/if \(!remoteHttpMode && !preserveStaging\) cleanupStagingDir\(stagingDir\)/,
);
});
});