From 68e452c0edf93988285dfd25bd03c26893c0b234 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 3 Jun 2026 07:20:03 -0700 Subject: [PATCH] fix(sync): preserve staging dir on internal import timeout (#1802 C3) The import-timeout branch printed 'checkpoint preserved' but the finally then deleted the staging dir: the SIGTERM forwarder's preserve branch only runs when the PARENT is signalled, and an internal runGbrainImport timeout kills just the child and returns normally. So #1611 resume-after-timeout never actually worked. Mirror the forwarder in the timeout branch: set preserveStaging only when gbrain checkpointed against this dir (finally then skips cleanup); otherwise clean up and tell the user it restages instead of falsely promising a resume. Co-Authored-By: Claude Opus 4.8 (1M context) --- bin/gstack-memory-ingest.ts | 28 ++++++++++++---- ...regression-1611-gbrain-sync-resume.test.ts | 32 ++++++++++++++++++- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/bin/gstack-memory-ingest.ts b/bin/gstack-memory-ingest.ts index 67e885ef2..eb3c2f2a0 100644 --- a/bin/gstack-memory-ingest.ts +++ b/bin/gstack-memory-ingest.ts @@ -1555,6 +1555,11 @@ async function ingestPass(args: CliArgs): Promise { if (!remoteHttpMode) { _activeStagingDir = stagingDir; } + // #1802 C3: set when the import-timeout branch leaves a resumable checkpoint + // pointing at this staging dir, so the finally preserves it for the next run + // instead of deleting it (the SIGTERM forwarder's preserve branch only runs + // when the PARENT is signalled, which an internal timeout never does). + let preserveStaging = false; try { let staging: StagingResult; if (resuming) { @@ -1663,14 +1668,23 @@ async function ingestPass(args: CliArgs): Promise { const importJson = parseImportJson(stdout); if (importResult.status !== 0) { - // #1611: on timeout, gbrain's import-checkpoint.json is preserved (the - // SIGTERM forwarder keeps the staging dir), so the next /sync-gbrain - // resumes rather than restarting. Tell the user instead of looking failed. + // #1611/#1802 C3: on timeout, gbrain may have written + // import-checkpoint.json so the next /sync-gbrain can resume. But an + // INTERNAL timeout (runGbrainImport kills the child and returns here) + // never signals the parent, so the SIGTERM forwarder's preserve branch + // doesn't run — and the finally would otherwise delete the staging dir + // despite a "checkpoint preserved" message. Mirror the forwarder: preserve + // only when gbrain actually checkpointed against this dir; otherwise let + // the finally clean up (nothing to resume) and say so honestly. if (importResult.timedOut) { const mins = Math.round(resolveImportTimeoutMs() / 60000); - const msg = - `gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` + - `/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`; + const checkpointed = stagingDirIsCheckpointed(stagingDir); + const msg = checkpointed + ? `gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` + + `/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)` + : `gbrain import timed out after ${mins}min before writing a checkpoint; ` + + `re-run /sync-gbrain to restage (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`; + if (checkpointed) preserveStaging = true; console.error(`[memory-ingest] ${msg}`); return { written: 0, @@ -1786,7 +1800,7 @@ async function ingestPass(args: CliArgs): Promise { // `finally` runs on its `return`, so the gate has to live here. Gating on // mode (rather than widening the ownership guard) keeps checkOwnedStagingDir // strict: it only ever sees `.staging-ingest-*` dirs. - if (!remoteHttpMode) cleanupStagingDir(stagingDir); + if (!remoteHttpMode && !preserveStaging) cleanupStagingDir(stagingDir); _activeStagingDir = null; } diff --git a/test/regression-1611-gbrain-sync-resume.test.ts b/test/regression-1611-gbrain-sync-resume.test.ts index 1f262a596..c53825f70 100644 --- a/test/regression-1611-gbrain-sync-resume.test.ts +++ b/test/regression-1611-gbrain-sync-resume.test.ts @@ -365,7 +365,9 @@ describe("#1802 D1 — remote-http finally gate (static invariant)", () => { ); test("finally gates cleanupStagingDir on !remoteHttpMode", () => { - expect(ingest).toMatch(/if \(!remoteHttpMode\) cleanupStagingDir\(stagingDir\)/); + // Tolerates additional guards (e.g. C3's !preserveStaging) in the same + // condition — the load-bearing invariant is that remote-http never deletes. + expect(ingest).toMatch(/if \(!remoteHttpMode[^)]*\) cleanupStagingDir\(stagingDir\)/); }); test("the only finally-scoped cleanup call is the gated one", () => { @@ -377,3 +379,31 @@ describe("#1802 D1 — remote-http finally gate (static invariant)", () => { expect(finallySlice).not.toMatch(/^\s*cleanupStagingDir\(stagingDir\);/m); }); }); + +// ── #1802 C3: internal import-timeout must preserve a checkpointed staging dir ─ +// runGbrainImport kills only the child on an internal timeout; the parent +// returns normally, so the SIGTERM forwarder's preserve branch never runs. The +// timeout branch must mirror it (preserve when checkpointed) and the finally +// must honor that — otherwise "checkpoint preserved" is a lie and resume breaks. +describe("#1802 C3 — import-timeout preserve (static invariant)", () => { + const ingest = fs.readFileSync( + path.join(ROOT, "bin", "gstack-memory-ingest.ts"), + "utf-8", + ); + + test("timeout branch checks stagingDirIsCheckpointed and sets preserveStaging", () => { + const timeoutAt = ingest.indexOf("if (importResult.timedOut)"); + expect(timeoutAt).toBeGreaterThan(-1); + const slice = ingest.slice(timeoutAt, timeoutAt + 1200); + expect(slice).toMatch(/stagingDirIsCheckpointed\(stagingDir\)/); + expect(slice).toMatch(/preserveStaging = true/); + // The not-checkpointed path must say so honestly rather than promising resume. + expect(slice).toMatch(/before writing a checkpoint/); + }); + + test("finally honors preserveStaging", () => { + expect(ingest).toMatch( + /if \(!remoteHttpMode && !preserveStaging\) cleanupStagingDir\(stagingDir\)/, + ); + }); +});