mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 07:10:12 +02:00
fix(sync): preserve staging dir on internal import timeout (#1802 C3)
The import-timeout branch printed 'checkpoint preserved' but the finally then deleted the staging dir: the SIGTERM forwarder's preserve branch only runs when the PARENT is signalled, and an internal runGbrainImport timeout kills just the child and returns normally. So #1611 resume-after-timeout never actually worked. Mirror the forwarder in the timeout branch: set preserveStaging only when gbrain checkpointed against this dir (finally then skips cleanup); otherwise clean up and tell the user it restages instead of falsely promising a resume. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1555,6 +1555,11 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
if (!remoteHttpMode) {
|
||||
_activeStagingDir = stagingDir;
|
||||
}
|
||||
// #1802 C3: set when the import-timeout branch leaves a resumable checkpoint
|
||||
// pointing at this staging dir, so the finally preserves it for the next run
|
||||
// instead of deleting it (the SIGTERM forwarder's preserve branch only runs
|
||||
// when the PARENT is signalled, which an internal timeout never does).
|
||||
let preserveStaging = false;
|
||||
try {
|
||||
let staging: StagingResult;
|
||||
if (resuming) {
|
||||
@@ -1663,14 +1668,23 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
const importJson = parseImportJson(stdout);
|
||||
|
||||
if (importResult.status !== 0) {
|
||||
// #1611: on timeout, gbrain's import-checkpoint.json is preserved (the
|
||||
// SIGTERM forwarder keeps the staging dir), so the next /sync-gbrain
|
||||
// resumes rather than restarting. Tell the user instead of looking failed.
|
||||
// #1611/#1802 C3: on timeout, gbrain may have written
|
||||
// import-checkpoint.json so the next /sync-gbrain can resume. But an
|
||||
// INTERNAL timeout (runGbrainImport kills the child and returns here)
|
||||
// never signals the parent, so the SIGTERM forwarder's preserve branch
|
||||
// doesn't run — and the finally would otherwise delete the staging dir
|
||||
// despite a "checkpoint preserved" message. Mirror the forwarder: preserve
|
||||
// only when gbrain actually checkpointed against this dir; otherwise let
|
||||
// the finally clean up (nothing to resume) and say so honestly.
|
||||
if (importResult.timedOut) {
|
||||
const mins = Math.round(resolveImportTimeoutMs() / 60000);
|
||||
const msg =
|
||||
`gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` +
|
||||
`/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`;
|
||||
const checkpointed = stagingDirIsCheckpointed(stagingDir);
|
||||
const msg = checkpointed
|
||||
? `gbrain import timed out after ${mins}min; checkpoint preserved — re-run ` +
|
||||
`/sync-gbrain to resume (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`
|
||||
: `gbrain import timed out after ${mins}min before writing a checkpoint; ` +
|
||||
`re-run /sync-gbrain to restage (raise GSTACK_INGEST_TIMEOUT_MS for big brains)`;
|
||||
if (checkpointed) preserveStaging = true;
|
||||
console.error(`[memory-ingest] ${msg}`);
|
||||
return {
|
||||
written: 0,
|
||||
@@ -1786,7 +1800,7 @@ async function ingestPass(args: CliArgs): Promise<BulkResult> {
|
||||
// `finally` runs on its `return`, so the gate has to live here. Gating on
|
||||
// mode (rather than widening the ownership guard) keeps checkOwnedStagingDir
|
||||
// strict: it only ever sees `.staging-ingest-*` dirs.
|
||||
if (!remoteHttpMode) cleanupStagingDir(stagingDir);
|
||||
if (!remoteHttpMode && !preserveStaging) cleanupStagingDir(stagingDir);
|
||||
_activeStagingDir = null;
|
||||
}
|
||||
|
||||
|
||||
@@ -365,7 +365,9 @@ describe("#1802 D1 — remote-http finally gate (static invariant)", () => {
|
||||
);
|
||||
|
||||
test("finally gates cleanupStagingDir on !remoteHttpMode", () => {
|
||||
expect(ingest).toMatch(/if \(!remoteHttpMode\) cleanupStagingDir\(stagingDir\)/);
|
||||
// Tolerates additional guards (e.g. C3's !preserveStaging) in the same
|
||||
// condition — the load-bearing invariant is that remote-http never deletes.
|
||||
expect(ingest).toMatch(/if \(!remoteHttpMode[^)]*\) cleanupStagingDir\(stagingDir\)/);
|
||||
});
|
||||
|
||||
test("the only finally-scoped cleanup call is the gated one", () => {
|
||||
@@ -377,3 +379,31 @@ describe("#1802 D1 — remote-http finally gate (static invariant)", () => {
|
||||
expect(finallySlice).not.toMatch(/^\s*cleanupStagingDir\(stagingDir\);/m);
|
||||
});
|
||||
});
|
||||
|
||||
// ── #1802 C3: internal import-timeout must preserve a checkpointed staging dir ─
|
||||
// runGbrainImport kills only the child on an internal timeout; the parent
|
||||
// returns normally, so the SIGTERM forwarder's preserve branch never runs. The
|
||||
// timeout branch must mirror it (preserve when checkpointed) and the finally
|
||||
// must honor that — otherwise "checkpoint preserved" is a lie and resume breaks.
|
||||
describe("#1802 C3 — import-timeout preserve (static invariant)", () => {
|
||||
const ingest = fs.readFileSync(
|
||||
path.join(ROOT, "bin", "gstack-memory-ingest.ts"),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
test("timeout branch checks stagingDirIsCheckpointed and sets preserveStaging", () => {
|
||||
const timeoutAt = ingest.indexOf("if (importResult.timedOut)");
|
||||
expect(timeoutAt).toBeGreaterThan(-1);
|
||||
const slice = ingest.slice(timeoutAt, timeoutAt + 1200);
|
||||
expect(slice).toMatch(/stagingDirIsCheckpointed\(stagingDir\)/);
|
||||
expect(slice).toMatch(/preserveStaging = true/);
|
||||
// The not-checkpointed path must say so honestly rather than promising resume.
|
||||
expect(slice).toMatch(/before writing a checkpoint/);
|
||||
});
|
||||
|
||||
test("finally honors preserveStaging", () => {
|
||||
expect(ingest).toMatch(
|
||||
/if \(!remoteHttpMode && !preserveStaging\) cleanupStagingDir\(stagingDir\)/,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user