feat(gbrain-sync): self-heal stale autopilot lock (dead-pid)

detectAutopilot treated a lock FILE as proof of life, so a crashed gbrain daemon
left a stale lock that wedged every sync forever (observed: a dead pid refused
--full indefinitely). Now read the holder pid (bare or JSON body) and check
liveness via signal-0: ESRCH=dead → ignore the stale signal and keep checking;
EPERM=alive (other user) → active. A stale lock never masks a live autopilot
process. Pure decision function — does not delete the file; the caller may clean it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-07 18:12:52 -07:00
parent fa250db27b
commit b43fbc3884
2 changed files with 82 additions and 2 deletions
+39
View File
@@ -38,6 +38,45 @@ describe("detectAutopilot", () => {
expect(r.active).toBe(false);
expect(r.signal).toBeNull();
});
// Stale-lock self-heal: a crashed daemon's lock (dead holder pid) must NOT
// wedge syncs forever (observed: dead pid refused --full indefinitely).
const DEAD_PID = 2999999; // above macOS pid_max; vanishingly unlikely elsewhere
test("ignores a STALE lock whose holder pid is dead", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, `${DEAD_PID}\n`);
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false });
expect(r.active).toBe(false);
expect(r.signal).toBeNull();
});
test("treats a FRESH lock (live holder pid) as active", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, String(process.pid)); // the test runner itself is alive
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false });
expect(r.active).toBe(true);
expect(r.signal).toContain(`pid ${process.pid}`);
});
test("parses a JSON lock body and ignores it when the pid is dead", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, JSON.stringify({ pid: DEAD_PID, started_at: "x" }));
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false });
expect(r.active).toBe(false);
});
test("a stale lock does not mask a live autopilot process", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, `${DEAD_PID}`);
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => true });
expect(r.active).toBe(true);
expect(r.signal).toBe("process:gbrain autopilot");
});
});
// ── #1734 remove safety (E7: fail closed on user-managed without keep-storage) ─