From b43fbc38845a956f2eab7b7e61135c1335115d55 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 7 Jun 2026 18:12:52 -0700 Subject: [PATCH] feat(gbrain-sync): self-heal stale autopilot lock (dead-pid) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit detectAutopilot treated a lock FILE as proof of life, so a crashed gbrain daemon left a stale lock that wedged every sync forever (observed: a dead pid refused --full indefinitely). Now read the holder pid (bare or JSON body) and check liveness via signal-0: ESRCH=dead → ignore the stale signal and keep checking; EPERM=alive (other user) → active. A stale lock never masks a live autopilot process. Pure decision function — does not delete the file; the caller may clean it. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/gbrain-guards.ts | 45 ++++++++++++++++++++++++++++++++++++-- test/gbrain-guards.test.ts | 39 +++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/lib/gbrain-guards.ts b/lib/gbrain-guards.ts index 3a4edacba..e983de260 100644 --- a/lib/gbrain-guards.ts +++ b/lib/gbrain-guards.ts @@ -29,7 +29,7 @@ */ import { spawnSync } from "child_process"; -import { existsSync, realpathSync } from "fs"; +import { existsSync, realpathSync, readFileSync } from "fs"; import { homedir } from "os"; import { join, resolve, sep } from "path"; import { execGbrainJson, execGbrainText, NEEDS_SHELL_ON_WINDOWS } from "./gbrain-exec"; @@ -92,7 +92,20 @@ export function detectAutopilot( join(homedir(), ".gbrain", "autopilot.pid"), ]; for (const lp of lockPaths) { - if (existsSync(lp)) return { active: true, signal: `lock:${lp}` }; + if (!existsSync(lp)) continue; + // A lock FILE alone is not proof of life — a crashed daemon leaves a stale + // lock that would otherwise wedge every sync forever (observed: a dead pid + // refused --full indefinitely). Read the holder pid and check liveness. + const pid = readLockPid(lp); + if (pid === null) { + // Can't introspect (no parseable pid) → stay conservative: treat as active. + return { active: true, signal: `lock:${lp}` }; + } + if (isPidAlive(pid)) { + return { active: true, signal: `lock:${lp} (pid ${pid})` }; + } + // Stale lock (holder pid is dead): ignore this signal, keep checking. Pure + // decision function — we do NOT delete the file here; the caller may clean it. } // Primary signal: a live `gbrain autopilot` process. const running = (probe.processRunning ?? defaultProcessRunning)(); @@ -100,6 +113,34 @@ export function detectAutopilot( return { active: false, signal: null }; } +/** Read the holder pid from a lock/pid file. Returns null if no integer pid is present. */ +function readLockPid(lockPath: string): number | null { + try { + const raw = readFileSync(lockPath, "utf-8").trim(); + // Files seen: a bare pid ("65495"), or JSON like {"pid":65495,...}. + const m = raw.match(/"pid"\s*:\s*(\d+)/) ?? raw.match(/^(\d+)$/); + if (!m) return null; + const pid = Number.parseInt(m[1], 10); + return Number.isFinite(pid) && pid > 0 ? pid : null; + } catch { + return null; + } +} + +/** + * Liveness via signal 0: no signal sent, just an existence/permission check. + * ESRCH → dead; EPERM → alive but owned by another user. Cross-host pids are + * meaningless, but the autopilot lock is same-host by construction. + */ +function isPidAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err) { + return (err as NodeJS.ErrnoException).code === "EPERM"; + } +} + function defaultProcessRunning(): boolean { // No reliable pgrep on Windows; rely on the lock-file signal there. if (process.platform === "win32") return false; diff --git a/test/gbrain-guards.test.ts b/test/gbrain-guards.test.ts index 0740148f9..c3c5883f7 100644 --- a/test/gbrain-guards.test.ts +++ b/test/gbrain-guards.test.ts @@ -38,6 +38,45 @@ describe("detectAutopilot", () => { expect(r.active).toBe(false); expect(r.signal).toBeNull(); }); + + // Stale-lock self-heal: a crashed daemon's lock (dead holder pid) must NOT + // wedge syncs forever (observed: dead pid refused --full indefinitely). + const DEAD_PID = 2999999; // above macOS pid_max; vanishingly unlikely elsewhere + + test("ignores a STALE lock whose holder pid is dead", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, `${DEAD_PID}\n`); + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false }); + expect(r.active).toBe(false); + expect(r.signal).toBeNull(); + }); + + test("treats a FRESH lock (live holder pid) as active", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, String(process.pid)); // the test runner itself is alive + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false }); + expect(r.active).toBe(true); + expect(r.signal).toContain(`pid ${process.pid}`); + }); + + test("parses a JSON lock body and ignores it when the pid is dead", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, JSON.stringify({ pid: DEAD_PID, started_at: "x" })); + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false }); + expect(r.active).toBe(false); + }); + + test("a stale lock does not mask a live autopilot process", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, `${DEAD_PID}`); + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => true }); + expect(r.active).toBe(true); + expect(r.signal).toBe("process:gbrain autopilot"); + }); }); // ── #1734 remove safety (E7: fail closed on user-managed without keep-storage) ─