diff --git a/lib/gbrain-guards.ts b/lib/gbrain-guards.ts index 3a4edacba..e983de260 100644 --- a/lib/gbrain-guards.ts +++ b/lib/gbrain-guards.ts @@ -29,7 +29,7 @@ */ import { spawnSync } from "child_process"; -import { existsSync, realpathSync } from "fs"; +import { existsSync, realpathSync, readFileSync } from "fs"; import { homedir } from "os"; import { join, resolve, sep } from "path"; import { execGbrainJson, execGbrainText, NEEDS_SHELL_ON_WINDOWS } from "./gbrain-exec"; @@ -92,7 +92,20 @@ export function detectAutopilot( join(homedir(), ".gbrain", "autopilot.pid"), ]; for (const lp of lockPaths) { - if (existsSync(lp)) return { active: true, signal: `lock:${lp}` }; + if (!existsSync(lp)) continue; + // A lock FILE alone is not proof of life — a crashed daemon leaves a stale + // lock that would otherwise wedge every sync forever (observed: a dead pid + // refused --full indefinitely). Read the holder pid and check liveness. + const pid = readLockPid(lp); + if (pid === null) { + // Can't introspect (no parseable pid) → stay conservative: treat as active. + return { active: true, signal: `lock:${lp}` }; + } + if (isPidAlive(pid)) { + return { active: true, signal: `lock:${lp} (pid ${pid})` }; + } + // Stale lock (holder pid is dead): ignore this signal, keep checking. Pure + // decision function — we do NOT delete the file here; the caller may clean it. } // Primary signal: a live `gbrain autopilot` process. const running = (probe.processRunning ?? defaultProcessRunning)(); @@ -100,6 +113,34 @@ export function detectAutopilot( return { active: false, signal: null }; } +/** Read the holder pid from a lock/pid file. Returns null if no integer pid is present. */ +function readLockPid(lockPath: string): number | null { + try { + const raw = readFileSync(lockPath, "utf-8").trim(); + // Files seen: a bare pid ("65495"), or JSON like {"pid":65495,...}. + const m = raw.match(/"pid"\s*:\s*(\d+)/) ?? raw.match(/^(\d+)$/); + if (!m) return null; + const pid = Number.parseInt(m[1], 10); + return Number.isFinite(pid) && pid > 0 ? pid : null; + } catch { + return null; + } +} + +/** + * Liveness via signal 0: no signal sent, just an existence/permission check. + * ESRCH → dead; EPERM → alive but owned by another user. Cross-host pids are + * meaningless, but the autopilot lock is same-host by construction. + */ +function isPidAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err) { + return (err as NodeJS.ErrnoException).code === "EPERM"; + } +} + function defaultProcessRunning(): boolean { // No reliable pgrep on Windows; rely on the lock-file signal there. if (process.platform === "win32") return false; diff --git a/test/gbrain-guards.test.ts b/test/gbrain-guards.test.ts index 0740148f9..c3c5883f7 100644 --- a/test/gbrain-guards.test.ts +++ b/test/gbrain-guards.test.ts @@ -38,6 +38,45 @@ describe("detectAutopilot", () => { expect(r.active).toBe(false); expect(r.signal).toBeNull(); }); + + // Stale-lock self-heal: a crashed daemon's lock (dead holder pid) must NOT + // wedge syncs forever (observed: dead pid refused --full indefinitely). + const DEAD_PID = 2999999; // above macOS pid_max; vanishingly unlikely elsewhere + + test("ignores a STALE lock whose holder pid is dead", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, `${DEAD_PID}\n`); + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false }); + expect(r.active).toBe(false); + expect(r.signal).toBeNull(); + }); + + test("treats a FRESH lock (live holder pid) as active", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, String(process.pid)); // the test runner itself is alive + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false }); + expect(r.active).toBe(true); + expect(r.signal).toContain(`pid ${process.pid}`); + }); + + test("parses a JSON lock body and ignores it when the pid is dead", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, JSON.stringify({ pid: DEAD_PID, started_at: "x" })); + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false }); + expect(r.active).toBe(false); + }); + + test("a stale lock does not mask a live autopilot process", () => { + const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-")); + const lock = join(tmp, "autopilot.lock"); + fs.writeFileSync(lock, `${DEAD_PID}`); + const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => true }); + expect(r.active).toBe(true); + expect(r.signal).toBe("process:gbrain autopilot"); + }); }); // ── #1734 remove safety (E7: fail closed on user-managed without keep-storage) ─