feat(gbrain-sync): self-heal stale autopilot lock (dead-pid)

detectAutopilot treated a lock FILE as proof of life, so a crashed gbrain daemon
left a stale lock that wedged every sync forever (observed: a dead pid refused
--full indefinitely). Now read the holder pid (bare or JSON body) and check
liveness via signal-0: ESRCH=dead → ignore the stale signal and keep checking;
EPERM=alive (other user) → active. A stale lock never masks a live autopilot
process. Pure decision function — does not delete the file; the caller may clean it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-07 18:12:52 -07:00
parent fa250db27b
commit b43fbc3884
2 changed files with 82 additions and 2 deletions
+43 -2
View File
@@ -29,7 +29,7 @@
*/
import { spawnSync } from "child_process";
import { existsSync, realpathSync } from "fs";
import { existsSync, realpathSync, readFileSync } from "fs";
import { homedir } from "os";
import { join, resolve, sep } from "path";
import { execGbrainJson, execGbrainText, NEEDS_SHELL_ON_WINDOWS } from "./gbrain-exec";
@@ -92,7 +92,20 @@ export function detectAutopilot(
join(homedir(), ".gbrain", "autopilot.pid"),
];
for (const lp of lockPaths) {
if (existsSync(lp)) return { active: true, signal: `lock:${lp}` };
if (!existsSync(lp)) continue;
// A lock FILE alone is not proof of life — a crashed daemon leaves a stale
// lock that would otherwise wedge every sync forever (observed: a dead pid
// refused --full indefinitely). Read the holder pid and check liveness.
const pid = readLockPid(lp);
if (pid === null) {
// Can't introspect (no parseable pid) → stay conservative: treat as active.
return { active: true, signal: `lock:${lp}` };
}
if (isPidAlive(pid)) {
return { active: true, signal: `lock:${lp} (pid ${pid})` };
}
// Stale lock (holder pid is dead): ignore this signal, keep checking. Pure
// decision function — we do NOT delete the file here; the caller may clean it.
}
// Primary signal: a live `gbrain autopilot` process.
const running = (probe.processRunning ?? defaultProcessRunning)();
@@ -100,6 +113,34 @@ export function detectAutopilot(
return { active: false, signal: null };
}
/** Read the holder pid from a lock/pid file. Returns null if no integer pid is present. */
function readLockPid(lockPath: string): number | null {
try {
const raw = readFileSync(lockPath, "utf-8").trim();
// Files seen: a bare pid ("65495"), or JSON like {"pid":65495,...}.
const m = raw.match(/"pid"\s*:\s*(\d+)/) ?? raw.match(/^(\d+)$/);
if (!m) return null;
const pid = Number.parseInt(m[1], 10);
return Number.isFinite(pid) && pid > 0 ? pid : null;
} catch {
return null;
}
}
/**
* Liveness via signal 0: no signal sent, just an existence/permission check.
* ESRCH → dead; EPERM → alive but owned by another user. Cross-host pids are
* meaningless, but the autopilot lock is same-host by construction.
*/
function isPidAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch (err) {
return (err as NodeJS.ErrnoException).code === "EPERM";
}
}
function defaultProcessRunning(): boolean {
// No reliable pgrep on Windows; rely on the lock-file signal there.
if (process.platform === "win32") return false;
+39
View File
@@ -38,6 +38,45 @@ describe("detectAutopilot", () => {
expect(r.active).toBe(false);
expect(r.signal).toBeNull();
});
// Stale-lock self-heal: a crashed daemon's lock (dead holder pid) must NOT
// wedge syncs forever (observed: dead pid refused --full indefinitely).
const DEAD_PID = 2999999; // above macOS pid_max; vanishingly unlikely elsewhere
test("ignores a STALE lock whose holder pid is dead", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, `${DEAD_PID}\n`);
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false });
expect(r.active).toBe(false);
expect(r.signal).toBeNull();
});
test("treats a FRESH lock (live holder pid) as active", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, String(process.pid)); // the test runner itself is alive
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false });
expect(r.active).toBe(true);
expect(r.signal).toContain(`pid ${process.pid}`);
});
test("parses a JSON lock body and ignores it when the pid is dead", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, JSON.stringify({ pid: DEAD_PID, started_at: "x" }));
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => false });
expect(r.active).toBe(false);
});
test("a stale lock does not mask a live autopilot process", () => {
const tmp = fs.mkdtempSync(join(os.tmpdir(), "ap-"));
const lock = join(tmp, "autopilot.lock");
fs.writeFileSync(lock, `${DEAD_PID}`);
const r = detectAutopilot(process.env, { lockPaths: [lock], processRunning: () => true });
expect(r.active).toBe(true);
expect(r.signal).toBe("process:gbrain autopilot");
});
});
// ── #1734 remove safety (E7: fail closed on user-managed without keep-storage) ─