mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-19 00:00:13 +02:00
feat(gbrain-sync): self-heal stale autopilot lock (dead-pid)
detectAutopilot treated a lock FILE as proof of life, so a crashed gbrain daemon left a stale lock that wedged every sync forever (observed: a dead pid refused --full indefinitely). Now read the holder pid (bare or JSON body) and check liveness via signal-0: ESRCH=dead → ignore the stale signal and keep checking; EPERM=alive (other user) → active. A stale lock never masks a live autopilot process. Pure decision function — does not delete the file; the caller may clean it. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+43
-2
@@ -29,7 +29,7 @@
|
||||
*/
|
||||
|
||||
import { spawnSync } from "child_process";
|
||||
import { existsSync, realpathSync } from "fs";
|
||||
import { existsSync, realpathSync, readFileSync } from "fs";
|
||||
import { homedir } from "os";
|
||||
import { join, resolve, sep } from "path";
|
||||
import { execGbrainJson, execGbrainText, NEEDS_SHELL_ON_WINDOWS } from "./gbrain-exec";
|
||||
@@ -92,7 +92,20 @@ export function detectAutopilot(
|
||||
join(homedir(), ".gbrain", "autopilot.pid"),
|
||||
];
|
||||
for (const lp of lockPaths) {
|
||||
if (existsSync(lp)) return { active: true, signal: `lock:${lp}` };
|
||||
if (!existsSync(lp)) continue;
|
||||
// A lock FILE alone is not proof of life — a crashed daemon leaves a stale
|
||||
// lock that would otherwise wedge every sync forever (observed: a dead pid
|
||||
// refused --full indefinitely). Read the holder pid and check liveness.
|
||||
const pid = readLockPid(lp);
|
||||
if (pid === null) {
|
||||
// Can't introspect (no parseable pid) → stay conservative: treat as active.
|
||||
return { active: true, signal: `lock:${lp}` };
|
||||
}
|
||||
if (isPidAlive(pid)) {
|
||||
return { active: true, signal: `lock:${lp} (pid ${pid})` };
|
||||
}
|
||||
// Stale lock (holder pid is dead): ignore this signal, keep checking. Pure
|
||||
// decision function — we do NOT delete the file here; the caller may clean it.
|
||||
}
|
||||
// Primary signal: a live `gbrain autopilot` process.
|
||||
const running = (probe.processRunning ?? defaultProcessRunning)();
|
||||
@@ -100,6 +113,34 @@ export function detectAutopilot(
|
||||
return { active: false, signal: null };
|
||||
}
|
||||
|
||||
/** Read the holder pid from a lock/pid file. Returns null if no integer pid is present. */
|
||||
function readLockPid(lockPath: string): number | null {
|
||||
try {
|
||||
const raw = readFileSync(lockPath, "utf-8").trim();
|
||||
// Files seen: a bare pid ("65495"), or JSON like {"pid":65495,...}.
|
||||
const m = raw.match(/"pid"\s*:\s*(\d+)/) ?? raw.match(/^(\d+)$/);
|
||||
if (!m) return null;
|
||||
const pid = Number.parseInt(m[1], 10);
|
||||
return Number.isFinite(pid) && pid > 0 ? pid : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Liveness via signal 0: no signal sent, just an existence/permission check.
|
||||
* ESRCH → dead; EPERM → alive but owned by another user. Cross-host pids are
|
||||
* meaningless, but the autopilot lock is same-host by construction.
|
||||
*/
|
||||
function isPidAlive(pid: number): boolean {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch (err) {
|
||||
return (err as NodeJS.ErrnoException).code === "EPERM";
|
||||
}
|
||||
}
|
||||
|
||||
function defaultProcessRunning(): boolean {
|
||||
// No reliable pgrep on Windows; rely on the lock-file signal there.
|
||||
if (process.platform === "win32") return false;
|
||||
|
||||
Reference in New Issue
Block a user