Files
gstack/design/src/daemon-state.ts
T
Garry Tan e3c235ae5c feat(design): daemon-client with lock + identity-verified spawn
design/src/daemon-client.ts implements the CLI side of the daemon lifecycle:
ensureDaemon() (the spawn-or-attach decision), publishBoard(), and the
$D daemon stop|status helpers.

Modeled on browse/src/cli.ts:317-415 — same health-check-first attach,
same fs.openSync('wx') lock, same re-read-state-INSIDE-the-lock guard
against two CLIs both deciding "no daemon, spawn." Two design-specific
safety properties added beyond browse:

1. verifyIdentity before any SIGTERM/SIGKILL. Reads the running process's
   cmdline (/proc/PID/cmdline on Linux, `ps -p PID -o command=` on macOS)
   and only signals if it contains CMDLINE_MARKER ("gstack-design-daemon",
   passed as argv at spawn time). Prevents a stale state file from
   causing us to kill an unrelated process that inherited the PID.

2. Refuse-kill-with-active-boards on version mismatch. Browse silently
   restarts; here in-memory board history would vanish, so the client
   prints a user-actionable WARNING and exit 1 instead. Users explicitly
   `$D daemon stop` to override.

Spawn uses Node child_process.spawn (NOT Bun.spawn().unref) because of
the macOS session-detach quirks browse already discovered. Stdio is
redirected to ~/.gstack/design-daemon-startup.log, which the client
tails into stderr if waitForHealthOrError times out — no more silent
"daemon failed for some unknowable reason."

daemon-state.ts gains DESIGN_DAEMON_STATE_FILE env override so tests
can point both client and spawned daemon at a per-test path without a
shared cwd.

design/test/daemon-discovery.test.ts: 17 tests, all green in ~8s. Covers:
spawn-fresh, attach-existing, stale-state-file (pid dead), PID-reuse
safety (uses the test runner's own PID as the bait — verifyIdentity
catches the cmdline mismatch, daemon not signaled), version-mismatch
with/without active boards (the active-boards case runs a subprocess
and asserts exit 1 + WARNING in stderr), publishBoard 200 + 409,
shutdownDaemon refuse/force/unresponsive paths, daemonStatus.

The daemon-discovery suite is split out of daemon.test.ts because each
real spawn costs ~200ms; the in-process daemon.test.ts (30 tests, 70ms)
covers the same handler logic without the spawn overhead.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 15:26:07 -07:00

190 lines
5.7 KiB
TypeScript

/**
* Pure utilities for design-daemon discovery.
*
* Shared between daemon.ts (writes/removes the state file) and
* daemon-client.ts (reads state, decides spawn-vs-attach). Mirrors
* browse/src/cli.ts:109-315 — same atomic-write + fs.openSync 'wx' lock
* pattern, with an added cmdline-based identity check to guard against
* SIGTERM hitting a reused PID (Codex finding on the daemon plan).
*/
import { execFileSync } from "child_process";
import fs from "fs";
import os from "os";
import path from "path";
export interface DaemonState {
pid: number;
port: number;
startedAt: string; // ISO 8601
version: string;
serverPath: string;
cmdlineMarker: string;
}
// String we grep for in the spawned daemon's cmdline to confirm a pid is
// ours before sending any signal. Must appear in argv at spawn time.
export const CMDLINE_MARKER = "gstack-design-daemon";
export function resolveStateFilePath(): string {
// Env override has highest precedence so tests can point both client and
// spawned daemon at a per-test path without a shared cwd.
const envOverride = process.env.DESIGN_DAEMON_STATE_FILE;
if (envOverride) return envOverride;
try {
const root = execFileSync("git", ["rev-parse", "--show-toplevel"], {
encoding: "utf8",
stdio: ["ignore", "pipe", "ignore"],
}).trim();
if (root) return path.join(root, ".gstack", "design.json");
} catch {
// not in a git repo — fall through
}
return path.join(process.cwd(), ".gstack", "design.json");
}
export function resolveLockFilePath(stateFile: string = resolveStateFilePath()): string {
return `${stateFile}.lock`;
}
export function resolveDaemonLogPath(): string {
return path.join(os.homedir(), ".gstack", "design-daemon.log");
}
export function resolveStartupLogPath(): string {
return path.join(os.homedir(), ".gstack", "design-daemon-startup.log");
}
export function readStateFile(stateFile: string = resolveStateFilePath()): DaemonState | null {
try {
return JSON.parse(fs.readFileSync(stateFile, "utf-8")) as DaemonState;
} catch {
return null;
}
}
export function writeStateFile(
state: DaemonState,
stateFile: string = resolveStateFilePath(),
): void {
fs.mkdirSync(path.dirname(stateFile), { recursive: true });
const tmp = `${stateFile}.tmp.${process.pid}.${Math.random().toString(36).slice(2)}`;
fs.writeFileSync(tmp, JSON.stringify(state, null, 2), { mode: 0o600 });
fs.renameSync(tmp, stateFile);
}
export function removeStateFile(stateFile: string = resolveStateFilePath()): void {
try {
fs.unlinkSync(stateFile);
} catch {
// already gone
}
}
export interface HealthOk {
ok: true;
version: string;
uptime: number;
boards: number;
activeBoards: number;
}
export async function healthCheck(
port: number,
timeoutMs: number = 2000,
): Promise<HealthOk | null> {
try {
const resp = await fetch(`http://127.0.0.1:${port}/health`, {
signal: AbortSignal.timeout(timeoutMs),
});
if (!resp.ok) return null;
const body = (await resp.json()) as Partial<HealthOk> | null;
if (body && body.ok === true && typeof body.version === "string") {
return body as HealthOk;
}
return null;
} catch {
return null;
}
}
export function isProcessAlive(pid: number): boolean {
if (!pid || pid <= 0) return false;
try {
process.kill(pid, 0);
return true;
} catch (e: unknown) {
// EPERM means it exists, we just can't signal it. ESRCH means it's gone.
const code = (e as NodeJS.ErrnoException | undefined)?.code;
return code === "EPERM";
}
}
/**
* Read the cmdline of a running process. Returns "" on any error.
* Linux: /proc/<pid>/cmdline (NUL-separated argv). macOS: `ps -p PID -o command=`.
*/
export function readCmdline(pid: number): string {
if (!isProcessAlive(pid)) return "";
try {
if (process.platform === "linux") {
const raw = fs.readFileSync(`/proc/${pid}/cmdline`, "utf-8");
return raw.replace(/\0/g, " ").trim();
}
if (process.platform === "darwin") {
return execFileSync("ps", ["-p", String(pid), "-o", "command="], {
encoding: "utf8",
stdio: ["ignore", "pipe", "ignore"],
}).trim();
}
return "";
} catch {
return "";
}
}
/**
* True only when the process at `pid` has `marker` in its cmdline. Used to
* avoid SIGTERMing an unrelated process that happens to have inherited a
* PID from a stale state file (the Codex PID-reuse concern). On systems
* where readCmdline is unsupported (or fails), this returns false — safer
* to skip the signal than to risk killing the wrong process.
*/
export function verifyIdentity(pid: number, marker: string): boolean {
if (!marker) return false;
return readCmdline(pid).includes(marker);
}
/**
* Acquire an exclusive lock on `lockPath`. Returns a release function, or
* null if held by another live process. Stale locks (PID dead) are reclaimed
* once; if reclaim also fails the caller waits and retries via state re-read.
*/
export function acquireLock(lockPath: string): (() => void) | null {
try {
fs.mkdirSync(path.dirname(lockPath), { recursive: true });
// 'wx' = create exclusive, fail if exists. Atomic check-and-create.
const fd = fs.openSync(lockPath, "wx");
fs.writeSync(fd, `${process.pid}\n`);
fs.closeSync(fd);
return () => {
try {
fs.unlinkSync(lockPath);
} catch {
// already gone
}
};
} catch {
// Held — check if holder is alive
try {
const holderPid = parseInt(fs.readFileSync(lockPath, "utf-8").trim(), 10);
if (holderPid && isProcessAlive(holderPid)) return null;
// Stale, reclaim
fs.unlinkSync(lockPath);
return acquireLock(lockPath);
} catch {
return null;
}
}
}