fix(browse): identity-based terminal-agent kill replaces pkill regex

Commit 0 of the v1.44 long-lived-sidebar PR — foundation for the watchdog and removes a latent cross-session footgun. `pkill -f terminal-agent\.ts` (cli.ts spawn site + server.ts shutdown) matched by argv regex and would kill ANY process whose argv contained the string — sibling gstack sessions on the same host, an editor with the file open, a second `$B connect` run. Identity-based PID kill via a new helper module removes that whole class of bug. * New `browse/src/terminal-agent-control.ts`: `readAgentRecord`, `writeAgentRecord`, `clearAgentRecord`, `killAgentByRecord`. Validates PID liveness via `isProcessAlive` before signaling (PID-reuse defense). * `terminal-agent.ts` writes `<stateDir>/terminal-agent-pid` (JSON `{pid, gen, startedAt}`) at boot; clears on SIGTERM/SIGINT. * New per-boot `CURRENT_GEN` (16-byte random); `/internal/*` callers can include `X-Browse-Gen` to defend against split-brain in the upcoming watchdog. Absent header is accepted (backward compat); mismatch returns 409. New `checkInternalAuth` helper centralizes bearer + gen checks. * New `/internal/healthz` route — agent liveness probe used by the upcoming watchdog (returns pid/gen/sessions, no claude-binary lookup). * `cli.ts` and `server.ts` both call `killAgentByRecord` instead of pkill. * `ServerConfig.ownsTerminalAgent` JSDoc updated; the gated teardown now runs 4 side effects (was 3) — adds the new agent-record unlink. Test changes: * New `browse/test/terminal-agent-pid-identity.test.ts` — static-grep tripwire that fails CI if any source file re-introduces `pkill ... terminal-agent` or `spawnSync('pkill', ...)`; round-trips write/read/clear; verifies killAgentByRecord no-ops on dead PIDs. * `browse/test/server-embedder-terminal-port.test.ts` rewritten to intercept `process.kill` (not `child_process.spawnSync`); writes a sentinel agent-record with a guaranteed-dead PID; asserts probe-only (signal 0) calls, no termination signals; verifies all 3 discovery files including the new terminal-agent-pid. Closes TODOS.md P3 ("Identity-based terminal-agent kill"). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-02 14:35:40 +02:00 · 2026-05-23 18:29:33 -07:00
parent 1d9b9c4cfc
commit 3af07a0c23
8 changed files with 447 additions and 108 deletions
@@ -16,6 +16,7 @@ import { writeSecureFile, mkdirSecure } from './file-permissions';
 import { resolveConfig, ensureStateDir, readVersionHash } from './config';
 import { parseProxyConfig, computeConfigHash, ProxyConfigError } from './proxy-config';
 import { redactProxyUrl } from './proxy-redact';
+import { readAgentRecord, killAgentByRecord, clearAgentRecord } from './terminal-agent-control';

 const config = resolveConfig();
 const IS_WINDOWS = process.platform === 'win32';
@@ -1040,13 +1041,19 @@ Refs:           After 'snapshot', use @e1, @e2... as selectors:
      }
      try {
        if (fs.existsSync(termAgentScript)) {
-          // Kill old terminal-agents so a stale port file can't trick the
-          // server into routing /pty-session at a dead listener.
-          try {
-            const { spawnSync } = require('child_process');
-            spawnSync('pkill', ['-f', 'terminal-agent\\.ts'], { stdio: 'ignore', timeout: 3000 });
-          } catch (err: any) {
-            if (err?.code !== 'ENOENT') throw err;
+          // Kill any stale terminal-agent from a prior run so its port file
+          // can't trick the server into routing /pty-session at a dead
+          // listener. Identity-based (v1.44+) — only kills the PID recorded
+          // in `<stateDir>/terminal-agent-pid`. Pre-v1.44 used
+          // `pkill -f terminal-agent\.ts` which matched sibling gstack
+          // sessions; see terminal-agent-control.ts header for rationale.
+          {
+            const stateDir = path.dirname(config.stateFile);
+            const prior = readAgentRecord(stateDir);
+            if (prior) {
+              killAgentByRecord(prior, 'SIGTERM');
+              clearAgentRecord(stateDir);
+            }
          }
          const termProc = Bun.spawn(['bun', 'run', termAgentScript], {
            cwd: config.projectDir,
@@ -43,6 +43,7 @@ import { inspectElement, modifyStyle, resetModifications, getModificationHistory
 // Bun.spawn used instead of child_process.spawn (compiled bun binaries
 // fail posix_spawn on all executables including /bin/bash)
 import { safeUnlink, safeUnlinkQuiet, safeKill } from './error-handling';
+import { readAgentRecord, killAgentByRecord, clearAgentRecord, agentRecordPath } from './terminal-agent-control';
 import { sanitizeBody, stripLoneSurrogateEscapes } from './sanitize';
 import { startSocksBridge, testUpstream, type BridgeHandle } from './socks-bridge';
 import { parseProxyConfig, toUpstreamConfig, ProxyConfigError } from './proxy-config';
@@ -207,31 +208,34 @@ export interface ServerConfig {
  beforeRoute?: (req: Request, surface: Surface, auth: TokenInfo | null) => Promise<Response | null>;
  /**
   * Whether gstack owns the lifecycle of the terminal-agent process and its
-   * discovery files (`<stateDir>/terminal-port`, `<stateDir>/terminal-internal-token`).
+   * discovery files (`<stateDir>/terminal-port`, `<stateDir>/terminal-internal-token`,
+   * `<stateDir>/terminal-agent-pid`).
   *
-   * When true (default), shutdown() runs three side effects:
-   *   1. `pkill -f terminal-agent\.ts`  — regex-broad, matches ANY process whose
-   *      command line contains `terminal-agent.ts` on this host (including
-   *      sibling gstack sessions). Pre-existing CLI behavior, not introduced by
-   *      this flag. Identity-based PID kill is a separate followup (see TODOS).
+   * When true (default), shutdown() runs four side effects:
+   *   1. Identity-based kill via `killAgentByRecord(readAgentRecord(stateDir))`
+   *      (v1.44+). Only signals the PID recorded by THIS daemon's agent.
+   *      Replaced the historical `pkill -f terminal-agent\.ts` regex that
+   *      matched sibling gstack sessions on the same host — see
+   *      terminal-agent-control.ts for rationale.
   *   2. `safeUnlinkQuiet(<stateDir>/terminal-port)`
   *   3. `safeUnlinkQuiet(<stateDir>/terminal-internal-token)`
+   *   4. `safeUnlinkQuiet(<stateDir>/terminal-agent-pid)` (the v1.44 record)
   *
   * This is correct for gstack's CLI path, which spawns `terminal-agent.ts` as
   * the producer of those files (see cli.ts:1037-1063).
   *
   * Embedders (gbrowser phoenix overlay, future hosts) that run their own PTY
   * server and write those files themselves should pass `false`. When `false`,
-   * the embedder owns BOTH the agent process AND both discovery files —
-   * terminal-agent.ts's own SIGTERM cleanup only removes `terminal-port`
-   * (see terminal-agent.ts:558), so the internal-token file is the embedder's
-   * full responsibility.
+   * the embedder owns BOTH the agent process AND all three discovery files.
+   * Note that terminal-agent.ts's own SIGTERM cleanup removes `terminal-port`
+   * and `terminal-agent-pid` (the agent writes both at boot), so embedders
+   * that pre-launch their own agent must ensure their cleanup matches.
   *
   * Polarity note: this differs from `xvfb?` and `proxyBridge?`, which gate by
   * the *presence* of a caller-owned handle (presence ⇒ don't close). This
   * field gates by an explicit boolean because there is no handle object —
   * the terminal-agent is started elsewhere (cli.ts), and shutdown's only
-   * reference is the regex-based pkill + the file paths.
+   * reference is the PID record + the file paths.
   */
  ownsTerminalAgent?: boolean;
 }
@@ -1319,14 +1323,20 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle {

    console.log('[browse] Shutting down...');
    if (ownsTerminalAgent) {
+      // Identity-based kill (v1.44+). Replaces the v1.43- `pkill -f
+      // terminal-agent\.ts` regex teardown which matched sibling gstack
+      // sessions on the same host. Only the PID recorded in
+      // `<stateDir>/terminal-agent-pid` by THIS daemon's agent is signaled.
      try {
-        const { spawnSync } = require('child_process');
-        spawnSync('pkill', ['-f', 'terminal-agent\\.ts'], { stdio: 'ignore', timeout: 3000 });
+        const stateDir = path.dirname(config.stateFile);
+        const record = readAgentRecord(stateDir);
+        if (record) killAgentByRecord(record, 'SIGTERM');
      } catch (err: any) {
        console.warn('[browse] Failed to kill terminal-agent:', err.message);
      }
      safeUnlinkQuiet(path.join(path.dirname(config.stateFile), 'terminal-port'));
      safeUnlinkQuiet(path.join(path.dirname(config.stateFile), 'terminal-internal-token'));
+      safeUnlinkQuiet(agentRecordPath(path.dirname(config.stateFile)));
    }
    try { detachSession(); } catch (err: any) {
      console.warn('[browse] Failed to detach CDP session:', err.message);
@@ -0,0 +1,80 @@
+/**
+ * terminal-agent process-control primitives shared by cli.ts spawn site,
+ * server.ts shutdown teardown, and the v1.44 watchdog/respawn loop.
+ *
+ * Why this exists: pre-v1.44 used `pkill -f terminal-agent\.ts`, which
+ * matches any process whose argv contains the string and would kill
+ * sibling gstack sessions on the same host. The agent now writes a
+ * structured `terminal-agent-pid` record (`{pid, gen, startedAt}`) and
+ * every kill site routes through `killAgentByRecord` here — identity-based,
+ * no regex.
+ *
+ * The `gen` field is a per-boot generation counter. Loopback /internal/*
+ * calls from the parent server include `X-Browse-Gen` so a slow agent that
+ * the watchdog respawned around can't accidentally service a stale grant
+ * from the old generation.
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import { safeUnlink, safeKill, isProcessAlive } from './error-handling';
+import { writeSecureFile, mkdirSecure } from './file-permissions';
+
+export interface AgentRecord {
+  pid: number;
+  /** Random per-boot identifier. Loopback /internal/* sees X-Browse-Gen: <gen>. */
+  gen: string;
+  /** ms since epoch. Reserved for future PID-reuse guards. */
+  startedAt: number;
+}
+
+export function agentRecordPath(stateDir: string): string {
+  return path.join(stateDir, 'terminal-agent-pid');
+}
+
+/** Read the current record. Returns null on missing/malformed file. */
+export function readAgentRecord(stateDir: string): AgentRecord | null {
+  try {
+    const raw = fs.readFileSync(agentRecordPath(stateDir), 'utf-8');
+    const j = JSON.parse(raw);
+    if (typeof j?.pid === 'number' && typeof j?.gen === 'string' && typeof j?.startedAt === 'number') {
+      return j as AgentRecord;
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+/** Atomic write. Caller must ensure stateDir exists; agent does this at boot. */
+export function writeAgentRecord(stateDir: string, record: AgentRecord): void {
+  try { mkdirSecure(stateDir); } catch {}
+  const target = agentRecordPath(stateDir);
+  const tmp = `${target}.tmp-${process.pid}`;
+  writeSecureFile(tmp, JSON.stringify(record));
+  fs.renameSync(tmp, target);
+}
+
+export function clearAgentRecord(stateDir: string): void {
+  safeUnlink(agentRecordPath(stateDir));
+}
+
+/**
+ * Kill the agent identified by `record`. Signal defaults to SIGTERM (give
+ * the agent a chance to run its own SIGTERM cleanup). Returns true if a
+ * signal was actually sent to a live PID; false if the PID was already
+ * dead (no-op). Never throws — ESRCH is swallowed by safeKill.
+ *
+ * Validates liveness BEFORE signaling so a PID-reuse race (the recorded
+ * PID was reaped and a brand-new unrelated process now holds it) can't
+ * cause us to kill the wrong process. This is a best-effort defense:
+ * Linux/macOS don't expose process-start-time cheaply, and the gap
+ * between record-write and watchdog-tick is small (60s max).
+ */
+export function killAgentByRecord(
+  record: AgentRecord,
+  signal: NodeJS.Signals = 'SIGTERM',
+): boolean {
+  if (!isProcessAlive(record.pid)) return false;
+  safeKill(record.pid, signal);
+  return true;
+}
@@ -25,12 +25,21 @@ import * as path from 'path';
 import * as crypto from 'crypto';
 import { writeSecureFile, mkdirSecure } from './file-permissions';
 import { safeUnlink } from './error-handling';
+import { writeAgentRecord, clearAgentRecord } from './terminal-agent-control';

 const STATE_FILE = process.env.BROWSE_STATE_FILE || path.join(process.env.HOME || '/tmp', '.gstack', 'browse.json');
 const PORT_FILE = path.join(path.dirname(STATE_FILE), 'terminal-port');
 const BROWSE_SERVER_PORT = parseInt(process.env.BROWSE_SERVER_PORT || '0', 10);
 const EXTENSION_ID = process.env.BROWSE_EXTENSION_ID || ''; // optional: tighten Origin check
 const INTERNAL_TOKEN = crypto.randomBytes(32).toString('base64url'); // shared with parent server via env at spawn
+/**
+ * Per-boot generation identifier. Loopback /internal/* callers include
+ * `X-Browse-Gen: <CURRENT_GEN>` so a slow agent the watchdog respawned
+ * around can't service a stale grant from the prior generation. Absent
+ * header means "legacy caller" and is accepted (backward compat); a
+ * present-but-mismatched header returns 409 stale generation.
+ */
+const CURRENT_GEN = crypto.randomBytes(16).toString('base64url');

 // In-memory cookie token registry. Parent posts /internal/grant after
 // /pty-session; we validate WS cookies against this set.
@@ -201,6 +210,27 @@ function disposeSession(session: PtySession): void {
 *
 * Everything else returns 404. The listener binds 127.0.0.1 only.
 */
+/**
+ * Validate a loopback /internal/* request. Returns null when the request
+ * is allowed; otherwise returns the Response to send back. Centralizes
+ * bearer auth + the v1.44 X-Browse-Gen generation check so adding a new
+ * /internal/* route is a one-liner. The full internalHandler<T> wrapper
+ * arrives in Commit 1 alongside the new routes; this is the minimal
+ * shape needed to gate the existing /internal/grant + /internal/revoke
+ * without copy-pasting the gen check.
+ */
+function checkInternalAuth(req: Request): Response | null {
+  const auth = req.headers.get('authorization');
+  if (auth !== `Bearer ${INTERNAL_TOKEN}`) {
+    return new Response('forbidden', { status: 403 });
+  }
+  const headerGen = req.headers.get('x-browse-gen');
+  if (headerGen && headerGen !== CURRENT_GEN) {
+    return new Response('stale generation', { status: 409 });
+  }
+  return null;
+}
+
 function buildServer() {
  return Bun.serve({
    hostname: '127.0.0.1',
@@ -212,10 +242,8 @@ function buildServer() {

      // /internal/grant — loopback-only handshake from parent server.
      if (url.pathname === '/internal/grant' && req.method === 'POST') {
-        const auth = req.headers.get('authorization');
-        if (auth !== `Bearer ${INTERNAL_TOKEN}`) {
-          return new Response('forbidden', { status: 403 });
-        }
+        const denied = checkInternalAuth(req);
+        if (denied) return denied;
        return req.json().then((body: any) => {
          if (typeof body?.token === 'string' && body.token.length > 16) {
            validTokens.add(body.token);
@@ -226,16 +254,28 @@ function buildServer() {

      // /internal/revoke — drop a token (called on WS close or bootstrap reload)
      if (url.pathname === '/internal/revoke' && req.method === 'POST') {
-        const auth = req.headers.get('authorization');
-        if (auth !== `Bearer ${INTERNAL_TOKEN}`) {
-          return new Response('forbidden', { status: 403 });
-        }
+        const denied = checkInternalAuth(req);
+        if (denied) return denied;
        return req.json().then((body: any) => {
          if (typeof body?.token === 'string') validTokens.delete(body.token);
          return new Response('ok');
        }).catch(() => new Response('bad', { status: 400 }));
      }

+      // /internal/healthz — liveness probe used by the v1.44 watchdog.
+      // Returns this agent's pid + gen + active session count without
+      // touching claude binary lookup (which can fail for non-process
+      // reasons and isn't a useful liveness signal).
+      if (url.pathname === '/internal/healthz' && req.method === 'GET') {
+        const denied = checkInternalAuth(req);
+        if (denied) return denied;
+        return new Response(JSON.stringify({
+          pid: process.pid,
+          gen: CURRENT_GEN,
+          sessions: validTokens.size,
+        }), { status: 200, headers: { 'Content-Type': 'application/json' } });
+      }
+
      // /claude-available — bootstrap card hits this when user clicks "I installed it".
      if (url.pathname === '/claude-available' && req.method === 'GET') {
        writeClaudeAvailable();
@@ -548,14 +588,25 @@ function main() {
  writeSecureFile(tmp, String(port));
  fs.renameSync(tmp, PORT_FILE);

+  // Write identity-based agent record (pid + per-boot gen). Replaces the
+  // v1.43- `pkill -f terminal-agent\.ts` regex teardown that could kill
+  // sibling gstack sessions. Callers (cli.ts spawn site, server.ts
+  // shutdown, the v1.44 watchdog) now route through killAgentByRecord in
+  // terminal-agent-control.ts.
+  writeAgentRecord(dir, { pid: process.pid, gen: CURRENT_GEN, startedAt: Date.now() });
+
  // Hand the parent the internal token so it can call /internal/grant.
  // Parent learns INTERNAL_TOKEN via env (TERMINAL_AGENT_INTERNAL_TOKEN below).
  // We just print it on stdout for the supervising process to pick up if it's
  // not already in env. Defense against env races at spawn time.
-  console.log(`[terminal-agent] listening on 127.0.0.1:${port} pid=${process.pid}`);
+  console.log(`[terminal-agent] listening on 127.0.0.1:${port} pid=${process.pid} gen=${CURRENT_GEN}`);

-  // Cleanup port file on exit.
-  const cleanup = () => { safeUnlink(PORT_FILE); process.exit(0); };
+  // Cleanup port file + agent record on exit.
+  const cleanup = () => {
+    safeUnlink(PORT_FILE);
+    clearAgentRecord(dir);
+    process.exit(0);
+  };
  process.on('SIGTERM', cleanup);
  process.on('SIGINT', cleanup);
 }