fix(browse): stop the headed daemon crash-loop + silent headless downgrade (#1781)

A headed session against a beacon-heavy page (analytics/extension load) could tip
the single-threaded daemon into a self-inflicted crash-loop: a brief HTTP stall
was read as a crash, the restart didn't clear the dead Chromium's SingletonLock,
the relaunch failed, and the session silently came back headless. Four fixes:

1. Busy-vs-dead (sendCommand): on a connection error, if the process is alive give
   /health a bounded probe (3x/250ms) and just retry the command — never kill+restart
   a live-but-busy server. A 30s timeout now reports 'busy, not restarting' when the
   process is alive instead of exiting into a kill cycle.
2. Profile-lock cleanup on (re)start: startServer reaps the orphaned Chromium holding
   the SingletonLock and clears Singleton{Lock,Socket,Cookie} before relaunch, so the
   auto-restart path gets the same clean profile the manual connect preamble did.
3. Headed persistence: the restart env reapplies BROWSE_HEADED from this invocation OR
   the persisted server state (mode==='headed'), so a restart from a plain command
   never downgrades a headed window to invisible headless. Extracted to buildRestartEnv.
4. Force-clean disconnect reaps the Chromium child tree (via the SingletonLock PID) so
   the next connect starts clean instead of fighting an orphan.

Plus macOS window surfacing: connect + focus raise 'Google Chrome for Testing' to the
active Space (best-effort osascript) with a Mission Control hint — the first thing
users read as 'I can't see the browser'.

Shared lock helpers (chromiumProfileDir / cleanChromiumProfileLocks / killOrphanChromium)
dedupe the connect, disconnect, and restart paths. browse/test/restart-env.test.ts pins
the headed-persistence decision; the full crash-loop repro is an E2E (periodic).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-30 10:53:28 -07:00
parent c54ece27a4
commit 30458df38f
2 changed files with 174 additions and 41 deletions
+135 -41
View File
@@ -211,6 +211,86 @@ function cleanupLegacyState(): void {
}
}
// ─── Chromium profile lock helpers (#1781) ─────────────────────
/** Profile dir used by headed/connect Chromium sessions. */
function chromiumProfileDir(): string {
return path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile');
}
/** Remove Chromium SingletonLock/Socket/Cookie so a relaunch can acquire the
* profile. Safe to call when absent. */
function cleanChromiumProfileLocks(profileDir: string = chromiumProfileDir()): void {
for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) {
safeUnlinkQuiet(path.join(profileDir, lockFile));
}
}
/** Kill an orphaned Chromium that still holds the profile's SingletonLock. The
* lock symlink target is "hostname-PID"; killing that PID tears down its
* renderer tree so the next launch starts clean. No-op when absent/stale. */
async function killOrphanChromium(profileDir: string = chromiumProfileDir()): Promise<void> {
try {
const lockTarget = fs.readlinkSync(path.join(profileDir, 'SingletonLock')); // "hostname-12345"
const orphanPid = parseInt(lockTarget.split('-').pop() || '', 10);
if (orphanPid && isProcessAlive(orphanPid)) {
safeKill(orphanPid, 'SIGTERM');
await new Promise(r => setTimeout(r, 1000));
if (isProcessAlive(orphanPid)) {
safeKill(orphanPid, 'SIGKILL');
await new Promise(r => setTimeout(r, 500));
}
}
} catch (err: any) {
if (err?.code !== 'ENOENT' && err?.code !== 'EINVAL') throw err;
}
}
/** Bounded /health probe. Returns true if the server answers within `attempts`
* tries spaced `backoffMs` apart — distinguishes a busy-but-alive daemon from a
* dead one (#1781) so a slow server isn't killed and restarted into a crash-loop. */
async function probeHealthWithBackoff(port: number, attempts = 3, backoffMs = 250): Promise<boolean> {
for (let i = 0; i < attempts; i++) {
if (await isServerHealthy(port)) return true;
if (i < attempts - 1) await Bun.sleep(backoffMs);
}
return false;
}
/**
* Build the env for an auto-restart after a crash. headed/proxy/configHash are
* reapplied from THIS invocation OR the persisted server state, so a restart
* triggered by a plain command (goto/status, no --headed flag) never silently
* downgrades a headed session to headless (#1781). Pure + exported for tests.
*/
export function buildRestartEnv(
globalFlags: GlobalFlags | null | undefined,
oldState: ServerState | null,
): Record<string, string> {
const env: Record<string, string> = {};
if (globalFlags?.proxyUrl) env.BROWSE_PROXY_URL = globalFlags.proxyUrl;
if (globalFlags?.headed || oldState?.mode === 'headed') env.BROWSE_HEADED = '1';
const configHash = globalFlags?.configHash || oldState?.configHash;
if (configHash) env.BROWSE_CONFIG_HASH = configHash;
return env;
}
/** macOS only: pull the headed Chromium window to the user's current Space.
* "Google Chrome for Testing" frequently opens behind the active window or on
* another Space — the first thing users read as "I can't see the browser"
* (#1781). Best-effort, fire-and-forget, never throws. The app name is a fixed
* literal (no interpolation). */
function raiseHeadedWindowMacOS(): void {
if (process.platform !== 'darwin') return;
try {
nodeSpawn('osascript', ['-e', 'tell application "Google Chrome for Testing" to activate'], {
stdio: 'ignore',
detached: true,
}).unref();
} catch {
// osascript missing or app not present — non-fatal
}
}
// ─── Server Lifecycle ──────────────────────────────────────────
async function startServer(extraEnv?: Record<string, string>): Promise<ServerState> {
ensureStateDir(config);
@@ -219,6 +299,13 @@ async function startServer(extraEnv?: Record<string, string>): Promise<ServerSta
safeUnlink(config.stateFile);
safeUnlink(path.join(config.stateDir, 'browse-startup-error.log'));
// #1781: clear a stale Chromium profile lock (and kill the orphan still
// holding it) before launch, so an auto-restart after an abrupt kill isn't
// blocked by the previous Chromium's SingletonLock — the self-inflicted
// crash-loop. Previously only the manual connect preamble did this.
await killOrphanChromium();
cleanChromiumProfileLocks();
// Allow the caller to opt out of the parent-process watchdog by setting
// BROWSE_PARENT_PID=0 in the environment. Useful for CI, non-interactive
// shells, and short-lived Bash invocations that need the server to outlive
@@ -486,26 +573,42 @@ async function sendCommand(state: ServerState, command: string, args: string[],
}
} catch (err: any) {
if (err.name === 'AbortError') {
console.error('[browse] Command timed out after 30s');
// #1781: a 30s timeout on a heavy page usually means busy, not dead.
// Don't kill a live server (that's what triggered the crash-loop) — report
// and exit so the user can retry rather than losing their (headed) window.
const ts = readState();
const alive = ts?.pid ? isProcessAlive(ts.pid) : false;
console.error(alive
? '[browse] Command timed out after 30s (server still alive — busy, not restarting). Retry, or raise load.'
: '[browse] Command timed out after 30s');
process.exit(1);
}
// Connection error — server may have crashed
// Connection error — server may have crashed, OR may just be busy.
if (err.code === 'ECONNREFUSED' || err.code === 'ECONNRESET' || err.message?.includes('fetch failed')) {
const oldState = readState();
// #1781 busy-vs-dead: a single-threaded daemon under beacon/extension load
// can briefly stop answering HTTP while still alive. Before declaring a
// crash, if the process is alive give /health a bounded chance to recover
// and just retry the command — never kill+restart a live-but-busy server.
if (oldState?.pid && isProcessAlive(oldState.pid) && await probeHealthWithBackoff(oldState.port)) {
if (retries >= 1) throw new Error('[browse] Server unresponsive after retry — aborting');
console.error('[browse] Server was briefly unresponsive (busy); retrying command...');
return sendCommand(oldState, command, args, retries + 1);
}
// Truly dead (or health never recovered) → restart.
if (retries >= 1) throw new Error('[browse] Server crashed twice in a row — aborting');
console.error('[browse] Server connection lost. Restarting...');
// Kill the old server to avoid orphaned chromium processes
const oldState = readState();
if (oldState && oldState.pid) {
await killServer(oldState.pid);
}
// Reapply --proxy / --headed flags from this invocation when restarting
// after a crash. Without this, a proxied daemon that dies mid-command
// would silently restart in default direct/headless mode and bypass
// the SOCKS bridge.
const restartEnv: Record<string, string> = {};
if (_globalFlags?.proxyUrl) restartEnv.BROWSE_PROXY_URL = _globalFlags.proxyUrl;
if (_globalFlags?.headed) restartEnv.BROWSE_HEADED = '1';
if (_globalFlags?.configHash) restartEnv.BROWSE_CONFIG_HASH = _globalFlags.configHash;
// startServer() now clears the Chromium SingletonLock + reaps the orphan,
// so the relaunch isn't blocked by the dead Chromium's profile lock (#1781).
//
// Reapply --proxy / --headed when restarting. headed comes from THIS
// invocation OR the persisted server mode, so a restart triggered by a
// plain command (goto/status, no --headed) never silently downgrades a
// headed session to headless (#1781). Same for proxy/configHash.
const restartEnv = buildRestartEnv(_globalFlags, oldState);
const newState = await startServer(Object.keys(restartEnv).length ? restartEnv : undefined);
return sendCommand(newState, command, args, retries + 1);
}
@@ -966,30 +1069,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
}
}
// Kill orphaned Chromium processes that may still hold the profile lock.
// The server PID is the Bun process; Chromium is a child that can outlive it
// if the server is killed abruptly (SIGKILL, crash, manual rm of state file).
const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile');
try {
const singletonLock = path.join(profileDir, 'SingletonLock');
const lockTarget = fs.readlinkSync(singletonLock); // e.g. "hostname-12345"
const orphanPid = parseInt(lockTarget.split('-').pop() || '', 10);
if (orphanPid && isProcessAlive(orphanPid)) {
safeKill(orphanPid, 'SIGTERM');
await new Promise(resolve => setTimeout(resolve, 1000));
if (isProcessAlive(orphanPid)) {
safeKill(orphanPid, 'SIGKILL');
await new Promise(resolve => setTimeout(resolve, 500));
}
}
} catch (err: any) {
if (err?.code !== 'ENOENT' && err?.code !== 'EINVAL') throw err;
}
// Clean up Chromium profile locks (can persist after crashes)
for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) {
safeUnlinkQuiet(path.join(profileDir, lockFile));
}
// Kill an orphaned Chromium still holding the profile lock (the Bun server
// PID's Chromium child can outlive an abrupt kill/crash), then clear the
// lock files so the launch is clean. Shared with the auto-restart path (#1781).
await killOrphanChromium();
cleanChromiumProfileLocks();
// Delete stale state file
safeUnlinkQuiet(config.stateFile);
@@ -1027,6 +1111,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
});
const status = await resp.text();
console.log(`Connected to real Chrome\n${status}`);
// #1781: surface the window — it often opens behind/on another Space.
raiseHeadedWindowMacOS();
if (process.platform === 'darwin') {
console.log('(If you still don\'t see it, check Mission Control / other Spaces.)');
}
// sidebar-agent.ts spawn was here. Ripped alongside the chat queue —
// the Terminal pane runs an interactive PTY now, no more one-shot
@@ -1194,11 +1283,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
safeKill(existingState.pid, 'SIGKILL');
}
}
// Clean profile locks and state file
const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile');
for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) {
safeUnlinkQuiet(path.join(profileDir, lockFile));
}
// #1781: killing the daemon can orphan its Chromium child tree, which keeps
// holding the SingletonLock and makes the next `connect` fail to launch.
// Reap the orphan via the lock, then clear the lock files + state.
await killOrphanChromium();
cleanChromiumProfileLocks();
// Xvfb orphan cleanup: if the recorded PID still matches our Xvfb (by
// cmdline AND start-time), kill it. PID-only would risk killing a
// recycled PID belonging to an unrelated process.
@@ -1258,6 +1347,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
}
await sendCommand(state, command, commandArgs);
// #1781: `focus` means "show me the window". The server-side focus activates
// the page via CDP, but on macOS the app can still sit on another Space — pull
// it to the user's current Space too.
if (command === 'focus') raiseHeadedWindowMacOS();
}
if (import.meta.main) {
+39
View File
@@ -0,0 +1,39 @@
import { describe, test, expect } from "bun:test";
import { buildRestartEnv } from "../src/cli";
// #1781: an auto-restart triggered by a plain command (no --headed flag) must
// NOT silently downgrade a headed session to headless. buildRestartEnv reapplies
// headed/proxy/configHash from this invocation OR the persisted server state.
describe("buildRestartEnv (#1781 headed persistence)", () => {
const headedState = { pid: 1, port: 9, token: "t", startedAt: "", serverPath: "", mode: "headed" as const };
const launchedState = { pid: 1, port: 9, token: "t", startedAt: "", serverPath: "", mode: "launched" as const };
test("headed flag on this invocation → BROWSE_HEADED=1", () => {
expect(buildRestartEnv({ headed: true } as any, null).BROWSE_HEADED).toBe("1");
});
test("plain command + persisted headed state → still BROWSE_HEADED=1 (the regression)", () => {
const env = buildRestartEnv({} as any, headedState as any);
expect(env.BROWSE_HEADED).toBe("1");
});
test("plain command + headless state → no BROWSE_HEADED (no spurious headed)", () => {
const env = buildRestartEnv({} as any, launchedState as any);
expect(env.BROWSE_HEADED).toBeUndefined();
});
test("nothing set → empty env", () => {
expect(buildRestartEnv(null, null)).toEqual({});
});
test("proxy + configHash reapplied from flags", () => {
const env = buildRestartEnv({ proxyUrl: "socks5://x", configHash: "abc" } as any, null);
expect(env.BROWSE_PROXY_URL).toBe("socks5://x");
expect(env.BROWSE_CONFIG_HASH).toBe("abc");
});
test("configHash falls back to persisted state", () => {
const env = buildRestartEnv({} as any, { ...launchedState, configHash: "fromstate" } as any);
expect(env.BROWSE_CONFIG_HASH).toBe("fromstate");
});
});