mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 07:10:12 +02:00
fix(browse): stop the headed daemon crash-loop + silent headless downgrade (#1781)
A headed session against a beacon-heavy page (analytics/extension load) could tip
the single-threaded daemon into a self-inflicted crash-loop: a brief HTTP stall
was read as a crash, the restart didn't clear the dead Chromium's SingletonLock,
the relaunch failed, and the session silently came back headless. Four fixes:
1. Busy-vs-dead (sendCommand): on a connection error, if the process is alive give
/health a bounded probe (3x/250ms) and just retry the command — never kill+restart
a live-but-busy server. A 30s timeout now reports 'busy, not restarting' when the
process is alive instead of exiting into a kill cycle.
2. Profile-lock cleanup on (re)start: startServer reaps the orphaned Chromium holding
the SingletonLock and clears Singleton{Lock,Socket,Cookie} before relaunch, so the
auto-restart path gets the same clean profile the manual connect preamble did.
3. Headed persistence: the restart env reapplies BROWSE_HEADED from this invocation OR
the persisted server state (mode==='headed'), so a restart from a plain command
never downgrades a headed window to invisible headless. Extracted to buildRestartEnv.
4. Force-clean disconnect reaps the Chromium child tree (via the SingletonLock PID) so
the next connect starts clean instead of fighting an orphan.
Plus macOS window surfacing: connect + focus raise 'Google Chrome for Testing' to the
active Space (best-effort osascript) with a Mission Control hint — the first thing
users read as 'I can't see the browser'.
Shared lock helpers (chromiumProfileDir / cleanChromiumProfileLocks / killOrphanChromium)
dedupe the connect, disconnect, and restart paths. browse/test/restart-env.test.ts pins
the headed-persistence decision; the full crash-loop repro is an E2E (periodic).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+135
-41
@@ -211,6 +211,86 @@ function cleanupLegacyState(): void {
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Chromium profile lock helpers (#1781) ─────────────────────
|
||||
/** Profile dir used by headed/connect Chromium sessions. */
|
||||
function chromiumProfileDir(): string {
|
||||
return path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile');
|
||||
}
|
||||
|
||||
/** Remove Chromium SingletonLock/Socket/Cookie so a relaunch can acquire the
|
||||
* profile. Safe to call when absent. */
|
||||
function cleanChromiumProfileLocks(profileDir: string = chromiumProfileDir()): void {
|
||||
for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) {
|
||||
safeUnlinkQuiet(path.join(profileDir, lockFile));
|
||||
}
|
||||
}
|
||||
|
||||
/** Kill an orphaned Chromium that still holds the profile's SingletonLock. The
|
||||
* lock symlink target is "hostname-PID"; killing that PID tears down its
|
||||
* renderer tree so the next launch starts clean. No-op when absent/stale. */
|
||||
async function killOrphanChromium(profileDir: string = chromiumProfileDir()): Promise<void> {
|
||||
try {
|
||||
const lockTarget = fs.readlinkSync(path.join(profileDir, 'SingletonLock')); // "hostname-12345"
|
||||
const orphanPid = parseInt(lockTarget.split('-').pop() || '', 10);
|
||||
if (orphanPid && isProcessAlive(orphanPid)) {
|
||||
safeKill(orphanPid, 'SIGTERM');
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
if (isProcessAlive(orphanPid)) {
|
||||
safeKill(orphanPid, 'SIGKILL');
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (err?.code !== 'ENOENT' && err?.code !== 'EINVAL') throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/** Bounded /health probe. Returns true if the server answers within `attempts`
|
||||
* tries spaced `backoffMs` apart — distinguishes a busy-but-alive daemon from a
|
||||
* dead one (#1781) so a slow server isn't killed and restarted into a crash-loop. */
|
||||
async function probeHealthWithBackoff(port: number, attempts = 3, backoffMs = 250): Promise<boolean> {
|
||||
for (let i = 0; i < attempts; i++) {
|
||||
if (await isServerHealthy(port)) return true;
|
||||
if (i < attempts - 1) await Bun.sleep(backoffMs);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the env for an auto-restart after a crash. headed/proxy/configHash are
|
||||
* reapplied from THIS invocation OR the persisted server state, so a restart
|
||||
* triggered by a plain command (goto/status, no --headed flag) never silently
|
||||
* downgrades a headed session to headless (#1781). Pure + exported for tests.
|
||||
*/
|
||||
export function buildRestartEnv(
|
||||
globalFlags: GlobalFlags | null | undefined,
|
||||
oldState: ServerState | null,
|
||||
): Record<string, string> {
|
||||
const env: Record<string, string> = {};
|
||||
if (globalFlags?.proxyUrl) env.BROWSE_PROXY_URL = globalFlags.proxyUrl;
|
||||
if (globalFlags?.headed || oldState?.mode === 'headed') env.BROWSE_HEADED = '1';
|
||||
const configHash = globalFlags?.configHash || oldState?.configHash;
|
||||
if (configHash) env.BROWSE_CONFIG_HASH = configHash;
|
||||
return env;
|
||||
}
|
||||
|
||||
/** macOS only: pull the headed Chromium window to the user's current Space.
|
||||
* "Google Chrome for Testing" frequently opens behind the active window or on
|
||||
* another Space — the first thing users read as "I can't see the browser"
|
||||
* (#1781). Best-effort, fire-and-forget, never throws. The app name is a fixed
|
||||
* literal (no interpolation). */
|
||||
function raiseHeadedWindowMacOS(): void {
|
||||
if (process.platform !== 'darwin') return;
|
||||
try {
|
||||
nodeSpawn('osascript', ['-e', 'tell application "Google Chrome for Testing" to activate'], {
|
||||
stdio: 'ignore',
|
||||
detached: true,
|
||||
}).unref();
|
||||
} catch {
|
||||
// osascript missing or app not present — non-fatal
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Server Lifecycle ──────────────────────────────────────────
|
||||
async function startServer(extraEnv?: Record<string, string>): Promise<ServerState> {
|
||||
ensureStateDir(config);
|
||||
@@ -219,6 +299,13 @@ async function startServer(extraEnv?: Record<string, string>): Promise<ServerSta
|
||||
safeUnlink(config.stateFile);
|
||||
safeUnlink(path.join(config.stateDir, 'browse-startup-error.log'));
|
||||
|
||||
// #1781: clear a stale Chromium profile lock (and kill the orphan still
|
||||
// holding it) before launch, so an auto-restart after an abrupt kill isn't
|
||||
// blocked by the previous Chromium's SingletonLock — the self-inflicted
|
||||
// crash-loop. Previously only the manual connect preamble did this.
|
||||
await killOrphanChromium();
|
||||
cleanChromiumProfileLocks();
|
||||
|
||||
// Allow the caller to opt out of the parent-process watchdog by setting
|
||||
// BROWSE_PARENT_PID=0 in the environment. Useful for CI, non-interactive
|
||||
// shells, and short-lived Bash invocations that need the server to outlive
|
||||
@@ -486,26 +573,42 @@ async function sendCommand(state: ServerState, command: string, args: string[],
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (err.name === 'AbortError') {
|
||||
console.error('[browse] Command timed out after 30s');
|
||||
// #1781: a 30s timeout on a heavy page usually means busy, not dead.
|
||||
// Don't kill a live server (that's what triggered the crash-loop) — report
|
||||
// and exit so the user can retry rather than losing their (headed) window.
|
||||
const ts = readState();
|
||||
const alive = ts?.pid ? isProcessAlive(ts.pid) : false;
|
||||
console.error(alive
|
||||
? '[browse] Command timed out after 30s (server still alive — busy, not restarting). Retry, or raise load.'
|
||||
: '[browse] Command timed out after 30s');
|
||||
process.exit(1);
|
||||
}
|
||||
// Connection error — server may have crashed
|
||||
// Connection error — server may have crashed, OR may just be busy.
|
||||
if (err.code === 'ECONNREFUSED' || err.code === 'ECONNRESET' || err.message?.includes('fetch failed')) {
|
||||
const oldState = readState();
|
||||
// #1781 busy-vs-dead: a single-threaded daemon under beacon/extension load
|
||||
// can briefly stop answering HTTP while still alive. Before declaring a
|
||||
// crash, if the process is alive give /health a bounded chance to recover
|
||||
// and just retry the command — never kill+restart a live-but-busy server.
|
||||
if (oldState?.pid && isProcessAlive(oldState.pid) && await probeHealthWithBackoff(oldState.port)) {
|
||||
if (retries >= 1) throw new Error('[browse] Server unresponsive after retry — aborting');
|
||||
console.error('[browse] Server was briefly unresponsive (busy); retrying command...');
|
||||
return sendCommand(oldState, command, args, retries + 1);
|
||||
}
|
||||
// Truly dead (or health never recovered) → restart.
|
||||
if (retries >= 1) throw new Error('[browse] Server crashed twice in a row — aborting');
|
||||
console.error('[browse] Server connection lost. Restarting...');
|
||||
// Kill the old server to avoid orphaned chromium processes
|
||||
const oldState = readState();
|
||||
if (oldState && oldState.pid) {
|
||||
await killServer(oldState.pid);
|
||||
}
|
||||
// Reapply --proxy / --headed flags from this invocation when restarting
|
||||
// after a crash. Without this, a proxied daemon that dies mid-command
|
||||
// would silently restart in default direct/headless mode and bypass
|
||||
// the SOCKS bridge.
|
||||
const restartEnv: Record<string, string> = {};
|
||||
if (_globalFlags?.proxyUrl) restartEnv.BROWSE_PROXY_URL = _globalFlags.proxyUrl;
|
||||
if (_globalFlags?.headed) restartEnv.BROWSE_HEADED = '1';
|
||||
if (_globalFlags?.configHash) restartEnv.BROWSE_CONFIG_HASH = _globalFlags.configHash;
|
||||
// startServer() now clears the Chromium SingletonLock + reaps the orphan,
|
||||
// so the relaunch isn't blocked by the dead Chromium's profile lock (#1781).
|
||||
//
|
||||
// Reapply --proxy / --headed when restarting. headed comes from THIS
|
||||
// invocation OR the persisted server mode, so a restart triggered by a
|
||||
// plain command (goto/status, no --headed) never silently downgrades a
|
||||
// headed session to headless (#1781). Same for proxy/configHash.
|
||||
const restartEnv = buildRestartEnv(_globalFlags, oldState);
|
||||
const newState = await startServer(Object.keys(restartEnv).length ? restartEnv : undefined);
|
||||
return sendCommand(newState, command, args, retries + 1);
|
||||
}
|
||||
@@ -966,30 +1069,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
|
||||
}
|
||||
}
|
||||
|
||||
// Kill orphaned Chromium processes that may still hold the profile lock.
|
||||
// The server PID is the Bun process; Chromium is a child that can outlive it
|
||||
// if the server is killed abruptly (SIGKILL, crash, manual rm of state file).
|
||||
const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile');
|
||||
try {
|
||||
const singletonLock = path.join(profileDir, 'SingletonLock');
|
||||
const lockTarget = fs.readlinkSync(singletonLock); // e.g. "hostname-12345"
|
||||
const orphanPid = parseInt(lockTarget.split('-').pop() || '', 10);
|
||||
if (orphanPid && isProcessAlive(orphanPid)) {
|
||||
safeKill(orphanPid, 'SIGTERM');
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
if (isProcessAlive(orphanPid)) {
|
||||
safeKill(orphanPid, 'SIGKILL');
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (err?.code !== 'ENOENT' && err?.code !== 'EINVAL') throw err;
|
||||
}
|
||||
|
||||
// Clean up Chromium profile locks (can persist after crashes)
|
||||
for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) {
|
||||
safeUnlinkQuiet(path.join(profileDir, lockFile));
|
||||
}
|
||||
// Kill an orphaned Chromium still holding the profile lock (the Bun server
|
||||
// PID's Chromium child can outlive an abrupt kill/crash), then clear the
|
||||
// lock files so the launch is clean. Shared with the auto-restart path (#1781).
|
||||
await killOrphanChromium();
|
||||
cleanChromiumProfileLocks();
|
||||
|
||||
// Delete stale state file
|
||||
safeUnlinkQuiet(config.stateFile);
|
||||
@@ -1027,6 +1111,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
|
||||
});
|
||||
const status = await resp.text();
|
||||
console.log(`Connected to real Chrome\n${status}`);
|
||||
// #1781: surface the window — it often opens behind/on another Space.
|
||||
raiseHeadedWindowMacOS();
|
||||
if (process.platform === 'darwin') {
|
||||
console.log('(If you still don\'t see it, check Mission Control / other Spaces.)');
|
||||
}
|
||||
|
||||
// sidebar-agent.ts spawn was here. Ripped alongside the chat queue —
|
||||
// the Terminal pane runs an interactive PTY now, no more one-shot
|
||||
@@ -1194,11 +1283,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
|
||||
safeKill(existingState.pid, 'SIGKILL');
|
||||
}
|
||||
}
|
||||
// Clean profile locks and state file
|
||||
const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile');
|
||||
for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) {
|
||||
safeUnlinkQuiet(path.join(profileDir, lockFile));
|
||||
}
|
||||
// #1781: killing the daemon can orphan its Chromium child tree, which keeps
|
||||
// holding the SingletonLock and makes the next `connect` fail to launch.
|
||||
// Reap the orphan via the lock, then clear the lock files + state.
|
||||
await killOrphanChromium();
|
||||
cleanChromiumProfileLocks();
|
||||
// Xvfb orphan cleanup: if the recorded PID still matches our Xvfb (by
|
||||
// cmdline AND start-time), kill it. PID-only would risk killing a
|
||||
// recycled PID belonging to an unrelated process.
|
||||
@@ -1258,6 +1347,11 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
|
||||
}
|
||||
|
||||
await sendCommand(state, command, commandArgs);
|
||||
|
||||
// #1781: `focus` means "show me the window". The server-side focus activates
|
||||
// the page via CDP, but on macOS the app can still sit on another Space — pull
|
||||
// it to the user's current Space too.
|
||||
if (command === 'focus') raiseHeadedWindowMacOS();
|
||||
}
|
||||
|
||||
if (import.meta.main) {
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
import { describe, test, expect } from "bun:test";
|
||||
import { buildRestartEnv } from "../src/cli";
|
||||
|
||||
// #1781: an auto-restart triggered by a plain command (no --headed flag) must
|
||||
// NOT silently downgrade a headed session to headless. buildRestartEnv reapplies
|
||||
// headed/proxy/configHash from this invocation OR the persisted server state.
|
||||
describe("buildRestartEnv (#1781 headed persistence)", () => {
|
||||
const headedState = { pid: 1, port: 9, token: "t", startedAt: "", serverPath: "", mode: "headed" as const };
|
||||
const launchedState = { pid: 1, port: 9, token: "t", startedAt: "", serverPath: "", mode: "launched" as const };
|
||||
|
||||
test("headed flag on this invocation → BROWSE_HEADED=1", () => {
|
||||
expect(buildRestartEnv({ headed: true } as any, null).BROWSE_HEADED).toBe("1");
|
||||
});
|
||||
|
||||
test("plain command + persisted headed state → still BROWSE_HEADED=1 (the regression)", () => {
|
||||
const env = buildRestartEnv({} as any, headedState as any);
|
||||
expect(env.BROWSE_HEADED).toBe("1");
|
||||
});
|
||||
|
||||
test("plain command + headless state → no BROWSE_HEADED (no spurious headed)", () => {
|
||||
const env = buildRestartEnv({} as any, launchedState as any);
|
||||
expect(env.BROWSE_HEADED).toBeUndefined();
|
||||
});
|
||||
|
||||
test("nothing set → empty env", () => {
|
||||
expect(buildRestartEnv(null, null)).toEqual({});
|
||||
});
|
||||
|
||||
test("proxy + configHash reapplied from flags", () => {
|
||||
const env = buildRestartEnv({ proxyUrl: "socks5://x", configHash: "abc" } as any, null);
|
||||
expect(env.BROWSE_PROXY_URL).toBe("socks5://x");
|
||||
expect(env.BROWSE_CONFIG_HASH).toBe("abc");
|
||||
});
|
||||
|
||||
test("configHash falls back to persisted state", () => {
|
||||
const env = buildRestartEnv({} as any, { ...launchedState, configHash: "fromstate" } as any);
|
||||
expect(env.BROWSE_CONFIG_HASH).toBe("fromstate");
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user