mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-22 01:30:03 +02:00
feat: harden gstack-detach against all four eval-infra killers
The basic bash detach fixed SIGTERM but a real run on a shared dev box hit three more killers: cross-worktree API saturation (15-way concurrency x a sibling worktree mass-timed-out the suite), a silent hang (periodic bun died with no exit marker), and shared-/tmp log contamination (a concurrent worktree's agent output bled into the log). Rewrite as a portable python3 tool that bakes in all four fixes: - fork + setsid: SIGTERM-proof (own session, survives harness polite-quit) - caffeinate -i on macOS: no idle-sleep death - --lock NAME (fcntl, machine-wide): concurrent worktrees SERIALIZE instead of saturating the shared model API - run-scoped default log (~/.gstack-dev/eval-runs/<label>-<slug>-<branch>-<ts>-<pid>): no cross-worktree collision/contamination - --timeout watchdog + a guaranteed '### gstack-detach EXIT=<code> ###' sentinel on every terminal path: no silent hang, finished-vs-died always detectable Guard test pins all four: detached pgid differs + outlives launcher, run-scoped log path, watchdog EXIT=timeout, and lock serialization (second run WAITS). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+71
-45
@@ -1,13 +1,12 @@
|
||||
/**
|
||||
* gstack-detach — the SIGTERM-survival guard.
|
||||
*
|
||||
* Proves the wrapper runs its command in a DIFFERENT process group than the
|
||||
* caller (so a group SIGTERM from the harness can't reach it) and that the
|
||||
* command outlives the launching shell (returns immediately, completes later).
|
||||
* This is the regression guard that keeps the eval-killer dead.
|
||||
* gstack-detach — the eval-infra robustness guard. Pins the four killer fixes:
|
||||
* 1. SIGTERM-proof detachment (runs in a different process group, outlives the launcher)
|
||||
* 2. run-scoped default log path (no shared-/tmp collision between worktrees)
|
||||
* 3. watchdog --timeout (no silent hang) + guaranteed EXIT sentinel
|
||||
* 4. machine-wide --lock serialization (no cross-worktree API saturation)
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import { spawnSync, spawn } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
@@ -16,55 +15,82 @@ const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const DETACH = path.join(ROOT, 'bin', 'gstack-detach');
|
||||
|
||||
function ownPgid(): string {
|
||||
const r = spawnSync('ps', ['-o', 'pgid=', '-p', String(process.pid)], { encoding: 'utf-8' });
|
||||
return (r.stdout || '').trim();
|
||||
return (spawnSync('ps', ['-o', 'pgid=', '-p', String(process.pid)], { encoding: 'utf-8' }).stdout || '').trim();
|
||||
}
|
||||
function waitFor(pred: () => boolean, ms: number): boolean {
|
||||
const end = Date.now() + ms;
|
||||
while (Date.now() < end) {
|
||||
if (pred()) return true;
|
||||
spawnSync('sleep', ['0.2']);
|
||||
}
|
||||
return pred();
|
||||
}
|
||||
function logHas(p: string, needle: string): boolean {
|
||||
try { return fs.readFileSync(p, 'utf-8').includes(needle); } catch { return false; }
|
||||
}
|
||||
|
||||
describe('gstack-detach', () => {
|
||||
test('returns immediately and the command keeps running detached', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-detach-'));
|
||||
test('detaches (different pgid), returns immediately, completes, writes EXIT sentinel', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
|
||||
const log = path.join(dir, 'run.log');
|
||||
const marker = path.join(dir, 'marker');
|
||||
const pgidFile = path.join(dir, 'child.pgid');
|
||||
try {
|
||||
const started = Date.now();
|
||||
// Child records its own pgid, sleeps past the launcher's return, then writes
|
||||
// a marker — proving it ran to completion independently of this shell.
|
||||
const cmd = `ps -o pgid= -p $$ | tr -d ' ' > '${pgidFile}'; sleep 2; echo ok > '${marker}'`;
|
||||
const r = spawnSync(DETACH, [log, '--', 'bash', '-c', cmd], { encoding: 'utf-8', timeout: 10000 });
|
||||
const elapsed = Date.now() - started;
|
||||
|
||||
const t0 = Date.now();
|
||||
const r = spawnSync(DETACH, ['--log', log, '--', 'bash', '-c', 'sleep 2; echo body-ran'], { encoding: 'utf-8', timeout: 10000 });
|
||||
const elapsed = Date.now() - t0;
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toMatch(/PID \d+ {2}LOG /);
|
||||
// Non-blocking: the launcher returns well before the child's 2s sleep ends.
|
||||
expect(elapsed).toBeLessThan(1500);
|
||||
|
||||
// Poll for the marker — the detached child finishes after the launcher exited.
|
||||
let survived = false;
|
||||
const deadline = Date.now() + 6000;
|
||||
while (Date.now() < deadline) {
|
||||
if (fs.existsSync(marker)) { survived = true; break; }
|
||||
spawnSync('sleep', ['0.2']);
|
||||
}
|
||||
expect(survived).toBe(true);
|
||||
|
||||
// Detached: the child's process group differs from ours, so a group SIGTERM
|
||||
// aimed at this process can't reach it.
|
||||
const childPgid = fs.readFileSync(pgidFile, 'utf-8').trim();
|
||||
expect(childPgid).not.toBe('');
|
||||
expect(childPgid).not.toBe(ownPgid());
|
||||
} finally {
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
}
|
||||
expect(r.stdout).toContain(`gstack-detach LOG ${log}`);
|
||||
expect(elapsed).toBeLessThan(1500); // non-blocking
|
||||
expect(waitFor(() => logHas(log, '### gstack-detach EXIT=0 ###'), 8000)).toBe(true);
|
||||
expect(logHas(log, 'body-ran')).toBe(true); // ran to completion after launcher returned
|
||||
const m = fs.readFileSync(log, 'utf-8').match(/pgid=(\d+)/);
|
||||
expect(m).not.toBeNull();
|
||||
expect(m![1]).not.toBe(ownPgid()); // detached into its own group
|
||||
} finally { fs.rmSync(dir, { recursive: true, force: true }); }
|
||||
}, 15000);
|
||||
|
||||
test('rejects missing command (exit 2)', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-detach-'));
|
||||
test('default log is run-scoped under ~/.gstack-dev/eval-runs (no shared /tmp)', () => {
|
||||
const r = spawnSync(DETACH, ['--label', 'unittest', '--', 'true'], { encoding: 'utf-8', timeout: 10000 });
|
||||
const log = (r.stdout.match(/gstack-detach LOG (\S+)/) || [])[1];
|
||||
try {
|
||||
const r = spawnSync(DETACH, [path.join(dir, 'x.log')], { encoding: 'utf-8' });
|
||||
expect(r.status).toBe(2);
|
||||
expect(log).toContain('/.gstack-dev/eval-runs/');
|
||||
expect(path.basename(log)).toContain('unittest-');
|
||||
expect(path.basename(log)).toMatch(/-\d+\.log$/); // pid-unique
|
||||
waitFor(() => logHas(log, '### gstack-detach EXIT=0 ###'), 6000);
|
||||
} finally { if (log) fs.rmSync(log, { force: true }); }
|
||||
}, 12000);
|
||||
|
||||
test('watchdog kills a stalled run and records EXIT=timeout (no silent hang)', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
|
||||
const log = path.join(dir, 'run.log');
|
||||
try {
|
||||
spawnSync(DETACH, ['--log', log, '--timeout', '1', '--', 'sleep', '60'], { encoding: 'utf-8', timeout: 10000 });
|
||||
expect(waitFor(() => logHas(log, '### gstack-detach EXIT=timeout ###'), 12000)).toBe(true);
|
||||
expect(logHas(log, 'WATCHDOG fired')).toBe(true);
|
||||
} finally { fs.rmSync(dir, { recursive: true, force: true }); }
|
||||
}, 16000);
|
||||
|
||||
test('machine --lock serializes concurrent runs (second WAITS for the first)', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
|
||||
const lock = `gstack-detach-test-${process.pid}`;
|
||||
const logA = path.join(dir, 'a.log');
|
||||
const logB = path.join(dir, 'b.log');
|
||||
try {
|
||||
// First holds the lock for ~3s; second must wait then acquire.
|
||||
spawnSync(DETACH, ['--log', logA, '--lock', lock, '--', 'sleep', '3'], { encoding: 'utf-8', timeout: 10000 });
|
||||
waitFor(() => logHas(logA, "ACQUIRED"), 4000);
|
||||
spawnSync(DETACH, ['--log', logB, '--lock', lock, '--', 'echo', 'second-ran'], { encoding: 'utf-8', timeout: 10000 });
|
||||
// Second should report WAITING (first still holds it) then ACQUIRE after release.
|
||||
expect(waitFor(() => logHas(logB, 'WAITING for lock'), 4000)).toBe(true);
|
||||
expect(waitFor(() => logHas(logB, '### gstack-detach EXIT=0 ###'), 12000)).toBe(true);
|
||||
expect(logHas(logB, 'second-ran')).toBe(true);
|
||||
} finally {
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
fs.rmSync(path.join(os.homedir(), '.gstack', 'locks', `${lock}.lock`), { force: true });
|
||||
}
|
||||
}, 20000);
|
||||
|
||||
test('rejects missing command (exit 2)', () => {
|
||||
const r = spawnSync(DETACH, ['--label', 'x'], { encoding: 'utf-8' });
|
||||
expect(r.status).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user