feat: gstack-detach — run agent eval/bench jobs in their own session

Long agent-run jobs (30-60 min evals, benchmarks) die when the harness sends
SIGTERM to a background task's process group on turn boundaries / monitor
stops / interruptions (observed: 'script test:gate terminated by signal
SIGTERM'). gstack-detach runs the command in a fresh session (python3
os.setsid, or setsid on Linux, nohup fallback) so a group SIGTERM can't reach
it, and wraps it in caffeinate -i on macOS so idle-sleep can't kill it either.
Returns immediately; caller polls the logfile. Secrets stay in env, never argv.

The guard test pins the contract: the command runs in a different process
group than the caller and outlives the launching shell.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-11 23:05:59 -07:00
parent 0f165cbed0
commit d1fc21cbca
2 changed files with 122 additions and 0 deletions
+52
View File
@@ -0,0 +1,52 @@
#!/usr/bin/env bash
# gstack-detach — run a long-running command in its OWN session (a fresh process
# group with no controlling terminal) so a SIGTERM aimed at the launching shell's
# process group can't reach it.
#
# Why this exists: when an AGENT/harness launches a 30-60 min eval as a background
# task, the harness sends SIGTERM ("polite quit") to that task's process group on
# turn boundaries, monitor stops, or interruptions — killing the run mid-flight
# (observed: `script "test:gate" was terminated by signal SIGTERM`). Detaching into
# a new session escapes that group signal. Humans running evals foreground in their
# own terminal don't need this (Ctrl-C is intended); this is for agent-run jobs.
#
# Usage: gstack-detach <logfile> -- <command> [args...]
# (the `--` is optional but recommended for clarity)
# Output: prints `PID <n> LOG <path>` and returns immediately. Poll the logfile;
# the command keeps running independently of this shell.
# Secrets: inherited from the environment ONLY. NEVER pass an API key in argv
# (it would show in `ps`). Export it before calling gstack-detach.
set -euo pipefail
LOG="${1:?usage: gstack-detach <logfile> -- <command...>}"; shift
[ "${1:-}" = "--" ] && shift
[ "$#" -ge 1 ] || { echo "gstack-detach: no command given" >&2; exit 2; }
mkdir -p "$(dirname "$LOG")" 2>/dev/null || true
# Preferred path: python3 creates the new session (portable; macOS has no setsid)
# and, on macOS, wraps the command in `caffeinate -i` so idle-sleep can't kill a
# long run — a second silent killer for 30-60 min jobs.
if command -v python3 >/dev/null 2>&1; then
GSTACK_DETACH_LOG="$LOG" exec python3 - "$@" <<'PY'
import os, sys, shutil, subprocess
os.setsid() # new session => new process group, no controlling terminal
log = os.environ["GSTACK_DETACH_LOG"]
cmd = sys.argv[1:]
if shutil.which("caffeinate"): # macOS: block idle-sleep for the run
cmd = ["caffeinate", "-i", *cmd]
f = open(log, "ab", buffering=0)
p = subprocess.Popen(cmd, stdout=f, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL)
print(f"PID {p.pid} LOG {log}")
PY
fi
# Linux without python3: real setsid.
if command -v setsid >/dev/null 2>&1; then
setsid sh -c 'exec "$@" >>"$0" 2>&1' "$LOG" "$@" &
echo "PID $! LOG $LOG"; disown 2>/dev/null || true; exit 0
fi
# Last resort: nohup detaches from SIGHUP (not a group SIGTERM, but better than
# nothing on a minimal box).
nohup sh -c 'exec "$@" >>"$0" 2>&1' "$LOG" "$@" >/dev/null 2>&1 &
echo "PID $! LOG $LOG"; disown 2>/dev/null || true
+70
View File
@@ -0,0 +1,70 @@
/**
* gstack-detach — the SIGTERM-survival guard.
*
* Proves the wrapper runs its command in a DIFFERENT process group than the
* caller (so a group SIGTERM from the harness can't reach it) and that the
* command outlives the launching shell (returns immediately, completes later).
* This is the regression guard that keeps the eval-killer dead.
*/
import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
const ROOT = path.resolve(import.meta.dir, '..');
const DETACH = path.join(ROOT, 'bin', 'gstack-detach');
function ownPgid(): string {
const r = spawnSync('ps', ['-o', 'pgid=', '-p', String(process.pid)], { encoding: 'utf-8' });
return (r.stdout || '').trim();
}
describe('gstack-detach', () => {
test('returns immediately and the command keeps running detached', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-detach-'));
const log = path.join(dir, 'run.log');
const marker = path.join(dir, 'marker');
const pgidFile = path.join(dir, 'child.pgid');
try {
const started = Date.now();
// Child records its own pgid, sleeps past the launcher's return, then writes
// a marker — proving it ran to completion independently of this shell.
const cmd = `ps -o pgid= -p $$ | tr -d ' ' > '${pgidFile}'; sleep 2; echo ok > '${marker}'`;
const r = spawnSync(DETACH, [log, '--', 'bash', '-c', cmd], { encoding: 'utf-8', timeout: 10000 });
const elapsed = Date.now() - started;
expect(r.status).toBe(0);
expect(r.stdout).toMatch(/PID \d+ {2}LOG /);
// Non-blocking: the launcher returns well before the child's 2s sleep ends.
expect(elapsed).toBeLessThan(1500);
// Poll for the marker — the detached child finishes after the launcher exited.
let survived = false;
const deadline = Date.now() + 6000;
while (Date.now() < deadline) {
if (fs.existsSync(marker)) { survived = true; break; }
spawnSync('sleep', ['0.2']);
}
expect(survived).toBe(true);
// Detached: the child's process group differs from ours, so a group SIGTERM
// aimed at this process can't reach it.
const childPgid = fs.readFileSync(pgidFile, 'utf-8').trim();
expect(childPgid).not.toBe('');
expect(childPgid).not.toBe(ownPgid());
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
}, 15000);
test('rejects missing command (exit 2)', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-detach-'));
try {
const r = spawnSync(DETACH, [path.join(dir, 'x.log')], { encoding: 'utf-8' });
expect(r.status).toBe(2);
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
});
});