feat: session-runner spawns hermetic children + isolation canaries

claude -p children now get the allowlist-scrubbed env and a gated
--strict-mcp-config (EVALS_HERMETIC=0 restores operator env AND args).
Two gate-tier canaries make the clean room falsifiable: hermetic-canary
asserts env redirect + scrub + zero MCP servers + nonzero API-key cost
from the Bash tool_result (never model prose); hermetic-sentinel plants a
poisoned operator config (user CLAUDE.md + MCP server) and proves the
child cannot see it. Empirically verified on claude 2.1.175: print mode
needs no seed config (the seed serves the PTY path); the child CLI sets
CLAUDECODE for its own tools, so that scrub is pinned in unit tests, not
E2E. hermetic-env.ts joins GLOBAL_TOUCHFILES.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-12 11:13:50 -07:00
parent 2d56961636
commit c3e65b1634
4 changed files with 219 additions and 6 deletions
+9 -5
View File
@@ -131,14 +131,18 @@ export interface SeedConfigOpts {
}
/**
* Minimal $CLAUDE_CONFIG_DIR/.claude.json that gets a fresh-config child past
* first-run prompts non-interactively. Every key here was empirically
* verified against a real ~/.claude.json (2026-06-12, claude 2.1.175):
* Minimal $CLAUDE_CONFIG_DIR/.claude.json for fresh-config children.
*
* Empirically verified 2026-06-12 on claude 2.1.175: PRINT MODE (`claude -p`)
* with ANTHROPIC_API_KEY needs NO seed at all — a fresh empty config dir ran
* non-interactively (exit 0, real cost billed to the key). The seed exists
* for the PTY path, where first-run TUI prompts DO appear:
* - hasCompletedOnboarding: suppresses the onboarding flow
* - customApiKeyResponses.approved: suppresses the "use this API key?"
* prompt; entries are the key's LAST 20 CHARS
* prompt; entries are the key's LAST 20 CHARS (shape verified against a
* real ~/.claude.json)
* - projects[dir].hasTrustDialogAccepted: pre-trusts repo-cwd PTY sessions
* (print mode skips the dialog; PTY plan-mode tests don't)
* (the pty-runner's 15s trust-watcher remains as fallback for temp cwds)
* bypassPermissionsModeAccepted was considered and dropped: absent from a
* real config even though --dangerously-skip-permissions is in daily use.
*/
+9 -1
View File
@@ -10,6 +10,7 @@ import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { getProjectEvalDir } from './eval-store';
import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env';
const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
@@ -167,6 +168,10 @@ export async function runSkillTest(options: {
'--max-turns', String(maxTurns),
'--allowed-tools', ...allowedTools,
];
// Hermetic children get zero MCP servers (no --mcp-config is passed).
// Gated on the same call-time check as the env scrub so EVALS_HERMETIC=0
// restores operator MCP along with the operator env.
if (isHermeticEnabled()) args.push('--strict-mcp-config');
// Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions
// where afterAll cleanup deletes the dir before cat reads the file (especially
@@ -176,11 +181,14 @@ export async function runSkillTest(options: {
const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
cwd: workingDirectory,
// Hermetic by default (see test/helpers/hermetic-env.ts): operator
// session context (CONDUCTOR_*, CLAUDECODE, ~/.claude config, ~/.gstack)
// never reaches the child; EVALS_HERMETIC=0 restores the legacy env.
// Default GSTACK_HEADLESS=1 so eval/E2E runs classify as headless (BLOCK on an
// AskUserQuestion failure rather than emit a prose question no human reads). A
// suite exercising the INTERACTIVE prose-fallback path opts out by passing
// `env: { GSTACK_HEADLESS: '' }` — extraEnv wins because it spreads last.
env: { ...process.env, GSTACK_HEADLESS: '1', ...extraEnv },
env: hermeticChildEnv({ GSTACK_HEADLESS: '1', ...extraEnv }),
stdout: 'pipe',
stderr: 'pipe',
});
+11
View File
@@ -36,6 +36,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'],
'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],
// Hermetic isolation canaries (hermetic-env.ts is also a GLOBAL touchfile;
// these entries exist so the canaries themselves stay tier-classified)
'hermetic-canary': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
'hermetic-sentinel': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
// SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -437,6 +442,11 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'browse-basic': 'gate',
'browse-snapshot': 'gate',
// Hermetic isolation — gate (deterministic env/config assertions; if the
// clean room breaks, every other eval's signal is contaminated)
'hermetic-canary': 'gate',
'hermetic-sentinel': 'gate',
// SKILL.md setup — gate (if setup breaks, no skill works)
'skillmd-setup-discovery': 'gate',
'skillmd-no-local-binary': 'gate',
@@ -782,6 +792,7 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
*/
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts', // All E2E tests use this runner
'test/helpers/hermetic-env.ts', // Changes every E2E child's environment
'test/helpers/eval-store.ts', // All E2E tests store results here
'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous
];