diff --git a/test/helpers/hermetic-env.ts b/test/helpers/hermetic-env.ts index dbaa49b51..c883efc22 100644 --- a/test/helpers/hermetic-env.ts +++ b/test/helpers/hermetic-env.ts @@ -131,14 +131,18 @@ export interface SeedConfigOpts { } /** - * Minimal $CLAUDE_CONFIG_DIR/.claude.json that gets a fresh-config child past - * first-run prompts non-interactively. Every key here was empirically - * verified against a real ~/.claude.json (2026-06-12, claude 2.1.175): + * Minimal $CLAUDE_CONFIG_DIR/.claude.json for fresh-config children. + * + * Empirically verified 2026-06-12 on claude 2.1.175: PRINT MODE (`claude -p`) + * with ANTHROPIC_API_KEY needs NO seed at all — a fresh empty config dir ran + * non-interactively (exit 0, real cost billed to the key). The seed exists + * for the PTY path, where first-run TUI prompts DO appear: * - hasCompletedOnboarding: suppresses the onboarding flow * - customApiKeyResponses.approved: suppresses the "use this API key?" - * prompt; entries are the key's LAST 20 CHARS + * prompt; entries are the key's LAST 20 CHARS (shape verified against a + * real ~/.claude.json) * - projects[dir].hasTrustDialogAccepted: pre-trusts repo-cwd PTY sessions - * (print mode skips the dialog; PTY plan-mode tests don't) + * (the pty-runner's 15s trust-watcher remains as fallback for temp cwds) * bypassPermissionsModeAccepted was considered and dropped: absent from a * real config even though --dangerously-skip-permissions is in daily use. */ diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts index 675255cf2..c29c59022 100644 --- a/test/helpers/session-runner.ts +++ b/test/helpers/session-runner.ts @@ -10,6 +10,7 @@ import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import { getProjectEvalDir } from './eval-store'; +import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env'; const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev'); const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global @@ -167,6 +168,10 @@ export async function runSkillTest(options: { '--max-turns', String(maxTurns), '--allowed-tools', ...allowedTools, ]; + // Hermetic children get zero MCP servers (no --mcp-config is passed). + // Gated on the same call-time check as the env scrub so EVALS_HERMETIC=0 + // restores operator MCP along with the operator env. + if (isHermeticEnabled()) args.push('--strict-mcp-config'); // Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions // where afterAll cleanup deletes the dir before cat reads the file (especially @@ -176,11 +181,14 @@ export async function runSkillTest(options: { const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], { cwd: workingDirectory, + // Hermetic by default (see test/helpers/hermetic-env.ts): operator + // session context (CONDUCTOR_*, CLAUDECODE, ~/.claude config, ~/.gstack) + // never reaches the child; EVALS_HERMETIC=0 restores the legacy env. // Default GSTACK_HEADLESS=1 so eval/E2E runs classify as headless (BLOCK on an // AskUserQuestion failure rather than emit a prose question no human reads). A // suite exercising the INTERACTIVE prose-fallback path opts out by passing // `env: { GSTACK_HEADLESS: '' }` — extraEnv wins because it spreads last. - env: { ...process.env, GSTACK_HEADLESS: '1', ...extraEnv }, + env: hermeticChildEnv({ GSTACK_HEADLESS: '1', ...extraEnv }), stdout: 'pipe', stderr: 'pipe', }); diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 3902b968e..faf1dea63 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -36,6 +36,11 @@ export const E2E_TOUCHFILES: Record = { 'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'], 'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'], + // Hermetic isolation canaries (hermetic-env.ts is also a GLOBAL touchfile; + // these entries exist so the canaries themselves stay tier-classified) + 'hermetic-canary': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'], + 'hermetic-sentinel': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'], + // SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs) 'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], 'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], @@ -437,6 +442,11 @@ export const E2E_TIERS: Record = { 'browse-basic': 'gate', 'browse-snapshot': 'gate', + // Hermetic isolation — gate (deterministic env/config assertions; if the + // clean room breaks, every other eval's signal is contaminated) + 'hermetic-canary': 'gate', + 'hermetic-sentinel': 'gate', + // SKILL.md setup — gate (if setup breaks, no skill works) 'skillmd-setup-discovery': 'gate', 'skillmd-no-local-binary': 'gate', @@ -782,6 +792,7 @@ export const LLM_JUDGE_TOUCHFILES: Record = { */ export const GLOBAL_TOUCHFILES = [ 'test/helpers/session-runner.ts', // All E2E tests use this runner + 'test/helpers/hermetic-env.ts', // Changes every E2E child's environment 'test/helpers/eval-store.ts', // All E2E tests store results here 'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous ]; diff --git a/test/skill-e2e-hermetic-canary.test.ts b/test/skill-e2e-hermetic-canary.test.ts new file mode 100644 index 000000000..1356b7dce --- /dev/null +++ b/test/skill-e2e-hermetic-canary.test.ts @@ -0,0 +1,190 @@ +/** + * Hermetic-isolation canaries (gate tier, ~$0.02 each, deterministic). + * + * Two tests that make the hermeticity claim FALSIFIABLE instead of asserted: + * + * 1. `hermetic-canary` — env + auth isolation. Plants contamination vars in + * the TEST process env, spawns a child through the real runner, and + * asserts from the Bash tool_result in the stream-json transcript (never + * the model's prose — prose can hallucinate) that the child saw a temp + * `/.claude` config dir, a temp GSTACK_HOME, and none of the planted + * contamination. Auth hermeticity: hard-fails when ANTHROPIC_API_KEY is + * absent (a skip here would be a silent hole), and asserts + * total_cost_usd > 0 — subscription/keychain OAuth reports cost 0, so + * nonzero cost is the discriminator that the API key actually paid + * (verified empirically 2026-06-12; the result record exposes no + * auth-source field, so cost is the best available signal — residual + * gap documented in the plan). + * + * 2. `hermetic-sentinel` — config isolation, the poisoned-operator probe. + * Builds a FAKE operator config tree (user CLAUDE.md + an mcpServers + * entry) and points the test process's CLAUDE_CONFIG_DIR at it. If the + * hermetic redirect ever breaks, the child loads that poisoned tree and + * the probes fire: init.mcp_servers would list the planted server + * (semantic proof that --strict-mcp-config + the redirect yield ZERO MCP + * servers, not an assumption), and the child's config dir would contain + * the poisoned CLAUDE.md. + * + * Both canaries double as the seed-schema / CLI version-skew tripwire: a + * claude release that changes first-run behavior or config discovery fails + * here first, loudly, in the gate tier. + */ + +import { expect, afterAll } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { runSkillTest } from './helpers/session-runner'; +import { + describeIfSelected, testIfSelected, createEvalCollector, finalizeEvalCollector, + recordE2E, runId, logCost, +} from './helpers/e2e-helpers'; + +const evalCollector = createEvalCollector('e2e-hermetic'); + +// Cheap + deterministic: the canaries assert environment facts, not model +// quality, so the smallest model is the right tool. +const CANARY_MODEL = 'claude-haiku-4-5-20251001'; + +/** Extract concatenated tool_result text from the stream-json transcript. */ +function toolResultText(transcript: any[]): string { + const chunks: string[] = []; + for (const event of transcript) { + if (event.type !== 'user') continue; + for (const item of event.message?.content ?? []) { + if (item.type !== 'tool_result') continue; + if (typeof item.content === 'string') chunks.push(item.content); + else for (const c of item.content ?? []) if (c.type === 'text') chunks.push(c.text); + } + } + return chunks.join('\n'); +} + +function initEvent(transcript: any[]): any { + return transcript.find((e) => e.type === 'system' && e.subtype === 'init'); +} + +describeIfSelected('hermetic isolation canaries', ['hermetic-canary', 'hermetic-sentinel'], () => { + testIfSelected('hermetic-canary', async () => { + // Auth hermeticity is part of the contract: a missing key must FAIL the + // gate, not skip it — a skipped canary is a silent hole. + if (!process.env.ANTHROPIC_API_KEY) { + throw new Error('hermetic-canary requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip'); + } + + const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-canary-')); + // Plant contamination deterministically — the operator env may or may not + // carry these, so set them ourselves and restore after. + const planted: Record = { + CONDUCTOR_WORKSPACE_PATH: '/tmp/poison-conductor-ws', + GBRAIN_POISON_PROBE: 'leaked', + }; + const prev: Record = {}; + for (const [k, v] of Object.entries(planted)) { prev[k] = process.env[k]; process.env[k] = v; } + + try { + const result = await runSkillTest({ + prompt: 'Run exactly this bash command and then stop: ' + + 'echo "CFG=$CLAUDE_CONFIG_DIR"; echo "GH=$GSTACK_HOME"; ' + + 'echo "CW=$CONDUCTOR_WORKSPACE_PATH"; echo "GP=$GBRAIN_POISON_PROBE"', + workingDirectory: workDir, + maxTurns: 3, + allowedTools: ['Bash'], + timeout: 120_000, + testName: 'hermetic-canary', + runId, + model: CANARY_MODEL, + }); + logCost('hermetic-canary', result); + recordE2E(evalCollector, 'hermetic-canary', 'e2e-hermetic', result); + + expect(result.exitReason).toBe('success'); + + // Deterministic: assert the Bash tool OUTPUT, not the model's prose. + const bashOut = toolResultText(result.transcript); + const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? ''; + expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/); + expect(bashOut).toMatch(/GH=\S*gstack-home/); + // Planted contamination must not reach the child. CLAUDECODE is NOT + // probed here: the child claude CLI sets CLAUDECODE=1 for its own tool + // subprocesses (verified empirically — CI behaves identically), so the + // Bash tool can't observe our scrub of it; the unit test pins that. + expect(bashOut).toMatch(/(^|\n)CW=\s*($|\n)/); // planted Conductor var scrubbed + expect(bashOut).toMatch(/(^|\n)GP=\s*($|\n)/); // GBRAIN_* scrubbed + + // Zero MCP servers — semantic, from the init event, not a flag grep. + const init = initEvent(result.transcript); + expect(init).toBeTruthy(); + expect(init.mcp_servers ?? []).toHaveLength(0); + + // Auth: nonzero cost = the API key paid (OAuth/keychain reports 0). + expect(result.transcript.find((e) => e.type === 'result')?.total_cost_usd).toBeGreaterThan(0); + } finally { + for (const [k, v] of Object.entries(prev)) { + if (v === undefined) delete process.env[k]; else process.env[k] = v; + } + fs.rmSync(workDir, { recursive: true, force: true }); + } + }, 180_000); + + testIfSelected('hermetic-sentinel', async () => { + if (!process.env.ANTHROPIC_API_KEY) { + throw new Error('hermetic-sentinel requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip'); + } + + const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-sentinel-')); + // Poisoned operator config tree: if the hermetic redirect breaks, the + // child discovers this dir and both probes below fire. + const poisonRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-poison-')); + const poisonCfg = path.join(poisonRoot, '.claude'); + fs.mkdirSync(poisonCfg, { recursive: true }); + fs.writeFileSync(path.join(poisonCfg, 'CLAUDE.md'), 'POISONED OPERATOR MEMORY — must never load\n'); + fs.writeFileSync(path.join(poisonCfg, '.claude.json'), JSON.stringify({ + hasCompletedOnboarding: true, + mcpServers: { 'sentinel-mcp': { command: '/usr/bin/true', args: [] } }, + })); + const prevCfgDir = process.env.CLAUDE_CONFIG_DIR; + process.env.CLAUDE_CONFIG_DIR = poisonCfg; + + try { + const result = await runSkillTest({ + prompt: 'Run exactly this bash command and then stop: ' + + 'echo "CFG=$CLAUDE_CONFIG_DIR"; ' + + 'if [ -f "$CLAUDE_CONFIG_DIR/CLAUDE.md" ]; then echo "USER_MD=present"; else echo "USER_MD=absent"; fi', + workingDirectory: workDir, + maxTurns: 3, + allowedTools: ['Bash'], + timeout: 120_000, + testName: 'hermetic-sentinel', + runId, + model: CANARY_MODEL, + }); + logCost('hermetic-sentinel', result); + recordE2E(evalCollector, 'hermetic-sentinel', 'e2e-hermetic', result); + + expect(result.exitReason).toBe('success'); + + const bashOut = toolResultText(result.transcript); + const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? ''; + // The redirect must beat the poisoned operator value... + expect(cfg).not.toBe(poisonCfg); + expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/); + // ...and the active config dir must not carry the poisoned user memory. + expect(bashOut).toContain('USER_MD=absent'); + + // The planted MCP server must be invisible: zero servers in init. + const init = initEvent(result.transcript); + expect(init).toBeTruthy(); + const servers = (init.mcp_servers ?? []).map((s: any) => s?.name ?? s); + expect(servers).toHaveLength(0); + expect(JSON.stringify(servers)).not.toContain('sentinel-mcp'); + } finally { + if (prevCfgDir === undefined) delete process.env.CLAUDE_CONFIG_DIR; + else process.env.CLAUDE_CONFIG_DIR = prevCfgDir; + fs.rmSync(workDir, { recursive: true, force: true }); + fs.rmSync(poisonRoot, { recursive: true, force: true }); + } + }, 180_000); +}); + +afterAll(() => finalizeEvalCollector(evalCollector));