mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-24 02:29:59 +02:00
Merge remote-tracking branch 'origin/main' into garrytan/trunk-land-skill
# Conflicts: # CHANGELOG.md # VERSION # package.json
This commit is contained in:
@@ -36,6 +36,7 @@ import {
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { resolveClaudeBinary as resolveClaudeBinaryShared } from '../../browse/src/claude-bin';
|
||||
import { hermeticChildEnv } from './hermetic-env';
|
||||
import type { SkillTestResult } from './session-runner';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -300,12 +301,17 @@ export async function runAgentSdkTest(
|
||||
const queryImpl: QueryProvider = opts.queryProvider ?? query;
|
||||
const model = opts.model ?? 'claude-opus-4-7';
|
||||
|
||||
// NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless
|
||||
// classification for eval/E2E runs is set by the `test:gate` / `test:evals`
|
||||
// package.json scripts (scoped to that invocation), NOT mutated here. We must not
|
||||
// pass sdkOpts.env (it breaks the SDK auth pipeline — see CLAUDE.md) and must not
|
||||
// mutate process.env ambiently (it would leak headless into later interactive-path
|
||||
// tests in the same Bun process — Codex review finding).
|
||||
// NOTE on env: the SDK child gets the COMPLETE hermetic env (allowlist
|
||||
// scrub + ANTHROPIC_API_KEY + hermetic CLAUDE_CONFIG_DIR/GSTACK_HOME), with
|
||||
// per-test opts.env merging last. The historical "passing env: breaks SDK
|
||||
// auth" failure (old CLAUDE.md warning) was partial-env replacement —
|
||||
// Options.env REPLACES the child's entire environment, so an object without
|
||||
// the key killed auth. A complete env is safe (validated 2026-06-12 via
|
||||
// query() with hermeticChildEnv(): success, real cost, Bash tool working).
|
||||
// Do not mutate process.env ambiently here (it would leak into later
|
||||
// interactive-path tests in the same Bun process — Codex review finding);
|
||||
// ambient ANTHROPIC_API_KEY mutation by tests still works because the
|
||||
// builder reads process.env at call time.
|
||||
|
||||
let attempt = 0;
|
||||
let lastErr: unknown = null;
|
||||
@@ -356,7 +362,7 @@ export async function runAgentSdkTest(
|
||||
permissionMode: resolvedPermissionMode,
|
||||
allowDangerouslySkipPermissions: resolvedPermissionMode === 'bypassPermissions',
|
||||
settingSources: opts.settingSources ?? [],
|
||||
env: opts.env,
|
||||
env: hermeticChildEnv(opts.env),
|
||||
pathToClaudeCodeExecutable: opts.pathToClaudeCodeExecutable,
|
||||
...(hasCanUseTool ? { canUseTool: opts.canUseTool } : {}),
|
||||
};
|
||||
|
||||
@@ -145,6 +145,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
maxSkeletonBytes: 90_000,
|
||||
minUnionBytes: 80_000,
|
||||
mustContain: ['SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'HOLD SCOPE', 'SCOPE REDUCTION'],
|
||||
// Default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
|
||||
// prose replacing the smaller opt-in question) lands this ~5.2% over baseline.
|
||||
maxSizeRatio: 1.08,
|
||||
},
|
||||
'plan-eng-review': {
|
||||
skill: 'plan-eng-review',
|
||||
@@ -162,9 +165,11 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['Architecture', 'Code Quality', 'Test', 'Performance'],
|
||||
// Cross-cutting preamble growth (v1.57.2.0 AUQ-failure prose fallback + the
|
||||
// decision-memory nudge + the v1.57.4.0 Boil-the-Ocean rename) lands this just
|
||||
// over the strict 1.05; small headroom for the shared preamble additions.
|
||||
maxSizeRatio: 1.06,
|
||||
// decision-memory nudge + the v1.57.4.0 Boil-the-Ocean rename) plus the
|
||||
// default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
|
||||
// prose, replacing the smaller opt-in question) land this at ~6.6% over the
|
||||
// v1.53.0.0 baseline. Headroom for those intentional additions.
|
||||
maxSizeRatio: 1.08,
|
||||
},
|
||||
'plan-design-review': {
|
||||
skill: 'plan-design-review',
|
||||
@@ -178,7 +183,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'plan',
|
||||
maxSkeletonBytes: 82_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 84_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['design', 'visual'],
|
||||
},
|
||||
@@ -194,9 +201,14 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'plan',
|
||||
maxSkeletonBytes: 76_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/destructive prose safety +
|
||||
// continuation protocol in the always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 78_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['developer experience', 'Getting Started'],
|
||||
// Default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
|
||||
// prose replacing the smaller opt-in question) lands this ~5.7% over baseline.
|
||||
maxSizeRatio: 1.08,
|
||||
},
|
||||
'office-hours': {
|
||||
skill: 'office-hours',
|
||||
@@ -229,14 +241,20 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 50_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 53_000,
|
||||
minUnionBytes: 55_000,
|
||||
mustContain: ['CHANGELOG', 'Diataxis', 'coverage'],
|
||||
// The AUQ-failure prose fallback (v1.57.2.0) adds ~2KB to every skill's
|
||||
// always-loaded preamble; on this small carved skeleton that lands at ~5.9%
|
||||
// over the pre-carve/pre-AUQ v1.53.0.0 baseline. Headroom for the
|
||||
// cross-cutting addition; all other skills keep the strict 1.05 ceiling.
|
||||
maxSizeRatio: 1.08,
|
||||
// Two intentional additions stack on this small skill: the AUQ-failure prose
|
||||
// fallback (v1.57.2.0, ~2KB to every preamble) AND the new default-on Codex
|
||||
// documentation-review section (codexPreflight + prompt + apply-gate, carved
|
||||
// into release-body so the SKELETON stays under maxSkeletonBytes). On a ~55KB
|
||||
// baseline that whole new capability is ~18.6% of union bytes. The doc review
|
||||
// is a deliberate new feature, not preamble creep; the union ceiling is raised
|
||||
// to match while the skeleton budget (50_000) still holds the always-loaded
|
||||
// cost flat.
|
||||
maxSizeRatio: 1.20,
|
||||
},
|
||||
'design-consultation': {
|
||||
skill: 'design-consultation',
|
||||
@@ -250,7 +268,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 64_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 67_000,
|
||||
minUnionBytes: 72_000,
|
||||
mustContain: ['Typography', 'Color', 'Aesthetic Direction'],
|
||||
// Cross-cutting preamble growth (v1.57.2.0 AUQ-failure prose fallback ~2KB +
|
||||
@@ -286,7 +306,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 70_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 73_000,
|
||||
minUnionBytes: 72_000,
|
||||
mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
|
||||
// cso keeps its mode-dispatch + FP-filtering phases always-loaded, so the
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env';
|
||||
|
||||
/** Strip ANSI escapes for pattern-matching against visible text. */
|
||||
export function stripAnsi(s: string): string {
|
||||
@@ -120,6 +121,13 @@ export interface ClaudePtySession {
|
||||
exited(): boolean;
|
||||
/** Exit code, if known. */
|
||||
exitCode(): number | null;
|
||||
/**
|
||||
* The hermetic CLAUDE_CONFIG_DIR this session's claude was pointed at, or
|
||||
* null when EVALS_HERMETIC=0. Forensics: hermetic plan files live under
|
||||
* `<hermeticConfigDir>/plans/` (extractPlanFilePath still matches them —
|
||||
* the dir name ends in `/.claude` by contract).
|
||||
*/
|
||||
hermeticConfigDir: string | null;
|
||||
/**
|
||||
* Send SIGINT, then SIGKILL after 1s. Always safe to call multiple times.
|
||||
* Awaits process exit before resolving.
|
||||
@@ -1143,8 +1151,17 @@ export async function launchClaudePty(
|
||||
if (permissionMode !== null) {
|
||||
args.push('--permission-mode', permissionMode);
|
||||
}
|
||||
// Hermetic children get zero MCP servers; gated on the same call-time
|
||||
// check as the env scrub so EVALS_HERMETIC=0 restores operator MCP too.
|
||||
// Before opts.extraArgs so a test could theoretically supply --mcp-config.
|
||||
const hermetic = isHermeticEnabled();
|
||||
if (hermetic) args.push('--strict-mcp-config');
|
||||
if (opts.extraArgs) args.push(...opts.extraArgs);
|
||||
|
||||
// Hermetic by default (test/helpers/hermetic-env.ts): operator session
|
||||
// context never reaches the child; per-test opts.env merges last.
|
||||
const childEnv = hermeticChildEnv(opts.env);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const proc = (Bun as any).spawn([claudePath, ...args], {
|
||||
terminal: {
|
||||
@@ -1155,7 +1172,7 @@ export async function launchClaudePty(
|
||||
},
|
||||
},
|
||||
cwd,
|
||||
env: { ...process.env, ...(opts.env ?? {}) },
|
||||
env: childEnv,
|
||||
});
|
||||
|
||||
// Track exit so waitForAny can fail fast if claude crashes.
|
||||
@@ -1307,6 +1324,7 @@ export async function launchClaudePty(
|
||||
pid: () => proc.pid as number | undefined,
|
||||
exited: () => exited,
|
||||
exitCode: () => exitCodeCaptured,
|
||||
hermeticConfigDir: hermetic ? childEnv.CLAUDE_CONFIG_DIR ?? null : null,
|
||||
close,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { hermeticChildEnv } from './hermetic-env';
|
||||
|
||||
// --- Interfaces ---
|
||||
|
||||
@@ -201,15 +202,18 @@ export async function runCodexSkill(opts: {
|
||||
// Build codex exec command
|
||||
const args = ['exec', prompt, '--json', '-s', sandbox];
|
||||
|
||||
// Spawn codex with temp HOME so it discovers our installed skill
|
||||
// Spawn codex with temp HOME so it discovers our installed skill.
|
||||
// Hermetic scrub (test/helpers/hermetic-env.ts) with codex's auth surface
|
||||
// re-admitted: codex auths from $HOME/.codex (copied into tempHome above)
|
||||
// plus OPENAI_API_KEY/CODEX_* when present. HOME override merges last.
|
||||
const proc = Bun.spawn(['codex', ...args], {
|
||||
cwd: cwd || skillDir,
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
env: {
|
||||
...process.env,
|
||||
HOME: tempHome,
|
||||
},
|
||||
env: hermeticChildEnv(
|
||||
{ HOME: tempHome },
|
||||
{ extraAllow: ['OPENAI_API_KEY', 'CODEX_*'] },
|
||||
),
|
||||
});
|
||||
|
||||
// Race against timeout
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
*/
|
||||
|
||||
import * as path from 'path';
|
||||
import { hermeticChildEnv } from './hermetic-env';
|
||||
|
||||
// --- Interfaces ---
|
||||
|
||||
@@ -122,11 +123,16 @@ export async function runGeminiSkill(opts: {
|
||||
// Build gemini command
|
||||
const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];
|
||||
|
||||
// Spawn gemini — uses real HOME for auth, cwd for skill discovery
|
||||
// Spawn gemini — uses real HOME for auth (~/.gemini; HOME is allowlisted),
|
||||
// cwd for skill discovery. Hermetic scrub with gemini's auth surface
|
||||
// re-admitted (previously this spawn inherited the full operator env).
|
||||
const proc = Bun.spawn(['gemini', ...args], {
|
||||
cwd: cwd || process.cwd(),
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
env: hermeticChildEnv(undefined, {
|
||||
extraAllow: ['GEMINI_API_KEY', 'GOOGLE_API_KEY', 'GOOGLE_APPLICATION_CREDENTIALS', 'GOOGLE_CLOUD_*', 'GEMINI_*'],
|
||||
}),
|
||||
});
|
||||
|
||||
// Race against timeout
|
||||
|
||||
@@ -0,0 +1,269 @@
|
||||
/**
|
||||
* Unit tests for the hermetic child-env builder. Free tier — no API calls.
|
||||
*
|
||||
* Pins three contracts:
|
||||
* 1. Allowlist semantics: contamination vars dropped, basics/auth/network
|
||||
* kept, overrides merge last, EVALS_HERMETIC=0 is byte-identical legacy.
|
||||
* 2. Seed-config shape: 20-char key suffix, trusted dirs, undefined-key safe.
|
||||
* 3. Dir lifecycle: /.claude suffix (extractPlanFilePath contract —
|
||||
* claude-pty-runner.ts:191), sync singleton reuse, pid-aware GC.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, afterAll } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import {
|
||||
buildHermeticEnv,
|
||||
buildSeedConfig,
|
||||
isHermeticEnabled,
|
||||
getHermeticDirs,
|
||||
gcStaleHermeticDirs,
|
||||
hermeticChildEnv,
|
||||
} from './hermetic-env';
|
||||
|
||||
const CONTAMINATED: NodeJS.ProcessEnv = {
|
||||
PATH: '/usr/bin', HOME: '/Users/op', TMPDIR: '/tmp', TERM: 'xterm',
|
||||
ANTHROPIC_API_KEY: 'sk-ant-0123456789abcdefghijklmn',
|
||||
ANTHROPIC_BASE_URL: 'https://proxy.example/api',
|
||||
ANTHROPIC_MODEL: 'sneaky-model-override',
|
||||
EVALS_MODEL: 'claude-sonnet-4-6',
|
||||
GITHUB_ACTIONS: 'true',
|
||||
HTTPS_PROXY: 'http://corp:3128',
|
||||
NODE_EXTRA_CA_CERTS: '/etc/corp.pem',
|
||||
CONDUCTOR_WORKSPACE_PATH: '/Users/op/conductor/ws',
|
||||
CONDUCTOR_SESSION: '1',
|
||||
CLAUDECODE: '1',
|
||||
CLAUDE_CODE_ENTRYPOINT: 'cli',
|
||||
CLAUDE_CONFIG_DIR: '/Users/op/.claude',
|
||||
GSTACK_HOME: '/Users/op/.gstack',
|
||||
GSTACK_HEADLESS_DEFAULT: 'x',
|
||||
MCP_TIMEOUT: '5000',
|
||||
GBRAIN_ENDPOINT: 'http://localhost:1234',
|
||||
OPENAI_API_KEY: 'sk-openai-secret',
|
||||
VOYAGE_API_KEY: 'vg-secret',
|
||||
GH_TOKEN: 'gho_secret',
|
||||
SSH_AUTH_SOCK: '/tmp/ssh.sock',
|
||||
GIT_AUTHOR_NAME: 'Op',
|
||||
};
|
||||
|
||||
const HERMETIC_VARS = { CLAUDE_CONFIG_DIR: '/x/.claude', GSTACK_HOME: '/x/gstack-home' };
|
||||
|
||||
describe('buildHermeticEnv allowlist', () => {
|
||||
const env = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS);
|
||||
|
||||
test('keeps process basics, network, CI, and eval knobs', () => {
|
||||
expect(env.PATH).toBe('/usr/bin');
|
||||
expect(env.HOME).toBe('/Users/op');
|
||||
expect(env.EVALS_MODEL).toBe('claude-sonnet-4-6');
|
||||
expect(env.GITHUB_ACTIONS).toBe('true');
|
||||
expect(env.HTTPS_PROXY).toBe('http://corp:3128');
|
||||
expect(env.NODE_EXTRA_CA_CERTS).toBe('/etc/corp.pem');
|
||||
});
|
||||
|
||||
test('keeps named auth vars but not the broad ANTHROPIC_ prefix', () => {
|
||||
expect(env.ANTHROPIC_API_KEY).toBe(CONTAMINATED.ANTHROPIC_API_KEY);
|
||||
expect(env.ANTHROPIC_BASE_URL).toBe(CONTAMINATED.ANTHROPIC_BASE_URL);
|
||||
expect(env.ANTHROPIC_MODEL).toBeUndefined(); // behavior knob, not auth
|
||||
});
|
||||
|
||||
test('drops session-context and operator-credential vars', () => {
|
||||
for (const k of [
|
||||
'CONDUCTOR_WORKSPACE_PATH', 'CONDUCTOR_SESSION', 'CLAUDECODE',
|
||||
'CLAUDE_CODE_ENTRYPOINT', 'GSTACK_HEADLESS_DEFAULT', 'MCP_TIMEOUT',
|
||||
'GBRAIN_ENDPOINT', 'OPENAI_API_KEY', 'VOYAGE_API_KEY', 'GH_TOKEN',
|
||||
'SSH_AUTH_SOCK', 'GIT_AUTHOR_NAME',
|
||||
]) {
|
||||
expect(env[k]).toBeUndefined();
|
||||
}
|
||||
});
|
||||
|
||||
test('redirects CLAUDE_CONFIG_DIR and GSTACK_HOME to hermetic values', () => {
|
||||
expect(env.CLAUDE_CONFIG_DIR).toBe('/x/.claude');
|
||||
expect(env.GSTACK_HOME).toBe('/x/gstack-home');
|
||||
});
|
||||
|
||||
test('overrides merge last — per-test re-contamination is deliberate', () => {
|
||||
const e = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS, {
|
||||
CONDUCTOR_WORKSPACE_PATH: '/tmp/test-ws',
|
||||
GSTACK_HOME: '/tmp/test-home',
|
||||
GSTACK_HEADLESS: '',
|
||||
});
|
||||
expect(e.CONDUCTOR_WORKSPACE_PATH).toBe('/tmp/test-ws');
|
||||
expect(e.GSTACK_HOME).toBe('/tmp/test-home');
|
||||
expect(e.GSTACK_HEADLESS).toBe('');
|
||||
});
|
||||
|
||||
test('promotes GSTACK_ANTHROPIC_API_KEY when canonical absent (shared shim fn)', () => {
|
||||
const base = { ...CONTAMINATED } as NodeJS.ProcessEnv;
|
||||
delete base.ANTHROPIC_API_KEY;
|
||||
base.GSTACK_ANTHROPIC_API_KEY = 'sk-ant-promoted-9876543210';
|
||||
const e = buildHermeticEnv(base, HERMETIC_VARS);
|
||||
expect(e.ANTHROPIC_API_KEY).toBe('sk-ant-promoted-9876543210');
|
||||
expect(e.GSTACK_ANTHROPIC_API_KEY).toBeUndefined(); // GSTACK_* still dropped
|
||||
});
|
||||
|
||||
test('extraAllow re-admits exact names and prefixes per runner', () => {
|
||||
const e = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS, undefined, {
|
||||
extraAllow: ['OPENAI_API_KEY', 'GIT_*'],
|
||||
});
|
||||
expect(e.OPENAI_API_KEY).toBe('sk-openai-secret');
|
||||
expect(e.GIT_AUTHOR_NAME).toBe('Op');
|
||||
expect(e.GH_TOKEN).toBeUndefined(); // not in extraAllow
|
||||
});
|
||||
|
||||
test('TERM falls back when base omits it', () => {
|
||||
const base = { ...CONTAMINATED } as NodeJS.ProcessEnv;
|
||||
delete base.TERM;
|
||||
expect(buildHermeticEnv(base, HERMETIC_VARS).TERM).toBe('xterm-256color');
|
||||
});
|
||||
});
|
||||
|
||||
describe('EVALS_HERMETIC=0 escape hatch', () => {
|
||||
test('returns byte-identical legacy env, overrides still last', () => {
|
||||
const base = { ...CONTAMINATED, EVALS_HERMETIC: '0' } as NodeJS.ProcessEnv;
|
||||
const e = buildHermeticEnv(base, HERMETIC_VARS, { GSTACK_HEADLESS: '1' });
|
||||
// Legacy spread: every base var survives, hermeticVars NOT applied.
|
||||
expect(e.CONDUCTOR_WORKSPACE_PATH).toBe(CONTAMINATED.CONDUCTOR_WORKSPACE_PATH);
|
||||
expect(e.CLAUDE_CONFIG_DIR).toBe('/Users/op/.claude');
|
||||
expect(e.GSTACK_HOME).toBe('/Users/op/.gstack');
|
||||
expect(e.GSTACK_HEADLESS).toBe('1');
|
||||
expect(e).toEqual({ ...(base as Record<string, string>), GSTACK_HEADLESS: '1' });
|
||||
});
|
||||
|
||||
test('isHermeticEnabled reads at call time (ESM-hoist safety)', () => {
|
||||
const prev = process.env.EVALS_HERMETIC;
|
||||
try {
|
||||
process.env.EVALS_HERMETIC = '0';
|
||||
expect(isHermeticEnabled()).toBe(false);
|
||||
process.env.EVALS_HERMETIC = '1';
|
||||
expect(isHermeticEnabled()).toBe(true);
|
||||
delete process.env.EVALS_HERMETIC;
|
||||
expect(isHermeticEnabled()).toBe(true);
|
||||
} finally {
|
||||
if (prev === undefined) delete process.env.EVALS_HERMETIC;
|
||||
else process.env.EVALS_HERMETIC = prev;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildSeedConfig', () => {
|
||||
test('stores only the 20-char key suffix and trusts the given dirs', () => {
|
||||
const seed = buildSeedConfig({
|
||||
apiKey: 'sk-ant-0123456789abcdefghijklmn',
|
||||
trustedDirs: ['/repo/root'],
|
||||
}) as any;
|
||||
expect(seed.hasCompletedOnboarding).toBe(true);
|
||||
const approved = seed.customApiKeyResponses.approved;
|
||||
expect(approved).toHaveLength(1);
|
||||
expect(approved[0]).toHaveLength(20);
|
||||
expect('sk-ant-0123456789abcdefghijklmn'.endsWith(approved[0])).toBe(true);
|
||||
expect(seed.projects['/repo/root'].hasTrustDialogAccepted).toBe(true);
|
||||
expect(seed.projects['/repo/root'].hasCompletedProjectOnboarding).toBe(true);
|
||||
});
|
||||
|
||||
test('apiKey undefined → omits customApiKeyResponses, does not throw', () => {
|
||||
const seed = buildSeedConfig({ apiKey: undefined, trustedDirs: [] }) as any;
|
||||
expect(seed.customApiKeyResponses).toBeUndefined();
|
||||
expect(seed.hasCompletedOnboarding).toBe(true);
|
||||
});
|
||||
|
||||
test('no full key material anywhere in the seed', () => {
|
||||
const key = 'sk-ant-0123456789abcdefghijklmn';
|
||||
const json = JSON.stringify(buildSeedConfig({ apiKey: key, trustedDirs: [] }));
|
||||
expect(json.includes(key)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getHermeticDirs lifecycle', () => {
|
||||
test('configDir ends in /.claude — extractPlanFilePath contract', () => {
|
||||
// claude-pty-runner.ts:191 anchors plan paths on `.claude/plans/` under
|
||||
// /var|/tmp prefixes; the dir-name suffix is what keeps PTY plan-mode
|
||||
// tests extracting hermetic plan files with zero extractor changes.
|
||||
const dirs = getHermeticDirs();
|
||||
expect(dirs.configDir.endsWith(`${path.sep}.claude`)).toBe(true);
|
||||
expect(dirs.configDir.startsWith(os.tmpdir())).toBe(true);
|
||||
});
|
||||
|
||||
test('sync singleton: repeat calls return the same dirs', () => {
|
||||
expect(getHermeticDirs()).toBe(getHermeticDirs());
|
||||
});
|
||||
|
||||
test('seeds .claude.json in the config dir', () => {
|
||||
const dirs = getHermeticDirs();
|
||||
const seed = JSON.parse(fs.readFileSync(path.join(dirs.configDir, '.claude.json'), 'utf-8'));
|
||||
expect(seed.hasCompletedOnboarding).toBe(true);
|
||||
const root = path.resolve(__dirname, '..', '..');
|
||||
expect(seed.projects[root].hasTrustDialogAccepted).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('gcStaleHermeticDirs', () => {
|
||||
test('removes dead-pid dirs, keeps live-pid and foreign dirs', () => {
|
||||
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-gc-test-'));
|
||||
// Find a pid that is definitely dead: spawn-and-reap is overkill; use a
|
||||
// huge pid beyond pid_max on macOS/Linux defaults.
|
||||
const deadPid = 99999999;
|
||||
const dead = path.join(tmp, `gstack-hermetic-${deadPid}-abc`);
|
||||
const live = path.join(tmp, `gstack-hermetic-${process.pid}-abc`);
|
||||
const foreign = path.join(tmp, 'unrelated-dir');
|
||||
const malformed = path.join(tmp, 'gstack-hermetic-notapid-abc');
|
||||
for (const d of [dead, live, foreign, malformed]) fs.mkdirSync(d);
|
||||
// GC only reclaims dirs older than its 1h age floor (PID-reuse guard);
|
||||
// backdate the dead-pid dir's mtime so it qualifies.
|
||||
const old = new Date(Date.now() - 2 * 60 * 60 * 1000);
|
||||
fs.utimesSync(dead, old, old);
|
||||
|
||||
gcStaleHermeticDirs(tmp);
|
||||
|
||||
expect(fs.existsSync(dead)).toBe(false);
|
||||
expect(fs.existsSync(live)).toBe(true);
|
||||
expect(fs.existsSync(foreign)).toBe(true);
|
||||
expect(fs.existsSync(malformed)).toBe(true); // never guess on malformed names
|
||||
fs.rmSync(tmp, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test('keeps a fresh dead-pid dir (PID-reuse grace window)', () => {
|
||||
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-gc-fresh-'));
|
||||
// Dead pid but just created — must survive GC, else PID reuse could delete
|
||||
// a dir whose original pid exited and got recycled to a live process.
|
||||
const freshDead = path.join(tmp, 'gstack-hermetic-99999999-xyz');
|
||||
fs.mkdirSync(freshDead);
|
||||
gcStaleHermeticDirs(tmp);
|
||||
expect(fs.existsSync(freshDead)).toBe(true);
|
||||
fs.rmSync(tmp, { recursive: true, force: true });
|
||||
});
|
||||
});
|
||||
|
||||
describe('hermeticChildEnv composition', () => {
|
||||
test('hermetic by default: redirects config dirs, drops contamination', () => {
|
||||
// process.env in a real test run may carry CONDUCTOR_*/CLAUDECODE — the
|
||||
// composition must scrub them and point at the singleton dirs.
|
||||
const e = hermeticChildEnv({ GSTACK_HEADLESS: '1' });
|
||||
const dirs = getHermeticDirs();
|
||||
expect(e.CLAUDE_CONFIG_DIR).toBe(dirs.configDir);
|
||||
expect(e.GSTACK_HOME).toBe(dirs.gstackHome);
|
||||
expect(e.GSTACK_HEADLESS).toBe('1');
|
||||
expect(e.CLAUDECODE).toBeUndefined();
|
||||
expect(e.CONDUCTOR_WORKSPACE_PATH).toBeUndefined();
|
||||
});
|
||||
|
||||
test('EVALS_HERMETIC=0: legacy passthrough of live process.env', () => {
|
||||
const prev = process.env.EVALS_HERMETIC;
|
||||
try {
|
||||
process.env.EVALS_HERMETIC = '0';
|
||||
const e = hermeticChildEnv({ EXTRA: 'x' });
|
||||
expect(e.PATH).toBe(process.env.PATH as string);
|
||||
expect(e.EXTRA).toBe('x');
|
||||
// No hermetic redirection in legacy mode.
|
||||
expect(e.CLAUDE_CONFIG_DIR).toBe(process.env.CLAUDE_CONFIG_DIR as any);
|
||||
} finally {
|
||||
if (prev === undefined) delete process.env.EVALS_HERMETIC;
|
||||
else process.env.EVALS_HERMETIC = prev;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
// The singleton's own exit hook handles runRoot; nothing else to clean.
|
||||
});
|
||||
@@ -0,0 +1,276 @@
|
||||
/**
|
||||
* Hermetic child environment for E2E test runners.
|
||||
*
|
||||
* Local E2E runs spawn `claude` (and codex/gemini/SDK) children that, until
|
||||
* this module, inherited the operator's full session context: ~/.claude
|
||||
* (user CLAUDE.md, .claude.json MCP servers incl. gbrain + Conductor,
|
||||
* skills), ~/.gstack decision logs, and CONDUCTOR_-/CLAUDECODE-style env vars.
|
||||
* CI was hermetic only by accident (fresh Docker /home/runner). This module
|
||||
* makes local children see a CI-equivalent clean room by default.
|
||||
*
|
||||
* operator shell (contaminated) hermetic child env
|
||||
* ┌─────────────────────────────┐ buildHermeticEnv()
|
||||
* │ PATH, HOME, TMPDIR, ... │── allowlist ─────────► kept
|
||||
* │ HTTP(S)_PROXY, SSL_CERT_* │── allowlist ─────────► kept (network)
|
||||
* │ ANTHROPIC_API_KEY/BASE_URL/ │── named list ────────► kept (auth)
|
||||
* │ AUTH_TOKEN │
|
||||
* │ GSTACK_ANTHROPIC_API_KEY │── promotedEnv() ─────► ANTHROPIC_API_KEY
|
||||
* │ CONDUCTOR_*, CLAUDECODE, │
|
||||
* │ CLAUDE_*, GSTACK_*, MCP_*, │── dropped ───────────► ∅
|
||||
* │ GBRAIN_*, GH_TOKEN, ... │
|
||||
* └─────────────────────────────┘
|
||||
* + per-runner extraAllow (codex: OpenAI vars; gemini: Google vars)
|
||||
* + CLAUDE_CONFIG_DIR=<runRoot>/.claude GSTACK_HOME=<runRoot>/gstack-home
|
||||
* + per-test overrides spread LAST
|
||||
*
|
||||
* Escape hatch: EVALS_HERMETIC=0 restores the legacy contaminated env
|
||||
* byte-identically (runners must also gate --strict-mcp-config on
|
||||
* isHermeticEnabled() so the escape hatch restores args too).
|
||||
*
|
||||
* isHermeticEnabled() is evaluated at CALL time, never at module load —
|
||||
* ESM hoists imports above any in-file `process.env.EVALS_HERMETIC = '0'`
|
||||
* assignment, so a module-load-time read would silently ignore test pins.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { promotedEnv } from '../../lib/conductor-env-shim';
|
||||
import { isProcessAlive } from '../../browse/src/error-handling';
|
||||
|
||||
/** Exact env names a hermetic child keeps. Everything not listed (or matched
|
||||
* by a prefix rule below) is dropped. */
|
||||
const ALLOW_EXACT = new Set([
|
||||
// Process basics
|
||||
'PATH', 'HOME', 'TMPDIR', 'TERM', 'COLORTERM', 'LANG', 'LC_ALL', 'SHELL',
|
||||
'USER', 'LOGNAME', 'TZ', 'NODE_ENV', 'CI',
|
||||
// Browser/runtime caches the child legitimately shares with the operator
|
||||
'PLAYWRIGHT_BROWSERS_PATH',
|
||||
// Network reachability — without these, children on proxied networks can't
|
||||
// reach the Anthropic API at all
|
||||
'HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY',
|
||||
'http_proxy', 'https_proxy', 'no_proxy',
|
||||
'SSL_CERT_FILE', 'SSL_CERT_DIR', 'NODE_EXTRA_CA_CERTS',
|
||||
// Auth — named, NOT the broad ANTHROPIC_* prefix: a prefix rule would
|
||||
// smuggle model/beta/debug knobs that change eval behavior
|
||||
'ANTHROPIC_API_KEY', // the auth credential evals require
|
||||
'ANTHROPIC_BASE_URL', // API endpoint override (corp proxies)
|
||||
'ANTHROPIC_AUTH_TOKEN', // bearer-token auth variant
|
||||
]);
|
||||
|
||||
/** Prefix rules: eval-harness knobs + CI metadata. Deliberately NOT here:
|
||||
* CONDUCTOR_* / CLAUDE_* (incl. CLAUDECODE, CLAUDE_CODE_ENTRYPOINT) /
|
||||
* GSTACK_* / MCP_* / GBRAIN_* — session-context contamination; and operator
|
||||
* credentials (GH_TOKEN, SSH_AUTH_SOCK, GIT_*, OPENAI_API_KEY,
|
||||
* VOYAGE_API_KEY) — CI doesn't have them and eval children have no business
|
||||
* using them. A test that legitimately needs one opts in via its own env
|
||||
* override; a provider runner (codex/gemini) re-admits its auth vars via
|
||||
* opts.extraAllow. */
|
||||
const ALLOW_PREFIXES = ['EVALS_', 'GITHUB_'];
|
||||
|
||||
export interface HermeticEnvOpts {
|
||||
/** Per-runner additional allowed names (exact match) or prefixes (entries
|
||||
* ending in '*'). Example: codex runner passes ['OPENAI_API_KEY', 'CODEX_*']. */
|
||||
extraAllow?: string[];
|
||||
}
|
||||
|
||||
/** EVALS_HERMETIC !== '0'. Read at call time (see module doc — ESM hoist). */
|
||||
export function isHermeticEnabled(env: NodeJS.ProcessEnv = process.env): boolean {
|
||||
return env.EVALS_HERMETIC !== '0';
|
||||
}
|
||||
|
||||
/**
|
||||
* Pure allowlist scrub. No I/O. Overrides spread LAST so per-test env
|
||||
* (GSTACK_HOME, CONDUCTOR_WORKSPACE_PATH, GSTACK_HEADLESS opt-out) always
|
||||
* wins over the scrub — that is the documented re-contamination escape and
|
||||
* the wiring tripwire forbids passing raw process.env through it.
|
||||
*/
|
||||
export function buildHermeticEnv(
|
||||
base: NodeJS.ProcessEnv,
|
||||
hermeticVars: Record<string, string>,
|
||||
overrides?: Record<string, string | undefined>,
|
||||
opts?: HermeticEnvOpts,
|
||||
): Record<string, string> {
|
||||
if (!isHermeticEnabled(base)) {
|
||||
// Escape hatch: byte-identical to the legacy spread.
|
||||
const legacy: Record<string, string> = {};
|
||||
for (const [k, v] of Object.entries(base)) if (v !== undefined) legacy[k] = v;
|
||||
for (const [k, v] of Object.entries(overrides ?? {})) if (v !== undefined) legacy[k] = v;
|
||||
return legacy;
|
||||
}
|
||||
|
||||
const promoted = promotedEnv(base);
|
||||
const extraExact = new Set<string>();
|
||||
const extraPrefixes: string[] = [];
|
||||
for (const entry of opts?.extraAllow ?? []) {
|
||||
if (entry.endsWith('*')) extraPrefixes.push(entry.slice(0, -1));
|
||||
else extraExact.add(entry);
|
||||
}
|
||||
|
||||
const out: Record<string, string> = {};
|
||||
for (const [k, v] of Object.entries(promoted)) {
|
||||
if (v === undefined) continue;
|
||||
const allowed =
|
||||
ALLOW_EXACT.has(k) ||
|
||||
extraExact.has(k) ||
|
||||
ALLOW_PREFIXES.some((p) => k.startsWith(p)) ||
|
||||
extraPrefixes.some((p) => k.startsWith(p));
|
||||
if (allowed) out[k] = v;
|
||||
}
|
||||
if (!out.TERM) out.TERM = 'xterm-256color';
|
||||
Object.assign(out, hermeticVars);
|
||||
for (const [k, v] of Object.entries(overrides ?? {})) if (v !== undefined) out[k] = v;
|
||||
return out;
|
||||
}
|
||||
|
||||
export interface SeedConfigOpts {
|
||||
/** When undefined (operator has no key exported), customApiKeyResponses is
|
||||
* omitted — the child fails auth exactly as it would today, no throw here. */
|
||||
apiKey: string | undefined;
|
||||
trustedDirs: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal $CLAUDE_CONFIG_DIR/.claude.json for fresh-config children.
|
||||
*
|
||||
* Empirically verified 2026-06-12 on claude 2.1.175: PRINT MODE (`claude -p`)
|
||||
* with ANTHROPIC_API_KEY needs NO seed at all — a fresh empty config dir ran
|
||||
* non-interactively (exit 0, real cost billed to the key). The seed exists
|
||||
* for the PTY path, where first-run TUI prompts DO appear:
|
||||
* - hasCompletedOnboarding: suppresses the onboarding flow
|
||||
* - customApiKeyResponses.approved: suppresses the "use this API key?"
|
||||
* prompt; entries are the key's LAST 20 CHARS (shape verified against a
|
||||
* real ~/.claude.json)
|
||||
* - projects[dir].hasTrustDialogAccepted: pre-trusts repo-cwd PTY sessions
|
||||
* (the pty-runner's 15s trust-watcher remains as fallback for temp cwds)
|
||||
* bypassPermissionsModeAccepted was considered and dropped: absent from a
|
||||
* real config even though --dangerously-skip-permissions is in daily use.
|
||||
*/
|
||||
export function buildSeedConfig(opts: SeedConfigOpts): Record<string, unknown> {
|
||||
const seed: Record<string, unknown> = {
|
||||
hasCompletedOnboarding: true,
|
||||
projects: Object.fromEntries(
|
||||
opts.trustedDirs.map((dir) => [
|
||||
dir,
|
||||
{ hasTrustDialogAccepted: true, hasCompletedProjectOnboarding: true },
|
||||
]),
|
||||
),
|
||||
};
|
||||
if (opts.apiKey) {
|
||||
seed.customApiKeyResponses = { approved: [opts.apiKey.slice(-20)] };
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
||||
export interface HermeticDirs {
|
||||
/** Ends in `/.claude` — load-bearing: extractPlanFilePath in
|
||||
* claude-pty-runner.ts:191 anchors plan-file paths on `.claude/plans/`
|
||||
* under a /var|/tmp prefix. Renaming this segment breaks PTY plan tests. */
|
||||
configDir: string;
|
||||
gstackHome: string;
|
||||
runRoot: string;
|
||||
}
|
||||
|
||||
const DIR_PREFIX = 'gstack-hermetic-';
|
||||
|
||||
let cachedDirs: HermeticDirs | null = null;
|
||||
|
||||
/** Repo root for the trusted-dir seed: test files live in <root>/test/helpers. */
|
||||
function repoRoot(): string {
|
||||
return path.resolve(__dirname, '..', '..');
|
||||
}
|
||||
|
||||
/**
|
||||
* Sync memoized per-process singleton — intentionally NO async gap between
|
||||
* the cache check and create+seed, so concurrent first calls under
|
||||
* `bun test --concurrent` cannot double-create or observe a half-seeded dir.
|
||||
* Shared across all tests in the process: that matches CI's within-job
|
||||
* shared /home/runner (operator isolation, not per-test isolation).
|
||||
*/
|
||||
export function getHermeticDirs(): HermeticDirs {
|
||||
if (cachedDirs) return cachedDirs;
|
||||
|
||||
gcStaleHermeticDirs();
|
||||
|
||||
// Embed our pid so the GC of future processes can check liveness.
|
||||
const runRoot = fs.mkdtempSync(path.join(os.tmpdir(), `${DIR_PREFIX}${process.pid}-`));
|
||||
const configDir = path.join(runRoot, '.claude');
|
||||
const gstackHome = path.join(runRoot, 'gstack-home');
|
||||
|
||||
// A half-seeded config dir means children hang on first-run prompts until
|
||||
// the test timeout — far worse than failing loudly here. So we throw on
|
||||
// failure, but tear down the partial dir first: an unseeded runRoot named
|
||||
// with our (alive) pid would be skipped by this process's GC and leak until
|
||||
// process exit, so remove it before rethrowing.
|
||||
try {
|
||||
fs.mkdirSync(configDir, { recursive: true });
|
||||
fs.mkdirSync(gstackHome, { recursive: true });
|
||||
const seed = buildSeedConfig({
|
||||
apiKey: process.env.ANTHROPIC_API_KEY ?? process.env.GSTACK_ANTHROPIC_API_KEY,
|
||||
trustedDirs: [repoRoot()],
|
||||
});
|
||||
fs.writeFileSync(path.join(configDir, '.claude.json'), JSON.stringify(seed, null, 2));
|
||||
} catch (err) {
|
||||
try { fs.rmSync(runRoot, { recursive: true, force: true }); } catch { /* best-effort */ }
|
||||
throw err;
|
||||
}
|
||||
|
||||
process.on('exit', () => {
|
||||
// Exit handlers cannot await: sync best-effort removal only. Anything
|
||||
// left behind is reclaimed by the next process's pid-aware GC.
|
||||
try { fs.rmSync(runRoot, { recursive: true, force: true }); } catch { /* GC reclaims */ }
|
||||
});
|
||||
|
||||
cachedDirs = { configDir, gstackHome, runRoot };
|
||||
return cachedDirs;
|
||||
}
|
||||
|
||||
/** A dir younger than this is never GC'd even if its pid looks dead — guards
|
||||
* against PID reuse deleting a freshly-created dir whose original pid exited
|
||||
* and was recycled to an unrelated live process between create and GC. */
|
||||
const GC_MIN_AGE_MS = 60 * 60 * 1000; // 1h
|
||||
|
||||
/**
|
||||
* Reclaim leftovers from crashed runs. Two signals, both required: the
|
||||
* embedded pid is dead AND the dir is older than GC_MIN_AGE_MS. Pid-alone
|
||||
* would risk PID-reuse false-deletes of live dirs; age-alone would delete a
|
||||
* live >24h eval run's config out from under it. Exported for tests.
|
||||
*/
|
||||
export function gcStaleHermeticDirs(tmpDir: string = os.tmpdir()): void {
|
||||
let entries: string[];
|
||||
try { entries = fs.readdirSync(tmpDir); } catch { return; }
|
||||
const now = Date.now();
|
||||
for (const name of entries) {
|
||||
if (!name.startsWith(DIR_PREFIX)) continue;
|
||||
const pidStr = name.slice(DIR_PREFIX.length).split('-')[0];
|
||||
const pid = Number(pidStr);
|
||||
if (!Number.isInteger(pid) || pid <= 0) continue;
|
||||
if (pid === process.pid || isProcessAlive(pid)) continue;
|
||||
const full = path.join(tmpDir, name);
|
||||
try {
|
||||
if (now - fs.statSync(full).mtimeMs < GC_MIN_AGE_MS) continue; // too fresh
|
||||
} catch { continue; } // vanished or unreadable — leave it
|
||||
try { fs.rmSync(full, { recursive: true, force: true }); } catch { /* best-effort */ }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The composition runners use: scrub process.env, point the child at the
|
||||
* singleton hermetic dirs, apply per-test overrides last. Returns the legacy
|
||||
* env untouched when EVALS_HERMETIC=0 (and skips dir creation entirely).
|
||||
*/
|
||||
export function hermeticChildEnv(
|
||||
overrides?: Record<string, string | undefined>,
|
||||
opts?: HermeticEnvOpts,
|
||||
): Record<string, string> {
|
||||
if (!isHermeticEnabled()) {
|
||||
return buildHermeticEnv(process.env, {}, overrides, opts);
|
||||
}
|
||||
const dirs = getHermeticDirs();
|
||||
return buildHermeticEnv(
|
||||
process.env,
|
||||
{ CLAUDE_CONFIG_DIR: dirs.configDir, GSTACK_HOME: dirs.gstackHome },
|
||||
overrides,
|
||||
opts,
|
||||
);
|
||||
}
|
||||
@@ -210,7 +210,11 @@ const MONOLITH_INVARIANTS: ParityInvariant[] = [
|
||||
skill: 'review',
|
||||
mustContain: ['confidence', 'P1', 'P2'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
// The adversarial step swapped its bare `command -v codex` check for the shared
|
||||
// codexPreflight() block (install + auth tri-state + CODEX_MODE branch prose),
|
||||
// landing ~6.3% over the v1.53.0.0 baseline. Intentional: it adds proper
|
||||
// not-installed vs not-authed handling, not slop.
|
||||
maxSizeRatio: 1.08,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
|
||||
@@ -10,6 +10,7 @@ import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { getProjectEvalDir } from './eval-store';
|
||||
import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env';
|
||||
|
||||
const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
|
||||
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
|
||||
@@ -167,6 +168,10 @@ export async function runSkillTest(options: {
|
||||
'--max-turns', String(maxTurns),
|
||||
'--allowed-tools', ...allowedTools,
|
||||
];
|
||||
// Hermetic children get zero MCP servers (no --mcp-config is passed).
|
||||
// Gated on the same call-time check as the env scrub so EVALS_HERMETIC=0
|
||||
// restores operator MCP along with the operator env.
|
||||
if (isHermeticEnabled()) args.push('--strict-mcp-config');
|
||||
|
||||
// Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions
|
||||
// where afterAll cleanup deletes the dir before cat reads the file (especially
|
||||
@@ -176,11 +181,14 @@ export async function runSkillTest(options: {
|
||||
|
||||
const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
|
||||
cwd: workingDirectory,
|
||||
// Hermetic by default (see test/helpers/hermetic-env.ts): operator
|
||||
// session context (CONDUCTOR_*, CLAUDECODE, ~/.claude config, ~/.gstack)
|
||||
// never reaches the child; EVALS_HERMETIC=0 restores the legacy env.
|
||||
// Default GSTACK_HEADLESS=1 so eval/E2E runs classify as headless (BLOCK on an
|
||||
// AskUserQuestion failure rather than emit a prose question no human reads). A
|
||||
// suite exercising the INTERACTIVE prose-fallback path opts out by passing
|
||||
// `env: { GSTACK_HEADLESS: '' }` — extraEnv wins because it spreads last.
|
||||
env: { ...process.env, GSTACK_HEADLESS: '1', ...extraEnv },
|
||||
env: hermeticChildEnv({ GSTACK_HEADLESS: '1', ...extraEnv }),
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
});
|
||||
|
||||
@@ -36,6 +36,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
|
||||
// Hermetic isolation canaries (hermetic-env.ts is also a GLOBAL touchfile;
|
||||
// these entries exist so the canaries themselves stay tier-classified)
|
||||
'hermetic-canary': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
|
||||
'hermetic-sentinel': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
|
||||
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
@@ -111,7 +116,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// written a never-ask preference, AUQ should still auto-decide rather than
|
||||
// surfacing the question. Touches the question-tuning + preference
|
||||
// infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
|
||||
'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'],
|
||||
'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
|
||||
// Conductor → prose decision brief (Conductor signal makes prose the default;
|
||||
// the PreToolUse hook denies the flaky tool). Touches the resolver that owns
|
||||
// the Conductor rule, the preamble signal, the hook, and the detection helper.
|
||||
'conductor-prose': ['scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble.ts', 'plan-eng-review/**', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-conductor-prose.test.ts'],
|
||||
|
||||
// Real-PTY E2E batch (#6 new tests on the harness).
|
||||
// Each one tests behavior the SDK harness can't observe (rendered TTY,
|
||||
@@ -291,6 +301,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'design-shotgun-session': ['design-shotgun/**', 'scripts/resolvers/design.ts'],
|
||||
'design-shotgun-full': ['design-shotgun/**', 'design/src/**', 'browse/src/**'],
|
||||
|
||||
// /diagram (diagram-render bundle consumers). Triplet = deterministic
|
||||
// functional (gate); authoring quality = LLM-judged benchmark (periodic).
|
||||
'diagram-triplet': ['diagram/**', 'lib/diagram-render/**', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts'],
|
||||
'diagram-authoring-quality': ['diagram/**', 'lib/diagram-render/**', 'test/helpers/llm-judge.ts'],
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
|
||||
@@ -435,6 +450,11 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'browse-basic': 'gate',
|
||||
'browse-snapshot': 'gate',
|
||||
|
||||
// Hermetic isolation — gate (deterministic env/config assertions; if the
|
||||
// clean room breaks, every other eval's signal is contaminated)
|
||||
'hermetic-canary': 'gate',
|
||||
'hermetic-sentinel': 'gate',
|
||||
|
||||
// SKILL.md setup — gate (if setup breaks, no skill works)
|
||||
'skillmd-setup-discovery': 'gate',
|
||||
'skillmd-no-local-binary': 'gate',
|
||||
@@ -508,6 +528,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// v1.21+ auto-mode regression tests
|
||||
'office-hours-auto-mode': 'gate',
|
||||
'auto-decide-preserved': 'periodic',
|
||||
'conductor-prose': 'periodic',
|
||||
'e2e-harness-audit': 'gate',
|
||||
|
||||
// Real-PTY E2E batch — tier classification:
|
||||
@@ -659,6 +680,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'design-shotgun-session': 'gate',
|
||||
'design-shotgun-full': 'periodic',
|
||||
|
||||
// /diagram — triplet is deterministic functional, judge is a quality benchmark
|
||||
'diagram-triplet': 'gate',
|
||||
'diagram-authoring-quality': 'periodic',
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': 'gate',
|
||||
|
||||
@@ -779,6 +804,7 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
*/
|
||||
export const GLOBAL_TOUCHFILES = [
|
||||
'test/helpers/session-runner.ts', // All E2E tests use this runner
|
||||
'test/helpers/hermetic-env.ts', // Changes every E2E child's environment
|
||||
'test/helpers/eval-store.ts', // All E2E tests store results here
|
||||
'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous
|
||||
];
|
||||
|
||||
Reference in New Issue
Block a user