From 09c82222eabebac6aa3689a47f68e00c00fc7260 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 22:02:32 -0700 Subject: [PATCH] test: fix Codex eval sandbox + collector API Two test infrastructure bugs in the initial Codex eval landed in the prior commit: 1. sandbox: 'read-only' (the default) blocked Codex from writing $OUT_FILE. Test reported "STATUS: BLOCKED" and exited 0 without a capture file. Fixed: sandbox: 'workspace-write' for all 4 cases, allowing writes inside the tempdir. 2. recordCodexResult called a non-existent evalCollector.record() API (I invented it). The real surface is addTest() with a different field schema. Aligned with test/codex-e2e.test.ts pattern. With both fixed, the eval now actually measures Codex AskUserQuestion format compliance. All 4 cases pass on v1.6.2.0 with the gpt.md carve-out: RECOMMENDATION always, Completeness: N/10 only on coverage, "options differ in kind" note on kind, ELI10 explanation present. Co-Authored-By: Claude Opus 4.7 --- test/codex-e2e-plan-format.test.ts | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/test/codex-e2e-plan-format.test.ts b/test/codex-e2e-plan-format.test.ts index 2b1dcb2b..0481f69d 100644 --- a/test/codex-e2e-plan-format.test.ts +++ b/test/codex-e2e-plan-format.test.ts @@ -85,16 +85,17 @@ if (!SKIP) { } function recordCodexResult(testName: string, result: CodexResult, passed: boolean) { - if (!evalCollector) return; - const entry: EvalTestEntry = { - test: testName, + evalCollector?.addTest({ + name: testName, + suite: 'codex-e2e-plan-format', + tier: 'e2e', passed, - cost: 0, // Codex cost not tracked here; inferred from tokens - tokens: result.tokens, - duration: Math.round(result.durationMs / 1000), - exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`, - }; - evalCollector.record(entry); + duration_ms: result.durationMs, + cost_usd: 0, // Codex doesn't report cost in the same way; tokens tracked separately + output: result.output?.slice(0, 2000), + turns_used: result.toolCalls.length, + exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`, + }); } afterAll(async () => { @@ -183,6 +184,7 @@ describeCodex('Codex Plan Format — CEO Mode Selection', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-ceo-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0); @@ -222,6 +224,7 @@ describeCodex('Codex Plan Format — CEO Approach Menu', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-ceo-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0); @@ -258,6 +261,7 @@ describeCodex('Codex Plan Format — Eng Coverage Issue', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-eng-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0); @@ -294,6 +298,7 @@ describeCodex('Codex Plan Format — Eng Kind Issue', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-eng-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0);