test: fix Codex eval sandbox + collector API

Two test infrastructure bugs in the initial Codex eval landed in the
prior commit:

1. sandbox: 'read-only' (the default) blocked Codex from writing
   $OUT_FILE. Test reported "STATUS: BLOCKED" and exited 0 without
   a capture file. Fixed: sandbox: 'workspace-write' for all 4 cases,
   allowing writes inside the tempdir.

2. recordCodexResult called a non-existent evalCollector.record()
   API (I invented it). The real surface is addTest() with a
   different field schema. Aligned with test/codex-e2e.test.ts
   pattern.

With both fixed, the eval now actually measures Codex AskUserQuestion
format compliance. All 4 cases pass on v1.6.2.0 with the gpt.md
carve-out: RECOMMENDATION always, Completeness: N/10 only on coverage,
"options differ in kind" note on kind, ELI10 explanation present.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-22 22:02:32 -07:00
parent 028627fbcd
commit 09c82222ea
+14 -9
View File
@@ -85,16 +85,17 @@ if (!SKIP) {
}
function recordCodexResult(testName: string, result: CodexResult, passed: boolean) {
if (!evalCollector) return;
const entry: EvalTestEntry = {
test: testName,
evalCollector?.addTest({
name: testName,
suite: 'codex-e2e-plan-format',
tier: 'e2e',
passed,
cost: 0, // Codex cost not tracked here; inferred from tokens
tokens: result.tokens,
duration: Math.round(result.durationMs / 1000),
exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`,
};
evalCollector.record(entry);
duration_ms: result.durationMs,
cost_usd: 0, // Codex doesn't report cost in the same way; tokens tracked separately
output: result.output?.slice(0, 2000),
turns_used: result.toolCalls.length,
exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
});
}
afterAll(async () => {
@@ -183,6 +184,7 @@ describeCodex('Codex Plan Format — CEO Mode Selection', () => {
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-ceo-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0);
@@ -222,6 +224,7 @@ describeCodex('Codex Plan Format — CEO Approach Menu', () => {
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-ceo-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0);
@@ -258,6 +261,7 @@ describeCodex('Codex Plan Format — Eng Coverage Issue', () => {
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-eng-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0);
@@ -294,6 +298,7 @@ describeCodex('Codex Plan Format — Eng Kind Issue', () => {
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-eng-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0);