mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test: fix Codex eval sandbox + collector API
Two test infrastructure bugs in the initial Codex eval landed in the prior commit: 1. sandbox: 'read-only' (the default) blocked Codex from writing $OUT_FILE. Test reported "STATUS: BLOCKED" and exited 0 without a capture file. Fixed: sandbox: 'workspace-write' for all 4 cases, allowing writes inside the tempdir. 2. recordCodexResult called a non-existent evalCollector.record() API (I invented it). The real surface is addTest() with a different field schema. Aligned with test/codex-e2e.test.ts pattern. With both fixed, the eval now actually measures Codex AskUserQuestion format compliance. All 4 cases pass on v1.6.2.0 with the gpt.md carve-out: RECOMMENDATION always, Completeness: N/10 only on coverage, "options differ in kind" note on kind, ELI10 explanation present. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -85,16 +85,17 @@ if (!SKIP) {
|
||||
}
|
||||
|
||||
function recordCodexResult(testName: string, result: CodexResult, passed: boolean) {
|
||||
if (!evalCollector) return;
|
||||
const entry: EvalTestEntry = {
|
||||
test: testName,
|
||||
evalCollector?.addTest({
|
||||
name: testName,
|
||||
suite: 'codex-e2e-plan-format',
|
||||
tier: 'e2e',
|
||||
passed,
|
||||
cost: 0, // Codex cost not tracked here; inferred from tokens
|
||||
tokens: result.tokens,
|
||||
duration: Math.round(result.durationMs / 1000),
|
||||
exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`,
|
||||
};
|
||||
evalCollector.record(entry);
|
||||
duration_ms: result.durationMs,
|
||||
cost_usd: 0, // Codex doesn't report cost in the same way; tokens tracked separately
|
||||
output: result.output?.slice(0, 2000),
|
||||
turns_used: result.toolCalls.length,
|
||||
exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
|
||||
});
|
||||
}
|
||||
|
||||
afterAll(async () => {
|
||||
@@ -183,6 +184,7 @@ describeCodex('Codex Plan Format — CEO Mode Selection', () => {
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-ceo-review',
|
||||
sandbox: 'workspace-write',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0);
|
||||
@@ -222,6 +224,7 @@ describeCodex('Codex Plan Format — CEO Approach Menu', () => {
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-ceo-review',
|
||||
sandbox: 'workspace-write',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0);
|
||||
@@ -258,6 +261,7 @@ describeCodex('Codex Plan Format — Eng Coverage Issue', () => {
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-eng-review',
|
||||
sandbox: 'workspace-write',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0);
|
||||
@@ -294,6 +298,7 @@ describeCodex('Codex Plan Format — Eng Kind Issue', () => {
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-eng-review',
|
||||
sandbox: 'workspace-write',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0);
|
||||
|
||||
Reference in New Issue
Block a user