From b7f6246061f94d28cae4fff44da285e27d959260 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 21:34:11 -0700 Subject: [PATCH] test: add Codex eval for AskUserQuestion format compliance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four-case periodic-tier eval mirrors test/skill-e2e-plan-format.test.ts but drives the plan review skills via codex exec instead of claude -p. Context: Codex under the gpt.md "No preamble / Prefer doing over listing" overlay tends to skip the Simplify/ELI10 paragraph and the RECOMMENDATION line on AskUserQuestion calls. Users have to manually re-prompt "ELI10 and don't forget to recommend" almost every time. This test pins the behavior so regressions surface. Cases: - plan-ceo-review mode selection (kind-differentiated) - plan-ceo-review approach menu (coverage-differentiated) - plan-eng-review per-issue coverage decision - plan-eng-review per-issue architectural choice (kind-differentiated) Assertions on captured AskUserQuestion text: - RECOMMENDATION: Choose present (all cases) - Completeness: N/10 present on coverage, absent on kind - "options differ in kind" note present on kind - ELI10 length floor (>400 chars) — catches bare options-only output Cost: ~\$2-4 per full run. Co-Authored-By: Claude Opus 4.7 --- test/codex-e2e-plan-format.test.ts | 315 +++++++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 test/codex-e2e-plan-format.test.ts diff --git a/test/codex-e2e-plan-format.test.ts b/test/codex-e2e-plan-format.test.ts new file mode 100644 index 00000000..2b1dcb2b --- /dev/null +++ b/test/codex-e2e-plan-format.test.ts @@ -0,0 +1,315 @@ +/** + * AskUserQuestion format regression test for /plan-ceo-review and /plan-eng-review + * running under Codex CLI (GPT-5.4). + * + * Context: GPT-class models under the "No preamble / Prefer doing over listing" + * gpt.md overlay tend to skip the Simplify (ELI10) paragraph and the RECOMMENDATION + * line on AskUserQuestion calls. The user has to manually re-prompt "ELI10 and don't + * forget to recommend" almost every time. This test pins that behavior so future + * regressions surface automatically. + * + * Mirrors test/skill-e2e-plan-format.test.ts (the Claude version) but uses + * test/helpers/codex-session-runner.ts to drive `codex exec` instead of `claude -p`. + * + * Four cases: + * 1. plan-ceo-review mode selection (kind-differentiated) + * 2. plan-ceo-review approach menu (coverage-differentiated) + * 3. plan-eng-review per-issue coverage decision + * 4. plan-eng-review per-issue architectural choice (kind-differentiated) + * + * Assertions on captured AskUserQuestion text: + * - RECOMMENDATION: Choose present (all cases) + * - Completeness: N/10 present on coverage cases, absent on kind cases + * - "options differ in kind" note present on kind cases + * - ELI10-style plain-English explanation present (length floor + no raw jargon) + * + * Periodic tier (Codex non-determinism). Cost: ~$2-3 per full run. + */ +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { runCodexSkill, installSkillToTempHome } from './helpers/codex-session-runner'; +import type { CodexResult } from './helpers/codex-session-runner'; +import { EvalCollector } from './helpers/eval-store'; +import type { EvalTestEntry } from './helpers/eval-store'; +import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { spawnSync } from 'child_process'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// --- Prerequisites --- + +const CODEX_AVAILABLE = (() => { + try { + const result = Bun.spawnSync(['which', 'codex']); + return result.exitCode === 0; + } catch { return false; } +})(); +const evalsEnabled = !!process.env.EVALS; +const SKIP = !CODEX_AVAILABLE || !evalsEnabled; +const describeCodex = SKIP ? describe.skip : describe; + +// --- Touchfiles --- + +const CODEX_FORMAT_TOUCHFILES: Record = { + 'codex-plan-ceo-format-mode': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], + 'codex-plan-ceo-format-approach': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], + 'codex-plan-eng-format-coverage': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], + 'codex-plan-eng-format-kind': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], +}; + +let selectedTests: string[] | null = null; +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE || detectBaseBranch(ROOT) || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, CODEX_FORMAT_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + } +} + +function testIfSelected(name: string, fn: () => Promise, timeout?: number) { + if (selectedTests !== null && !selectedTests.includes(name)) { + test.skip(name, fn, timeout); + } else { + test(name, fn, timeout); + } +} + +// --- Eval collector --- + +let evalCollector: EvalCollector | null = null; +if (!SKIP) { + evalCollector = new EvalCollector('codex-e2e-plan-format'); +} + +function recordCodexResult(testName: string, result: CodexResult, passed: boolean) { + if (!evalCollector) return; + const entry: EvalTestEntry = { + test: testName, + passed, + cost: 0, // Codex cost not tracked here; inferred from tokens + tokens: result.tokens, + duration: Math.round(result.durationMs / 1000), + exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`, + }; + evalCollector.record(entry); +} + +afterAll(async () => { + if (evalCollector) { + await evalCollector.finalize(); + } +}); + +// --- Fixtures --- + +const SAMPLE_PLAN = `# Plan: Add User Dashboard + +## Context +We're building a new user dashboard that shows recent activity, notifications, and quick actions. + +## Changes +1. New React component \`UserDashboard\` in \`src/components/\` +2. REST API endpoint \`GET /api/dashboard\` returning user stats +3. PostgreSQL query for activity aggregation +4. Redis cache layer for dashboard data (5min TTL) + +## Architecture +- Frontend: React + TailwindCSS +- Backend: Express.js REST API +- Database: PostgreSQL with existing user/activity tables +- Cache: Redis for dashboard aggregates +`; + +function setupCodexSkillDir(tmpPrefix: string, skillName: 'plan-ceo-review' | 'plan-eng-review'): { skillDir: string; planDir: string; outFile: string } { + const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix)); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(planDir, 'plan.md'), SAMPLE_PLAN); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add plan']); + + // Codex skill lives in .agents/skills/gstack-{name}/ per the gstack host convention. + const codexSkillSource = path.join(ROOT, '.agents', 'skills', `gstack-${skillName}`); + const skillDir = path.join(planDir, '.agents', 'skills', `gstack-${skillName}`); + fs.mkdirSync(skillDir, { recursive: true }); + fs.cpSync(codexSkillSource, skillDir, { recursive: true }); + + const outFile = path.join(planDir, 'ask-capture.md'); + return { skillDir, planDir, outFile }; +} + +// Capture instruction — same shape as the Claude version. Codex may ignore tool calls, +// so we tell it to write prose to the file directly. +function captureInstruction(outFile: string): string { + return `Write the verbatim text of every AskUserQuestion you would have presented to the user to the file ${outFile} (one question per session, full text including the re-ground, ELI10 paragraph, RECOMMENDATION line, and options). Do NOT ask the user interactively. Do NOT paraphrase. This is a format-capture test, not an interactive session.`; +} + +// --- Regex predicates --- +// Match RECOMMENDATION lenient to markdown bolding around it. +const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/; +const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/; +const KIND_NOTE_RE = /options differ in kind/i; +// ELI10 signal: some plain-English explanation must exist. Weak proxy: >= 200 chars +// of narrative prose between the re-ground and the options, AND at least one of the +// plain-English hints ("plain English", "16-year-old", or "what this means"). +// We test for the length floor and absence of a bare options-list-only output. +const ELI10_LENGTH_FLOOR = 400; // full AskUserQuestion content should be at least this long + +// --- Tests --- + +describeCodex('Codex Plan Format — CEO Mode Selection', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-mode-', 'plan-ceo-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-ceo-format-mode', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-ceo-review skill. Read plan.md (the plan to review). Proceed to Step 0F (Mode Selection) where the skill presents 4 mode options (SCOPE EXPANSION, SELECTIVE EXPANSION, HOLD SCOPE, SCOPE REDUCTION) via AskUserQuestion. These options differ in kind (review posture), not coverage. ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-ceo-review', + }); + + recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0); + console.log(`codex-plan-ceo-format-mode: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + // Codex may timeout — accept as non-fatal (same pattern as existing codex-e2e tests) + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + // kind-differentiated: no fabricated score, must have note + expect(captured).not.toMatch(COMPLETENESS_RE); + expect(captured).toMatch(KIND_NOTE_RE); + }, 360_000); +}); + +describeCodex('Codex Plan Format — CEO Approach Menu', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-approach-', 'plan-ceo-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-ceo-format-approach', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-ceo-review skill. Read plan.md. Proceed to Step 0C-bis (Implementation Alternatives / Approach Menu) where the skill generates 2-3 approaches (minimal viable vs ideal architecture) and presents them via AskUserQuestion. These options differ in coverage so Completeness: N/10 applies. ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-ceo-review', + }); + + recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0); + console.log(`codex-plan-ceo-format-approach: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).toMatch(COMPLETENESS_RE); + }, 360_000); +}); + +describeCodex('Codex Plan Format — Eng Coverage Issue', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-cov-', 'plan-eng-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-eng-format-coverage', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 3 Test Review, generate ONE AskUserQuestion about test coverage depth where options are clearly coverage-differentiated: A) full coverage incl. edge + error paths (Completeness 10/10), B) happy path only (7/10), C) smoke test (3/10). ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-eng-review', + }); + + recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0); + console.log(`codex-plan-eng-format-coverage: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).toMatch(COMPLETENESS_RE); + }, 360_000); +}); + +describeCodex('Codex Plan Format — Eng Kind Issue', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-kind-', 'plan-eng-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-eng-format-kind', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 1 Architecture review, generate ONE AskUserQuestion about an architectural choice where the options differ in kind (e.g. Redis vs Postgres materialized view vs in-process cache — different kinds of systems with different tradeoffs, NOT more-or-less-complete versions of the same thing). ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-eng-review', + }); + + recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0); + console.log(`codex-plan-eng-format-kind: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + // kind-differentiated: no fabricated score + expect(captured).not.toMatch(COMPLETENESS_RE); + expect(captured).toMatch(KIND_NOTE_RE); + }, 360_000); +});