test: add Codex eval for AskUserQuestion format compliance

Four-case periodic-tier eval mirrors test/skill-e2e-plan-format.test.ts but drives the plan review skills via codex exec instead of claude -p. Context: Codex under the gpt.md "No preamble / Prefer doing over listing" overlay tends to skip the Simplify/ELI10 paragraph and the RECOMMENDATION line on AskUserQuestion calls. Users have to manually re-prompt "ELI10 and don't forget to recommend" almost every time. This test pins the behavior so regressions surface. Cases: - plan-ceo-review mode selection (kind-differentiated) - plan-ceo-review approach menu (coverage-differentiated) - plan-eng-review per-issue coverage decision - plan-eng-review per-issue architectural choice (kind-differentiated) Assertions on captured AskUserQuestion text: - RECOMMENDATION: Choose present (all cases) - Completeness: N/10 present on coverage, absent on kind - "options differ in kind" note present on kind - ELI10 length floor (>400 chars) — catches bare options-only output Cost: ~\$2-4 per full run. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-04-22 21:34:11 -07:00
parent d591ad29b2
commit b7f6246061
1 changed files with 315 additions and 0 deletions
@@ -0,0 +1,315 @@
+/**
+ * AskUserQuestion format regression test for /plan-ceo-review and /plan-eng-review
+ * running under Codex CLI (GPT-5.4).
+ *
+ * Context: GPT-class models under the "No preamble / Prefer doing over listing"
+ * gpt.md overlay tend to skip the Simplify (ELI10) paragraph and the RECOMMENDATION
+ * line on AskUserQuestion calls. The user has to manually re-prompt "ELI10 and don't
+ * forget to recommend" almost every time. This test pins that behavior so future
+ * regressions surface automatically.
+ *
+ * Mirrors test/skill-e2e-plan-format.test.ts (the Claude version) but uses
+ * test/helpers/codex-session-runner.ts to drive `codex exec` instead of `claude -p`.
+ *
+ * Four cases:
+ *   1. plan-ceo-review mode selection (kind-differentiated)
+ *   2. plan-ceo-review approach menu (coverage-differentiated)
+ *   3. plan-eng-review per-issue coverage decision
+ *   4. plan-eng-review per-issue architectural choice (kind-differentiated)
+ *
+ * Assertions on captured AskUserQuestion text:
+ *   - RECOMMENDATION: Choose present (all cases)
+ *   - Completeness: N/10 present on coverage cases, absent on kind cases
+ *   - "options differ in kind" note present on kind cases
+ *   - ELI10-style plain-English explanation present (length floor + no raw jargon)
+ *
+ * Periodic tier (Codex non-determinism). Cost: ~$2-3 per full run.
+ */
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
+import { runCodexSkill, installSkillToTempHome } from './helpers/codex-session-runner';
+import type { CodexResult } from './helpers/codex-session-runner';
+import { EvalCollector } from './helpers/eval-store';
+import type { EvalTestEntry } from './helpers/eval-store';
+import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { spawnSync } from 'child_process';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+
+// --- Prerequisites ---
+
+const CODEX_AVAILABLE = (() => {
+  try {
+    const result = Bun.spawnSync(['which', 'codex']);
+    return result.exitCode === 0;
+  } catch { return false; }
+})();
+const evalsEnabled = !!process.env.EVALS;
+const SKIP = !CODEX_AVAILABLE || !evalsEnabled;
+const describeCodex = SKIP ? describe.skip : describe;
+
+// --- Touchfiles ---
+
+const CODEX_FORMAT_TOUCHFILES: Record<string, string[]> = {
+  'codex-plan-ceo-format-mode':      ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
+  'codex-plan-ceo-format-approach':  ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
+  'codex-plan-eng-format-coverage':  ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
+  'codex-plan-eng-format-kind':      ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
+};
+
+let selectedTests: string[] | null = null;
+if (evalsEnabled && !process.env.EVALS_ALL) {
+  const baseBranch = process.env.EVALS_BASE || detectBaseBranch(ROOT) || 'main';
+  const changedFiles = getChangedFiles(baseBranch, ROOT);
+  if (changedFiles.length > 0) {
+    const selection = selectTests(changedFiles, CODEX_FORMAT_TOUCHFILES, GLOBAL_TOUCHFILES);
+    selectedTests = selection.selected;
+  }
+}
+
+function testIfSelected(name: string, fn: () => Promise<void>, timeout?: number) {
+  if (selectedTests !== null && !selectedTests.includes(name)) {
+    test.skip(name, fn, timeout);
+  } else {
+    test(name, fn, timeout);
+  }
+}
+
+// --- Eval collector ---
+
+let evalCollector: EvalCollector | null = null;
+if (!SKIP) {
+  evalCollector = new EvalCollector('codex-e2e-plan-format');
+}
+
+function recordCodexResult(testName: string, result: CodexResult, passed: boolean) {
+  if (!evalCollector) return;
+  const entry: EvalTestEntry = {
+    test: testName,
+    passed,
+    cost: 0, // Codex cost not tracked here; inferred from tokens
+    tokens: result.tokens,
+    duration: Math.round(result.durationMs / 1000),
+    exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`,
+  };
+  evalCollector.record(entry);
+}
+
+afterAll(async () => {
+  if (evalCollector) {
+    await evalCollector.finalize();
+  }
+});
+
+// --- Fixtures ---
+
+const SAMPLE_PLAN = `# Plan: Add User Dashboard
+
+## Context
+We're building a new user dashboard that shows recent activity, notifications, and quick actions.
+
+## Changes
+1. New React component \`UserDashboard\` in \`src/components/\`
+2. REST API endpoint \`GET /api/dashboard\` returning user stats
+3. PostgreSQL query for activity aggregation
+4. Redis cache layer for dashboard data (5min TTL)
+
+## Architecture
+- Frontend: React + TailwindCSS
+- Backend: Express.js REST API
+- Database: PostgreSQL with existing user/activity tables
+- Cache: Redis for dashboard aggregates
+`;
+
+function setupCodexSkillDir(tmpPrefix: string, skillName: 'plan-ceo-review' | 'plan-eng-review'): { skillDir: string; planDir: string; outFile: string } {
+  const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix));
+  const run = (cmd: string, args: string[]) =>
+    spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
+
+  run('git', ['init', '-b', 'main']);
+  run('git', ['config', 'user.email', 'test@test.com']);
+  run('git', ['config', 'user.name', 'Test']);
+
+  fs.writeFileSync(path.join(planDir, 'plan.md'), SAMPLE_PLAN);
+  run('git', ['add', '.']);
+  run('git', ['commit', '-m', 'add plan']);
+
+  // Codex skill lives in .agents/skills/gstack-{name}/ per the gstack host convention.
+  const codexSkillSource = path.join(ROOT, '.agents', 'skills', `gstack-${skillName}`);
+  const skillDir = path.join(planDir, '.agents', 'skills', `gstack-${skillName}`);
+  fs.mkdirSync(skillDir, { recursive: true });
+  fs.cpSync(codexSkillSource, skillDir, { recursive: true });
+
+  const outFile = path.join(planDir, 'ask-capture.md');
+  return { skillDir, planDir, outFile };
+}
+
+// Capture instruction — same shape as the Claude version. Codex may ignore tool calls,
+// so we tell it to write prose to the file directly.
+function captureInstruction(outFile: string): string {
+  return `Write the verbatim text of every AskUserQuestion you would have presented to the user to the file ${outFile} (one question per session, full text including the re-ground, ELI10 paragraph, RECOMMENDATION line, and options). Do NOT ask the user interactively. Do NOT paraphrase. This is a format-capture test, not an interactive session.`;
+}
+
+// --- Regex predicates ---
+// Match RECOMMENDATION lenient to markdown bolding around it.
+const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/;
+const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/;
+const KIND_NOTE_RE = /options differ in kind/i;
+// ELI10 signal: some plain-English explanation must exist. Weak proxy: >= 200 chars
+// of narrative prose between the re-ground and the options, AND at least one of the
+// plain-English hints ("plain English", "16-year-old", or "what this means").
+// We test for the length floor and absence of a bare options-list-only output.
+const ELI10_LENGTH_FLOOR = 400; // full AskUserQuestion content should be at least this long
+
+// --- Tests ---
+
+describeCodex('Codex Plan Format — CEO Mode Selection', () => {
+  let skillDir: string, planDir: string, outFile: string;
+
+  beforeAll(() => {
+    ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-mode-', 'plan-ceo-review'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('codex-plan-ceo-format-mode', async () => {
+    const result = await runCodexSkill({
+      skillDir,
+      prompt: `Read the plan-ceo-review skill. Read plan.md (the plan to review). Proceed to Step 0F (Mode Selection) where the skill presents 4 mode options (SCOPE EXPANSION, SELECTIVE EXPANSION, HOLD SCOPE, SCOPE REDUCTION) via AskUserQuestion. These options differ in kind (review posture), not coverage. ${captureInstruction(outFile)}`,
+      timeoutMs: 300_000,
+      cwd: planDir,
+      skillName: 'gstack-plan-ceo-review',
+    });
+
+    recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0);
+    console.log(`codex-plan-ceo-format-mode: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
+
+    // Codex may timeout — accept as non-fatal (same pattern as existing codex-e2e tests)
+    if (result.exitCode === 124 || result.exitCode === 137) {
+      console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
+      return;
+    }
+
+    expect(fs.existsSync(outFile)).toBe(true);
+    const captured = fs.readFileSync(outFile, 'utf-8');
+    expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
+    expect(captured).toMatch(RECOMMENDATION_RE);
+    // kind-differentiated: no fabricated score, must have note
+    expect(captured).not.toMatch(COMPLETENESS_RE);
+    expect(captured).toMatch(KIND_NOTE_RE);
+  }, 360_000);
+});
+
+describeCodex('Codex Plan Format — CEO Approach Menu', () => {
+  let skillDir: string, planDir: string, outFile: string;
+
+  beforeAll(() => {
+    ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-approach-', 'plan-ceo-review'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('codex-plan-ceo-format-approach', async () => {
+    const result = await runCodexSkill({
+      skillDir,
+      prompt: `Read the plan-ceo-review skill. Read plan.md. Proceed to Step 0C-bis (Implementation Alternatives / Approach Menu) where the skill generates 2-3 approaches (minimal viable vs ideal architecture) and presents them via AskUserQuestion. These options differ in coverage so Completeness: N/10 applies. ${captureInstruction(outFile)}`,
+      timeoutMs: 300_000,
+      cwd: planDir,
+      skillName: 'gstack-plan-ceo-review',
+    });
+
+    recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0);
+    console.log(`codex-plan-ceo-format-approach: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
+
+    if (result.exitCode === 124 || result.exitCode === 137) {
+      console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
+      return;
+    }
+
+    expect(fs.existsSync(outFile)).toBe(true);
+    const captured = fs.readFileSync(outFile, 'utf-8');
+    expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
+    expect(captured).toMatch(RECOMMENDATION_RE);
+    expect(captured).toMatch(COMPLETENESS_RE);
+  }, 360_000);
+});
+
+describeCodex('Codex Plan Format — Eng Coverage Issue', () => {
+  let skillDir: string, planDir: string, outFile: string;
+
+  beforeAll(() => {
+    ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-cov-', 'plan-eng-review'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('codex-plan-eng-format-coverage', async () => {
+    const result = await runCodexSkill({
+      skillDir,
+      prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 3 Test Review, generate ONE AskUserQuestion about test coverage depth where options are clearly coverage-differentiated: A) full coverage incl. edge + error paths (Completeness 10/10), B) happy path only (7/10), C) smoke test (3/10). ${captureInstruction(outFile)}`,
+      timeoutMs: 300_000,
+      cwd: planDir,
+      skillName: 'gstack-plan-eng-review',
+    });
+
+    recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0);
+    console.log(`codex-plan-eng-format-coverage: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
+
+    if (result.exitCode === 124 || result.exitCode === 137) {
+      console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
+      return;
+    }
+
+    expect(fs.existsSync(outFile)).toBe(true);
+    const captured = fs.readFileSync(outFile, 'utf-8');
+    expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
+    expect(captured).toMatch(RECOMMENDATION_RE);
+    expect(captured).toMatch(COMPLETENESS_RE);
+  }, 360_000);
+});
+
+describeCodex('Codex Plan Format — Eng Kind Issue', () => {
+  let skillDir: string, planDir: string, outFile: string;
+
+  beforeAll(() => {
+    ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-kind-', 'plan-eng-review'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('codex-plan-eng-format-kind', async () => {
+    const result = await runCodexSkill({
+      skillDir,
+      prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 1 Architecture review, generate ONE AskUserQuestion about an architectural choice where the options differ in kind (e.g. Redis vs Postgres materialized view vs in-process cache — different kinds of systems with different tradeoffs, NOT more-or-less-complete versions of the same thing). ${captureInstruction(outFile)}`,
+      timeoutMs: 300_000,
+      cwd: planDir,
+      skillName: 'gstack-plan-eng-review',
+    });
+
+    recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0);
+    console.log(`codex-plan-eng-format-kind: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
+
+    if (result.exitCode === 124 || result.exitCode === 137) {
+      console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
+      return;
+    }
+
+    expect(fs.existsSync(outFile)).toBe(true);
+    const captured = fs.readFileSync(outFile, 'utf-8');
+    expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
+    expect(captured).toMatch(RECOMMENDATION_RE);
+    // kind-differentiated: no fabricated score
+    expect(captured).not.toMatch(COMPLETENESS_RE);
+    expect(captured).toMatch(KIND_NOTE_RE);
+  }, 360_000);
+});