mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test: add Codex eval for AskUserQuestion format compliance
Four-case periodic-tier eval mirrors test/skill-e2e-plan-format.test.ts but drives the plan review skills via codex exec instead of claude -p. Context: Codex under the gpt.md "No preamble / Prefer doing over listing" overlay tends to skip the Simplify/ELI10 paragraph and the RECOMMENDATION line on AskUserQuestion calls. Users have to manually re-prompt "ELI10 and don't forget to recommend" almost every time. This test pins the behavior so regressions surface. Cases: - plan-ceo-review mode selection (kind-differentiated) - plan-ceo-review approach menu (coverage-differentiated) - plan-eng-review per-issue coverage decision - plan-eng-review per-issue architectural choice (kind-differentiated) Assertions on captured AskUserQuestion text: - RECOMMENDATION: Choose present (all cases) - Completeness: N/10 present on coverage, absent on kind - "options differ in kind" note present on kind - ELI10 length floor (>400 chars) — catches bare options-only output Cost: ~\$2-4 per full run. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,315 @@
|
||||
/**
|
||||
* AskUserQuestion format regression test for /plan-ceo-review and /plan-eng-review
|
||||
* running under Codex CLI (GPT-5.4).
|
||||
*
|
||||
* Context: GPT-class models under the "No preamble / Prefer doing over listing"
|
||||
* gpt.md overlay tend to skip the Simplify (ELI10) paragraph and the RECOMMENDATION
|
||||
* line on AskUserQuestion calls. The user has to manually re-prompt "ELI10 and don't
|
||||
* forget to recommend" almost every time. This test pins that behavior so future
|
||||
* regressions surface automatically.
|
||||
*
|
||||
* Mirrors test/skill-e2e-plan-format.test.ts (the Claude version) but uses
|
||||
* test/helpers/codex-session-runner.ts to drive `codex exec` instead of `claude -p`.
|
||||
*
|
||||
* Four cases:
|
||||
* 1. plan-ceo-review mode selection (kind-differentiated)
|
||||
* 2. plan-ceo-review approach menu (coverage-differentiated)
|
||||
* 3. plan-eng-review per-issue coverage decision
|
||||
* 4. plan-eng-review per-issue architectural choice (kind-differentiated)
|
||||
*
|
||||
* Assertions on captured AskUserQuestion text:
|
||||
* - RECOMMENDATION: Choose present (all cases)
|
||||
* - Completeness: N/10 present on coverage cases, absent on kind cases
|
||||
* - "options differ in kind" note present on kind cases
|
||||
* - ELI10-style plain-English explanation present (length floor + no raw jargon)
|
||||
*
|
||||
* Periodic tier (Codex non-determinism). Cost: ~$2-3 per full run.
|
||||
*/
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runCodexSkill, installSkillToTempHome } from './helpers/codex-session-runner';
|
||||
import type { CodexResult } from './helpers/codex-session-runner';
|
||||
import { EvalCollector } from './helpers/eval-store';
|
||||
import type { EvalTestEntry } from './helpers/eval-store';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { spawnSync } from 'child_process';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// --- Prerequisites ---
|
||||
|
||||
const CODEX_AVAILABLE = (() => {
|
||||
try {
|
||||
const result = Bun.spawnSync(['which', 'codex']);
|
||||
return result.exitCode === 0;
|
||||
} catch { return false; }
|
||||
})();
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const SKIP = !CODEX_AVAILABLE || !evalsEnabled;
|
||||
const describeCodex = SKIP ? describe.skip : describe;
|
||||
|
||||
// --- Touchfiles ---
|
||||
|
||||
const CODEX_FORMAT_TOUCHFILES: Record<string, string[]> = {
|
||||
'codex-plan-ceo-format-mode': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
|
||||
'codex-plan-ceo-format-approach': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
|
||||
'codex-plan-eng-format-coverage': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
|
||||
'codex-plan-eng-format-kind': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
|
||||
};
|
||||
|
||||
let selectedTests: string[] | null = null;
|
||||
if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
const baseBranch = process.env.EVALS_BASE || detectBaseBranch(ROOT) || 'main';
|
||||
const changedFiles = getChangedFiles(baseBranch, ROOT);
|
||||
if (changedFiles.length > 0) {
|
||||
const selection = selectTests(changedFiles, CODEX_FORMAT_TOUCHFILES, GLOBAL_TOUCHFILES);
|
||||
selectedTests = selection.selected;
|
||||
}
|
||||
}
|
||||
|
||||
function testIfSelected(name: string, fn: () => Promise<void>, timeout?: number) {
|
||||
if (selectedTests !== null && !selectedTests.includes(name)) {
|
||||
test.skip(name, fn, timeout);
|
||||
} else {
|
||||
test(name, fn, timeout);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Eval collector ---
|
||||
|
||||
let evalCollector: EvalCollector | null = null;
|
||||
if (!SKIP) {
|
||||
evalCollector = new EvalCollector('codex-e2e-plan-format');
|
||||
}
|
||||
|
||||
function recordCodexResult(testName: string, result: CodexResult, passed: boolean) {
|
||||
if (!evalCollector) return;
|
||||
const entry: EvalTestEntry = {
|
||||
test: testName,
|
||||
passed,
|
||||
cost: 0, // Codex cost not tracked here; inferred from tokens
|
||||
tokens: result.tokens,
|
||||
duration: Math.round(result.durationMs / 1000),
|
||||
exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`,
|
||||
};
|
||||
evalCollector.record(entry);
|
||||
}
|
||||
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
await evalCollector.finalize();
|
||||
}
|
||||
});
|
||||
|
||||
// --- Fixtures ---
|
||||
|
||||
const SAMPLE_PLAN = `# Plan: Add User Dashboard
|
||||
|
||||
## Context
|
||||
We're building a new user dashboard that shows recent activity, notifications, and quick actions.
|
||||
|
||||
## Changes
|
||||
1. New React component \`UserDashboard\` in \`src/components/\`
|
||||
2. REST API endpoint \`GET /api/dashboard\` returning user stats
|
||||
3. PostgreSQL query for activity aggregation
|
||||
4. Redis cache layer for dashboard data (5min TTL)
|
||||
|
||||
## Architecture
|
||||
- Frontend: React + TailwindCSS
|
||||
- Backend: Express.js REST API
|
||||
- Database: PostgreSQL with existing user/activity tables
|
||||
- Cache: Redis for dashboard aggregates
|
||||
`;
|
||||
|
||||
function setupCodexSkillDir(tmpPrefix: string, skillName: 'plan-ceo-review' | 'plan-eng-review'): { skillDir: string; planDir: string; outFile: string } {
|
||||
const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(planDir, 'plan.md'), SAMPLE_PLAN);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'add plan']);
|
||||
|
||||
// Codex skill lives in .agents/skills/gstack-{name}/ per the gstack host convention.
|
||||
const codexSkillSource = path.join(ROOT, '.agents', 'skills', `gstack-${skillName}`);
|
||||
const skillDir = path.join(planDir, '.agents', 'skills', `gstack-${skillName}`);
|
||||
fs.mkdirSync(skillDir, { recursive: true });
|
||||
fs.cpSync(codexSkillSource, skillDir, { recursive: true });
|
||||
|
||||
const outFile = path.join(planDir, 'ask-capture.md');
|
||||
return { skillDir, planDir, outFile };
|
||||
}
|
||||
|
||||
// Capture instruction — same shape as the Claude version. Codex may ignore tool calls,
|
||||
// so we tell it to write prose to the file directly.
|
||||
function captureInstruction(outFile: string): string {
|
||||
return `Write the verbatim text of every AskUserQuestion you would have presented to the user to the file ${outFile} (one question per session, full text including the re-ground, ELI10 paragraph, RECOMMENDATION line, and options). Do NOT ask the user interactively. Do NOT paraphrase. This is a format-capture test, not an interactive session.`;
|
||||
}
|
||||
|
||||
// --- Regex predicates ---
|
||||
// Match RECOMMENDATION lenient to markdown bolding around it.
|
||||
const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/;
|
||||
const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/;
|
||||
const KIND_NOTE_RE = /options differ in kind/i;
|
||||
// ELI10 signal: some plain-English explanation must exist. Weak proxy: >= 200 chars
|
||||
// of narrative prose between the re-ground and the options, AND at least one of the
|
||||
// plain-English hints ("plain English", "16-year-old", or "what this means").
|
||||
// We test for the length floor and absence of a bare options-list-only output.
|
||||
const ELI10_LENGTH_FLOOR = 400; // full AskUserQuestion content should be at least this long
|
||||
|
||||
// --- Tests ---
|
||||
|
||||
describeCodex('Codex Plan Format — CEO Mode Selection', () => {
|
||||
let skillDir: string, planDir: string, outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-mode-', 'plan-ceo-review'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('codex-plan-ceo-format-mode', async () => {
|
||||
const result = await runCodexSkill({
|
||||
skillDir,
|
||||
prompt: `Read the plan-ceo-review skill. Read plan.md (the plan to review). Proceed to Step 0F (Mode Selection) where the skill presents 4 mode options (SCOPE EXPANSION, SELECTIVE EXPANSION, HOLD SCOPE, SCOPE REDUCTION) via AskUserQuestion. These options differ in kind (review posture), not coverage. ${captureInstruction(outFile)}`,
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-ceo-review',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0);
|
||||
console.log(`codex-plan-ceo-format-mode: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
|
||||
|
||||
// Codex may timeout — accept as non-fatal (same pattern as existing codex-e2e tests)
|
||||
if (result.exitCode === 124 || result.exitCode === 137) {
|
||||
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
|
||||
return;
|
||||
}
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
|
||||
expect(captured).toMatch(RECOMMENDATION_RE);
|
||||
// kind-differentiated: no fabricated score, must have note
|
||||
expect(captured).not.toMatch(COMPLETENESS_RE);
|
||||
expect(captured).toMatch(KIND_NOTE_RE);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
describeCodex('Codex Plan Format — CEO Approach Menu', () => {
|
||||
let skillDir: string, planDir: string, outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-approach-', 'plan-ceo-review'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('codex-plan-ceo-format-approach', async () => {
|
||||
const result = await runCodexSkill({
|
||||
skillDir,
|
||||
prompt: `Read the plan-ceo-review skill. Read plan.md. Proceed to Step 0C-bis (Implementation Alternatives / Approach Menu) where the skill generates 2-3 approaches (minimal viable vs ideal architecture) and presents them via AskUserQuestion. These options differ in coverage so Completeness: N/10 applies. ${captureInstruction(outFile)}`,
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-ceo-review',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0);
|
||||
console.log(`codex-plan-ceo-format-approach: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
|
||||
|
||||
if (result.exitCode === 124 || result.exitCode === 137) {
|
||||
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
|
||||
return;
|
||||
}
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
|
||||
expect(captured).toMatch(RECOMMENDATION_RE);
|
||||
expect(captured).toMatch(COMPLETENESS_RE);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
describeCodex('Codex Plan Format — Eng Coverage Issue', () => {
|
||||
let skillDir: string, planDir: string, outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-cov-', 'plan-eng-review'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('codex-plan-eng-format-coverage', async () => {
|
||||
const result = await runCodexSkill({
|
||||
skillDir,
|
||||
prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 3 Test Review, generate ONE AskUserQuestion about test coverage depth where options are clearly coverage-differentiated: A) full coverage incl. edge + error paths (Completeness 10/10), B) happy path only (7/10), C) smoke test (3/10). ${captureInstruction(outFile)}`,
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-eng-review',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0);
|
||||
console.log(`codex-plan-eng-format-coverage: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
|
||||
|
||||
if (result.exitCode === 124 || result.exitCode === 137) {
|
||||
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
|
||||
return;
|
||||
}
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
|
||||
expect(captured).toMatch(RECOMMENDATION_RE);
|
||||
expect(captured).toMatch(COMPLETENESS_RE);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
describeCodex('Codex Plan Format — Eng Kind Issue', () => {
|
||||
let skillDir: string, planDir: string, outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-kind-', 'plan-eng-review'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('codex-plan-eng-format-kind', async () => {
|
||||
const result = await runCodexSkill({
|
||||
skillDir,
|
||||
prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 1 Architecture review, generate ONE AskUserQuestion about an architectural choice where the options differ in kind (e.g. Redis vs Postgres materialized view vs in-process cache — different kinds of systems with different tradeoffs, NOT more-or-less-complete versions of the same thing). ${captureInstruction(outFile)}`,
|
||||
timeoutMs: 300_000,
|
||||
cwd: planDir,
|
||||
skillName: 'gstack-plan-eng-review',
|
||||
});
|
||||
|
||||
recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0);
|
||||
console.log(`codex-plan-eng-format-kind: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
|
||||
|
||||
if (result.exitCode === 124 || result.exitCode === 137) {
|
||||
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
|
||||
return;
|
||||
}
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
|
||||
expect(captured).toMatch(RECOMMENDATION_RE);
|
||||
// kind-differentiated: no fabricated score
|
||||
expect(captured).not.toMatch(COMPLETENESS_RE);
|
||||
expect(captured).toMatch(KIND_NOTE_RE);
|
||||
}, 360_000);
|
||||
});
|
||||
Reference in New Issue
Block a user