diff --git a/test/codex-e2e-recommendation-substance.test.ts b/test/codex-e2e-recommendation-substance.test.ts new file mode 100644 index 000000000..8d33c0311 --- /dev/null +++ b/test/codex-e2e-recommendation-substance.test.ts @@ -0,0 +1,103 @@ +/** + * /codex recommendation substance — LIVE grade (periodic, paid, Codex CLI). + * + * The gap this closes: skill-cross-model-recommendation-emit.test.ts only checks + * the /codex TEMPLATE contains the "Recommendation: because " + * instruction (static grep). llm-judge-recommendation.test.ts grades the rubric + * against FIXTURES. Nothing runs /codex live and grades the recommendation it + * actually emits. The user reports codex recommendations were the least + * consistent surface on main — so this is the one that needs live coverage. + * + * Method: drive the real /codex skill via codex exec (isolated temp HOME) over a + * small, deliberately-flawed fixture diff. Capture codex's output, extract its + * synthesis "Recommendation: ... because ..." line, and grade it with the same + * judgeRecommendation() rubric used everywhere else: + * - present : a Recommendation line exists + * - commits : names exactly one action (no hedging) + * - has_because : a because-clause follows + * - substance>=4: the reason is option-specific / names a concrete tradeoff, + * not boilerplate ("because it's better") + * + * Periodic tier (Codex non-determinism, ~$2-3/run). + */ +import { describe, test, expect } from 'bun:test'; +import * as path from 'node:path'; +import { runCodexSkill } from './helpers/codex-session-runner'; +import { judgeRecommendation } from './helpers/llm-judge'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +const CODEX_AVAILABLE = (() => { + try { + return Bun.spawnSync(['which', 'codex']).exitCode === 0; + } catch { + return false; + } +})(); +const shouldRun = + CODEX_AVAILABLE && !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeCodex = shouldRun ? describe : describe.skip; + +// A small fixture with two real, comparable problems so a good recommendation +// must CHOOSE (and justify the choice against the alternative) — the exact +// shape judgeRecommendation scores >= 4. +const FIXTURE_DIFF = ` +Review this change. It has more than one issue; finish with a single synthesis +recommendation line in your skill's required format: "Recommendation: +because ." + +--- a/server/auth.ts ++++ b/server/auth.ts +@@ + export function login(req, res) { +- const user = db.query("SELECT * FROM users WHERE name = ?", [req.body.name]); ++ const user = db.query("SELECT * FROM users WHERE name = '" + req.body.name + "'"); + if (user && user.password === req.body.password) { + res.cookie('session', user.id); // no HttpOnly, no Secure, no expiry + return res.json({ ok: true }); + } + return res.status(401).json({ ok: false }); + } +`; + +describeCodex('/codex recommendation substance (live, periodic)', () => { + test( + 'codex emits a committed, substance>=4 synthesis recommendation', + async () => { + const result = await runCodexSkill({ + skillDir: path.join(ROOT, 'codex'), + skillName: 'codex', + prompt: FIXTURE_DIFF, + timeoutMs: 300_000, + }); + + if (result.output.startsWith('SKIP:')) { + // codex binary missing — describeCodex already guards, but double-safe. + return; + } + + const score = await judgeRecommendation(result.output); + // eslint-disable-next-line no-console + console.log( + `[codex-rec] present=${score.present} commits=${score.commits} ` + + `has_because=${score.has_because} substance=${score.reason_substance}\n` + + ` reason: ${score.reason_text}`, + ); + + expect(score.present).toBe(true); + expect(score.has_because).toBe(true); + expect(score.commits).toBe(true); + // The named weak spot: substance must clear the boilerplate bar. + if (score.reason_substance < 4) { + throw new Error( + `codex recommendation substance ${score.reason_substance} < 4 (boilerplate/weak):\n` + + ` reason: ${score.reason_text}\n` + + ` judge: ${score.reasoning}\n` + + `--- codex output (last 2KB) ---\n${result.output.slice(-2000)}`, + ); + } + }, + 360_000, + ); +});