mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
test(codex): live recommendation-substance grade for /codex
Closes the gap where /codex's synthesis recommendation was only checked statically (template grep) and via fixtures. Drives the real /codex skill over a flawed diff and grades the emitted "Recommendation: ... because ..." line with judgeRecommendation (present/commits/has_because/substance>=4). The named weak spot holds up: substance 5. Periodic tier. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
/**
|
||||
* /codex recommendation substance — LIVE grade (periodic, paid, Codex CLI).
|
||||
*
|
||||
* The gap this closes: skill-cross-model-recommendation-emit.test.ts only checks
|
||||
* the /codex TEMPLATE contains the "Recommendation: <action> because <reason>"
|
||||
* instruction (static grep). llm-judge-recommendation.test.ts grades the rubric
|
||||
* against FIXTURES. Nothing runs /codex live and grades the recommendation it
|
||||
* actually emits. The user reports codex recommendations were the least
|
||||
* consistent surface on main — so this is the one that needs live coverage.
|
||||
*
|
||||
* Method: drive the real /codex skill via codex exec (isolated temp HOME) over a
|
||||
* small, deliberately-flawed fixture diff. Capture codex's output, extract its
|
||||
* synthesis "Recommendation: ... because ..." line, and grade it with the same
|
||||
* judgeRecommendation() rubric used everywhere else:
|
||||
* - present : a Recommendation line exists
|
||||
* - commits : names exactly one action (no hedging)
|
||||
* - has_because : a because-clause follows
|
||||
* - substance>=4: the reason is option-specific / names a concrete tradeoff,
|
||||
* not boilerplate ("because it's better")
|
||||
*
|
||||
* Periodic tier (Codex non-determinism, ~$2-3/run).
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as path from 'node:path';
|
||||
import { runCodexSkill } from './helpers/codex-session-runner';
|
||||
import { judgeRecommendation } from './helpers/llm-judge';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
const CODEX_AVAILABLE = (() => {
|
||||
try {
|
||||
return Bun.spawnSync(['which', 'codex']).exitCode === 0;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
})();
|
||||
const shouldRun =
|
||||
CODEX_AVAILABLE && !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||
const describeCodex = shouldRun ? describe : describe.skip;
|
||||
|
||||
// A small fixture with two real, comparable problems so a good recommendation
|
||||
// must CHOOSE (and justify the choice against the alternative) — the exact
|
||||
// shape judgeRecommendation scores >= 4.
|
||||
const FIXTURE_DIFF = `
|
||||
Review this change. It has more than one issue; finish with a single synthesis
|
||||
recommendation line in your skill's required format: "Recommendation: <action>
|
||||
because <one-line reason that names the most important finding and why it beats
|
||||
the alternative>."
|
||||
|
||||
--- a/server/auth.ts
|
||||
+++ b/server/auth.ts
|
||||
@@
|
||||
export function login(req, res) {
|
||||
- const user = db.query("SELECT * FROM users WHERE name = ?", [req.body.name]);
|
||||
+ const user = db.query("SELECT * FROM users WHERE name = '" + req.body.name + "'");
|
||||
if (user && user.password === req.body.password) {
|
||||
res.cookie('session', user.id); // no HttpOnly, no Secure, no expiry
|
||||
return res.json({ ok: true });
|
||||
}
|
||||
return res.status(401).json({ ok: false });
|
||||
}
|
||||
`;
|
||||
|
||||
describeCodex('/codex recommendation substance (live, periodic)', () => {
|
||||
test(
|
||||
'codex emits a committed, substance>=4 synthesis recommendation',
|
||||
async () => {
|
||||
const result = await runCodexSkill({
|
||||
skillDir: path.join(ROOT, 'codex'),
|
||||
skillName: 'codex',
|
||||
prompt: FIXTURE_DIFF,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
if (result.output.startsWith('SKIP:')) {
|
||||
// codex binary missing — describeCodex already guards, but double-safe.
|
||||
return;
|
||||
}
|
||||
|
||||
const score = await judgeRecommendation(result.output);
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(
|
||||
`[codex-rec] present=${score.present} commits=${score.commits} ` +
|
||||
`has_because=${score.has_because} substance=${score.reason_substance}\n` +
|
||||
` reason: ${score.reason_text}`,
|
||||
);
|
||||
|
||||
expect(score.present).toBe(true);
|
||||
expect(score.has_because).toBe(true);
|
||||
expect(score.commits).toBe(true);
|
||||
// The named weak spot: substance must clear the boilerplate bar.
|
||||
if (score.reason_substance < 4) {
|
||||
throw new Error(
|
||||
`codex recommendation substance ${score.reason_substance} < 4 (boilerplate/weak):\n` +
|
||||
` reason: ${score.reason_text}\n` +
|
||||
` judge: ${score.reasoning}\n` +
|
||||
`--- codex output (last 2KB) ---\n${result.output.slice(-2000)}`,
|
||||
);
|
||||
}
|
||||
},
|
||||
360_000,
|
||||
);
|
||||
});
|
||||
Reference in New Issue
Block a user