test(codex): live recommendation-substance grade for /codex

Closes the gap where /codex's synthesis recommendation was only checked statically (template grep) and via fixtures. Drives the real /codex skill over a flawed diff and grades the emitted "Recommendation: ... because ..." line with judgeRecommendation (present/commits/has_because/substance>=4). The named weak spot holds up: substance 5. Periodic tier. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-08-03 04:48:42 +02:00 · 2026-06-01 22:17:37 -07:00
parent 63ea6bb23c
commit 9e53ec85b0
1 changed files with 103 additions and 0 deletions
@@ -0,0 +1,103 @@
+/**
+ * /codex recommendation substance — LIVE grade (periodic, paid, Codex CLI).
+ *
+ * The gap this closes: skill-cross-model-recommendation-emit.test.ts only checks
+ * the /codex TEMPLATE contains the "Recommendation: <action> because <reason>"
+ * instruction (static grep). llm-judge-recommendation.test.ts grades the rubric
+ * against FIXTURES. Nothing runs /codex live and grades the recommendation it
+ * actually emits. The user reports codex recommendations were the least
+ * consistent surface on main — so this is the one that needs live coverage.
+ *
+ * Method: drive the real /codex skill via codex exec (isolated temp HOME) over a
+ * small, deliberately-flawed fixture diff. Capture codex's output, extract its
+ * synthesis "Recommendation: ... because ..." line, and grade it with the same
+ * judgeRecommendation() rubric used everywhere else:
+ *   - present     : a Recommendation line exists
+ *   - commits     : names exactly one action (no hedging)
+ *   - has_because : a because-clause follows
+ *   - substance>=4: the reason is option-specific / names a concrete tradeoff,
+ *                   not boilerplate ("because it's better")
+ *
+ * Periodic tier (Codex non-determinism, ~$2-3/run).
+ */
+import { describe, test, expect } from 'bun:test';
+import * as path from 'node:path';
+import { runCodexSkill } from './helpers/codex-session-runner';
+import { judgeRecommendation } from './helpers/llm-judge';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+
+const CODEX_AVAILABLE = (() => {
+  try {
+    return Bun.spawnSync(['which', 'codex']).exitCode === 0;
+  } catch {
+    return false;
+  }
+})();
+const shouldRun =
+  CODEX_AVAILABLE && !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeCodex = shouldRun ? describe : describe.skip;
+
+// A small fixture with two real, comparable problems so a good recommendation
+// must CHOOSE (and justify the choice against the alternative) — the exact
+// shape judgeRecommendation scores >= 4.
+const FIXTURE_DIFF = `
+Review this change. It has more than one issue; finish with a single synthesis
+recommendation line in your skill's required format: "Recommendation: <action>
+because <one-line reason that names the most important finding and why it beats
+the alternative>."
+
+--- a/server/auth.ts
+++ b/server/auth.ts
+@@
+ export function login(req, res) {
+-  const user = db.query("SELECT * FROM users WHERE name = ?", [req.body.name]);
+  const user = db.query("SELECT * FROM users WHERE name = '" + req.body.name + "'");
+   if (user && user.password === req.body.password) {
+     res.cookie('session', user.id);  // no HttpOnly, no Secure, no expiry
+     return res.json({ ok: true });
+   }
+   return res.status(401).json({ ok: false });
+ }
+`;
+
+describeCodex('/codex recommendation substance (live, periodic)', () => {
+  test(
+    'codex emits a committed, substance>=4 synthesis recommendation',
+    async () => {
+      const result = await runCodexSkill({
+        skillDir: path.join(ROOT, 'codex'),
+        skillName: 'codex',
+        prompt: FIXTURE_DIFF,
+        timeoutMs: 300_000,
+      });
+
+      if (result.output.startsWith('SKIP:')) {
+        // codex binary missing — describeCodex already guards, but double-safe.
+        return;
+      }
+
+      const score = await judgeRecommendation(result.output);
+      // eslint-disable-next-line no-console
+      console.log(
+        `[codex-rec] present=${score.present} commits=${score.commits} ` +
+          `has_because=${score.has_because} substance=${score.reason_substance}\n` +
+          `  reason: ${score.reason_text}`,
+      );
+
+      expect(score.present).toBe(true);
+      expect(score.has_because).toBe(true);
+      expect(score.commits).toBe(true);
+      // The named weak spot: substance must clear the boilerplate bar.
+      if (score.reason_substance < 4) {
+        throw new Error(
+          `codex recommendation substance ${score.reason_substance} < 4 (boilerplate/weak):\n` +
+            `  reason: ${score.reason_text}\n` +
+            `  judge: ${score.reasoning}\n` +
+            `--- codex output (last 2KB) ---\n${result.output.slice(-2000)}`,
+        );
+      }
+    },
+    360_000,
+  );
+});