feat(codex+review): require synthesis Recommendation in cross-model skills

Extends the v1.25.1.0 AskUserQuestion recommendation-quality coverage to the cross-model synthesis surfaces that were previously emitting prose without a structured recommendation: - /codex review (Step 2A) — after presenting Codex output + GATE verdict, must emit `Recommendation: <action> because <reason>` line. Reason must compare against alternatives (other findings, fix-vs-ship, fix-order). - /codex challenge (Step 2B) — same requirement after adversarial output. - /codex consult (Step 2C) — same requirement after consult presentation, with examples for plan-review consults that engage with specific Codex insights. - Claude adversarial subagent (scripts/resolvers/review.ts:446, used by /ship Step 11 + standalone /review) — subagent prompt now ends with "After listing findings, end your output with ONE line in the canonical format Recommendation: <action> because <reason>". Codex adversarial command (line 461) gets the same final-line requirement. The same `judgeRecommendation` helper grades both AskUserQuestion and cross-model synthesis — one rubric, two surfaces. Substance-5 cross-model recommendations explicitly compare against alternatives (a different finding, fix-vs-ship, fix-order). Generic synthesis ("because adversarial review found things") fails at threshold ≥ 4. Tests: - test/llm-judge-recommendation.test.ts gains 5 cross-model fixtures (3 substance ≥ 4, 2 substance < 4). Existing rubric correctly grades them. - test/skill-cross-model-recommendation-emit.test.ts (new, free-tier) — static guard greps codex/SKILL.md.tmpl + scripts/resolvers/review.ts for the canonical emit instruction. Trips before any paid eval if the templates drift. Touchfile: extended `llm-judge-recommendation` entry with codex/SKILL.md.tmpl and scripts/resolvers/review.ts so synthesis-template edits invalidate the fixture re-run. Verified: free `bun test` exits 0 (5/5 static emit-guard tests pass), paid fixture passes 45/45 expect calls in 24s with the cross-model substance-5 fixtures correctly judged at >= 4. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 21:46:40 +02:00 · 2026-05-01 19:38:12 -07:00
parent 336c2cfe4a
commit 4ab0269729
9 changed files with 232 additions and 8 deletions
@@ -106,7 +106,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'autoplan-auto-mode':           ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
  'office-hours-auto-mode':       ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
  'office-hours-phase4-fork':     ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/question-tuning.ts', 'test/helpers/llm-judge.ts', 'test/skill-e2e-office-hours-phase4.test.ts'],
-  'llm-judge-recommendation':     ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts'],
+  'llm-judge-recommendation':     ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'codex/SKILL.md.tmpl', 'scripts/resolvers/review.ts'],
  // v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution
  // fix doesn't trip the legitimate /plan-tune opt-in path: when the user has
  // written a never-ask preference, AUQ should still auto-decide rather than
@@ -104,6 +104,63 @@ Net: ...`);
    expect(noRec.has_because).toBe(false);
    expect(noRec.reason_substance).toBe(1);

+    // CROSS-MODEL synthesis recommendations: when /codex or the Claude
+    // adversarial subagent emit a synthesis Recommendation line, it follows
+    // the same canonical shape and is graded by the same rubric. These
+    // fixtures pin the v1.25.1.0+ cross-model-skill emit format documented
+    // in codex/SKILL.md.tmpl Steps 2A/2B/2C and scripts/resolvers/review.ts.
+    // Substance-5 cross-model fixtures explicitly compare against an
+    // alternative (a different finding, a different recommended action, or
+    // no-fix vs fix). The same rubric the AskUserQuestion judge uses applies:
+    // strong reasons name a tradeoff distinguishing the chosen action from
+    // at least one alternative. Cross-model synthesis has implicit
+    // alternatives — different findings, different fix orders, ship-vs-fix —
+    // so the same shape applies.
+    const crossModelCases = [
+      [
+        'codex-review good',
+        'Recommendation: Fix the SQL injection at users_controller.rb:42 first because its auth-bypass blast radius is higher than the LFI Codex also flagged, and the parameterized-query fix is three lines vs the LFI session-handling rewrite.',
+        true,  // expect substance >= 4
+      ],
+      [
+        'adversarial good',
+        'Recommendation: Fix the unbounded retry loop at queue.ts:78 because it DoSes the worker pool under sustained 429s, which is higher-blast-radius than the timing leak Codex also flagged that only touches a debug endpoint.',
+        true,
+      ],
+      [
+        'consult good',
+        'Recommendation: Adopt the sharding approach Codex suggested because it eliminates the head-of-line blocking the current writer-pool has, while the cache-layer alternative Codex also floated still has a single-writer hot path.',
+        true,
+      ],
+      // SUBSTANCE ~1-2: boilerplate cross-model synthesis.
+      [
+        'cross-model boilerplate',
+        'Recommendation: Look at the findings because adversarial review found things.',
+        false, // expect substance < 4
+      ],
+      [
+        'cross-model generic',
+        'Recommendation: Ship as-is because the diff is fine.',
+        false,
+      ],
+    ] as Array<[string, string, boolean]>;
+    for (const [label, text, shouldPass] of crossModelCases) {
+      const score = await judgeRecommendation(text);
+      expect(score.present, `[cross-model:${label}] present should be true`).toBe(true);
+      expect(score.has_because, `[cross-model:${label}] has_because should be true`).toBe(true);
+      if (shouldPass) {
+        expect(
+          score.reason_substance,
+          `[cross-model:${label}] expected substance >=4; got ${score.reason_substance}: ${score.reasoning}`,
+        ).toBeGreaterThanOrEqual(4);
+      } else {
+        expect(
+          score.reason_substance,
+          `[cross-model:${label}] expected substance <4; got ${score.reason_substance}: ${score.reasoning}`,
+        ).toBeLessThan(4);
+      }
+    }
+
    // HEDGING: each alternate in the hedging regex is exercised separately.
    // Most are no-because forms that short-circuit the LLM call entirely (the
    // judge skips Haiku when has_because is false). The "either B or C
@@ -0,0 +1,69 @@
+/**
+ * Static guard for cross-model synthesis recommendation emit instructions.
+ *
+ * v1.25.1.0+ extended the AskUserQuestion recommendation-quality coverage
+ * to cross-model skills (/codex review/challenge/consult, the Claude
+ * adversarial subagent, and the Codex adversarial pass). Each surface MUST
+ * tell the model to end its synthesis with a canonical
+ *   `Recommendation: <action> because <reason>`
+ * line so judgeRecommendation can grade it (see test/llm-judge-recommendation
+ * for the rubric exercise).
+ *
+ * Free, deterministic, single-purpose: if any contributor edits these
+ * templates and removes the emit instruction, this test trips before the
+ * change reaches a paid eval. The runtime grading still happens via
+ * judgeRecommendation when the skills run for real; this test just pins the
+ * source of truth.
+ */
+import { describe, test, expect } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+
+describe('cross-model synthesis emit instructions', () => {
+  test('codex/SKILL.md.tmpl Step 2A (review) requires a synthesis Recommendation', () => {
+    const tmpl = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md.tmpl'), 'utf-8');
+    const step2a = sliceBetween(tmpl, '## Step 2A:', '## Step 2B:');
+    expect(step2a, 'Step 2A section not found in codex template').not.toBe('');
+    expect(step2a).toMatch(/Synthesis recommendation \(REQUIRED\)/);
+    expect(step2a).toMatch(/Recommendation:\s*<action>\s*because/);
+  });
+
+  test('codex/SKILL.md.tmpl Step 2B (challenge) requires a synthesis Recommendation', () => {
+    const tmpl = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md.tmpl'), 'utf-8');
+    const step2b = sliceBetween(tmpl, '## Step 2B:', '## Step 2C:');
+    expect(step2b, 'Step 2B section not found in codex template').not.toBe('');
+    expect(step2b).toMatch(/Synthesis recommendation \(REQUIRED\)/);
+    expect(step2b).toMatch(/Recommendation:\s*<action>\s*because/);
+  });
+
+  test('codex/SKILL.md.tmpl Step 2C (consult) requires a synthesis Recommendation', () => {
+    const tmpl = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md.tmpl'), 'utf-8');
+    const step2c = sliceBetween(tmpl, '## Step 2C:', '## Model & Reasoning');
+    expect(step2c, 'Step 2C section not found in codex template').not.toBe('');
+    expect(step2c).toMatch(/Synthesis recommendation \(REQUIRED\)/);
+    expect(step2c).toMatch(/Recommendation:\s*<action>\s*because/);
+  });
+
+  test('scripts/resolvers/review.ts Claude adversarial subagent prompt requires Recommendation', () => {
+    const resolver = fs.readFileSync(path.join(ROOT, 'scripts', 'resolvers', 'review.ts'), 'utf-8');
+    // The Claude subagent prompt must instruct the model to emit a final
+    // canonical Recommendation line.
+    expect(resolver).toMatch(/Claude adversarial subagent[\s\S]+?Recommendation:\s*<action>\s*because/);
+  });
+
+  test('scripts/resolvers/review.ts Codex adversarial command requires Recommendation', () => {
+    const resolver = fs.readFileSync(path.join(ROOT, 'scripts', 'resolvers', 'review.ts'), 'utf-8');
+    // The codex exec command's prompt string must include the emit
+    // instruction. Match within the codex adversarial section.
+    expect(resolver).toMatch(/Codex adversarial challenge[\s\S]+?Recommendation:\s*<action>\s*because/);
+  });
+});
+
+function sliceBetween(text: string, startMarker: string, endMarker: string): string {
+  const start = text.indexOf(startMarker);
+  if (start < 0) return '';
+  const end = text.indexOf(endMarker, start + startMarker.length);
+  return end > start ? text.slice(start, end) : text.slice(start);
+}