mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 21:46:40 +02:00
feat(codex+review): require synthesis Recommendation in cross-model skills
Extends the v1.25.1.0 AskUserQuestion recommendation-quality coverage to the
cross-model synthesis surfaces that were previously emitting prose without a
structured recommendation:
- /codex review (Step 2A) — after presenting Codex output + GATE verdict,
must emit `Recommendation: <action> because <reason>` line. Reason must
compare against alternatives (other findings, fix-vs-ship, fix-order).
- /codex challenge (Step 2B) — same requirement after adversarial output.
- /codex consult (Step 2C) — same requirement after consult presentation,
with examples for plan-review consults that engage with specific Codex
insights.
- Claude adversarial subagent (scripts/resolvers/review.ts:446, used by
/ship Step 11 + standalone /review) — subagent prompt now ends with
"After listing findings, end your output with ONE line in the canonical
format Recommendation: <action> because <reason>". Codex adversarial
command (line 461) gets the same final-line requirement.
The same `judgeRecommendation` helper grades both AskUserQuestion and
cross-model synthesis — one rubric, two surfaces. Substance-5 cross-model
recommendations explicitly compare against alternatives (a different
finding, fix-vs-ship, fix-order). Generic synthesis ("because adversarial
review found things") fails at threshold ≥ 4.
Tests:
- test/llm-judge-recommendation.test.ts gains 5 cross-model fixtures (3
substance ≥ 4, 2 substance < 4). Existing rubric correctly grades them.
- test/skill-cross-model-recommendation-emit.test.ts (new, free-tier) —
static guard greps codex/SKILL.md.tmpl + scripts/resolvers/review.ts for
the canonical emit instruction. Trips before any paid eval if the
templates drift.
Touchfile: extended `llm-judge-recommendation` entry with codex/SKILL.md.tmpl
and scripts/resolvers/review.ts so synthesis-template edits invalidate the
fixture re-run.
Verified: free `bun test` exits 0 (5/5 static emit-guard tests pass), paid
fixture passes 45/45 expect calls in 24s with the cross-model substance-5
fixtures correctly judged at >= 4.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -106,7 +106,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-phase4-fork': ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/question-tuning.ts', 'test/helpers/llm-judge.ts', 'test/skill-e2e-office-hours-phase4.test.ts'],
|
||||
'llm-judge-recommendation': ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts'],
|
||||
'llm-judge-recommendation': ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'codex/SKILL.md.tmpl', 'scripts/resolvers/review.ts'],
|
||||
// v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution
|
||||
// fix doesn't trip the legitimate /plan-tune opt-in path: when the user has
|
||||
// written a never-ask preference, AUQ should still auto-decide rather than
|
||||
|
||||
@@ -104,6 +104,63 @@ Net: ...`);
|
||||
expect(noRec.has_because).toBe(false);
|
||||
expect(noRec.reason_substance).toBe(1);
|
||||
|
||||
// CROSS-MODEL synthesis recommendations: when /codex or the Claude
|
||||
// adversarial subagent emit a synthesis Recommendation line, it follows
|
||||
// the same canonical shape and is graded by the same rubric. These
|
||||
// fixtures pin the v1.25.1.0+ cross-model-skill emit format documented
|
||||
// in codex/SKILL.md.tmpl Steps 2A/2B/2C and scripts/resolvers/review.ts.
|
||||
// Substance-5 cross-model fixtures explicitly compare against an
|
||||
// alternative (a different finding, a different recommended action, or
|
||||
// no-fix vs fix). The same rubric the AskUserQuestion judge uses applies:
|
||||
// strong reasons name a tradeoff distinguishing the chosen action from
|
||||
// at least one alternative. Cross-model synthesis has implicit
|
||||
// alternatives — different findings, different fix orders, ship-vs-fix —
|
||||
// so the same shape applies.
|
||||
const crossModelCases = [
|
||||
[
|
||||
'codex-review good',
|
||||
'Recommendation: Fix the SQL injection at users_controller.rb:42 first because its auth-bypass blast radius is higher than the LFI Codex also flagged, and the parameterized-query fix is three lines vs the LFI session-handling rewrite.',
|
||||
true, // expect substance >= 4
|
||||
],
|
||||
[
|
||||
'adversarial good',
|
||||
'Recommendation: Fix the unbounded retry loop at queue.ts:78 because it DoSes the worker pool under sustained 429s, which is higher-blast-radius than the timing leak Codex also flagged that only touches a debug endpoint.',
|
||||
true,
|
||||
],
|
||||
[
|
||||
'consult good',
|
||||
'Recommendation: Adopt the sharding approach Codex suggested because it eliminates the head-of-line blocking the current writer-pool has, while the cache-layer alternative Codex also floated still has a single-writer hot path.',
|
||||
true,
|
||||
],
|
||||
// SUBSTANCE ~1-2: boilerplate cross-model synthesis.
|
||||
[
|
||||
'cross-model boilerplate',
|
||||
'Recommendation: Look at the findings because adversarial review found things.',
|
||||
false, // expect substance < 4
|
||||
],
|
||||
[
|
||||
'cross-model generic',
|
||||
'Recommendation: Ship as-is because the diff is fine.',
|
||||
false,
|
||||
],
|
||||
] as Array<[string, string, boolean]>;
|
||||
for (const [label, text, shouldPass] of crossModelCases) {
|
||||
const score = await judgeRecommendation(text);
|
||||
expect(score.present, `[cross-model:${label}] present should be true`).toBe(true);
|
||||
expect(score.has_because, `[cross-model:${label}] has_because should be true`).toBe(true);
|
||||
if (shouldPass) {
|
||||
expect(
|
||||
score.reason_substance,
|
||||
`[cross-model:${label}] expected substance >=4; got ${score.reason_substance}: ${score.reasoning}`,
|
||||
).toBeGreaterThanOrEqual(4);
|
||||
} else {
|
||||
expect(
|
||||
score.reason_substance,
|
||||
`[cross-model:${label}] expected substance <4; got ${score.reason_substance}: ${score.reasoning}`,
|
||||
).toBeLessThan(4);
|
||||
}
|
||||
}
|
||||
|
||||
// HEDGING: each alternate in the hedging regex is exercised separately.
|
||||
// Most are no-because forms that short-circuit the LLM call entirely (the
|
||||
// judge skips Haiku when has_because is false). The "either B or C
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
/**
|
||||
* Static guard for cross-model synthesis recommendation emit instructions.
|
||||
*
|
||||
* v1.25.1.0+ extended the AskUserQuestion recommendation-quality coverage
|
||||
* to cross-model skills (/codex review/challenge/consult, the Claude
|
||||
* adversarial subagent, and the Codex adversarial pass). Each surface MUST
|
||||
* tell the model to end its synthesis with a canonical
|
||||
* `Recommendation: <action> because <reason>`
|
||||
* line so judgeRecommendation can grade it (see test/llm-judge-recommendation
|
||||
* for the rubric exercise).
|
||||
*
|
||||
* Free, deterministic, single-purpose: if any contributor edits these
|
||||
* templates and removes the emit instruction, this test trips before the
|
||||
* change reaches a paid eval. The runtime grading still happens via
|
||||
* judgeRecommendation when the skills run for real; this test just pins the
|
||||
* source of truth.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
describe('cross-model synthesis emit instructions', () => {
|
||||
test('codex/SKILL.md.tmpl Step 2A (review) requires a synthesis Recommendation', () => {
|
||||
const tmpl = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md.tmpl'), 'utf-8');
|
||||
const step2a = sliceBetween(tmpl, '## Step 2A:', '## Step 2B:');
|
||||
expect(step2a, 'Step 2A section not found in codex template').not.toBe('');
|
||||
expect(step2a).toMatch(/Synthesis recommendation \(REQUIRED\)/);
|
||||
expect(step2a).toMatch(/Recommendation:\s*<action>\s*because/);
|
||||
});
|
||||
|
||||
test('codex/SKILL.md.tmpl Step 2B (challenge) requires a synthesis Recommendation', () => {
|
||||
const tmpl = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md.tmpl'), 'utf-8');
|
||||
const step2b = sliceBetween(tmpl, '## Step 2B:', '## Step 2C:');
|
||||
expect(step2b, 'Step 2B section not found in codex template').not.toBe('');
|
||||
expect(step2b).toMatch(/Synthesis recommendation \(REQUIRED\)/);
|
||||
expect(step2b).toMatch(/Recommendation:\s*<action>\s*because/);
|
||||
});
|
||||
|
||||
test('codex/SKILL.md.tmpl Step 2C (consult) requires a synthesis Recommendation', () => {
|
||||
const tmpl = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md.tmpl'), 'utf-8');
|
||||
const step2c = sliceBetween(tmpl, '## Step 2C:', '## Model & Reasoning');
|
||||
expect(step2c, 'Step 2C section not found in codex template').not.toBe('');
|
||||
expect(step2c).toMatch(/Synthesis recommendation \(REQUIRED\)/);
|
||||
expect(step2c).toMatch(/Recommendation:\s*<action>\s*because/);
|
||||
});
|
||||
|
||||
test('scripts/resolvers/review.ts Claude adversarial subagent prompt requires Recommendation', () => {
|
||||
const resolver = fs.readFileSync(path.join(ROOT, 'scripts', 'resolvers', 'review.ts'), 'utf-8');
|
||||
// The Claude subagent prompt must instruct the model to emit a final
|
||||
// canonical Recommendation line.
|
||||
expect(resolver).toMatch(/Claude adversarial subagent[\s\S]+?Recommendation:\s*<action>\s*because/);
|
||||
});
|
||||
|
||||
test('scripts/resolvers/review.ts Codex adversarial command requires Recommendation', () => {
|
||||
const resolver = fs.readFileSync(path.join(ROOT, 'scripts', 'resolvers', 'review.ts'), 'utf-8');
|
||||
// The codex exec command's prompt string must include the emit
|
||||
// instruction. Match within the codex adversarial section.
|
||||
expect(resolver).toMatch(/Codex adversarial challenge[\s\S]+?Recommendation:\s*<action>\s*because/);
|
||||
});
|
||||
});
|
||||
|
||||
function sliceBetween(text: string, startMarker: string, endMarker: string): string {
|
||||
const start = text.indexOf(startMarker);
|
||||
if (start < 0) return '';
|
||||
const end = text.indexOf(endMarker, start + startMarker.length);
|
||||
return end > start ? text.slice(start, end) : text.slice(start);
|
||||
}
|
||||
Reference in New Issue
Block a user