test(opus-4.7): E2E eval for fanout rate + routing precision

Closes the measurement gap flagged by the ship-quality review: "zero
tests exercise Opus 4.7 behavior; every skill-e2e hardcodes 4.6."

Two cases, both pinned to claude-opus-4-7:

1. Fanout rate (A/B)
   - Arm A: regen SKILL.md with --model opus-4-7 (overlay ON, includes
     "Fan out explicitly" nudge).
   - Arm B: regen SKILL.md with --model claude (overlay OFF, only
     model-agnostic nudges).
   - Prompt: "Read alpha.txt, beta.txt, gamma.txt. These are independent."
   - Measure: parallel tool calls in first assistant turn.
   - Assert: arm A >= arm B.

2. Routing precision (6-case mini-benchmark)
   - 3 positive prompts that should route (wtf bug, send it, does it work)
   - 3 negative prompts that match keywords but should NOT route
     (syntax question, algorithm question, slack message)
   - Assert: TP rate >= 66%, FP rate <= 33%.

Cost estimate: ~$3-5 per full run. Classified as periodic tier per
CLAUDE.md convention (Opus model, non-deterministic). Runs only with
EVALS=1 env var, touchfile-gated so unrelated diffs don't trigger it.

Test plan artifact at
~/.gstack/projects/garrytan-gstack/garrytan-feat-opus-4.7-migration-eng-review-test-plan-20260421-230611.md
tracks the full specification.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-22 00:11:38 -07:00
parent d3742c884a
commit 7e90b0f092
2 changed files with 297 additions and 0 deletions
+10
View File
@@ -206,6 +206,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'journey-retro': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-design-system': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
// Opus 4.7 behavior evals — depend on overlay + routing + resolver
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task':
['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],
'routing precision: positives route, negatives do not':
['SKILL.md.tmpl', 'scripts/resolvers/preamble/generate-routing-injection.ts', 'model-overlays/opus-4-7.md'],
};
/**
@@ -372,6 +378,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'journey-retro': 'periodic',
'journey-design-system': 'periodic',
'journey-visual-qa': 'periodic',
// Opus 4.7 overlay evals — periodic (non-deterministic LLM behavior + Opus cost)
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task': 'periodic',
'routing precision: positives route, negatives do not': 'periodic',
};
/**
+287
View File
@@ -0,0 +1,287 @@
/**
* Opus 4.7 behavior evals.
*
* Two cases, both pinned to claude-opus-4-7:
*
* 1. Fanout rate — the "Fan out explicitly" overlay nudge should make 4.7
* spawn parallel tool calls when the prompt has independent sub-problems.
* A/B: SKILL.md regenerated with `--model opus-4-7` (overlay ON) vs
* default `--model claude` (overlay OFF). Assert A ≥ B on parallel-call
* count in the first assistant turn.
*
* 2. Routing precision — the new "when in doubt, invoke the skill" policy
* should route ambiguous dev prompts to the right skill WITHOUT routing
* casual/non-dev prompts. A handful of positive and negative controls.
*
* Both cases require a running Anthropic API key. Gated behind EVALS=1.
* Classify as `periodic` in touchfiles — behavior measurement, not gate.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import { EvalCollector } from './helpers/eval-store';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const OPUS_47 = 'claude-opus-4-7';
const evalsEnabled = !!process.env.EVALS;
const describeE2E = evalsEnabled ? describe : describe.skip;
const evalCollector = evalsEnabled ? new EvalCollector('e2e-opus-47') : null;
const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
// --- Helpers ---
/** Regenerate SKILL.md files at the given model into a scratch root, return that root. */
function regenSkillsAt(model: string, suffix: string): string {
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `opus47-${suffix}-`));
// Bun runtime: run gen-skill-docs in a fresh copy of the repo so we don't
// pollute the main working tree. We need: SKILL.md.tmpl files, scripts/,
// model-overlays/, hosts/. Easiest is to run from ROOT and copy outputs.
const result = spawnSync(
'bun',
['run', 'scripts/gen-skill-docs.ts', '--model', model],
{ cwd: ROOT, stdio: 'pipe', encoding: 'utf-8', timeout: 60_000 },
);
if (result.status !== 0) {
throw new Error(`gen-skill-docs failed for --model ${model}: ${result.stderr}`);
}
// Copy the top-level generated SKILL.md into the scratch dir (under
// .claude/skills/gstack/ which is where Claude looks for project skills).
const skillDir = path.join(tmp, '.claude', 'skills', 'gstack');
fs.mkdirSync(skillDir, { recursive: true });
fs.copyFileSync(path.join(ROOT, 'SKILL.md'), path.join(skillDir, 'SKILL.md'));
// Minimal project context
fs.writeFileSync(
path.join(tmp, 'CLAUDE.md'),
`# Project\n\nSee .claude/skills/gstack/SKILL.md for skill definitions.\n`,
);
fs.writeFileSync(path.join(tmp, 'package.json'), '{"name":"opus47-eval"}');
// git init so any downstream git-aware logic doesn't blow up
const git = (args: string[]) =>
spawnSync('git', args, { cwd: tmp, stdio: 'pipe', timeout: 5_000 });
git(['init']);
git(['config', 'user.email', 't@t.com']);
git(['config', 'user.name', 'T']);
git(['add', '.']);
git(['commit', '-m', 'init']);
return tmp;
}
/** Count parallel tool calls in the first assistant turn. */
function firstTurnParallelism(transcript: any[]): number {
const firstAssistant = transcript.find((e) => e.type === 'assistant');
if (!firstAssistant) return 0;
const content = firstAssistant.message?.content ?? [];
return content.filter((c: any) => c.type === 'tool_use').length;
}
interface RoutingCase {
name: string;
prompt: string;
shouldRoute: boolean;
expectedSkill?: string;
}
/** Small, intentionally chosen routing cases. Positive cases are ambiguous
* phrasings the user actually says, not template text. Negative cases are
* casual or off-topic prompts that match routing keywords but shouldn't
* trigger a skill. */
const ROUTING_CASES: RoutingCase[] = [
// Positive — should route
{ name: 'pos-wtf-bug', prompt: "wtf is this error coming from auth.ts:47 when the cookie expires?", shouldRoute: true, expectedSkill: 'investigate' },
{ name: 'pos-send-it', prompt: "ok this is good enough, let's send it.", shouldRoute: true, expectedSkill: 'ship' },
{ name: 'pos-does-it-work', prompt: "does this feature work on mobile? can you check the deploy?", shouldRoute: true, expectedSkill: 'qa' },
// Negative — should NOT route
{ name: 'neg-syntax-q', prompt: "wtf does this Python list comprehension syntax even mean, [x for x in y if z]?", shouldRoute: false },
{ name: 'neg-algo-q', prompt: "does this bubble sort algorithm actually work in O(n log n)?", shouldRoute: false },
{ name: 'neg-slack-send', prompt: "can you help me write the slack message? I want to send it to the team.", shouldRoute: false },
];
// --- Tests ---
describeE2E('Opus 4.7 overlay behavior evals', () => {
afterAll(() => {
evalCollector?.finalize();
});
test(
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task',
async () => {
const armA = regenSkillsAt('opus-4-7', 'on');
const armB = regenSkillsAt('claude', 'off');
// Populate three tiny independent files in each arm. The prompt asks
// the agent to read all three and report. Opus 4.7 (without nudge)
// tends to serialize; with the nudge it should parallelize.
for (const dir of [armA, armB]) {
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'alpha content: 1\n');
fs.writeFileSync(path.join(dir, 'beta.txt'), 'beta content: 2\n');
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'gamma content: 3\n');
}
const prompt =
"Read alpha.txt, beta.txt, and gamma.txt in this directory and report what's inside each. These three reads are independent.";
try {
const [resA, resB] = await Promise.all([
runSkillTest({
prompt,
workingDirectory: armA,
maxTurns: 5,
allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
timeout: 90_000,
testName: 'fanout-arm-overlay-on',
runId,
model: OPUS_47,
}),
runSkillTest({
prompt,
workingDirectory: armB,
maxTurns: 5,
allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
timeout: 90_000,
testName: 'fanout-arm-overlay-off',
runId,
model: OPUS_47,
}),
]);
const parA = firstTurnParallelism(resA.transcript);
const parB = firstTurnParallelism(resB.transcript);
console.log(
`[opus-4-7 fanout] arm A (overlay ON): ${parA} parallel tool calls in first turn; ` +
`arm B (overlay OFF): ${parB}`,
);
console.log(` cost A=$${resA.costEstimate.estimatedCost.toFixed(2)} B=$${resB.costEstimate.estimatedCost.toFixed(2)}`);
evalCollector?.addTest({
name: 'fanout-arm-overlay-on',
suite: 'Opus 4.7 overlay',
tier: 'e2e',
passed: parA >= parB,
duration_ms: resA.duration,
cost_usd: resA.costEstimate.estimatedCost,
transcript: resA.transcript,
output: `parallel=${parA}`,
turns_used: resA.costEstimate.turnsUsed,
exit_reason: resA.exitReason,
});
evalCollector?.addTest({
name: 'fanout-arm-overlay-off',
suite: 'Opus 4.7 overlay',
tier: 'e2e',
passed: true, // baseline arm, recorded for comparison
duration_ms: resB.duration,
cost_usd: resB.costEstimate.estimatedCost,
transcript: resB.transcript,
output: `parallel=${parB}`,
turns_used: resB.costEstimate.turnsUsed,
exit_reason: resB.exitReason,
});
// Main assertion: overlay arm is at least as parallel as baseline.
expect(parA, `overlay arm emitted ${parA} parallel calls, baseline ${parB}`).toBeGreaterThanOrEqual(parB);
} finally {
fs.rmSync(armA, { recursive: true, force: true });
fs.rmSync(armB, { recursive: true, force: true });
}
},
240_000,
);
test(
'routing precision: positives route, negatives do not',
async () => {
// Single SKILL.md tree shared by all cases. We run claude-opus-4-7 with
// tool access to Skill; measure whether the first tool call is Skill(..)
// and if so, which skill.
const root = regenSkillsAt('opus-4-7', 'routing');
try {
const results = await Promise.all(
ROUTING_CASES.map((c) =>
runSkillTest({
prompt: c.prompt,
workingDirectory: root,
maxTurns: 3,
allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
timeout: 90_000,
testName: `routing-${c.name}`,
runId,
model: OPUS_47,
}).then((r) => ({ c, r })),
),
);
let tp = 0, fn = 0, fp = 0, tn = 0;
const rows: string[] = [];
let totalCost = 0;
for (const { c, r } of results) {
const skillCalls = r.toolCalls.filter((tc) => tc.tool === 'Skill');
const routed = skillCalls.length > 0;
const actualSkill = routed ? skillCalls[0]?.input?.skill : undefined;
const correct = c.shouldRoute
? routed && (!c.expectedSkill || actualSkill === c.expectedSkill)
: !routed;
if (c.shouldRoute && routed) tp++;
else if (c.shouldRoute && !routed) fn++;
else if (!c.shouldRoute && routed) fp++;
else tn++;
totalCost += r.costEstimate.estimatedCost;
rows.push(
` ${c.name.padEnd(18)} routed=${String(routed).padEnd(5)} skill=${String(actualSkill).padEnd(16)} ` +
`expected=${c.shouldRoute ? (c.expectedSkill ?? 'any') : '(none)'} ${correct ? 'OK' : 'MISS'}`,
);
evalCollector?.addTest({
name: `routing-${c.name}`,
suite: 'Opus 4.7 routing',
tier: 'e2e',
passed: correct,
duration_ms: r.duration,
cost_usd: r.costEstimate.estimatedCost,
transcript: r.transcript,
output: `routed=${routed} actual=${actualSkill ?? '(none)'} expected=${c.shouldRoute ? c.expectedSkill ?? 'any' : '(none)'}`,
turns_used: r.costEstimate.turnsUsed,
exit_reason: r.exitReason,
});
}
const posCount = ROUTING_CASES.filter((c) => c.shouldRoute).length;
const negCount = ROUTING_CASES.length - posCount;
const tpRate = posCount > 0 ? tp / posCount : 0;
const fpRate = negCount > 0 ? fp / negCount : 0;
console.log(`[opus-4-7 routing] total cost $${totalCost.toFixed(2)}`);
console.log(rows.join('\n'));
console.log(
` TP=${tp}/${posCount} (${(tpRate * 100).toFixed(0)}%) FN=${fn} ` +
`FP=${fp}/${negCount} (${(fpRate * 100).toFixed(0)}%) TN=${tn}`,
);
// Thresholds from the test plan artifact: TP >= 80%, FP <= 30%.
// With a small N we loosen slightly: TP >= 66% (2 of 3 positive),
// FP <= 33% (no more than 1 of 3 negatives).
expect(tpRate, `true-positive rate ${(tpRate * 100).toFixed(0)}% (need >= 66%)`).toBeGreaterThanOrEqual(2 / 3);
expect(fpRate, `false-positive rate ${(fpRate * 100).toFixed(0)}% (need <= 33%)`).toBeLessThanOrEqual(1 / 3);
} finally {
fs.rmSync(root, { recursive: true, force: true });
}
},
360_000,
);
});