diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 692d00d8..915d5e90 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -206,6 +206,12 @@ export const E2E_TOUCHFILES: Record = { 'journey-retro': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], 'journey-design-system': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], 'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + + // Opus 4.7 behavior evals — depend on overlay + routing + resolver + 'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task': + ['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'], + 'routing precision: positives route, negatives do not': + ['SKILL.md.tmpl', 'scripts/resolvers/preamble/generate-routing-injection.ts', 'model-overlays/opus-4-7.md'], }; /** @@ -372,6 +378,10 @@ export const E2E_TIERS: Record = { 'journey-retro': 'periodic', 'journey-design-system': 'periodic', 'journey-visual-qa': 'periodic', + + // Opus 4.7 overlay evals — periodic (non-deterministic LLM behavior + Opus cost) + 'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task': 'periodic', + 'routing precision: positives route, negatives do not': 'periodic', }; /** diff --git a/test/skill-e2e-opus-47.test.ts b/test/skill-e2e-opus-47.test.ts new file mode 100644 index 00000000..a8fa4c4b --- /dev/null +++ b/test/skill-e2e-opus-47.test.ts @@ -0,0 +1,287 @@ +/** + * Opus 4.7 behavior evals. + * + * Two cases, both pinned to claude-opus-4-7: + * + * 1. Fanout rate — the "Fan out explicitly" overlay nudge should make 4.7 + * spawn parallel tool calls when the prompt has independent sub-problems. + * A/B: SKILL.md regenerated with `--model opus-4-7` (overlay ON) vs + * default `--model claude` (overlay OFF). Assert A ≥ B on parallel-call + * count in the first assistant turn. + * + * 2. Routing precision — the new "when in doubt, invoke the skill" policy + * should route ambiguous dev prompts to the right skill WITHOUT routing + * casual/non-dev prompts. A handful of positive and negative controls. + * + * Both cases require a running Anthropic API key. Gated behind EVALS=1. + * Classify as `periodic` in touchfiles — behavior measurement, not gate. + */ + +import { describe, test, expect, afterAll } from 'bun:test'; +import { runSkillTest } from './helpers/session-runner'; +import { EvalCollector } from './helpers/eval-store'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const OPUS_47 = 'claude-opus-4-7'; + +const evalsEnabled = !!process.env.EVALS; +const describeE2E = evalsEnabled ? describe : describe.skip; +const evalCollector = evalsEnabled ? new EvalCollector('e2e-opus-47') : null; +const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').slice(0, 15); + +// --- Helpers --- + +/** Regenerate SKILL.md files at the given model into a scratch root, return that root. */ +function regenSkillsAt(model: string, suffix: string): string { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `opus47-${suffix}-`)); + + // Bun runtime: run gen-skill-docs in a fresh copy of the repo so we don't + // pollute the main working tree. We need: SKILL.md.tmpl files, scripts/, + // model-overlays/, hosts/. Easiest is to run from ROOT and copy outputs. + const result = spawnSync( + 'bun', + ['run', 'scripts/gen-skill-docs.ts', '--model', model], + { cwd: ROOT, stdio: 'pipe', encoding: 'utf-8', timeout: 60_000 }, + ); + if (result.status !== 0) { + throw new Error(`gen-skill-docs failed for --model ${model}: ${result.stderr}`); + } + + // Copy the top-level generated SKILL.md into the scratch dir (under + // .claude/skills/gstack/ which is where Claude looks for project skills). + const skillDir = path.join(tmp, '.claude', 'skills', 'gstack'); + fs.mkdirSync(skillDir, { recursive: true }); + fs.copyFileSync(path.join(ROOT, 'SKILL.md'), path.join(skillDir, 'SKILL.md')); + + // Minimal project context + fs.writeFileSync( + path.join(tmp, 'CLAUDE.md'), + `# Project\n\nSee .claude/skills/gstack/SKILL.md for skill definitions.\n`, + ); + fs.writeFileSync(path.join(tmp, 'package.json'), '{"name":"opus47-eval"}'); + + // git init so any downstream git-aware logic doesn't blow up + const git = (args: string[]) => + spawnSync('git', args, { cwd: tmp, stdio: 'pipe', timeout: 5_000 }); + git(['init']); + git(['config', 'user.email', 't@t.com']); + git(['config', 'user.name', 'T']); + git(['add', '.']); + git(['commit', '-m', 'init']); + + return tmp; +} + +/** Count parallel tool calls in the first assistant turn. */ +function firstTurnParallelism(transcript: any[]): number { + const firstAssistant = transcript.find((e) => e.type === 'assistant'); + if (!firstAssistant) return 0; + const content = firstAssistant.message?.content ?? []; + return content.filter((c: any) => c.type === 'tool_use').length; +} + +interface RoutingCase { + name: string; + prompt: string; + shouldRoute: boolean; + expectedSkill?: string; +} + +/** Small, intentionally chosen routing cases. Positive cases are ambiguous + * phrasings the user actually says, not template text. Negative cases are + * casual or off-topic prompts that match routing keywords but shouldn't + * trigger a skill. */ +const ROUTING_CASES: RoutingCase[] = [ + // Positive — should route + { name: 'pos-wtf-bug', prompt: "wtf is this error coming from auth.ts:47 when the cookie expires?", shouldRoute: true, expectedSkill: 'investigate' }, + { name: 'pos-send-it', prompt: "ok this is good enough, let's send it.", shouldRoute: true, expectedSkill: 'ship' }, + { name: 'pos-does-it-work', prompt: "does this feature work on mobile? can you check the deploy?", shouldRoute: true, expectedSkill: 'qa' }, + // Negative — should NOT route + { name: 'neg-syntax-q', prompt: "wtf does this Python list comprehension syntax even mean, [x for x in y if z]?", shouldRoute: false }, + { name: 'neg-algo-q', prompt: "does this bubble sort algorithm actually work in O(n log n)?", shouldRoute: false }, + { name: 'neg-slack-send', prompt: "can you help me write the slack message? I want to send it to the team.", shouldRoute: false }, +]; + +// --- Tests --- + +describeE2E('Opus 4.7 overlay behavior evals', () => { + afterAll(() => { + evalCollector?.finalize(); + }); + + test( + 'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task', + async () => { + const armA = regenSkillsAt('opus-4-7', 'on'); + const armB = regenSkillsAt('claude', 'off'); + + // Populate three tiny independent files in each arm. The prompt asks + // the agent to read all three and report. Opus 4.7 (without nudge) + // tends to serialize; with the nudge it should parallelize. + for (const dir of [armA, armB]) { + fs.writeFileSync(path.join(dir, 'alpha.txt'), 'alpha content: 1\n'); + fs.writeFileSync(path.join(dir, 'beta.txt'), 'beta content: 2\n'); + fs.writeFileSync(path.join(dir, 'gamma.txt'), 'gamma content: 3\n'); + } + + const prompt = + "Read alpha.txt, beta.txt, and gamma.txt in this directory and report what's inside each. These three reads are independent."; + + try { + const [resA, resB] = await Promise.all([ + runSkillTest({ + prompt, + workingDirectory: armA, + maxTurns: 5, + allowedTools: ['Read', 'Bash', 'Glob', 'Grep'], + timeout: 90_000, + testName: 'fanout-arm-overlay-on', + runId, + model: OPUS_47, + }), + runSkillTest({ + prompt, + workingDirectory: armB, + maxTurns: 5, + allowedTools: ['Read', 'Bash', 'Glob', 'Grep'], + timeout: 90_000, + testName: 'fanout-arm-overlay-off', + runId, + model: OPUS_47, + }), + ]); + + const parA = firstTurnParallelism(resA.transcript); + const parB = firstTurnParallelism(resB.transcript); + + console.log( + `[opus-4-7 fanout] arm A (overlay ON): ${parA} parallel tool calls in first turn; ` + + `arm B (overlay OFF): ${parB}`, + ); + console.log(` cost A=$${resA.costEstimate.estimatedCost.toFixed(2)} B=$${resB.costEstimate.estimatedCost.toFixed(2)}`); + + evalCollector?.addTest({ + name: 'fanout-arm-overlay-on', + suite: 'Opus 4.7 overlay', + tier: 'e2e', + passed: parA >= parB, + duration_ms: resA.duration, + cost_usd: resA.costEstimate.estimatedCost, + transcript: resA.transcript, + output: `parallel=${parA}`, + turns_used: resA.costEstimate.turnsUsed, + exit_reason: resA.exitReason, + }); + evalCollector?.addTest({ + name: 'fanout-arm-overlay-off', + suite: 'Opus 4.7 overlay', + tier: 'e2e', + passed: true, // baseline arm, recorded for comparison + duration_ms: resB.duration, + cost_usd: resB.costEstimate.estimatedCost, + transcript: resB.transcript, + output: `parallel=${parB}`, + turns_used: resB.costEstimate.turnsUsed, + exit_reason: resB.exitReason, + }); + + // Main assertion: overlay arm is at least as parallel as baseline. + expect(parA, `overlay arm emitted ${parA} parallel calls, baseline ${parB}`).toBeGreaterThanOrEqual(parB); + } finally { + fs.rmSync(armA, { recursive: true, force: true }); + fs.rmSync(armB, { recursive: true, force: true }); + } + }, + 240_000, + ); + + test( + 'routing precision: positives route, negatives do not', + async () => { + // Single SKILL.md tree shared by all cases. We run claude-opus-4-7 with + // tool access to Skill; measure whether the first tool call is Skill(..) + // and if so, which skill. + const root = regenSkillsAt('opus-4-7', 'routing'); + + try { + const results = await Promise.all( + ROUTING_CASES.map((c) => + runSkillTest({ + prompt: c.prompt, + workingDirectory: root, + maxTurns: 3, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 90_000, + testName: `routing-${c.name}`, + runId, + model: OPUS_47, + }).then((r) => ({ c, r })), + ), + ); + + let tp = 0, fn = 0, fp = 0, tn = 0; + const rows: string[] = []; + let totalCost = 0; + + for (const { c, r } of results) { + const skillCalls = r.toolCalls.filter((tc) => tc.tool === 'Skill'); + const routed = skillCalls.length > 0; + const actualSkill = routed ? skillCalls[0]?.input?.skill : undefined; + + const correct = c.shouldRoute + ? routed && (!c.expectedSkill || actualSkill === c.expectedSkill) + : !routed; + + if (c.shouldRoute && routed) tp++; + else if (c.shouldRoute && !routed) fn++; + else if (!c.shouldRoute && routed) fp++; + else tn++; + + totalCost += r.costEstimate.estimatedCost; + rows.push( + ` ${c.name.padEnd(18)} routed=${String(routed).padEnd(5)} skill=${String(actualSkill).padEnd(16)} ` + + `expected=${c.shouldRoute ? (c.expectedSkill ?? 'any') : '(none)'} ${correct ? 'OK' : 'MISS'}`, + ); + + evalCollector?.addTest({ + name: `routing-${c.name}`, + suite: 'Opus 4.7 routing', + tier: 'e2e', + passed: correct, + duration_ms: r.duration, + cost_usd: r.costEstimate.estimatedCost, + transcript: r.transcript, + output: `routed=${routed} actual=${actualSkill ?? '(none)'} expected=${c.shouldRoute ? c.expectedSkill ?? 'any' : '(none)'}`, + turns_used: r.costEstimate.turnsUsed, + exit_reason: r.exitReason, + }); + } + + const posCount = ROUTING_CASES.filter((c) => c.shouldRoute).length; + const negCount = ROUTING_CASES.length - posCount; + const tpRate = posCount > 0 ? tp / posCount : 0; + const fpRate = negCount > 0 ? fp / negCount : 0; + + console.log(`[opus-4-7 routing] total cost $${totalCost.toFixed(2)}`); + console.log(rows.join('\n')); + console.log( + ` TP=${tp}/${posCount} (${(tpRate * 100).toFixed(0)}%) FN=${fn} ` + + `FP=${fp}/${negCount} (${(fpRate * 100).toFixed(0)}%) TN=${tn}`, + ); + + // Thresholds from the test plan artifact: TP >= 80%, FP <= 30%. + // With a small N we loosen slightly: TP >= 66% (2 of 3 positive), + // FP <= 33% (no more than 1 of 3 negatives). + expect(tpRate, `true-positive rate ${(tpRate * 100).toFixed(0)}% (need >= 66%)`).toBeGreaterThanOrEqual(2 / 3); + expect(fpRate, `false-positive rate ${(fpRate * 100).toFixed(0)}% (need <= 33%)`).toBeLessThanOrEqual(1 / 3); + } finally { + fs.rmSync(root, { recursive: true, force: true }); + } + }, + 360_000, + ); +});