mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 22:16:52 +02:00
test(opus-4.7): E2E eval for fanout rate + routing precision
Closes the measurement gap flagged by the ship-quality review: "zero
tests exercise Opus 4.7 behavior; every skill-e2e hardcodes 4.6."
Two cases, both pinned to claude-opus-4-7:
1. Fanout rate (A/B)
- Arm A: regen SKILL.md with --model opus-4-7 (overlay ON, includes
"Fan out explicitly" nudge).
- Arm B: regen SKILL.md with --model claude (overlay OFF, only
model-agnostic nudges).
- Prompt: "Read alpha.txt, beta.txt, gamma.txt. These are independent."
- Measure: parallel tool calls in first assistant turn.
- Assert: arm A >= arm B.
2. Routing precision (6-case mini-benchmark)
- 3 positive prompts that should route (wtf bug, send it, does it work)
- 3 negative prompts that match keywords but should NOT route
(syntax question, algorithm question, slack message)
- Assert: TP rate >= 66%, FP rate <= 33%.
Cost estimate: ~$3-5 per full run. Classified as periodic tier per
CLAUDE.md convention (Opus model, non-deterministic). Runs only with
EVALS=1 env var, touchfile-gated so unrelated diffs don't trigger it.
Test plan artifact at
~/.gstack/projects/garrytan-gstack/garrytan-feat-opus-4.7-migration-eng-review-test-plan-20260421-230611.md
tracks the full specification.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -206,6 +206,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'journey-retro': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-design-system': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Opus 4.7 behavior evals — depend on overlay + routing + resolver
|
||||
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task':
|
||||
['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],
|
||||
'routing precision: positives route, negatives do not':
|
||||
['SKILL.md.tmpl', 'scripts/resolvers/preamble/generate-routing-injection.ts', 'model-overlays/opus-4-7.md'],
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -372,6 +378,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'journey-retro': 'periodic',
|
||||
'journey-design-system': 'periodic',
|
||||
'journey-visual-qa': 'periodic',
|
||||
|
||||
// Opus 4.7 overlay evals — periodic (non-deterministic LLM behavior + Opus cost)
|
||||
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task': 'periodic',
|
||||
'routing precision: positives route, negatives do not': 'periodic',
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user