test(diagram): paid E2E pair — gate triplet contract + periodic authoring judge

diagram-triplet (gate, deterministic functional): a fresh claude -p agent
following the skill extract must emit a parseable triplet — graph LR/TD in
.mmd, excalidraw scene with >3 elements, SVG markup, PNG magic bytes.
Verified live: pass, $0.17, 58s. diagram-authoring-quality (periodic,
LLM-judged): faithfulness/labels/size rubric with a diagnostic-path cap,
floor 6/10. Verified live: pass at exactly 6 with substantive critique.
Touchfiles select both on diagram/** and lib/diagram-render/** changes;
tier split per E2E_TIERS rules (eng-review D5).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-12 00:32:37 -07:00
parent 462c06224a
commit 29b948bd90
2 changed files with 161 additions and 0 deletions
+9
View File
@@ -291,6 +291,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'design-shotgun-session': ['design-shotgun/**', 'scripts/resolvers/design.ts'],
'design-shotgun-full': ['design-shotgun/**', 'design/src/**', 'browse/src/**'],
// /diagram (diagram-render bundle consumers). Triplet = deterministic
// functional (gate); authoring quality = LLM-judged benchmark (periodic).
'diagram-triplet': ['diagram/**', 'lib/diagram-render/**', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts'],
'diagram-authoring-quality': ['diagram/**', 'lib/diagram-render/**', 'test/helpers/llm-judge.ts'],
// gstack-upgrade
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
@@ -656,6 +661,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'design-shotgun-session': 'gate',
'design-shotgun-full': 'periodic',
// /diagram — triplet is deterministic functional, judge is a quality benchmark
'diagram-triplet': 'gate',
'diagram-authoring-quality': 'periodic',
// gstack-upgrade
'gstack-upgrade-happy-path': 'gate',