From 29b948bd90ecbd4ecf8f84e0d537cbbefc232d8c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 12 Jun 2026 00:32:37 -0700 Subject: [PATCH] =?UTF-8?q?test(diagram):=20paid=20E2E=20pair=20=E2=80=94?= =?UTF-8?q?=20gate=20triplet=20contract=20+=20periodic=20authoring=20judge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit diagram-triplet (gate, deterministic functional): a fresh claude -p agent following the skill extract must emit a parseable triplet — graph LR/TD in .mmd, excalidraw scene with >3 elements, SVG markup, PNG magic bytes. Verified live: pass, $0.17, 58s. diagram-authoring-quality (periodic, LLM-judged): faithfulness/labels/size rubric with a diagnostic-path cap, floor 6/10. Verified live: pass at exactly 6 with substantive critique. Touchfiles select both on diagram/** and lib/diagram-render/** changes; tier split per E2E_TIERS rules (eng-review D5). Co-Authored-By: Claude Fable 5 --- test/helpers/touchfiles.ts | 9 ++ test/skill-e2e-diagram.test.ts | 152 +++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 test/skill-e2e-diagram.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index ca9957c0e..68bc2062e 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -291,6 +291,11 @@ export const E2E_TOUCHFILES: Record = { 'design-shotgun-session': ['design-shotgun/**', 'scripts/resolvers/design.ts'], 'design-shotgun-full': ['design-shotgun/**', 'design/src/**', 'browse/src/**'], + // /diagram (diagram-render bundle consumers). Triplet = deterministic + // functional (gate); authoring quality = LLM-judged benchmark (periodic). + 'diagram-triplet': ['diagram/**', 'lib/diagram-render/**', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts'], + 'diagram-authoring-quality': ['diagram/**', 'lib/diagram-render/**', 'test/helpers/llm-judge.ts'], + // gstack-upgrade 'gstack-upgrade-happy-path': ['gstack-upgrade/**'], @@ -656,6 +661,10 @@ export const E2E_TIERS: Record = { 'design-shotgun-session': 'gate', 'design-shotgun-full': 'periodic', + // /diagram — triplet is deterministic functional, judge is a quality benchmark + 'diagram-triplet': 'gate', + 'diagram-authoring-quality': 'periodic', + // gstack-upgrade 'gstack-upgrade-happy-path': 'gate', diff --git a/test/skill-e2e-diagram.test.ts b/test/skill-e2e-diagram.test.ts new file mode 100644 index 000000000..cab919610 --- /dev/null +++ b/test/skill-e2e-diagram.test.ts @@ -0,0 +1,152 @@ +/** + * /diagram skill E2E (paid, claude -p). + * + * Two tests with deliberately different tiers (eng-review D5): + * + * diagram-triplet (gate) — deterministic functional contract: from an + * English ask, the agent following the skill emits a parseable triplet — + * .mmd source, .excalidraw scene with elements, SVG markup, PNG bytes. + * No quality judgment; either the artifacts exist and parse or they don't. + * + * diagram-authoring-quality (periodic) — LLM-judged benchmark of the + * authored mermaid itself (faithfulness to the ask, label quality, + * readable size). Non-deterministic by nature → never blocks merge. + * + * Per the extract-don't-copy fixture rule, the prompt embeds only the skill's + * working section (from "# /diagram" onward), not the full generated SKILL.md + * with its preamble. + */ +import { describe, expect } from 'bun:test'; +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import * as os from 'node:os'; + +import { runSkillTest } from './helpers/session-runner'; +import { + ROOT, browseBin, runId, + describeIfSelected, testConcurrentIfSelected, + logCost, +} from './helpers/e2e-helpers'; +import { callJudge } from './helpers/llm-judge'; + +const BUNDLE = path.join(ROOT, 'lib', 'diagram-render', 'dist', 'diagram-render.html'); + +/** Extract the working section of the generated skill doc (post-preamble). */ +function skillExtract(): string { + const full = fs.readFileSync(path.join(ROOT, 'diagram', 'SKILL.md'), 'utf-8'); + const start = full.indexOf('# /diagram'); + if (start < 0) throw new Error('diagram/SKILL.md missing "# /diagram" section — regenerate skill docs'); + return full.slice(start); +} + +function setupDir(prefix: string): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); + fs.writeFileSync(path.join(dir, 'diagram-skill.md'), skillExtract()); + // Pre-stage the bundle so the test is hermetic (no global install needed in + // CI); the prompt tells the agent discovery is already done. + fs.copyFileSync(BUNDLE, path.join(dir, 'diagram-render.html')); + fs.mkdirSync(path.join(dir, 'out')); + return dir; +} + +function basePrompt(dir: string, ask: string): string { + return `You have the /diagram skill instructions at ./diagram-skill.md — read them and follow Steps 1-4. + +Environment notes (already set up — skip Step 2's bundle discovery): +- The browse binary is at ${browseBin} — use it wherever the skill says $B. +- The render bundle is ALREADY staged at ./diagram-render.html in this directory; load it with: ${browseBin} load-html ./diagram-render.html +- Write all four artifacts into ./out/ with the slug "flow" (out/flow.mmd, out/flow.excalidraw, out/flow.svg, out/flow.png). +- Do not open any other applications. Do not use the Read tool on the PNG (no inline display needed here). + +The diagram to create: ${ask}`; +} + +describeIfSelected('/diagram skill E2E', ['diagram-triplet', 'diagram-authoring-quality'], () => { + testConcurrentIfSelected('diagram-triplet', async () => { + const dir = setupDir('diagram-triplet-'); + try { + const result = await runSkillTest({ + prompt: basePrompt( + dir, + 'a flowchart (graph LR) of a 4-stage pipeline: markdown → prepass → Chromium → PDF.', + ), + workingDirectory: dir, + maxTurns: 25, + allowedTools: ['Bash', 'Read', 'Write'], + timeout: 240_000, + testName: 'diagram-triplet', + runId, + }); + logCost('diagram triplet', result); + expect(result.exitReason).toBe('success'); + + // The deterministic contract: all four artifacts exist and parse. + const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8'); + expect(mmd).toMatch(/graph\s+(LR|TD)/); + + const scene = JSON.parse(fs.readFileSync(path.join(dir, 'out', 'flow.excalidraw'), 'utf-8')); + expect(scene.type).toBe('excalidraw'); + expect(Array.isArray(scene.elements)).toBe(true); + expect(scene.elements.length).toBeGreaterThan(3); + + const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8'); + expect(svg).toMatch(/ { + const dir = setupDir('diagram-quality-'); + try { + const result = await runSkillTest({ + prompt: basePrompt( + dir, + 'how gstack renders diagrams in PDFs: markdown containing mermaid fences goes through a pre-pass that extracts the fences, renders them in a browse daemon tab using an offline bundle, substitutes the SVG back in, inlines local images, and prints via Chromium. Failures become visible diagnostic blocks.', + ), + workingDirectory: dir, + maxTurns: 25, + allowedTools: ['Bash', 'Read', 'Write'], + timeout: 240_000, + testName: 'diagram-authoring-quality', + runId, + }); + logCost('diagram authoring quality', result); + expect(result.exitReason).toBe('success'); + + const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8'); + const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8'); + expect(svg).toMatch(/( + `You are judging the quality of an agent-authored mermaid diagram. + +THE ASK: a diagram of gstack's PDF diagram-rendering flow — mermaid fences are +extracted by a pre-pass, rendered in a browse tab via an offline bundle, +substituted back as SVG, images inlined, printed by Chromium, with render +failures becoming visible diagnostic blocks. + +THE AUTHORED MERMAID: +\`\`\`mermaid +${mmd} +\`\`\` + +Score 1-10 on: faithfulness to the ask (are the named stages present and +correctly ordered?), label quality (short node labels, detail on edges), +and readable size (5-15 nodes, not a wall). A diagram that misses the +failure/diagnostic path entirely caps at 6. + +Respond with JSON: {"score": N, "reasoning": "..."}`, + ); + // eslint-disable-next-line no-console + console.log(`[diagram-quality] score=${verdict.score} — ${verdict.reasoning}`); + expect(verdict.score).toBeGreaterThanOrEqual(6); + } finally { + try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ } + } + }, 300_000); +});