mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
Merge remote-tracking branch 'origin/main' into garrytan/conductor-skip-askuserquestion
# Conflicts: # CHANGELOG.md # VERSION # test/skill-e2e-bws.test.ts
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
/**
|
||||
* Drift guards for the committed diagram-render bundle (eng-review D2).
|
||||
*
|
||||
* Tier 1 (always, free, <50ms): dist/diagram-render.html must hash to exactly
|
||||
* what dist/BUILD_INFO.json records, and the BUILD_INFO dependency pins must
|
||||
* match package.json. Catches hand-edited dist files and "bumped the pin,
|
||||
* forgot to rebuild" commits.
|
||||
*
|
||||
* Tier 2 (deep, CI / post-install only): rebuild from source and compare
|
||||
* hashes. Skipped when lib/diagram-render/node_modules is absent (fresh
|
||||
* clone without `bun install` in that dir) or when the local bun version
|
||||
* differs from the one recorded at build time (minifier output is only
|
||||
* guaranteed deterministic within a bun version).
|
||||
*/
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import { createHash } from "node:crypto";
|
||||
import { existsSync } from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, "..", "lib", "diagram-render");
|
||||
const DIST_HTML = path.join(ROOT, "dist", "diagram-render.html");
|
||||
const BUILD_INFO = path.join(ROOT, "dist", "BUILD_INFO.json");
|
||||
|
||||
describe("diagram-render bundle drift", () => {
|
||||
test("dist hash matches BUILD_INFO (tamper check)", async () => {
|
||||
const html = await Bun.file(DIST_HTML).text();
|
||||
const info = await Bun.file(BUILD_INFO).json();
|
||||
const sha = createHash("sha256").update(html).digest("hex");
|
||||
expect(sha).toBe(info.sha256);
|
||||
expect(Buffer.byteLength(html)).toBe(info.bytes);
|
||||
});
|
||||
|
||||
test("BUILD_INFO dependency pins match package.json", async () => {
|
||||
const info = await Bun.file(BUILD_INFO).json();
|
||||
const pkg = await Bun.file(path.join(ROOT, "package.json")).json();
|
||||
expect(info.deps).toEqual(pkg.dependencies);
|
||||
});
|
||||
|
||||
test("BUILD_INFO srcSha256 matches src on disk (edited-src-forgot-rebuild guard)", async () => {
|
||||
// The deep rebuild check below needs node_modules, which CI doesn't
|
||||
// install for this nested package — this tier-1.5 fingerprint catches a
|
||||
// src edit committed without a rebuild using nothing but file hashes.
|
||||
const info = await Bun.file(BUILD_INFO).json();
|
||||
const srcSha = createHash("sha256")
|
||||
.update(await Bun.file(path.join(ROOT, "src", "entry.ts")).text())
|
||||
.update(await Bun.file(path.join(ROOT, "scripts", "build.ts")).text())
|
||||
.digest("hex");
|
||||
expect(srcSha).toBe(info.srcSha256);
|
||||
});
|
||||
|
||||
test("bundle font stack matches print-css (text-measurement drift guard)", async () => {
|
||||
const entrySrc = await Bun.file(path.join(ROOT, "src", "entry.ts")).text();
|
||||
// Every family print-css composes into the body stack must appear in the
|
||||
// bundle's PRINT_SANS literal — mermaid measures text with these fonts and
|
||||
// the print document lays it out with print-css's; drift = overflowing
|
||||
// labels (eng-review D3).
|
||||
for (const family of [
|
||||
"Helvetica", "Liberation Sans", "Arial",
|
||||
"Hiragino Kaku Gothic ProN", "Noto Sans CJK JP", "Microsoft YaHei",
|
||||
"Apple Color Emoji", "Segoe UI Emoji", "Noto Color Emoji",
|
||||
]) {
|
||||
expect(entrySrc).toContain(family);
|
||||
}
|
||||
});
|
||||
|
||||
test("page invariants: module script, base href, escaped terminators, error trap", async () => {
|
||||
const html = await Bun.file(DIST_HTML).text();
|
||||
expect(html).toContain('<script type="module">');
|
||||
expect(html).toContain('<base href="https://gstack-render.localhost/">');
|
||||
expect(html).toContain("window.__errors = []");
|
||||
// The inline module must contain no live </script> other than the page's
|
||||
// own closers: head error-trap closer + module closer.
|
||||
const closers = html.match(/<\/script>/g) ?? [];
|
||||
expect(closers.length).toBe(2);
|
||||
});
|
||||
|
||||
const nodeModules = path.join(ROOT, "node_modules");
|
||||
let builtWithSameBun = false;
|
||||
try {
|
||||
const info = require(BUILD_INFO);
|
||||
builtWithSameBun = info.bunVersion === Bun.version;
|
||||
} catch {}
|
||||
const canDeepCheck = existsSync(nodeModules) && builtWithSameBun;
|
||||
|
||||
test.skipIf(!canDeepCheck)(
|
||||
"deep: fresh build reproduces committed dist",
|
||||
async () => {
|
||||
const before = await Bun.file(BUILD_INFO).json();
|
||||
const proc = Bun.spawnSync(["bun", "run", "scripts/build.ts"], { cwd: ROOT });
|
||||
expect(proc.exitCode).toBe(0);
|
||||
const after = await Bun.file(BUILD_INFO).json();
|
||||
expect(after.sha256).toBe(before.sha256);
|
||||
},
|
||||
60000,
|
||||
);
|
||||
});
|
||||
@@ -301,6 +301,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'design-shotgun-session': ['design-shotgun/**', 'scripts/resolvers/design.ts'],
|
||||
'design-shotgun-full': ['design-shotgun/**', 'design/src/**', 'browse/src/**'],
|
||||
|
||||
// /diagram (diagram-render bundle consumers). Triplet = deterministic
|
||||
// functional (gate); authoring quality = LLM-judged benchmark (periodic).
|
||||
'diagram-triplet': ['diagram/**', 'lib/diagram-render/**', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts'],
|
||||
'diagram-authoring-quality': ['diagram/**', 'lib/diagram-render/**', 'test/helpers/llm-judge.ts'],
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
|
||||
@@ -672,6 +677,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'design-shotgun-session': 'gate',
|
||||
'design-shotgun-full': 'periodic',
|
||||
|
||||
// /diagram — triplet is deterministic functional, judge is a quality benchmark
|
||||
'diagram-triplet': 'gate',
|
||||
'diagram-authoring-quality': 'periodic',
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': 'gate',
|
||||
|
||||
|
||||
@@ -131,6 +131,11 @@ export const SKILL_COVERAGE: Record<string, SkillCoverage> = {
|
||||
'design-consultation': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
|
||||
'design-shotgun': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
|
||||
'design-html': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
|
||||
diagram: {
|
||||
gate: ['test/skill-e2e-diagram.test.ts', 'test/skill-coverage-floor.test.ts'],
|
||||
periodic: ['test/skill-e2e-diagram.test.ts'],
|
||||
rationale: 'Triplet contract is gate-tier deterministic; authoring-quality judge is periodic (E2E_TIERS: diagram-triplet/diagram-authoring-quality).',
|
||||
},
|
||||
cso: {
|
||||
gate: ['test/skill-e2e-cso.test.ts', 'test/cso-preserved.test.ts', 'test/skill-coverage-floor.test.ts'],
|
||||
periodic: [],
|
||||
|
||||
@@ -192,17 +192,21 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Copy bin scripts — preserving the bin/../lib layout:
|
||||
// gstack-learnings-log imports $SCRIPT_DIR/../lib/jsonl-store.ts
|
||||
// (hasInjection, added v1.57.5.0), so the lib must travel with the bin.
|
||||
// Copy bin scripts + the lib module they import. gstack-learnings-log
|
||||
// does `import ... from '$SCRIPT_DIR/../lib/jsonl-store.ts'` (v1.57.5.0
|
||||
// injection sanitization) — without lib/ alongside bin/, the script exits
|
||||
// 1 before writing anything, failing this test for a fixture reason, not
|
||||
// a model-behavior reason (root-caused during the v1.58.0.0 ship; fails
|
||||
// identically on main).
|
||||
const binDir = path.join(opDir, 'bin');
|
||||
fs.mkdirSync(binDir, { recursive: true });
|
||||
for (const script of ['gstack-learnings-log', 'gstack-slug']) {
|
||||
fs.copyFileSync(path.join(ROOT, 'bin', script), path.join(binDir, script));
|
||||
fs.chmodSync(path.join(binDir, script), 0o755);
|
||||
}
|
||||
fs.mkdirSync(path.join(opDir, 'lib'), { recursive: true });
|
||||
fs.copyFileSync(path.join(ROOT, 'lib', 'jsonl-store.ts'), path.join(opDir, 'lib', 'jsonl-store.ts'));
|
||||
const libDir = path.join(opDir, 'lib');
|
||||
fs.mkdirSync(libDir, { recursive: true });
|
||||
fs.copyFileSync(path.join(ROOT, 'lib', 'jsonl-store.ts'), path.join(libDir, 'jsonl-store.ts'));
|
||||
|
||||
// gstack-learnings-log will create the project dir automatically via gstack-slug
|
||||
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
/**
|
||||
* /diagram skill E2E (paid, claude -p).
|
||||
*
|
||||
* Two tests with deliberately different tiers (eng-review D5):
|
||||
*
|
||||
* diagram-triplet (gate) — deterministic functional contract: from an
|
||||
* English ask, the agent following the skill emits a parseable triplet —
|
||||
* .mmd source, .excalidraw scene with elements, SVG markup, PNG bytes.
|
||||
* No quality judgment; either the artifacts exist and parse or they don't.
|
||||
*
|
||||
* diagram-authoring-quality (periodic) — LLM-judged benchmark of the
|
||||
* authored mermaid itself (faithfulness to the ask, label quality,
|
||||
* readable size). Non-deterministic by nature → never blocks merge.
|
||||
*
|
||||
* Per the extract-don't-copy fixture rule, the prompt embeds only the skill's
|
||||
* working section (from "# /diagram" onward), not the full generated SKILL.md
|
||||
* with its preamble.
|
||||
*/
|
||||
import { describe, expect } from 'bun:test';
|
||||
import * as fs from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
import * as os from 'node:os';
|
||||
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import {
|
||||
ROOT, browseBin, runId,
|
||||
describeIfSelected, testConcurrentIfSelected,
|
||||
logCost,
|
||||
} from './helpers/e2e-helpers';
|
||||
import { callJudge } from './helpers/llm-judge';
|
||||
|
||||
const BUNDLE = path.join(ROOT, 'lib', 'diagram-render', 'dist', 'diagram-render.html');
|
||||
|
||||
/** Extract the working section of the generated skill doc (post-preamble). */
|
||||
function skillExtract(): string {
|
||||
const full = fs.readFileSync(path.join(ROOT, 'diagram', 'SKILL.md'), 'utf-8');
|
||||
const start = full.indexOf('# /diagram');
|
||||
if (start < 0) throw new Error('diagram/SKILL.md missing "# /diagram" section — regenerate skill docs');
|
||||
return full.slice(start);
|
||||
}
|
||||
|
||||
function setupDir(prefix: string): string {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
|
||||
fs.writeFileSync(path.join(dir, 'diagram-skill.md'), skillExtract());
|
||||
// Pre-stage the bundle so the test is hermetic (no global install needed in
|
||||
// CI); the prompt tells the agent discovery is already done.
|
||||
fs.copyFileSync(BUNDLE, path.join(dir, 'diagram-render.html'));
|
||||
fs.mkdirSync(path.join(dir, 'out'));
|
||||
return dir;
|
||||
}
|
||||
|
||||
function basePrompt(dir: string, ask: string): string {
|
||||
return `You have the /diagram skill instructions at ./diagram-skill.md — read them and follow Steps 1-4.
|
||||
|
||||
Environment notes (already set up — skip Step 2's bundle discovery):
|
||||
- The browse binary is at ${browseBin} — use it wherever the skill says $B.
|
||||
- The render bundle is ALREADY staged at ./diagram-render.html in this directory; load it with: ${browseBin} load-html ./diagram-render.html
|
||||
- Write all four artifacts into ./out/ with the slug "flow" (out/flow.mmd, out/flow.excalidraw, out/flow.svg, out/flow.png).
|
||||
- Do not open any other applications. Do not use the Read tool on the PNG (no inline display needed here).
|
||||
|
||||
The diagram to create: ${ask}`;
|
||||
}
|
||||
|
||||
describeIfSelected('/diagram skill E2E', ['diagram-triplet', 'diagram-authoring-quality'], () => {
|
||||
testConcurrentIfSelected('diagram-triplet', async () => {
|
||||
const dir = setupDir('diagram-triplet-');
|
||||
try {
|
||||
const result = await runSkillTest({
|
||||
prompt: basePrompt(
|
||||
dir,
|
||||
'a flowchart (graph LR) of a 4-stage pipeline: markdown → prepass → Chromium → PDF.',
|
||||
),
|
||||
workingDirectory: dir,
|
||||
maxTurns: 25,
|
||||
allowedTools: ['Bash', 'Read', 'Write'],
|
||||
timeout: 240_000,
|
||||
testName: 'diagram-triplet',
|
||||
runId,
|
||||
});
|
||||
logCost('diagram triplet', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// The deterministic contract: all four artifacts exist and parse.
|
||||
const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8');
|
||||
expect(mmd).toMatch(/graph\s+(LR|TD)/);
|
||||
|
||||
const scene = JSON.parse(fs.readFileSync(path.join(dir, 'out', 'flow.excalidraw'), 'utf-8'));
|
||||
expect(scene.type).toBe('excalidraw');
|
||||
expect(Array.isArray(scene.elements)).toBe(true);
|
||||
expect(scene.elements.length).toBeGreaterThan(3);
|
||||
|
||||
const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8');
|
||||
expect(svg).toMatch(/<svg/i);
|
||||
|
||||
const png = fs.readFileSync(path.join(dir, 'out', 'flow.png'));
|
||||
expect(png.subarray(0, 4)).toEqual(Buffer.from([0x89, 0x50, 0x4e, 0x47]));
|
||||
expect(png.length).toBeGreaterThan(5_000);
|
||||
} finally {
|
||||
try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||
}
|
||||
}, 300_000);
|
||||
|
||||
testConcurrentIfSelected('diagram-authoring-quality', async () => {
|
||||
const dir = setupDir('diagram-quality-');
|
||||
try {
|
||||
const result = await runSkillTest({
|
||||
prompt: basePrompt(
|
||||
dir,
|
||||
'how gstack renders diagrams in PDFs: markdown containing mermaid fences goes through a pre-pass that extracts the fences, renders them in a browse daemon tab using an offline bundle, substitutes the SVG back in, inlines local images, and prints via Chromium. Failures become visible diagnostic blocks.',
|
||||
),
|
||||
workingDirectory: dir,
|
||||
maxTurns: 25,
|
||||
allowedTools: ['Bash', 'Read', 'Write'],
|
||||
timeout: 240_000,
|
||||
testName: 'diagram-authoring-quality',
|
||||
runId,
|
||||
});
|
||||
logCost('diagram authoring quality', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8');
|
||||
const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8');
|
||||
expect(svg).toMatch(/<svg/i);
|
||||
|
||||
const verdict = await callJudge<{ score: number; reasoning: string }>(
|
||||
`You are judging the quality of an agent-authored mermaid diagram.
|
||||
|
||||
THE ASK: a diagram of gstack's PDF diagram-rendering flow — mermaid fences are
|
||||
extracted by a pre-pass, rendered in a browse tab via an offline bundle,
|
||||
substituted back as SVG, images inlined, printed by Chromium, with render
|
||||
failures becoming visible diagnostic blocks.
|
||||
|
||||
THE AUTHORED MERMAID:
|
||||
\`\`\`mermaid
|
||||
${mmd}
|
||||
\`\`\`
|
||||
|
||||
Score 1-10 on: faithfulness to the ask (are the named stages present and
|
||||
correctly ordered?), label quality (short node labels, detail on edges),
|
||||
and readable size (5-15 nodes, not a wall). A diagram that misses the
|
||||
failure/diagnostic path entirely caps at 5 — that path is an explicitly
|
||||
named requirement, so omitting it must fail the run.
|
||||
|
||||
Respond with JSON: {"score": N, "reasoning": "..."}`,
|
||||
);
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(`[diagram-quality] score=${verdict.score} — ${verdict.reasoning}`);
|
||||
expect(verdict.score).toBeGreaterThanOrEqual(6);
|
||||
} finally {
|
||||
try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||
}
|
||||
}, 300_000);
|
||||
});
|
||||
Reference in New Issue
Block a user