From 0d1d2e970bbc7ab42b551461a7a841409e565417 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 20 Mar 2026 07:16:45 -0700 Subject: [PATCH] test: E2E + LLM-judge evals for deploy skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 4 E2E tests: land-and-deploy (Fly.io detection + deploy report), canary (monitoring report structure), benchmark (perf report schema), setup-deploy (platform detection → CLAUDE.md config) - 4 LLM-judge evals: workflow quality for all 4 new skills - Touchfile entries for diff-based test selection (E2E + LLM-judge) - 460 free tests pass, 0 fail Co-Authored-By: Claude Opus 4.6 (1M context) --- test/helpers/touchfiles.ts | 12 ++ test/skill-e2e.test.ts | 277 ++++++++++++++++++++++++++++++++++++ test/skill-llm-eval.test.ts | 56 +++++++- 3 files changed, 344 insertions(+), 1 deletion(-) diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 53cc709c..02fefd40 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -98,6 +98,12 @@ export const E2E_TOUCHFILES: Record = { // gstack-upgrade 'gstack-upgrade-happy-path': ['gstack-upgrade/**'], + // Deploy skills + 'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'], + 'canary-workflow': ['canary/**', 'browse/src/**'], + 'benchmark-workflow': ['benchmark/**', 'browse/src/**'], + 'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'], + // Skill routing — journey-stage tests (depend on ALL skill descriptions) 'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], 'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], @@ -140,6 +146,12 @@ export const LLM_JUDGE_TOUCHFILES: Record = { 'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'], 'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'], + // Deploy skills + 'land-and-deploy/SKILL.md workflow': ['land-and-deploy/SKILL.md', 'land-and-deploy/SKILL.md.tmpl'], + 'canary/SKILL.md monitoring loop': ['canary/SKILL.md', 'canary/SKILL.md.tmpl'], + 'benchmark/SKILL.md perf collection': ['benchmark/SKILL.md', 'benchmark/SKILL.md.tmpl'], + 'setup-deploy/SKILL.md platform setup': ['setup-deploy/SKILL.md', 'setup-deploy/SKILL.md.tmpl'], + // Other skills 'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'], 'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'], diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 96019f70..e0ba3f6a 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -2911,6 +2911,283 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m }, 360_000); }); +// --- Land-and-Deploy / Canary / Benchmark / Setup-Deploy E2E --- + +describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], () => { + let landDir: string; + + beforeAll(() => { + landDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-deploy-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: landDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Create initial app + fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "world"; }\n'); + fs.writeFileSync(path.join(landDir, 'fly.toml'), 'app = "test-app"\n\n[http_service]\n internal_port = 3000\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Create feature branch with changes + run('git', ['checkout', '-b', 'feat/add-deploy']); + fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "deployed"; }\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'feat: update hello']); + + // Copy skill + copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(landDir, 'land-and-deploy')); + }); + + afterAll(() => { + try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {} + }); + + test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => { + const result = await runSkillTest({ + prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions. + +You are on branch feat/add-deploy with changes against main. This repo has a fly.toml +with app = "test-app", indicating a Fly.io deployment. + +IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands. +Instead, simulate the workflow: +1. Detect the deploy platform from fly.toml (should find Fly.io, app = test-app) +2. Infer the production URL (https://test-app.fly.dev) +3. Note the merge method would be squash +4. Write the deploy configuration to CLAUDE.md +5. Write a deploy report skeleton to .gstack/deploy-reports/report.md showing the + expected report structure (PR number: simulated, timing: simulated, verdict: simulated) + +Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`, + workingDirectory: landDir, + maxTurns: 20, + allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'], + timeout: 120_000, + testName: 'land-and-deploy-workflow', + runId, + }); + + logCost('/land-and-deploy', result); + recordE2E('/land-and-deploy workflow', 'Land-and-Deploy skill E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify deploy config was written to CLAUDE.md + const claudeMd = path.join(landDir, 'CLAUDE.md'); + if (fs.existsSync(claudeMd)) { + const content = fs.readFileSync(claudeMd, 'utf-8'); + const hasFly = content.toLowerCase().includes('fly') || content.toLowerCase().includes('test-app'); + expect(hasFly).toBe(true); + } + + // Verify deploy report directory was created + const reportDir = path.join(landDir, '.gstack', 'deploy-reports'); + expect(fs.existsSync(reportDir)).toBe(true); + }, 180_000); +}); + +describeIfSelected('Canary skill E2E', ['canary-workflow'], () => { + let canaryDir: string; + + beforeAll(() => { + canaryDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-canary-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: canaryDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(canaryDir, 'index.html'), '

Hello

\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Copy skill + copyDirSync(path.join(ROOT, 'canary'), path.join(canaryDir, 'canary')); + }); + + afterAll(() => { + try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {} + }); + + test('/canary skill produces monitoring report structure', async () => { + const result = await runSkillTest({ + prompt: `Read canary/SKILL.md for the /canary skill instructions. + +You are simulating a canary check. There is NO browse daemon available and NO production URL. + +Instead, demonstrate you understand the workflow: +1. Create the .gstack/canary-reports/ directory structure +2. Write a simulated baseline.json to .gstack/canary-reports/baseline.json with the + schema described in Phase 2 of the skill (url, timestamp, branch, pages with + screenshot path, console_errors count, and load_time_ms) +3. Write a simulated canary report to .gstack/canary-reports/canary-report.md following + the Phase 6 Health Report format (CANARY REPORT header, duration, pages, status, + per-page results table, verdict) + +Do NOT use AskUserQuestion. Do NOT run browse ($B) commands. +Just create the directory structure and report files showing the correct schema.`, + workingDirectory: canaryDir, + maxTurns: 15, + allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'], + timeout: 120_000, + testName: 'canary-workflow', + runId, + }); + + logCost('/canary', result); + recordE2E('/canary workflow', 'Canary skill E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify directory structure + expect(fs.existsSync(path.join(canaryDir, '.gstack', 'canary-reports'))).toBe(true); + + // Verify baseline or report was created + const reportDir = path.join(canaryDir, '.gstack', 'canary-reports'); + const files = fs.readdirSync(reportDir, { recursive: true }) as string[]; + expect(files.length).toBeGreaterThan(0); + }, 180_000); +}); + +describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => { + let benchDir: string; + + beforeAll(() => { + benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benchmark-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: benchDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(benchDir, 'index.html'), '

Hello

\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Copy skill + copyDirSync(path.join(ROOT, 'benchmark'), path.join(benchDir, 'benchmark')); + }); + + afterAll(() => { + try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {} + }); + + test('/benchmark skill produces performance report structure', async () => { + const result = await runSkillTest({ + prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions. + +You are simulating a benchmark run. There is NO browse daemon available and NO production URL. + +Instead, demonstrate you understand the workflow: +1. Create the .gstack/benchmark-reports/ directory structure including baselines/ +2. Write a simulated baseline.json to .gstack/benchmark-reports/baselines/baseline.json + with the schema from Phase 4 (url, timestamp, branch, pages with ttfb_ms, fcp_ms, + lcp_ms, dom_interactive_ms, dom_complete_ms, full_load_ms, total_requests, + total_transfer_bytes, js_bundle_bytes, css_bundle_bytes, largest_resources) +3. Write a simulated benchmark report to .gstack/benchmark-reports/benchmark-report.md + following the Phase 5 comparison format (PERFORMANCE REPORT header, page comparison + table with Baseline/Current/Delta/Status columns, regression thresholds applied) +4. Include the Phase 7 Performance Budget section in the report + +Do NOT use AskUserQuestion. Do NOT run browse ($B) commands. +Just create the files showing the correct schema and report format.`, + workingDirectory: benchDir, + maxTurns: 15, + allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'], + timeout: 120_000, + testName: 'benchmark-workflow', + runId, + }); + + logCost('/benchmark', result); + recordE2E('/benchmark workflow', 'Benchmark skill E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify directory structure + expect(fs.existsSync(path.join(benchDir, '.gstack', 'benchmark-reports'))).toBe(true); + + // Verify baseline was created + const baselineDir = path.join(benchDir, '.gstack', 'benchmark-reports', 'baselines'); + if (fs.existsSync(baselineDir)) { + const files = fs.readdirSync(baselineDir); + expect(files.length).toBeGreaterThan(0); + } + }, 180_000); +}); + +describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => { + let setupDir: string; + + beforeAll(() => { + setupDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-setup-deploy-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: setupDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Create a project with fly.toml + fs.writeFileSync(path.join(setupDir, 'app.ts'), 'export default { port: 3000 };\n'); + fs.writeFileSync(path.join(setupDir, 'fly.toml'), 'app = "my-cool-app"\n\n[http_service]\n internal_port = 3000\n force_https = true\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Copy skill + copyDirSync(path.join(ROOT, 'setup-deploy'), path.join(setupDir, 'setup-deploy')); + }); + + afterAll(() => { + try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {} + }); + + test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => { + const result = await runSkillTest({ + prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions. + +This repo has a fly.toml with app = "my-cool-app". Run the /setup-deploy workflow: +1. Detect the platform from fly.toml (should be Fly.io) +2. Extract the app name: my-cool-app +3. Infer production URL: https://my-cool-app.fly.dev +4. Set deploy status command: fly status --app my-cool-app +5. Write the Deploy Configuration section to CLAUDE.md + +Do NOT use AskUserQuestion. Do NOT run fly or gh commands. +Do NOT try to verify the health check URL (there is no network). +Just detect the platform and write the config.`, + workingDirectory: setupDir, + maxTurns: 15, + allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'], + timeout: 120_000, + testName: 'setup-deploy-workflow', + runId, + }); + + logCost('/setup-deploy', result); + recordE2E('/setup-deploy workflow', 'Setup-Deploy skill E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify CLAUDE.md was created with deploy config + const claudeMd = path.join(setupDir, 'CLAUDE.md'); + expect(fs.existsSync(claudeMd)).toBe(true); + + const content = fs.readFileSync(claudeMd, 'utf-8'); + // Should mention Fly.io or fly + expect(content.toLowerCase()).toContain('fly'); + // Should mention the app name + expect(content).toContain('my-cool-app'); + // Should have the deploy configuration header + expect(content).toContain('Deploy Configuration'); + }, 180_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { if (evalCollector) { diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index 45ac4452..5208836a 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -680,7 +680,61 @@ describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'de }, 30_000); }); -// Block 4: Other skills +// Block 4: Deploy skills +describeIfSelected('Deploy skill evals', [ + 'land-and-deploy/SKILL.md workflow', 'canary/SKILL.md monitoring loop', + 'benchmark/SKILL.md perf collection', 'setup-deploy/SKILL.md platform setup', +], () => { + testIfSelected('land-and-deploy/SKILL.md workflow', async () => { + await runWorkflowJudge({ + testName: 'land-and-deploy/SKILL.md workflow', + suite: 'Deploy skill evals', + skillPath: 'land-and-deploy/SKILL.md', + startMarker: '## Step 1: Pre-flight', + endMarker: '## Important Rules', + judgeContext: 'a merge-deploy-verify workflow for landing PRs to production', + judgeGoal: 'how to merge a PR via GitHub CLI, wait for CI and deploy workflows (with platform-specific strategies for Fly.io/Render/Vercel/Netlify), run canary health checks on production, and offer revert if something breaks — with timing data logged for retrospectives', + }); + }, 30_000); + + testIfSelected('canary/SKILL.md monitoring loop', async () => { + await runWorkflowJudge({ + testName: 'canary/SKILL.md monitoring loop', + suite: 'Deploy skill evals', + skillPath: 'canary/SKILL.md', + startMarker: '### Phase 2: Baseline Capture', + endMarker: '## Important Rules', + judgeContext: 'a post-deploy canary monitoring workflow using a headless browser daemon', + judgeGoal: 'how to capture baseline screenshots and metrics before deploy, run a continuous monitoring loop checking each page every 60 seconds for console errors and performance regressions, fire alerts with evidence (screenshots), and produce a health report with per-page status and verdict', + }); + }, 30_000); + + testIfSelected('benchmark/SKILL.md perf collection', async () => { + await runWorkflowJudge({ + testName: 'benchmark/SKILL.md perf collection', + suite: 'Deploy skill evals', + skillPath: 'benchmark/SKILL.md', + startMarker: '### Phase 3: Performance Data Collection', + endMarker: '## Important Rules', + judgeContext: 'a performance regression detection workflow using browser-based Web Vitals measurement', + judgeGoal: 'how to collect real performance metrics (TTFB, FCP, LCP, bundle sizes, request counts) via performance.getEntries(), compare against baselines with regression thresholds, produce a performance report with delta analysis, and track trends over time', + }); + }, 30_000); + + testIfSelected('setup-deploy/SKILL.md platform setup', async () => { + await runWorkflowJudge({ + testName: 'setup-deploy/SKILL.md platform setup', + suite: 'Deploy skill evals', + skillPath: 'setup-deploy/SKILL.md', + startMarker: '### Step 2: Detect platform', + endMarker: '## Important Rules', + judgeContext: 'a deployment configuration setup workflow that detects deploy platforms and writes config to CLAUDE.md', + judgeGoal: 'how to detect deploy platforms (Fly.io, Render, Vercel, Netlify, Heroku, GitHub Actions, custom), gather platform-specific configuration (URLs, status commands, health checks, custom hooks), and persist everything to CLAUDE.md for future automated use', + }); + }, 30_000); +}); + +// Block 5: Other skills describeIfSelected('Other skill evals', [ 'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow', ], () => {