From 0d1d2e970bbc7ab42b551461a7a841409e565417 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Fri, 20 Mar 2026 07:16:45 -0700
Subject: [PATCH] test: E2E + LLM-judge evals for deploy skills
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 4 E2E tests: land-and-deploy (Fly.io detection + deploy report),
  canary (monitoring report structure), benchmark (perf report schema),
  setup-deploy (platform detection → CLAUDE.md config)
- 4 LLM-judge evals: workflow quality for all 4 new skills
- Touchfile entries for diff-based test selection (E2E + LLM-judge)
- 460 free tests pass, 0 fail

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/helpers/touchfiles.ts  |  12 ++
 test/skill-e2e.test.ts      | 277 ++++++++++++++++++++++++++++++++++++
 test/skill-llm-eval.test.ts |  56 +++++++-
 3 files changed, 344 insertions(+), 1 deletion(-)

diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts
index 53cc709c..02fefd40 100644
--- a/test/helpers/touchfiles.ts
+++ b/test/helpers/touchfiles.ts
@@ -98,6 +98,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
   // gstack-upgrade
   'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
 
+  // Deploy skills
+  'land-and-deploy-workflow':   ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
+  'canary-workflow':            ['canary/**', 'browse/src/**'],
+  'benchmark-workflow':         ['benchmark/**', 'browse/src/**'],
+  'setup-deploy-workflow':      ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
+
   // Skill routing — journey-stage tests (depend on ALL skill descriptions)
   'journey-ideation':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
   'journey-plan-eng':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -140,6 +146,12 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
   'design-review/SKILL.md fix loop':      ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
   'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
 
+  // Deploy skills
+  'land-and-deploy/SKILL.md workflow':    ['land-and-deploy/SKILL.md', 'land-and-deploy/SKILL.md.tmpl'],
+  'canary/SKILL.md monitoring loop':      ['canary/SKILL.md', 'canary/SKILL.md.tmpl'],
+  'benchmark/SKILL.md perf collection':   ['benchmark/SKILL.md', 'benchmark/SKILL.md.tmpl'],
+  'setup-deploy/SKILL.md platform setup': ['setup-deploy/SKILL.md', 'setup-deploy/SKILL.md.tmpl'],
+
   // Other skills
   'retro/SKILL.md instructions':          ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
   'qa-only/SKILL.md workflow':            ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts
index 96019f70..e0ba3f6a 100644
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@@ -2911,6 +2911,283 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
   }, 360_000);
 });
 
+// --- Land-and-Deploy / Canary / Benchmark / Setup-Deploy E2E ---
+
+describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], () => {
+  let landDir: string;
+
+  beforeAll(() => {
+    landDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-deploy-'));
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: landDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    // Create initial app
+    fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "world"; }\n');
+    fs.writeFileSync(path.join(landDir, 'fly.toml'), 'app = "test-app"\n\n[http_service]\n  internal_port = 3000\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Create feature branch with changes
+    run('git', ['checkout', '-b', 'feat/add-deploy']);
+    fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "deployed"; }\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'feat: update hello']);
+
+    // Copy skill
+    copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(landDir, 'land-and-deploy'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
+    const result = await runSkillTest({
+      prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
+
+You are on branch feat/add-deploy with changes against main. This repo has a fly.toml
+with app = "test-app", indicating a Fly.io deployment.
+
+IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
+Instead, simulate the workflow:
+1. Detect the deploy platform from fly.toml (should find Fly.io, app = test-app)
+2. Infer the production URL (https://test-app.fly.dev)
+3. Note the merge method would be squash
+4. Write the deploy configuration to CLAUDE.md
+5. Write a deploy report skeleton to .gstack/deploy-reports/report.md showing the
+   expected report structure (PR number: simulated, timing: simulated, verdict: simulated)
+
+Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
+      workingDirectory: landDir,
+      maxTurns: 20,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+      timeout: 120_000,
+      testName: 'land-and-deploy-workflow',
+      runId,
+    });
+
+    logCost('/land-and-deploy', result);
+    recordE2E('/land-and-deploy workflow', 'Land-and-Deploy skill E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify deploy config was written to CLAUDE.md
+    const claudeMd = path.join(landDir, 'CLAUDE.md');
+    if (fs.existsSync(claudeMd)) {
+      const content = fs.readFileSync(claudeMd, 'utf-8');
+      const hasFly = content.toLowerCase().includes('fly') || content.toLowerCase().includes('test-app');
+      expect(hasFly).toBe(true);
+    }
+
+    // Verify deploy report directory was created
+    const reportDir = path.join(landDir, '.gstack', 'deploy-reports');
+    expect(fs.existsSync(reportDir)).toBe(true);
+  }, 180_000);
+});
+
+describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
+  let canaryDir: string;
+
+  beforeAll(() => {
+    canaryDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-canary-'));
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: canaryDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(canaryDir, 'index.html'), '<h1>Hello</h1>\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Copy skill
+    copyDirSync(path.join(ROOT, 'canary'), path.join(canaryDir, 'canary'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/canary skill produces monitoring report structure', async () => {
+    const result = await runSkillTest({
+      prompt: `Read canary/SKILL.md for the /canary skill instructions.
+
+You are simulating a canary check. There is NO browse daemon available and NO production URL.
+
+Instead, demonstrate you understand the workflow:
+1. Create the .gstack/canary-reports/ directory structure
+2. Write a simulated baseline.json to .gstack/canary-reports/baseline.json with the
+   schema described in Phase 2 of the skill (url, timestamp, branch, pages with
+   screenshot path, console_errors count, and load_time_ms)
+3. Write a simulated canary report to .gstack/canary-reports/canary-report.md following
+   the Phase 6 Health Report format (CANARY REPORT header, duration, pages, status,
+   per-page results table, verdict)
+
+Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
+Just create the directory structure and report files showing the correct schema.`,
+      workingDirectory: canaryDir,
+      maxTurns: 15,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
+      timeout: 120_000,
+      testName: 'canary-workflow',
+      runId,
+    });
+
+    logCost('/canary', result);
+    recordE2E('/canary workflow', 'Canary skill E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify directory structure
+    expect(fs.existsSync(path.join(canaryDir, '.gstack', 'canary-reports'))).toBe(true);
+
+    // Verify baseline or report was created
+    const reportDir = path.join(canaryDir, '.gstack', 'canary-reports');
+    const files = fs.readdirSync(reportDir, { recursive: true }) as string[];
+    expect(files.length).toBeGreaterThan(0);
+  }, 180_000);
+});
+
+describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
+  let benchDir: string;
+
+  beforeAll(() => {
+    benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benchmark-'));
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: benchDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(benchDir, 'index.html'), '<h1>Hello</h1>\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Copy skill
+    copyDirSync(path.join(ROOT, 'benchmark'), path.join(benchDir, 'benchmark'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/benchmark skill produces performance report structure', async () => {
+    const result = await runSkillTest({
+      prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
+
+You are simulating a benchmark run. There is NO browse daemon available and NO production URL.
+
+Instead, demonstrate you understand the workflow:
+1. Create the .gstack/benchmark-reports/ directory structure including baselines/
+2. Write a simulated baseline.json to .gstack/benchmark-reports/baselines/baseline.json
+   with the schema from Phase 4 (url, timestamp, branch, pages with ttfb_ms, fcp_ms,
+   lcp_ms, dom_interactive_ms, dom_complete_ms, full_load_ms, total_requests,
+   total_transfer_bytes, js_bundle_bytes, css_bundle_bytes, largest_resources)
+3. Write a simulated benchmark report to .gstack/benchmark-reports/benchmark-report.md
+   following the Phase 5 comparison format (PERFORMANCE REPORT header, page comparison
+   table with Baseline/Current/Delta/Status columns, regression thresholds applied)
+4. Include the Phase 7 Performance Budget section in the report
+
+Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
+Just create the files showing the correct schema and report format.`,
+      workingDirectory: benchDir,
+      maxTurns: 15,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
+      timeout: 120_000,
+      testName: 'benchmark-workflow',
+      runId,
+    });
+
+    logCost('/benchmark', result);
+    recordE2E('/benchmark workflow', 'Benchmark skill E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify directory structure
+    expect(fs.existsSync(path.join(benchDir, '.gstack', 'benchmark-reports'))).toBe(true);
+
+    // Verify baseline was created
+    const baselineDir = path.join(benchDir, '.gstack', 'benchmark-reports', 'baselines');
+    if (fs.existsSync(baselineDir)) {
+      const files = fs.readdirSync(baselineDir);
+      expect(files.length).toBeGreaterThan(0);
+    }
+  }, 180_000);
+});
+
+describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
+  let setupDir: string;
+
+  beforeAll(() => {
+    setupDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-setup-deploy-'));
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: setupDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    // Create a project with fly.toml
+    fs.writeFileSync(path.join(setupDir, 'app.ts'), 'export default { port: 3000 };\n');
+    fs.writeFileSync(path.join(setupDir, 'fly.toml'), 'app = "my-cool-app"\n\n[http_service]\n  internal_port = 3000\n  force_https = true\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Copy skill
+    copyDirSync(path.join(ROOT, 'setup-deploy'), path.join(setupDir, 'setup-deploy'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
+    const result = await runSkillTest({
+      prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
+
+This repo has a fly.toml with app = "my-cool-app". Run the /setup-deploy workflow:
+1. Detect the platform from fly.toml (should be Fly.io)
+2. Extract the app name: my-cool-app
+3. Infer production URL: https://my-cool-app.fly.dev
+4. Set deploy status command: fly status --app my-cool-app
+5. Write the Deploy Configuration section to CLAUDE.md
+
+Do NOT use AskUserQuestion. Do NOT run fly or gh commands.
+Do NOT try to verify the health check URL (there is no network).
+Just detect the platform and write the config.`,
+      workingDirectory: setupDir,
+      maxTurns: 15,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+      timeout: 120_000,
+      testName: 'setup-deploy-workflow',
+      runId,
+    });
+
+    logCost('/setup-deploy', result);
+    recordE2E('/setup-deploy workflow', 'Setup-Deploy skill E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify CLAUDE.md was created with deploy config
+    const claudeMd = path.join(setupDir, 'CLAUDE.md');
+    expect(fs.existsSync(claudeMd)).toBe(true);
+
+    const content = fs.readFileSync(claudeMd, 'utf-8');
+    // Should mention Fly.io or fly
+    expect(content.toLowerCase()).toContain('fly');
+    // Should mention the app name
+    expect(content).toContain('my-cool-app');
+    // Should have the deploy configuration header
+    expect(content).toContain('Deploy Configuration');
+  }, 180_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
   if (evalCollector) {
diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts
index 45ac4452..5208836a 100644
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -680,7 +680,61 @@ describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'de
   }, 30_000);
 });
 
-// Block 4: Other skills
+// Block 4: Deploy skills
+describeIfSelected('Deploy skill evals', [
+  'land-and-deploy/SKILL.md workflow', 'canary/SKILL.md monitoring loop',
+  'benchmark/SKILL.md perf collection', 'setup-deploy/SKILL.md platform setup',
+], () => {
+  testIfSelected('land-and-deploy/SKILL.md workflow', async () => {
+    await runWorkflowJudge({
+      testName: 'land-and-deploy/SKILL.md workflow',
+      suite: 'Deploy skill evals',
+      skillPath: 'land-and-deploy/SKILL.md',
+      startMarker: '## Step 1: Pre-flight',
+      endMarker: '## Important Rules',
+      judgeContext: 'a merge-deploy-verify workflow for landing PRs to production',
+      judgeGoal: 'how to merge a PR via GitHub CLI, wait for CI and deploy workflows (with platform-specific strategies for Fly.io/Render/Vercel/Netlify), run canary health checks on production, and offer revert if something breaks — with timing data logged for retrospectives',
+    });
+  }, 30_000);
+
+  testIfSelected('canary/SKILL.md monitoring loop', async () => {
+    await runWorkflowJudge({
+      testName: 'canary/SKILL.md monitoring loop',
+      suite: 'Deploy skill evals',
+      skillPath: 'canary/SKILL.md',
+      startMarker: '### Phase 2: Baseline Capture',
+      endMarker: '## Important Rules',
+      judgeContext: 'a post-deploy canary monitoring workflow using a headless browser daemon',
+      judgeGoal: 'how to capture baseline screenshots and metrics before deploy, run a continuous monitoring loop checking each page every 60 seconds for console errors and performance regressions, fire alerts with evidence (screenshots), and produce a health report with per-page status and verdict',
+    });
+  }, 30_000);
+
+  testIfSelected('benchmark/SKILL.md perf collection', async () => {
+    await runWorkflowJudge({
+      testName: 'benchmark/SKILL.md perf collection',
+      suite: 'Deploy skill evals',
+      skillPath: 'benchmark/SKILL.md',
+      startMarker: '### Phase 3: Performance Data Collection',
+      endMarker: '## Important Rules',
+      judgeContext: 'a performance regression detection workflow using browser-based Web Vitals measurement',
+      judgeGoal: 'how to collect real performance metrics (TTFB, FCP, LCP, bundle sizes, request counts) via performance.getEntries(), compare against baselines with regression thresholds, produce a performance report with delta analysis, and track trends over time',
+    });
+  }, 30_000);
+
+  testIfSelected('setup-deploy/SKILL.md platform setup', async () => {
+    await runWorkflowJudge({
+      testName: 'setup-deploy/SKILL.md platform setup',
+      suite: 'Deploy skill evals',
+      skillPath: 'setup-deploy/SKILL.md',
+      startMarker: '### Step 2: Detect platform',
+      endMarker: '## Important Rules',
+      judgeContext: 'a deployment configuration setup workflow that detects deploy platforms and writes config to CLAUDE.md',
+      judgeGoal: 'how to detect deploy platforms (Fly.io, Render, Vercel, Netlify, Heroku, GitHub Actions, custom), gather platform-specific configuration (URLs, status commands, health checks, custom hooks), and persist everything to CLAUDE.md for future automated use',
+    });
+  }, 30_000);
+});
+
+// Block 5: Other skills
 describeIfSelected('Other skill evals', [
   'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
 ], () => {