test: E2E + LLM-judge evals for deploy skills

- 4 E2E tests: land-and-deploy (Fly.io detection + deploy report),
  canary (monitoring report structure), benchmark (perf report schema),
  setup-deploy (platform detection → CLAUDE.md config)
- 4 LLM-judge evals: workflow quality for all 4 new skills
- Touchfile entries for diff-based test selection (E2E + LLM-judge)
- 460 free tests pass, 0 fail

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-20 07:16:45 -07:00
parent 198cd2dcad
commit 0d1d2e970b
3 changed files with 344 additions and 1 deletions
+12
View File
@@ -98,6 +98,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// gstack-upgrade
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
// Deploy skills
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
'canary-workflow': ['canary/**', 'browse/src/**'],
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -140,6 +146,12 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
// Deploy skills
'land-and-deploy/SKILL.md workflow': ['land-and-deploy/SKILL.md', 'land-and-deploy/SKILL.md.tmpl'],
'canary/SKILL.md monitoring loop': ['canary/SKILL.md', 'canary/SKILL.md.tmpl'],
'benchmark/SKILL.md perf collection': ['benchmark/SKILL.md', 'benchmark/SKILL.md.tmpl'],
'setup-deploy/SKILL.md platform setup': ['setup-deploy/SKILL.md', 'setup-deploy/SKILL.md.tmpl'],
// Other skills
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
+277
View File
@@ -2911,6 +2911,283 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
}, 360_000);
});
// --- Land-and-Deploy / Canary / Benchmark / Setup-Deploy E2E ---
describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], () => {
let landDir: string;
beforeAll(() => {
landDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-deploy-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: landDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Create initial app
fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "world"; }\n');
fs.writeFileSync(path.join(landDir, 'fly.toml'), 'app = "test-app"\n\n[http_service]\n internal_port = 3000\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Create feature branch with changes
run('git', ['checkout', '-b', 'feat/add-deploy']);
fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "deployed"; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'feat: update hello']);
// Copy skill
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(landDir, 'land-and-deploy'));
});
afterAll(() => {
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
});
test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
You are on branch feat/add-deploy with changes against main. This repo has a fly.toml
with app = "test-app", indicating a Fly.io deployment.
IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
Instead, simulate the workflow:
1. Detect the deploy platform from fly.toml (should find Fly.io, app = test-app)
2. Infer the production URL (https://test-app.fly.dev)
3. Note the merge method would be squash
4. Write the deploy configuration to CLAUDE.md
5. Write a deploy report skeleton to .gstack/deploy-reports/report.md showing the
expected report structure (PR number: simulated, timing: simulated, verdict: simulated)
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
workingDirectory: landDir,
maxTurns: 20,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'land-and-deploy-workflow',
runId,
});
logCost('/land-and-deploy', result);
recordE2E('/land-and-deploy workflow', 'Land-and-Deploy skill E2E', result);
expect(result.exitReason).toBe('success');
// Verify deploy config was written to CLAUDE.md
const claudeMd = path.join(landDir, 'CLAUDE.md');
if (fs.existsSync(claudeMd)) {
const content = fs.readFileSync(claudeMd, 'utf-8');
const hasFly = content.toLowerCase().includes('fly') || content.toLowerCase().includes('test-app');
expect(hasFly).toBe(true);
}
// Verify deploy report directory was created
const reportDir = path.join(landDir, '.gstack', 'deploy-reports');
expect(fs.existsSync(reportDir)).toBe(true);
}, 180_000);
});
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
let canaryDir: string;
beforeAll(() => {
canaryDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-canary-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: canaryDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(canaryDir, 'index.html'), '<h1>Hello</h1>\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Copy skill
copyDirSync(path.join(ROOT, 'canary'), path.join(canaryDir, 'canary'));
});
afterAll(() => {
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
});
test('/canary skill produces monitoring report structure', async () => {
const result = await runSkillTest({
prompt: `Read canary/SKILL.md for the /canary skill instructions.
You are simulating a canary check. There is NO browse daemon available and NO production URL.
Instead, demonstrate you understand the workflow:
1. Create the .gstack/canary-reports/ directory structure
2. Write a simulated baseline.json to .gstack/canary-reports/baseline.json with the
schema described in Phase 2 of the skill (url, timestamp, branch, pages with
screenshot path, console_errors count, and load_time_ms)
3. Write a simulated canary report to .gstack/canary-reports/canary-report.md following
the Phase 6 Health Report format (CANARY REPORT header, duration, pages, status,
per-page results table, verdict)
Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
Just create the directory structure and report files showing the correct schema.`,
workingDirectory: canaryDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
timeout: 120_000,
testName: 'canary-workflow',
runId,
});
logCost('/canary', result);
recordE2E('/canary workflow', 'Canary skill E2E', result);
expect(result.exitReason).toBe('success');
// Verify directory structure
expect(fs.existsSync(path.join(canaryDir, '.gstack', 'canary-reports'))).toBe(true);
// Verify baseline or report was created
const reportDir = path.join(canaryDir, '.gstack', 'canary-reports');
const files = fs.readdirSync(reportDir, { recursive: true }) as string[];
expect(files.length).toBeGreaterThan(0);
}, 180_000);
});
describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
let benchDir: string;
beforeAll(() => {
benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benchmark-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: benchDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(benchDir, 'index.html'), '<h1>Hello</h1>\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Copy skill
copyDirSync(path.join(ROOT, 'benchmark'), path.join(benchDir, 'benchmark'));
});
afterAll(() => {
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
});
test('/benchmark skill produces performance report structure', async () => {
const result = await runSkillTest({
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
You are simulating a benchmark run. There is NO browse daemon available and NO production URL.
Instead, demonstrate you understand the workflow:
1. Create the .gstack/benchmark-reports/ directory structure including baselines/
2. Write a simulated baseline.json to .gstack/benchmark-reports/baselines/baseline.json
with the schema from Phase 4 (url, timestamp, branch, pages with ttfb_ms, fcp_ms,
lcp_ms, dom_interactive_ms, dom_complete_ms, full_load_ms, total_requests,
total_transfer_bytes, js_bundle_bytes, css_bundle_bytes, largest_resources)
3. Write a simulated benchmark report to .gstack/benchmark-reports/benchmark-report.md
following the Phase 5 comparison format (PERFORMANCE REPORT header, page comparison
table with Baseline/Current/Delta/Status columns, regression thresholds applied)
4. Include the Phase 7 Performance Budget section in the report
Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
Just create the files showing the correct schema and report format.`,
workingDirectory: benchDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
timeout: 120_000,
testName: 'benchmark-workflow',
runId,
});
logCost('/benchmark', result);
recordE2E('/benchmark workflow', 'Benchmark skill E2E', result);
expect(result.exitReason).toBe('success');
// Verify directory structure
expect(fs.existsSync(path.join(benchDir, '.gstack', 'benchmark-reports'))).toBe(true);
// Verify baseline was created
const baselineDir = path.join(benchDir, '.gstack', 'benchmark-reports', 'baselines');
if (fs.existsSync(baselineDir)) {
const files = fs.readdirSync(baselineDir);
expect(files.length).toBeGreaterThan(0);
}
}, 180_000);
});
describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
let setupDir: string;
beforeAll(() => {
setupDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-setup-deploy-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: setupDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Create a project with fly.toml
fs.writeFileSync(path.join(setupDir, 'app.ts'), 'export default { port: 3000 };\n');
fs.writeFileSync(path.join(setupDir, 'fly.toml'), 'app = "my-cool-app"\n\n[http_service]\n internal_port = 3000\n force_https = true\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Copy skill
copyDirSync(path.join(ROOT, 'setup-deploy'), path.join(setupDir, 'setup-deploy'));
});
afterAll(() => {
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
});
test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
const result = await runSkillTest({
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
This repo has a fly.toml with app = "my-cool-app". Run the /setup-deploy workflow:
1. Detect the platform from fly.toml (should be Fly.io)
2. Extract the app name: my-cool-app
3. Infer production URL: https://my-cool-app.fly.dev
4. Set deploy status command: fly status --app my-cool-app
5. Write the Deploy Configuration section to CLAUDE.md
Do NOT use AskUserQuestion. Do NOT run fly or gh commands.
Do NOT try to verify the health check URL (there is no network).
Just detect the platform and write the config.`,
workingDirectory: setupDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'setup-deploy-workflow',
runId,
});
logCost('/setup-deploy', result);
recordE2E('/setup-deploy workflow', 'Setup-Deploy skill E2E', result);
expect(result.exitReason).toBe('success');
// Verify CLAUDE.md was created with deploy config
const claudeMd = path.join(setupDir, 'CLAUDE.md');
expect(fs.existsSync(claudeMd)).toBe(true);
const content = fs.readFileSync(claudeMd, 'utf-8');
// Should mention Fly.io or fly
expect(content.toLowerCase()).toContain('fly');
// Should mention the app name
expect(content).toContain('my-cool-app');
// Should have the deploy configuration header
expect(content).toContain('Deploy Configuration');
}, 180_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+55 -1
View File
@@ -680,7 +680,61 @@ describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'de
}, 30_000);
});
// Block 4: Other skills
// Block 4: Deploy skills
describeIfSelected('Deploy skill evals', [
'land-and-deploy/SKILL.md workflow', 'canary/SKILL.md monitoring loop',
'benchmark/SKILL.md perf collection', 'setup-deploy/SKILL.md platform setup',
], () => {
testIfSelected('land-and-deploy/SKILL.md workflow', async () => {
await runWorkflowJudge({
testName: 'land-and-deploy/SKILL.md workflow',
suite: 'Deploy skill evals',
skillPath: 'land-and-deploy/SKILL.md',
startMarker: '## Step 1: Pre-flight',
endMarker: '## Important Rules',
judgeContext: 'a merge-deploy-verify workflow for landing PRs to production',
judgeGoal: 'how to merge a PR via GitHub CLI, wait for CI and deploy workflows (with platform-specific strategies for Fly.io/Render/Vercel/Netlify), run canary health checks on production, and offer revert if something breaks — with timing data logged for retrospectives',
});
}, 30_000);
testIfSelected('canary/SKILL.md monitoring loop', async () => {
await runWorkflowJudge({
testName: 'canary/SKILL.md monitoring loop',
suite: 'Deploy skill evals',
skillPath: 'canary/SKILL.md',
startMarker: '### Phase 2: Baseline Capture',
endMarker: '## Important Rules',
judgeContext: 'a post-deploy canary monitoring workflow using a headless browser daemon',
judgeGoal: 'how to capture baseline screenshots and metrics before deploy, run a continuous monitoring loop checking each page every 60 seconds for console errors and performance regressions, fire alerts with evidence (screenshots), and produce a health report with per-page status and verdict',
});
}, 30_000);
testIfSelected('benchmark/SKILL.md perf collection', async () => {
await runWorkflowJudge({
testName: 'benchmark/SKILL.md perf collection',
suite: 'Deploy skill evals',
skillPath: 'benchmark/SKILL.md',
startMarker: '### Phase 3: Performance Data Collection',
endMarker: '## Important Rules',
judgeContext: 'a performance regression detection workflow using browser-based Web Vitals measurement',
judgeGoal: 'how to collect real performance metrics (TTFB, FCP, LCP, bundle sizes, request counts) via performance.getEntries(), compare against baselines with regression thresholds, produce a performance report with delta analysis, and track trends over time',
});
}, 30_000);
testIfSelected('setup-deploy/SKILL.md platform setup', async () => {
await runWorkflowJudge({
testName: 'setup-deploy/SKILL.md platform setup',
suite: 'Deploy skill evals',
skillPath: 'setup-deploy/SKILL.md',
startMarker: '### Step 2: Detect platform',
endMarker: '## Important Rules',
judgeContext: 'a deployment configuration setup workflow that detects deploy platforms and writes config to CLAUDE.md',
judgeGoal: 'how to detect deploy platforms (Fly.io, Render, Vercel, Netlify, Heroku, GitHub Actions, custom), gather platform-specific configuration (URLs, status commands, health checks, custom hooks), and persist everything to CLAUDE.md for future automated use',
});
}, 30_000);
});
// Block 5: Other skills
describeIfSelected('Other skill evals', [
'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
], () => {