mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 21:25:27 +02:00
test: E2E + LLM-judge evals for deploy skills
- 4 E2E tests: land-and-deploy (Fly.io detection + deploy report), canary (monitoring report structure), benchmark (perf report schema), setup-deploy (platform detection → CLAUDE.md config) - 4 LLM-judge evals: workflow quality for all 4 new skills - Touchfile entries for diff-based test selection (E2E + LLM-judge) - 460 free tests pass, 0 fail Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -98,6 +98,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
|
||||
// Deploy skills
|
||||
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
'canary-workflow': ['canary/**', 'browse/src/**'],
|
||||
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
|
||||
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
|
||||
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
@@ -140,6 +146,12 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
|
||||
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
|
||||
|
||||
// Deploy skills
|
||||
'land-and-deploy/SKILL.md workflow': ['land-and-deploy/SKILL.md', 'land-and-deploy/SKILL.md.tmpl'],
|
||||
'canary/SKILL.md monitoring loop': ['canary/SKILL.md', 'canary/SKILL.md.tmpl'],
|
||||
'benchmark/SKILL.md perf collection': ['benchmark/SKILL.md', 'benchmark/SKILL.md.tmpl'],
|
||||
'setup-deploy/SKILL.md platform setup': ['setup-deploy/SKILL.md', 'setup-deploy/SKILL.md.tmpl'],
|
||||
|
||||
// Other skills
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
|
||||
@@ -2911,6 +2911,283 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
// --- Land-and-Deploy / Canary / Benchmark / Setup-Deploy E2E ---
|
||||
|
||||
describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], () => {
|
||||
let landDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
landDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-deploy-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: landDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Create initial app
|
||||
fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "world"; }\n');
|
||||
fs.writeFileSync(path.join(landDir, 'fly.toml'), 'app = "test-app"\n\n[http_service]\n internal_port = 3000\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Create feature branch with changes
|
||||
run('git', ['checkout', '-b', 'feat/add-deploy']);
|
||||
fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "deployed"; }\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'feat: update hello']);
|
||||
|
||||
// Copy skill
|
||||
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(landDir, 'land-and-deploy'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
||||
|
||||
You are on branch feat/add-deploy with changes against main. This repo has a fly.toml
|
||||
with app = "test-app", indicating a Fly.io deployment.
|
||||
|
||||
IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
|
||||
Instead, simulate the workflow:
|
||||
1. Detect the deploy platform from fly.toml (should find Fly.io, app = test-app)
|
||||
2. Infer the production URL (https://test-app.fly.dev)
|
||||
3. Note the merge method would be squash
|
||||
4. Write the deploy configuration to CLAUDE.md
|
||||
5. Write a deploy report skeleton to .gstack/deploy-reports/report.md showing the
|
||||
expected report structure (PR number: simulated, timing: simulated, verdict: simulated)
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
|
||||
workingDirectory: landDir,
|
||||
maxTurns: 20,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'land-and-deploy-workflow',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/land-and-deploy', result);
|
||||
recordE2E('/land-and-deploy workflow', 'Land-and-Deploy skill E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify deploy config was written to CLAUDE.md
|
||||
const claudeMd = path.join(landDir, 'CLAUDE.md');
|
||||
if (fs.existsSync(claudeMd)) {
|
||||
const content = fs.readFileSync(claudeMd, 'utf-8');
|
||||
const hasFly = content.toLowerCase().includes('fly') || content.toLowerCase().includes('test-app');
|
||||
expect(hasFly).toBe(true);
|
||||
}
|
||||
|
||||
// Verify deploy report directory was created
|
||||
const reportDir = path.join(landDir, '.gstack', 'deploy-reports');
|
||||
expect(fs.existsSync(reportDir)).toBe(true);
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
|
||||
let canaryDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
canaryDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-canary-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: canaryDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(canaryDir, 'index.html'), '<h1>Hello</h1>\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Copy skill
|
||||
copyDirSync(path.join(ROOT, 'canary'), path.join(canaryDir, 'canary'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/canary skill produces monitoring report structure', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read canary/SKILL.md for the /canary skill instructions.
|
||||
|
||||
You are simulating a canary check. There is NO browse daemon available and NO production URL.
|
||||
|
||||
Instead, demonstrate you understand the workflow:
|
||||
1. Create the .gstack/canary-reports/ directory structure
|
||||
2. Write a simulated baseline.json to .gstack/canary-reports/baseline.json with the
|
||||
schema described in Phase 2 of the skill (url, timestamp, branch, pages with
|
||||
screenshot path, console_errors count, and load_time_ms)
|
||||
3. Write a simulated canary report to .gstack/canary-reports/canary-report.md following
|
||||
the Phase 6 Health Report format (CANARY REPORT header, duration, pages, status,
|
||||
per-page results table, verdict)
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
|
||||
Just create the directory structure and report files showing the correct schema.`,
|
||||
workingDirectory: canaryDir,
|
||||
maxTurns: 15,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'canary-workflow',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/canary', result);
|
||||
recordE2E('/canary workflow', 'Canary skill E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify directory structure
|
||||
expect(fs.existsSync(path.join(canaryDir, '.gstack', 'canary-reports'))).toBe(true);
|
||||
|
||||
// Verify baseline or report was created
|
||||
const reportDir = path.join(canaryDir, '.gstack', 'canary-reports');
|
||||
const files = fs.readdirSync(reportDir, { recursive: true }) as string[];
|
||||
expect(files.length).toBeGreaterThan(0);
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
|
||||
let benchDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benchmark-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: benchDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(benchDir, 'index.html'), '<h1>Hello</h1>\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Copy skill
|
||||
copyDirSync(path.join(ROOT, 'benchmark'), path.join(benchDir, 'benchmark'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/benchmark skill produces performance report structure', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
|
||||
|
||||
You are simulating a benchmark run. There is NO browse daemon available and NO production URL.
|
||||
|
||||
Instead, demonstrate you understand the workflow:
|
||||
1. Create the .gstack/benchmark-reports/ directory structure including baselines/
|
||||
2. Write a simulated baseline.json to .gstack/benchmark-reports/baselines/baseline.json
|
||||
with the schema from Phase 4 (url, timestamp, branch, pages with ttfb_ms, fcp_ms,
|
||||
lcp_ms, dom_interactive_ms, dom_complete_ms, full_load_ms, total_requests,
|
||||
total_transfer_bytes, js_bundle_bytes, css_bundle_bytes, largest_resources)
|
||||
3. Write a simulated benchmark report to .gstack/benchmark-reports/benchmark-report.md
|
||||
following the Phase 5 comparison format (PERFORMANCE REPORT header, page comparison
|
||||
table with Baseline/Current/Delta/Status columns, regression thresholds applied)
|
||||
4. Include the Phase 7 Performance Budget section in the report
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
|
||||
Just create the files showing the correct schema and report format.`,
|
||||
workingDirectory: benchDir,
|
||||
maxTurns: 15,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'benchmark-workflow',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/benchmark', result);
|
||||
recordE2E('/benchmark workflow', 'Benchmark skill E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify directory structure
|
||||
expect(fs.existsSync(path.join(benchDir, '.gstack', 'benchmark-reports'))).toBe(true);
|
||||
|
||||
// Verify baseline was created
|
||||
const baselineDir = path.join(benchDir, '.gstack', 'benchmark-reports', 'baselines');
|
||||
if (fs.existsSync(baselineDir)) {
|
||||
const files = fs.readdirSync(baselineDir);
|
||||
expect(files.length).toBeGreaterThan(0);
|
||||
}
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
|
||||
let setupDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
setupDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-setup-deploy-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: setupDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Create a project with fly.toml
|
||||
fs.writeFileSync(path.join(setupDir, 'app.ts'), 'export default { port: 3000 };\n');
|
||||
fs.writeFileSync(path.join(setupDir, 'fly.toml'), 'app = "my-cool-app"\n\n[http_service]\n internal_port = 3000\n force_https = true\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Copy skill
|
||||
copyDirSync(path.join(ROOT, 'setup-deploy'), path.join(setupDir, 'setup-deploy'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
|
||||
|
||||
This repo has a fly.toml with app = "my-cool-app". Run the /setup-deploy workflow:
|
||||
1. Detect the platform from fly.toml (should be Fly.io)
|
||||
2. Extract the app name: my-cool-app
|
||||
3. Infer production URL: https://my-cool-app.fly.dev
|
||||
4. Set deploy status command: fly status --app my-cool-app
|
||||
5. Write the Deploy Configuration section to CLAUDE.md
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run fly or gh commands.
|
||||
Do NOT try to verify the health check URL (there is no network).
|
||||
Just detect the platform and write the config.`,
|
||||
workingDirectory: setupDir,
|
||||
maxTurns: 15,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'setup-deploy-workflow',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/setup-deploy', result);
|
||||
recordE2E('/setup-deploy workflow', 'Setup-Deploy skill E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify CLAUDE.md was created with deploy config
|
||||
const claudeMd = path.join(setupDir, 'CLAUDE.md');
|
||||
expect(fs.existsSync(claudeMd)).toBe(true);
|
||||
|
||||
const content = fs.readFileSync(claudeMd, 'utf-8');
|
||||
// Should mention Fly.io or fly
|
||||
expect(content.toLowerCase()).toContain('fly');
|
||||
// Should mention the app name
|
||||
expect(content).toContain('my-cool-app');
|
||||
// Should have the deploy configuration header
|
||||
expect(content).toContain('Deploy Configuration');
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -680,7 +680,61 @@ describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'de
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 4: Other skills
|
||||
// Block 4: Deploy skills
|
||||
describeIfSelected('Deploy skill evals', [
|
||||
'land-and-deploy/SKILL.md workflow', 'canary/SKILL.md monitoring loop',
|
||||
'benchmark/SKILL.md perf collection', 'setup-deploy/SKILL.md platform setup',
|
||||
], () => {
|
||||
testIfSelected('land-and-deploy/SKILL.md workflow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'land-and-deploy/SKILL.md workflow',
|
||||
suite: 'Deploy skill evals',
|
||||
skillPath: 'land-and-deploy/SKILL.md',
|
||||
startMarker: '## Step 1: Pre-flight',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a merge-deploy-verify workflow for landing PRs to production',
|
||||
judgeGoal: 'how to merge a PR via GitHub CLI, wait for CI and deploy workflows (with platform-specific strategies for Fly.io/Render/Vercel/Netlify), run canary health checks on production, and offer revert if something breaks — with timing data logged for retrospectives',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('canary/SKILL.md monitoring loop', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'canary/SKILL.md monitoring loop',
|
||||
suite: 'Deploy skill evals',
|
||||
skillPath: 'canary/SKILL.md',
|
||||
startMarker: '### Phase 2: Baseline Capture',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a post-deploy canary monitoring workflow using a headless browser daemon',
|
||||
judgeGoal: 'how to capture baseline screenshots and metrics before deploy, run a continuous monitoring loop checking each page every 60 seconds for console errors and performance regressions, fire alerts with evidence (screenshots), and produce a health report with per-page status and verdict',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('benchmark/SKILL.md perf collection', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'benchmark/SKILL.md perf collection',
|
||||
suite: 'Deploy skill evals',
|
||||
skillPath: 'benchmark/SKILL.md',
|
||||
startMarker: '### Phase 3: Performance Data Collection',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a performance regression detection workflow using browser-based Web Vitals measurement',
|
||||
judgeGoal: 'how to collect real performance metrics (TTFB, FCP, LCP, bundle sizes, request counts) via performance.getEntries(), compare against baselines with regression thresholds, produce a performance report with delta analysis, and track trends over time',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('setup-deploy/SKILL.md platform setup', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'setup-deploy/SKILL.md platform setup',
|
||||
suite: 'Deploy skill evals',
|
||||
skillPath: 'setup-deploy/SKILL.md',
|
||||
startMarker: '### Step 2: Detect platform',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a deployment configuration setup workflow that detects deploy platforms and writes config to CLAUDE.md',
|
||||
judgeGoal: 'how to detect deploy platforms (Fly.io, Render, Vercel, Netlify, Heroku, GitHub Actions, custom), gather platform-specific configuration (URLs, status commands, health checks, custom hooks), and persist everything to CLAUDE.md for future automated use',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 5: Other skills
|
||||
describeIfSelected('Other skill evals', [
|
||||
'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
|
||||
], () => {
|
||||
|
||||
Reference in New Issue
Block a user