mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat: diff-based test selection for E2E and LLM-judge evals (v0.6.1.0) (#139)
* feat: diff-based test selection for E2E and LLM-judge evals Each test declares file dependencies in a TOUCHFILES map. The test runner checks git diff against the base branch and only runs tests whose dependencies were modified. Global touchfiles (session-runner, eval-store, gen-skill-docs) trigger all tests. New scripts: test:e2e:all, test:evals:all, eval:select Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.6.1.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: plan-design-review-audit eval — bump turns to 30, add efficiency hints The test was flaky at 20 turns because the agent reads a 300-line SKILL.md, navigates, extracts design data, and writes a report. Added hints to skip preamble/batch commands/write early while still testing the real SKILL.md. Now completes in ~13 turns consistently. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+86
-38
@@ -1,10 +1,11 @@
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import type { SkillTestResult } from './helpers/session-runner';
|
||||
import { outcomeJudge } from './helpers/llm-judge';
|
||||
import { outcomeJudge, callJudge } from './helpers/llm-judge';
|
||||
import { EvalCollector, judgePassed } from './helpers/eval-store';
|
||||
import type { EvalTestEntry } from './helpers/eval-store';
|
||||
import { startTestServer } from '../browse/test/test-server';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
@@ -21,6 +22,41 @@ const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||
|
||||
// --- Diff-based test selection ---
|
||||
// When EVALS_ALL is not set, only run tests whose touchfiles were modified.
|
||||
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
|
||||
let selectedTests: string[] | null = null; // null = run all
|
||||
|
||||
if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
const baseBranch = process.env.EVALS_BASE
|
||||
|| detectBaseBranch(ROOT)
|
||||
|| 'main';
|
||||
const changedFiles = getChangedFiles(baseBranch, ROOT);
|
||||
|
||||
if (changedFiles.length > 0) {
|
||||
const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
|
||||
selectedTests = selection.selected;
|
||||
process.stderr.write(`\nE2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`);
|
||||
if (selection.skipped.length > 0) {
|
||||
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
|
||||
}
|
||||
process.stderr.write('\n');
|
||||
}
|
||||
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
|
||||
}
|
||||
|
||||
/** Wrap a describe block to skip entirely if none of its tests are selected. */
|
||||
function describeIfSelected(name: string, testNames: string[], fn: () => void) {
|
||||
const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
|
||||
(anySelected ? describeE2E : describe.skip)(name, fn);
|
||||
}
|
||||
|
||||
/** Skip an individual test if not selected (for multi-test describe blocks). */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
|
||||
const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
|
||||
|
||||
@@ -133,7 +169,10 @@ if (evalsEnabled) {
|
||||
}
|
||||
}
|
||||
|
||||
describeE2E('Skill E2E tests', () => {
|
||||
describeIfSelected('Skill E2E tests', [
|
||||
'browse-basic', 'browse-snapshot', 'skillmd-setup-discovery',
|
||||
'skillmd-no-local-binary', 'skillmd-outside-git', 'contributor-mode', 'session-awareness',
|
||||
], () => {
|
||||
beforeAll(() => {
|
||||
testServer = startTestServer();
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
|
||||
@@ -145,7 +184,7 @@ describeE2E('Skill E2E tests', () => {
|
||||
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('browse basic commands work without errors', async () => {
|
||||
testIfSelected('browse-basic', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
|
||||
1. $B goto ${testServer.url}
|
||||
@@ -166,7 +205,7 @@ Report the results of each command.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('browse snapshot flags all work', async () => {
|
||||
testIfSelected('browse-snapshot', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
|
||||
1. $B goto ${testServer.url}
|
||||
@@ -191,7 +230,7 @@ Report what each command returned.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('agent discovers browse binary via SKILL.md setup block', async () => {
|
||||
testIfSelected('skillmd-setup-discovery', async () => {
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
@@ -220,7 +259,7 @@ Report whether it worked.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('SKILL.md setup block handles missing local binary gracefully', async () => {
|
||||
testIfSelected('skillmd-no-local-binary', async () => {
|
||||
// Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse
|
||||
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
|
||||
|
||||
@@ -255,7 +294,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w
|
||||
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('SKILL.md setup block works outside git repo', async () => {
|
||||
testIfSelected('skillmd-outside-git', async () => {
|
||||
// Create a tmpdir outside any git repo
|
||||
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
|
||||
|
||||
@@ -286,7 +325,7 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('contributor mode files a report on gstack error', async () => {
|
||||
testIfSelected('contributor-mode', async () => {
|
||||
const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-'));
|
||||
const logsDir = path.join(contribDir, 'contributor-logs');
|
||||
fs.mkdirSync(logsDir, { recursive: true });
|
||||
@@ -342,7 +381,7 @@ File a contributor report about this issue. Then tell me what you filed.`,
|
||||
try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
|
||||
}, 90_000);
|
||||
|
||||
test('session awareness adds ELI16 context when _SESSIONS >= 3', async () => {
|
||||
testIfSelected('session-awareness', async () => {
|
||||
const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-'));
|
||||
|
||||
// Set up a git repo so there's project/branch context to reference
|
||||
@@ -413,7 +452,7 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi
|
||||
|
||||
// --- B4: QA skill E2E ---
|
||||
|
||||
describeE2E('QA skill E2E', () => {
|
||||
describeIfSelected('QA skill E2E', ['qa-quick'], () => {
|
||||
let qaDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -468,7 +507,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
|
||||
// --- B5: Review skill E2E ---
|
||||
|
||||
describeE2E('Review skill E2E', () => {
|
||||
describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -527,7 +566,7 @@ Write your review findings to ${reviewDir}/review-output.md`,
|
||||
|
||||
// --- Review: Enum completeness E2E ---
|
||||
|
||||
describeE2E('Review enum completeness E2E', () => {
|
||||
describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'], () => {
|
||||
let enumDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -603,7 +642,10 @@ The diff adds a new "returned" status to the Order model. Your job is to check i
|
||||
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
|
||||
const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
|
||||
|
||||
describeOutcome('Planted-bug outcome evals', () => {
|
||||
// Wrap describeOutcome with selection — skip if no planted-bug tests are selected
|
||||
const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout'];
|
||||
const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t));
|
||||
(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => {
|
||||
let outcomeDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -767,7 +809,7 @@ CRITICAL RULES:
|
||||
|
||||
// --- Plan CEO Review E2E ---
|
||||
|
||||
describeE2E('Plan CEO Review E2E', () => {
|
||||
describeIfSelected('Plan CEO Review E2E', ['plan-ceo-review'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -854,7 +896,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
|
||||
// --- Plan CEO Review (SELECTIVE EXPANSION) E2E ---
|
||||
|
||||
describeE2E('Plan CEO Review SELECTIVE EXPANSION E2E', () => {
|
||||
describeIfSelected('Plan CEO Review SELECTIVE EXPANSION E2E', ['plan-ceo-review-selective'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -937,7 +979,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
|
||||
// --- Plan Eng Review E2E ---
|
||||
|
||||
describeE2E('Plan Eng Review E2E', () => {
|
||||
describeIfSelected('Plan Eng Review E2E', ['plan-eng-review'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1031,7 +1073,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
|
||||
|
||||
// --- Retro E2E ---
|
||||
|
||||
describeE2E('Retro E2E', () => {
|
||||
describeIfSelected('Retro E2E', ['retro'], () => {
|
||||
let retroDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1117,7 +1159,7 @@ Analyze the git history and produce the narrative report as described in the SKI
|
||||
|
||||
// --- QA-Only E2E (report-only, no fixes) ---
|
||||
|
||||
describeE2E('QA-Only skill E2E', () => {
|
||||
describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
|
||||
let qaOnlyDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1203,7 +1245,7 @@ Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`,
|
||||
|
||||
// --- QA Fix Loop E2E ---
|
||||
|
||||
describeE2E('QA Fix Loop E2E', () => {
|
||||
describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
|
||||
let qaFixDir: string;
|
||||
let qaFixServer: ReturnType<typeof Bun.serve> | null = null;
|
||||
|
||||
@@ -1317,7 +1359,7 @@ This is a test+fix loop: find bugs, fix them in the source code, commit each fix
|
||||
|
||||
// --- Plan-Eng-Review Test-Plan Artifact E2E ---
|
||||
|
||||
describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => {
|
||||
describeIfSelected('Plan-Eng-Review Test-Plan Artifact E2E', ['plan-eng-review-artifact'], () => {
|
||||
let planDir: string;
|
||||
let projectDir: string;
|
||||
|
||||
@@ -1444,7 +1486,7 @@ Write your review to ${planDir}/review-output.md`,
|
||||
|
||||
// --- Base branch detection smoke tests ---
|
||||
|
||||
describeE2E('Base branch detection', () => {
|
||||
describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => {
|
||||
let baseBranchDir: string;
|
||||
const run = (cmd: string, args: string[], cwd: string) =>
|
||||
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
|
||||
@@ -1457,7 +1499,7 @@ describeE2E('Base branch detection', () => {
|
||||
try { fs.rmSync(baseBranchDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review detects base branch and diffs against it', async () => {
|
||||
testIfSelected('review-base-branch', async () => {
|
||||
const dir = path.join(baseBranchDir, 'review-base');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
|
||||
@@ -1510,7 +1552,7 @@ Write your findings to ${dir}/review-output.md`,
|
||||
expect(usedGitDiff).toBe(true);
|
||||
}, 120_000);
|
||||
|
||||
test('/ship Step 0-1 detects base branch without destructive actions', async () => {
|
||||
testIfSelected('ship-base-branch', async () => {
|
||||
const dir = path.join(baseBranchDir, 'ship-base');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
|
||||
@@ -1572,7 +1614,7 @@ Write a summary of what you detected to ${dir}/ship-preflight.md including:
|
||||
expect(destructiveTools).toHaveLength(0);
|
||||
}, 90_000);
|
||||
|
||||
test('/retro detects default branch for git queries', async () => {
|
||||
testIfSelected('retro-base-branch', async () => {
|
||||
const dir = path.join(baseBranchDir, 'retro-base');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
|
||||
@@ -1631,7 +1673,7 @@ Write your retrospective to ${dir}/retro-output.md`,
|
||||
|
||||
// --- Document-Release skill E2E ---
|
||||
|
||||
describeE2E('Document-Release skill E2E', () => {
|
||||
describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
|
||||
let docReleaseDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1735,6 +1777,7 @@ IMPORTANT:
|
||||
|
||||
// --- Deferred skill E2E tests (destructive or require interactive UI) ---
|
||||
|
||||
// Deferred tests — only test.todo entries, no selection needed
|
||||
describeE2E('Deferred skill E2E', () => {
|
||||
// Ship is destructive: pushes to remote, creates PRs, modifies VERSION/CHANGELOG
|
||||
test.todo('/ship completes full workflow');
|
||||
@@ -1772,7 +1815,10 @@ ${designMd}
|
||||
Return JSON: { "passed": true/false, "reasoning": "one paragraph explaining your evaluation" }`);
|
||||
}
|
||||
|
||||
describeE2E('Design Consultation E2E', () => {
|
||||
describeIfSelected('Design Consultation E2E', [
|
||||
'design-consultation-core', 'design-consultation-research',
|
||||
'design-consultation-existing', 'design-consultation-preview',
|
||||
], () => {
|
||||
let designDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1816,7 +1862,7 @@ A civic tech data platform for government employees to access, visualize, and sh
|
||||
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 1: core flow produces valid DESIGN.md + CLAUDE.md', async () => {
|
||||
testIfSelected('design-consultation-core', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
|
||||
|
||||
@@ -1876,7 +1922,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
|
||||
}
|
||||
}, 420_000);
|
||||
|
||||
test('Test 2: research integration uses WebSearch', async () => {
|
||||
testIfSelected('design-consultation-research', async () => {
|
||||
// Clean up from previous test
|
||||
try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
|
||||
try { fs.unlinkSync(path.join(designDir, 'CLAUDE.md')); } catch {}
|
||||
@@ -1933,7 +1979,7 @@ Write DESIGN.md to the working directory.`,
|
||||
expect(designExists).toBe(true);
|
||||
}, 420_000);
|
||||
|
||||
test('Test 3: handles existing DESIGN.md', async () => {
|
||||
testIfSelected('design-consultation-existing', async () => {
|
||||
// Pre-create a minimal DESIGN.md
|
||||
fs.writeFileSync(path.join(designDir, 'DESIGN.md'), `# Design System — CivicPulse
|
||||
|
||||
@@ -1979,7 +2025,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
|
||||
}
|
||||
}, 420_000);
|
||||
|
||||
test('Test 4: generates font + color preview HTML', async () => {
|
||||
testIfSelected('design-consultation-preview', async () => {
|
||||
// Clean up
|
||||
try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
|
||||
|
||||
@@ -2043,7 +2089,7 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener
|
||||
|
||||
// --- Plan Design Review E2E ---
|
||||
|
||||
describeE2E('Plan Design Review E2E', () => {
|
||||
describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-design-review-export'], () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -2074,7 +2120,7 @@ describeE2E('Plan Design Review E2E', () => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 5: /plan-design-review produces audit report', async () => {
|
||||
testIfSelected('plan-design-review-audit', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
|
||||
@@ -2082,9 +2128,11 @@ B="${browseBin}"
|
||||
|
||||
Read plan-design-review/SKILL.md for the design review workflow.
|
||||
|
||||
Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.`,
|
||||
Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.
|
||||
|
||||
EFFICIENCY: Skip the preamble bash block. Combine multiple browse commands into single bash blocks (e.g. run all Phase 2 JS extractions in one block). Write the report as soon as you have enough data — do not over-explore.`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 20,
|
||||
maxTurns: 30,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-design-review-audit',
|
||||
runId,
|
||||
@@ -2113,7 +2161,7 @@ Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Ski
|
||||
}
|
||||
}, 420_000);
|
||||
|
||||
test('Test 6: /plan-design-review exports DESIGN.md', async () => {
|
||||
testIfSelected('plan-design-review-export', async () => {
|
||||
// Clean up previous test artifacts
|
||||
try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {}
|
||||
|
||||
@@ -2161,7 +2209,7 @@ Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — t
|
||||
|
||||
// --- QA Design Review E2E ---
|
||||
|
||||
describeE2E('QA Design Review E2E', () => {
|
||||
describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => {
|
||||
let qaDesignDir: string;
|
||||
let qaDesignServer: ReturnType<typeof Bun.serve> | null = null;
|
||||
|
||||
@@ -2300,7 +2348,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
|
||||
|
||||
// --- Test Bootstrap E2E ---
|
||||
|
||||
describeE2E('Test Bootstrap E2E', () => {
|
||||
describeIfSelected('Test Bootstrap E2E', ['qa-bootstrap'], () => {
|
||||
let bootstrapDir: string;
|
||||
let bootstrapServer: ReturnType<typeof Bun.serve>;
|
||||
|
||||
@@ -2437,7 +2485,7 @@ This is a test+fix loop: find bugs, fix them, write regression tests, commit eac
|
||||
|
||||
// --- Test Coverage Audit E2E ---
|
||||
|
||||
describeE2E('Test Coverage Audit E2E', () => {
|
||||
describeIfSelected('Test Coverage Audit E2E', ['ship-coverage-audit'], () => {
|
||||
let coverageDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
|
||||
Reference in New Issue
Block a user