mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat: diff-based test selection for E2E and LLM-judge evals (v0.6.1.0) (#139)
* feat: diff-based test selection for E2E and LLM-judge evals Each test declares file dependencies in a TOUCHFILES map. The test runner checks git diff against the base branch and only runs tests whose dependencies were modified. Global touchfiles (session-runner, eval-store, gen-skill-docs) trigger all tests. New scripts: test:e2e:all, test:evals:all, eval:select Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.6.1.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: plan-design-review-audit eval — bump turns to 30, add efficiency hints The test was flaky at 20 turns because the agent reads a 300-line SKILL.md, navigates, extracts design data, and writes a report. Added hints to skip preamble/batch commands/write early while still testing the real SKILL.md. Now completes in ~13 turns consistently. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
/**
|
||||
* Diff-based test selection for E2E and LLM-judge evals.
|
||||
*
|
||||
* Each test declares which source files it depends on ("touchfiles").
|
||||
* The test runner checks `git diff` and only runs tests whose
|
||||
* dependencies were modified. Override with EVALS_ALL=1 to run everything.
|
||||
*/
|
||||
|
||||
import { spawnSync } from 'child_process';
|
||||
|
||||
// --- Glob matching ---
|
||||
|
||||
/**
|
||||
* Match a file path against a glob pattern.
|
||||
* Supports:
|
||||
* ** — match any number of path segments
|
||||
* * — match within a single segment (no /)
|
||||
*/
|
||||
export function matchGlob(file: string, pattern: string): boolean {
|
||||
const regexStr = pattern
|
||||
.replace(/\./g, '\\.')
|
||||
.replace(/\*\*/g, '{{GLOBSTAR}}')
|
||||
.replace(/\*/g, '[^/]*')
|
||||
.replace(/\{\{GLOBSTAR\}\}/g, '.*');
|
||||
return new RegExp(`^${regexStr}$`).test(file);
|
||||
}
|
||||
|
||||
// --- Touchfile maps ---
|
||||
|
||||
/**
|
||||
* E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
|
||||
* Each test lists the file patterns that, if changed, require the test to run.
|
||||
*/
|
||||
export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Browse core
|
||||
'browse-basic': ['browse/src/**'],
|
||||
'browse-snapshot': ['browse/src/**'],
|
||||
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
|
||||
// QA
|
||||
'qa-quick': ['qa/**', 'browse/src/**'],
|
||||
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
|
||||
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
|
||||
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
|
||||
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
|
||||
'qa-fix-loop': ['qa/**', 'browse/src/**'],
|
||||
|
||||
// Review
|
||||
'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'],
|
||||
'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
|
||||
'review-base-branch': ['review/**'],
|
||||
|
||||
// Plan reviews
|
||||
'plan-ceo-review': ['plan-ceo-review/**'],
|
||||
'plan-ceo-review-selective': ['plan-ceo-review/**'],
|
||||
'plan-eng-review': ['plan-eng-review/**'],
|
||||
'plan-eng-review-artifact': ['plan-eng-review/**'],
|
||||
|
||||
// Ship
|
||||
'ship-base-branch': ['ship/**'],
|
||||
|
||||
// Retro
|
||||
'retro': ['retro/**'],
|
||||
'retro-base-branch': ['retro/**'],
|
||||
|
||||
// Document-release
|
||||
'document-release': ['document-release/**'],
|
||||
|
||||
// QA bootstrap
|
||||
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
|
||||
|
||||
// Ship coverage audit
|
||||
'ship-coverage-audit': ['ship/**'],
|
||||
|
||||
// Design
|
||||
'design-consultation-core': ['design-consultation/**'],
|
||||
'design-consultation-research': ['design-consultation/**'],
|
||||
'design-consultation-existing': ['design-consultation/**'],
|
||||
'design-consultation-preview': ['design-consultation/**'],
|
||||
'plan-design-review-audit': ['plan-design-review/**'],
|
||||
'plan-design-review-export': ['plan-design-review/**'],
|
||||
'qa-design-review-fix': ['qa-design-review/**', 'browse/src/**'],
|
||||
};
|
||||
|
||||
/**
|
||||
* LLM-judge test touchfiles — keyed by test description string.
|
||||
*/
|
||||
export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
|
||||
'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
|
||||
'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
|
||||
'setup block': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
|
||||
'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
|
||||
'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
|
||||
'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
|
||||
};
|
||||
|
||||
/**
|
||||
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
|
||||
*/
|
||||
export const GLOBAL_TOUCHFILES = [
|
||||
'test/helpers/session-runner.ts',
|
||||
'test/helpers/eval-store.ts',
|
||||
'test/helpers/llm-judge.ts',
|
||||
'scripts/gen-skill-docs.ts',
|
||||
'test/helpers/touchfiles.ts',
|
||||
'browse/test/test-server.ts',
|
||||
];
|
||||
|
||||
// --- Base branch detection ---
|
||||
|
||||
/**
|
||||
* Detect the base branch by trying refs in order.
|
||||
* Returns the first valid ref, or null if none found.
|
||||
*/
|
||||
export function detectBaseBranch(cwd: string): string | null {
|
||||
for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
|
||||
const result = spawnSync('git', ['rev-parse', '--verify', ref], {
|
||||
cwd, stdio: 'pipe', timeout: 3000,
|
||||
});
|
||||
if (result.status === 0) return ref;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of files changed between base branch and HEAD.
|
||||
*/
|
||||
export function getChangedFiles(baseBranch: string, cwd: string): string[] {
|
||||
const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
|
||||
cwd, stdio: 'pipe', timeout: 5000,
|
||||
});
|
||||
if (result.status !== 0) return [];
|
||||
return result.stdout.toString().trim().split('\n').filter(Boolean);
|
||||
}
|
||||
|
||||
// --- Test selection ---
|
||||
|
||||
/**
|
||||
* Select tests to run based on changed files.
|
||||
*
|
||||
* Algorithm:
|
||||
* 1. If any changed file matches a global touchfile → run ALL tests
|
||||
* 2. Otherwise, for each test, check if any changed file matches its patterns
|
||||
* 3. Return selected + skipped lists with reason
|
||||
*/
|
||||
export function selectTests(
|
||||
changedFiles: string[],
|
||||
touchfiles: Record<string, string[]>,
|
||||
globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
|
||||
): { selected: string[]; skipped: string[]; reason: string } {
|
||||
const allTestNames = Object.keys(touchfiles);
|
||||
|
||||
// Global touchfile hit → run all
|
||||
for (const file of changedFiles) {
|
||||
if (globalTouchfiles.some(g => matchGlob(file, g))) {
|
||||
return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
|
||||
}
|
||||
}
|
||||
|
||||
// Per-test matching
|
||||
const selected: string[] = [];
|
||||
const skipped: string[] = [];
|
||||
for (const [testName, patterns] of Object.entries(touchfiles)) {
|
||||
const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
|
||||
(hit ? selected : skipped).push(testName);
|
||||
}
|
||||
|
||||
return { selected, skipped, reason: 'diff' };
|
||||
}
|
||||
Reference in New Issue
Block a user