Files
gstack/scripts/eval-select.ts
T
Garry Tan 17c1c06cd9 feat: diff-based test selection for E2E and LLM-judge evals (v0.6.1.0) (#139)
* feat: diff-based test selection for E2E and LLM-judge evals

Each test declares file dependencies in a TOUCHFILES map. The test runner
checks git diff against the base branch and only runs tests whose
dependencies were modified. Global touchfiles (session-runner, eval-store,
gen-skill-docs) trigger all tests.

New scripts: test:e2e:all, test:evals:all, eval:select

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* chore: bump version and changelog (v0.6.1.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: plan-design-review-audit eval — bump turns to 30, add efficiency hints

The test was flaky at 20 turns because the agent reads a 300-line SKILL.md,
navigates, extracts design data, and writes a report. Added hints to skip
preamble/batch commands/write early while still testing the real SKILL.md.
Now completes in ~13 turns consistently.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 18:45:41 -05:00

87 lines
3.1 KiB
TypeScript

#!/usr/bin/env bun
/**
* Show which E2E and LLM-judge tests would run based on the current git diff.
*
* Usage:
* bun run eval:select # human-readable output
* bun run eval:select --json # machine-readable JSON
* bun run eval:select --base main # override base branch
*/
import * as path from 'path';
import {
selectTests,
detectBaseBranch,
getChangedFiles,
E2E_TOUCHFILES,
LLM_JUDGE_TOUCHFILES,
GLOBAL_TOUCHFILES,
} from '../test/helpers/touchfiles';
const ROOT = path.resolve(import.meta.dir, '..');
const args = process.argv.slice(2);
const jsonMode = args.includes('--json');
const baseIdx = args.indexOf('--base');
const baseOverride = baseIdx >= 0 ? args[baseIdx + 1] : undefined;
// Detect base branch
const baseBranch = baseOverride || detectBaseBranch(ROOT) || 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length === 0) {
if (jsonMode) {
console.log(JSON.stringify({ base: baseBranch, changed_files: 0, e2e: 'all', llm_judge: 'all', reason: 'no diff — would run all tests' }));
} else {
console.log(`Base: ${baseBranch}`);
console.log('No changed files detected — all tests would run.');
}
process.exit(0);
}
const e2eSelection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
const llmSelection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
if (jsonMode) {
console.log(JSON.stringify({
base: baseBranch,
changed_files: changedFiles,
e2e: {
selected: e2eSelection.selected,
skipped: e2eSelection.skipped,
reason: e2eSelection.reason,
count: `${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length}`,
},
llm_judge: {
selected: llmSelection.selected,
skipped: llmSelection.skipped,
reason: llmSelection.reason,
count: `${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length}`,
},
}, null, 2));
} else {
console.log(`Base: ${baseBranch}`);
console.log(`Changed files: ${changedFiles.length}`);
console.log();
console.log(`E2E (${e2eSelection.reason}): ${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests`);
if (e2eSelection.selected.length > 0 && e2eSelection.selected.length < Object.keys(E2E_TOUCHFILES).length) {
console.log(` Selected: ${e2eSelection.selected.join(', ')}`);
console.log(` Skipped: ${e2eSelection.skipped.join(', ')}`);
} else if (e2eSelection.selected.length === 0) {
console.log(' No E2E tests affected.');
} else {
console.log(' All E2E tests selected.');
}
console.log();
console.log(`LLM-judge (${llmSelection.reason}): ${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests`);
if (llmSelection.selected.length > 0 && llmSelection.selected.length < Object.keys(LLM_JUDGE_TOUCHFILES).length) {
console.log(` Selected: ${llmSelection.selected.join(', ')}`);
console.log(` Skipped: ${llmSelection.skipped.join(', ')}`);
} else if (llmSelection.selected.length === 0) {
console.log(' No LLM-judge tests affected.');
} else {
console.log(' All LLM-judge tests selected.');
}
}