mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
feat: diff-based test selection for E2E and LLM-judge evals (v0.6.1.0) (#139)
* feat: diff-based test selection for E2E and LLM-judge evals Each test declares file dependencies in a TOUCHFILES map. The test runner checks git diff against the base branch and only runs tests whose dependencies were modified. Global touchfiles (session-runner, eval-store, gen-skill-docs) trigger all tests. New scripts: test:e2e:all, test:evals:all, eval:select Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.6.1.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: plan-design-review-audit eval — bump turns to 30, add efficiency hints The test was flaky at 20 turns because the agent reads a 300-line SKILL.md, navigates, extracts design data, and writes a report. Added hints to skip preamble/batch commands/write early while still testing the real SKILL.md. Now completes in ~13 turns consistently. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Show which E2E and LLM-judge tests would run based on the current git diff.
|
||||
*
|
||||
* Usage:
|
||||
* bun run eval:select # human-readable output
|
||||
* bun run eval:select --json # machine-readable JSON
|
||||
* bun run eval:select --base main # override base branch
|
||||
*/
|
||||
|
||||
import * as path from 'path';
|
||||
import {
|
||||
selectTests,
|
||||
detectBaseBranch,
|
||||
getChangedFiles,
|
||||
E2E_TOUCHFILES,
|
||||
LLM_JUDGE_TOUCHFILES,
|
||||
GLOBAL_TOUCHFILES,
|
||||
} from '../test/helpers/touchfiles';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const args = process.argv.slice(2);
|
||||
const jsonMode = args.includes('--json');
|
||||
const baseIdx = args.indexOf('--base');
|
||||
const baseOverride = baseIdx >= 0 ? args[baseIdx + 1] : undefined;
|
||||
|
||||
// Detect base branch
|
||||
const baseBranch = baseOverride || detectBaseBranch(ROOT) || 'main';
|
||||
const changedFiles = getChangedFiles(baseBranch, ROOT);
|
||||
|
||||
if (changedFiles.length === 0) {
|
||||
if (jsonMode) {
|
||||
console.log(JSON.stringify({ base: baseBranch, changed_files: 0, e2e: 'all', llm_judge: 'all', reason: 'no diff — would run all tests' }));
|
||||
} else {
|
||||
console.log(`Base: ${baseBranch}`);
|
||||
console.log('No changed files detected — all tests would run.');
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const e2eSelection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
|
||||
const llmSelection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
|
||||
|
||||
if (jsonMode) {
|
||||
console.log(JSON.stringify({
|
||||
base: baseBranch,
|
||||
changed_files: changedFiles,
|
||||
e2e: {
|
||||
selected: e2eSelection.selected,
|
||||
skipped: e2eSelection.skipped,
|
||||
reason: e2eSelection.reason,
|
||||
count: `${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length}`,
|
||||
},
|
||||
llm_judge: {
|
||||
selected: llmSelection.selected,
|
||||
skipped: llmSelection.skipped,
|
||||
reason: llmSelection.reason,
|
||||
count: `${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length}`,
|
||||
},
|
||||
}, null, 2));
|
||||
} else {
|
||||
console.log(`Base: ${baseBranch}`);
|
||||
console.log(`Changed files: ${changedFiles.length}`);
|
||||
console.log();
|
||||
|
||||
console.log(`E2E (${e2eSelection.reason}): ${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests`);
|
||||
if (e2eSelection.selected.length > 0 && e2eSelection.selected.length < Object.keys(E2E_TOUCHFILES).length) {
|
||||
console.log(` Selected: ${e2eSelection.selected.join(', ')}`);
|
||||
console.log(` Skipped: ${e2eSelection.skipped.join(', ')}`);
|
||||
} else if (e2eSelection.selected.length === 0) {
|
||||
console.log(' No E2E tests affected.');
|
||||
} else {
|
||||
console.log(' All E2E tests selected.');
|
||||
}
|
||||
console.log();
|
||||
|
||||
console.log(`LLM-judge (${llmSelection.reason}): ${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests`);
|
||||
if (llmSelection.selected.length > 0 && llmSelection.selected.length < Object.keys(LLM_JUDGE_TOUCHFILES).length) {
|
||||
console.log(` Selected: ${llmSelection.selected.join(', ')}`);
|
||||
console.log(` Skipped: ${llmSelection.skipped.join(', ')}`);
|
||||
} else if (llmSelection.selected.length === 0) {
|
||||
console.log(' No LLM-judge tests affected.');
|
||||
} else {
|
||||
console.log(' All LLM-judge tests selected.');
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user