mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat: diff-based test selection for E2E and LLM-judge evals (v0.6.1.0) (#139)
* feat: diff-based test selection for E2E and LLM-judge evals Each test declares file dependencies in a TOUCHFILES map. The test runner checks git diff against the base branch and only runs tests whose dependencies were modified. Global touchfiles (session-runner, eval-store, gen-skill-docs) trigger all tests. New scripts: test:e2e:all, test:evals:all, eval:select Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.6.1.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: plan-design-review-audit eval — bump turns to 30, add efficiency hints The test was flaky at 20 turns because the agent reads a 300-line SKILL.md, navigates, extracts design data, and writes a report. Added hints to skip preamble/batch commands/write early while still testing the real SKILL.md. Now completes in ~13 turns consistently. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+4
-1
@@ -14,14 +14,17 @@
|
||||
"server": "bun run browse/src/server.ts",
|
||||
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
|
||||
"test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts",
|
||||
"test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts",
|
||||
"test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts",
|
||||
"test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts",
|
||||
"skill:check": "bun run scripts/skill-check.ts",
|
||||
"dev:skill": "bun run scripts/dev-skill.ts",
|
||||
"start": "bun run browse/src/server.ts",
|
||||
"eval:list": "bun run scripts/eval-list.ts",
|
||||
"eval:compare": "bun run scripts/eval-compare.ts",
|
||||
"eval:summary": "bun run scripts/eval-summary.ts",
|
||||
"eval:watch": "bun run scripts/eval-watch.ts"
|
||||
"eval:watch": "bun run scripts/eval-watch.ts",
|
||||
"eval:select": "bun run scripts/eval-select.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"playwright": "^1.58.2",
|
||||
|
||||
Reference in New Issue
Block a user