feat: eval CLI tools + docs cleanup

Add eval:list, eval:compare, eval:summary CLI scripts for exploring eval history from ~/.gstack-dev/evals/. eval:compare reuses the shared comparison functions from eval-store.ts. - eval:list: sorted table with branch/tier/cost filters - eval:compare: thin wrapper around compareEvalResults + formatComparison - eval:summary: aggregate stats, flaky test detection, branch rankings - Remove unused @anthropic-ai/claude-agent-sdk from devDependencies - Update CLAUDE.md: streaming docs, eval CLI commands, remove Agent SDK refs - Add GH Actions eval upload (P2) and web dashboard (P3) to TODOS.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-23 10:10:03 +02:00 · 2026-03-14 03:49:57 -05:00
parent 84f52f3bad
commit ed802d0c7f
6 changed files with 373 additions and 11 deletions
@@ -17,7 +17,10 @@
    "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts",
    "skill:check": "bun run scripts/skill-check.ts",
    "dev:skill": "bun run scripts/dev-skill.ts",
-    "start": "bun run browse/src/server.ts"
+    "start": "bun run browse/src/server.ts",
+    "eval:list": "bun run scripts/eval-list.ts",
+    "eval:compare": "bun run scripts/eval-compare.ts",
+    "eval:summary": "bun run scripts/eval-summary.ts"
  },
  "dependencies": {
    "playwright": "^1.58.2",
@@ -37,7 +40,6 @@
    "devtools"
  ],
  "devDependencies": {
-    "@anthropic-ai/claude-agent-sdk": "^0.2.75",
    "@anthropic-ai/sdk": "^0.78.0"
  }
 }