feat: LLM-as-judge evals for SKILL.md documentation quality

4 eval tests using Anthropic API (claude-haiku, ~$0.01-0.03/run):
- Command reference table: clarity/completeness/actionability >= 4/5
- Snapshot flags section: same thresholds
- browse/SKILL.md overall quality
- Regression: generated version must score >= hand-maintained baseline

Requires ANTHROPIC_API_KEY. Auto-skips without it.
Run: bun run test:eval (or ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts)
This commit is contained in:
Garry Tan
2026-03-13 15:59:11 -07:00
parent a0f28de22f
commit 9dffb1ed16
2 changed files with 199 additions and 3 deletions
+5 -3
View File
@@ -12,9 +12,10 @@
"gen:skill-docs": "bun run scripts/gen-skill-docs.ts",
"dev": "bun run browse/src/cli.ts",
"server": "bun run browse/src/server.ts",
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts",
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
"test:e2e": "SKILL_E2E=1 bun test test/skill-e2e.test.ts",
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
"test:eval": "bun test test/skill-llm-eval.test.ts",
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
"skill:check": "bun run scripts/skill-check.ts",
"dev:skill": "bun run scripts/dev-skill.ts",
"start": "bun run browse/src/server.ts"
@@ -37,6 +38,7 @@
"devtools"
],
"devDependencies": {
"@anthropic-ai/claude-agent-sdk": "^0.2.75"
"@anthropic-ai/claude-agent-sdk": "^0.2.75",
"@anthropic-ai/sdk": "^0.78.0"
}
}