mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
feat: LLM-as-judge evals for SKILL.md documentation quality
4 eval tests using Anthropic API (claude-haiku, ~$0.01-0.03/run): - Command reference table: clarity/completeness/actionability >= 4/5 - Snapshot flags section: same thresholds - browse/SKILL.md overall quality - Regression: generated version must score >= hand-maintained baseline Requires ANTHROPIC_API_KEY. Auto-skips without it. Run: bun run test:eval (or ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts)
This commit is contained in:
+5
-3
@@ -12,9 +12,10 @@
|
||||
"gen:skill-docs": "bun run scripts/gen-skill-docs.ts",
|
||||
"dev": "bun run browse/src/cli.ts",
|
||||
"server": "bun run browse/src/server.ts",
|
||||
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts",
|
||||
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
|
||||
"test:e2e": "SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||
"test:eval": "bun test test/skill-llm-eval.test.ts",
|
||||
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||
"skill:check": "bun run scripts/skill-check.ts",
|
||||
"dev:skill": "bun run scripts/dev-skill.ts",
|
||||
"start": "bun run browse/src/server.ts"
|
||||
@@ -37,6 +38,7 @@
|
||||
"devtools"
|
||||
],
|
||||
"devDependencies": {
|
||||
"@anthropic-ai/claude-agent-sdk": "^0.2.75"
|
||||
"@anthropic-ai/claude-agent-sdk": "^0.2.75",
|
||||
"@anthropic-ai/sdk": "^0.78.0"
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user