mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat: LLM-as-judge evals for SKILL.md documentation quality
4 eval tests using Anthropic API (claude-haiku, ~$0.01-0.03/run): - Command reference table: clarity/completeness/actionability >= 4/5 - Snapshot flags section: same thresholds - browse/SKILL.md overall quality - Regression: generated version must score >= hand-maintained baseline Requires ANTHROPIC_API_KEY. Auto-skips without it. Run: bun run test:eval (or ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts)
This commit is contained in:
+5
-3
@@ -12,9 +12,10 @@
|
||||
"gen:skill-docs": "bun run scripts/gen-skill-docs.ts",
|
||||
"dev": "bun run browse/src/cli.ts",
|
||||
"server": "bun run browse/src/server.ts",
|
||||
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts",
|
||||
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
|
||||
"test:e2e": "SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||
"test:eval": "bun test test/skill-llm-eval.test.ts",
|
||||
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||
"skill:check": "bun run scripts/skill-check.ts",
|
||||
"dev:skill": "bun run scripts/dev-skill.ts",
|
||||
"start": "bun run browse/src/server.ts"
|
||||
@@ -37,6 +38,7 @@
|
||||
"devtools"
|
||||
],
|
||||
"devDependencies": {
|
||||
"@anthropic-ai/claude-agent-sdk": "^0.2.75"
|
||||
"@anthropic-ai/claude-agent-sdk": "^0.2.75",
|
||||
"@anthropic-ai/sdk": "^0.78.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,194 @@
|
||||
/**
|
||||
* LLM-as-a-Judge evals for generated SKILL.md quality.
|
||||
*
|
||||
* Uses the Anthropic API directly (not Agent SDK) to evaluate whether
|
||||
* generated command docs are clear, complete, and actionable for an AI agent.
|
||||
*
|
||||
* Requires: ANTHROPIC_API_KEY env var
|
||||
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
|
||||
*
|
||||
* Cost: ~$0.01-0.03 per run (haiku)
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
|
||||
const describeEval = hasApiKey ? describe : describe.skip;
|
||||
|
||||
interface JudgeScore {
|
||||
clarity: number; // 1-5: can an agent understand what each command does?
|
||||
completeness: number; // 1-5: are all args, flags, valid values documented?
|
||||
actionability: number; // 1-5: can an agent use this to construct correct commands?
|
||||
reasoning: string; // why the scores were given
|
||||
}
|
||||
|
||||
async function judge(section: string, prompt: string): Promise<JudgeScore> {
|
||||
const client = new Anthropic();
|
||||
|
||||
const response = await client.messages.create({
|
||||
model: 'claude-haiku-4-5-20251001',
|
||||
max_tokens: 1024,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
content: `You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
||||
|
||||
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
||||
1. Understand what each command does
|
||||
2. Know what arguments to pass
|
||||
3. Know valid values for enum-like parameters
|
||||
4. Construct correct command invocations without guessing
|
||||
|
||||
Rate the following ${section} on three dimensions (1-5 scale):
|
||||
|
||||
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
||||
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
||||
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
||||
|
||||
Scoring guide:
|
||||
- 5: Excellent — no ambiguity, all info present
|
||||
- 4: Good — minor gaps an experienced agent could infer
|
||||
- 3: Adequate — some guessing required
|
||||
- 2: Poor — significant info missing
|
||||
- 1: Unusable — agent would fail without external help
|
||||
|
||||
Respond with ONLY valid JSON in this exact format:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the ${section} to evaluate:
|
||||
|
||||
${prompt}`,
|
||||
}],
|
||||
});
|
||||
|
||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||
// Extract JSON from response (handle markdown code blocks)
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||
return JSON.parse(jsonMatch[0]) as JudgeScore;
|
||||
}
|
||||
|
||||
describeEval('LLM-as-judge quality evals', () => {
|
||||
test('command reference table scores >= 4 on all dimensions', async () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
// Extract just the command reference section
|
||||
const start = content.indexOf('## Command Reference');
|
||||
const end = content.indexOf('## Tips');
|
||||
const section = content.slice(start, end);
|
||||
|
||||
const scores = await judge('command reference table', section);
|
||||
console.log('Command reference scores:', JSON.stringify(scores, null, 2));
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('snapshot flags section scores >= 4 on all dimensions', async () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const start = content.indexOf('## Snapshot System');
|
||||
const end = content.indexOf('## Command Reference');
|
||||
const section = content.slice(start, end);
|
||||
|
||||
const scores = await judge('snapshot flags reference', section);
|
||||
console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('browse/SKILL.md overall scores >= 4', async () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
|
||||
// Just the reference sections (skip examples/patterns)
|
||||
const start = content.indexOf('## Snapshot Flags');
|
||||
const section = content.slice(start);
|
||||
|
||||
const scores = await judge('browse skill reference (flags + commands)', section);
|
||||
console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('regression check: compare branch vs baseline quality', async () => {
|
||||
// This test compares the generated output against the hand-maintained
|
||||
// baseline from main. The generated version should score equal or higher.
|
||||
const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const genStart = generated.indexOf('## Command Reference');
|
||||
const genEnd = generated.indexOf('## Tips');
|
||||
const genSection = generated.slice(genStart, genEnd);
|
||||
|
||||
const baseline = `## Command Reference
|
||||
|
||||
### Navigation
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| \`goto <url>\` | Navigate to URL |
|
||||
| \`back\` / \`forward\` | History navigation |
|
||||
| \`reload\` | Reload page |
|
||||
| \`url\` | Print current URL |
|
||||
|
||||
### Interaction
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| \`click <sel>\` | Click element |
|
||||
| \`fill <sel> <val>\` | Fill input |
|
||||
| \`select <sel> <val>\` | Select dropdown |
|
||||
| \`hover <sel>\` | Hover element |
|
||||
| \`type <text>\` | Type into focused element |
|
||||
| \`press <key>\` | Press key (Enter, Tab, Escape) |
|
||||
| \`scroll [sel]\` | Scroll element into view |
|
||||
| \`wait <sel>\` | Wait for element (max 10s) |
|
||||
| \`wait --networkidle\` | Wait for network to be idle |
|
||||
| \`wait --load\` | Wait for page load event |
|
||||
|
||||
### Inspection
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| \`js <expr>\` | Run JavaScript |
|
||||
| \`css <sel> <prop>\` | Computed CSS |
|
||||
| \`attrs <sel>\` | Element attributes |
|
||||
| \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
|
||||
| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
|
||||
|
||||
const client = new Anthropic();
|
||||
const response = await client.messages.create({
|
||||
model: 'claude-haiku-4-5-20251001',
|
||||
max_tokens: 1024,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
content: `You are comparing two versions of CLI documentation for an AI coding agent.
|
||||
|
||||
VERSION A (baseline — hand-maintained):
|
||||
${baseline}
|
||||
|
||||
VERSION B (auto-generated from source):
|
||||
${genSection}
|
||||
|
||||
Which version is better for an AI agent trying to use these commands? Consider:
|
||||
- Completeness (more commands documented? all args shown?)
|
||||
- Clarity (descriptions helpful?)
|
||||
- Coverage (missing commands in either version?)
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
|
||||
|
||||
Scores are 1-5 overall quality.`,
|
||||
}],
|
||||
});
|
||||
|
||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||
const result = JSON.parse(jsonMatch[0]);
|
||||
console.log('Regression comparison:', JSON.stringify(result, null, 2));
|
||||
|
||||
// Generated version should be at least as good as hand-maintained
|
||||
expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
|
||||
}, 30_000);
|
||||
});
|
||||
Reference in New Issue
Block a user