From 9dffb1ed16ddb3548d3d4da23019babd82728c92 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 13 Mar 2026 15:59:11 -0700 Subject: [PATCH] feat: LLM-as-judge evals for SKILL.md documentation quality 4 eval tests using Anthropic API (claude-haiku, ~$0.01-0.03/run): - Command reference table: clarity/completeness/actionability >= 4/5 - Snapshot flags section: same thresholds - browse/SKILL.md overall quality - Regression: generated version must score >= hand-maintained baseline Requires ANTHROPIC_API_KEY. Auto-skips without it. Run: bun run test:eval (or ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts) --- package.json | 8 +- test/skill-llm-eval.test.ts | 194 ++++++++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 test/skill-llm-eval.test.ts diff --git a/package.json b/package.json index 27fe0a8c..15c9a9d2 100644 --- a/package.json +++ b/package.json @@ -12,9 +12,10 @@ "gen:skill-docs": "bun run scripts/gen-skill-docs.ts", "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", - "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts", + "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts", "test:e2e": "SKILL_E2E=1 bun test test/skill-e2e.test.ts", - "test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts", + "test:eval": "bun test test/skill-llm-eval.test.ts", + "test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts" @@ -37,6 +38,7 @@ "devtools" ], "devDependencies": { - "@anthropic-ai/claude-agent-sdk": "^0.2.75" + "@anthropic-ai/claude-agent-sdk": "^0.2.75", + "@anthropic-ai/sdk": "^0.78.0" } } diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts new file mode 100644 index 00000000..308de814 --- /dev/null +++ b/test/skill-llm-eval.test.ts @@ -0,0 +1,194 @@ +/** + * LLM-as-a-Judge evals for generated SKILL.md quality. + * + * Uses the Anthropic API directly (not Agent SDK) to evaluate whether + * generated command docs are clear, complete, and actionable for an AI agent. + * + * Requires: ANTHROPIC_API_KEY env var + * Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts + * + * Cost: ~$0.01-0.03 per run (haiku) + */ + +import { describe, test, expect } from 'bun:test'; +import Anthropic from '@anthropic-ai/sdk'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const hasApiKey = !!process.env.ANTHROPIC_API_KEY; +const describeEval = hasApiKey ? describe : describe.skip; + +interface JudgeScore { + clarity: number; // 1-5: can an agent understand what each command does? + completeness: number; // 1-5: are all args, flags, valid values documented? + actionability: number; // 1-5: can an agent use this to construct correct commands? + reasoning: string; // why the scores were given +} + +async function judge(section: string, prompt: string): Promise { + const client = new Anthropic(); + + const response = await client.messages.create({ + model: 'claude-haiku-4-5-20251001', + max_tokens: 1024, + messages: [{ + role: 'user', + content: `You are evaluating documentation quality for an AI coding agent's CLI tool reference. + +The agent reads this documentation to learn how to use a headless browser CLI. It needs to: +1. Understand what each command does +2. Know what arguments to pass +3. Know valid values for enum-like parameters +4. Construct correct command invocations without guessing + +Rate the following ${section} on three dimensions (1-5 scale): + +- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone? +- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything? +- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone? + +Scoring guide: +- 5: Excellent — no ambiguity, all info present +- 4: Good — minor gaps an experienced agent could infer +- 3: Adequate — some guessing required +- 2: Poor — significant info missing +- 1: Unusable — agent would fail without external help + +Respond with ONLY valid JSON in this exact format: +{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"} + +Here is the ${section} to evaluate: + +${prompt}`, + }], + }); + + const text = response.content[0].type === 'text' ? response.content[0].text : ''; + // Extract JSON from response (handle markdown code blocks) + const jsonMatch = text.match(/\{[\s\S]*\}/); + if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); + return JSON.parse(jsonMatch[0]) as JudgeScore; +} + +describeEval('LLM-as-judge quality evals', () => { + test('command reference table scores >= 4 on all dimensions', async () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + // Extract just the command reference section + const start = content.indexOf('## Command Reference'); + const end = content.indexOf('## Tips'); + const section = content.slice(start, end); + + const scores = await judge('command reference table', section); + console.log('Command reference scores:', JSON.stringify(scores, null, 2)); + + expect(scores.clarity).toBeGreaterThanOrEqual(4); + expect(scores.completeness).toBeGreaterThanOrEqual(4); + expect(scores.actionability).toBeGreaterThanOrEqual(4); + }, 30_000); + + test('snapshot flags section scores >= 4 on all dimensions', async () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const start = content.indexOf('## Snapshot System'); + const end = content.indexOf('## Command Reference'); + const section = content.slice(start, end); + + const scores = await judge('snapshot flags reference', section); + console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2)); + + expect(scores.clarity).toBeGreaterThanOrEqual(4); + expect(scores.completeness).toBeGreaterThanOrEqual(4); + expect(scores.actionability).toBeGreaterThanOrEqual(4); + }, 30_000); + + test('browse/SKILL.md overall scores >= 4', async () => { + const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8'); + // Just the reference sections (skip examples/patterns) + const start = content.indexOf('## Snapshot Flags'); + const section = content.slice(start); + + const scores = await judge('browse skill reference (flags + commands)', section); + console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2)); + + expect(scores.clarity).toBeGreaterThanOrEqual(4); + expect(scores.completeness).toBeGreaterThanOrEqual(4); + expect(scores.actionability).toBeGreaterThanOrEqual(4); + }, 30_000); + + test('regression check: compare branch vs baseline quality', async () => { + // This test compares the generated output against the hand-maintained + // baseline from main. The generated version should score equal or higher. + const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const genStart = generated.indexOf('## Command Reference'); + const genEnd = generated.indexOf('## Tips'); + const genSection = generated.slice(genStart, genEnd); + + const baseline = `## Command Reference + +### Navigation +| Command | Description | +|---------|-------------| +| \`goto \` | Navigate to URL | +| \`back\` / \`forward\` | History navigation | +| \`reload\` | Reload page | +| \`url\` | Print current URL | + +### Interaction +| Command | Description | +|---------|-------------| +| \`click \` | Click element | +| \`fill \` | Fill input | +| \`select \` | Select dropdown | +| \`hover \` | Hover element | +| \`type \` | Type into focused element | +| \`press \` | Press key (Enter, Tab, Escape) | +| \`scroll [sel]\` | Scroll element into view | +| \`wait \` | Wait for element (max 10s) | +| \`wait --networkidle\` | Wait for network to be idle | +| \`wait --load\` | Wait for page load event | + +### Inspection +| Command | Description | +|---------|-------------| +| \`js \` | Run JavaScript | +| \`css \` | Computed CSS | +| \`attrs \` | Element attributes | +| \`is \` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | +| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`; + + const client = new Anthropic(); + const response = await client.messages.create({ + model: 'claude-haiku-4-5-20251001', + max_tokens: 1024, + messages: [{ + role: 'user', + content: `You are comparing two versions of CLI documentation for an AI coding agent. + +VERSION A (baseline — hand-maintained): +${baseline} + +VERSION B (auto-generated from source): +${genSection} + +Which version is better for an AI agent trying to use these commands? Consider: +- Completeness (more commands documented? all args shown?) +- Clarity (descriptions helpful?) +- Coverage (missing commands in either version?) + +Respond with ONLY valid JSON: +{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N} + +Scores are 1-5 overall quality.`, + }], + }); + + const text = response.content[0].type === 'text' ? response.content[0].text : ''; + const jsonMatch = text.match(/\{[\s\S]*\}/); + if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); + const result = JSON.parse(jsonMatch[0]); + console.log('Regression comparison:', JSON.stringify(result, null, 2)); + + // Generated version should be at least as good as hand-maintained + expect(result.b_score).toBeGreaterThanOrEqual(result.a_score); + }, 30_000); +});