fix: enrich SKILL.md docs to pass LLM evals, upgrade judge to Sonnet 4.6 (#43)

* fix: enrich command descriptions and snapshot flags for LLM eval quality

14 command descriptions enriched with specific arg formats, valid values,
error behavior, and return types. Fixed header usage from <name> <value>
to <name>:<value>. Added cookie usage syntax. Snapshot flags now show
long names, ref numbering, and output format examples.

* refactor: auto-generate server.ts help text from COMMAND_DESCRIPTIONS

Replace hand-maintained help block with generateHelpText() that reads
from COMMAND_DESCRIPTIONS and SNAPSHOT_FLAGS. Eliminates help text
drift from source of truth.

* test: add usage consistency and pipe guard tests

Usage consistency test cross-checks Usage: patterns in implementation
against COMMAND_DESCRIPTIONS using structural skeleton comparison.
Pipe guard test ensures descriptions don't contain | which would break
markdown table rendering.

* chore: upgrade eval judge to Sonnet 4.6, update changelog

Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable,
nuanced scoring. Add changelog entry for all eval improvements.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-13 22:14:14 -07:00
committed by GitHub
parent 5205070299
commit a468374272
10 changed files with 233 additions and 112 deletions
+8
View File
@@ -139,6 +139,14 @@ describe('description quality evals', () => {
}
});
// Guard: descriptions must not contain pipe (breaks markdown table cells)
// Usage strings are backtick-wrapped in the table so pipes there are safe.
test('no command description contains pipe character', () => {
for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) {
expect(meta.description).not.toContain('|');
}
});
// Guard: generated output uses → not ->
test('generated SKILL.md uses unicode arrows', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+3 -3
View File
@@ -7,7 +7,7 @@
* Requires: ANTHROPIC_API_KEY env var
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
*
* Cost: ~$0.01-0.03 per run (haiku)
* Cost: ~$0.05-0.15 per run (sonnet)
*/
import { describe, test, expect } from 'bun:test';
@@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise<JudgeScore> {
const client = new Anthropic();
const response = await client.messages.create({
model: 'claude-haiku-4-5-20251001',
model: 'claude-sonnet-4-6',
max_tokens: 1024,
messages: [{
role: 'user',
@@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => {
const client = new Anthropic();
const response = await client.messages.create({
model: 'claude-haiku-4-5-20251001',
model: 'claude-sonnet-4-6',
max_tokens: 1024,
messages: [{
role: 'user',
+53
View File
@@ -80,6 +80,59 @@ describe('Command registry consistency', () => {
});
});
describe('Usage string consistency', () => {
// Normalize a usage string to its structural skeleton for comparison.
// Replaces <param-names> with <>, [optional] with [], strips parenthetical hints.
// This catches format mismatches (e.g., <name>:<value> vs <name> <value>)
// without tripping on abbreviation differences (e.g., <sel> vs <selector>).
function skeleton(usage: string): string {
return usage
.replace(/\(.*?\)/g, '') // strip parenthetical hints like (e.g., Enter, Tab)
.replace(/<[^>]*>/g, '<>') // normalize <param-name> → <>
.replace(/\[[^\]]*\]/g, '[]') // normalize [optional] → []
.replace(/\s+/g, ' ') // collapse whitespace
.trim();
}
// Cross-check Usage: patterns in implementation against COMMAND_DESCRIPTIONS
test('implementation Usage: structural format matches COMMAND_DESCRIPTIONS', () => {
const implFiles = [
path.join(ROOT, 'browse', 'src', 'write-commands.ts'),
path.join(ROOT, 'browse', 'src', 'read-commands.ts'),
path.join(ROOT, 'browse', 'src', 'meta-commands.ts'),
];
// Extract "Usage: browse <pattern>" from throw new Error(...) calls
const usagePattern = /throw new Error\(['"`]Usage:\s*browse\s+(.+?)['"`]\)/g;
const implUsages = new Map<string, string>();
for (const file of implFiles) {
const content = fs.readFileSync(file, 'utf-8');
let match;
while ((match = usagePattern.exec(content)) !== null) {
const usage = match[1].split('\\n')[0].trim();
const cmd = usage.split(/\s/)[0];
implUsages.set(cmd, usage);
}
}
// Compare structural skeletons
const mismatches: string[] = [];
for (const [cmd, implUsage] of implUsages) {
const desc = COMMAND_DESCRIPTIONS[cmd];
if (!desc) continue;
if (!desc.usage) continue;
const descSkel = skeleton(desc.usage);
const implSkel = skeleton(implUsage);
if (descSkel !== implSkel) {
mismatches.push(`${cmd}: docs "${desc.usage}" (${descSkel}) vs impl "${implUsage}" (${implSkel})`);
}
}
expect(mismatches).toEqual([]);
});
});
describe('Generated SKILL.md freshness', () => {
test('no unresolved {{placeholders}} in generated SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');