fix: enrich SKILL.md docs to pass LLM evals, upgrade judge to Sonnet 4.6 (#43)

* fix: enrich command descriptions and snapshot flags for LLM eval quality 14 command descriptions enriched with specific arg formats, valid values, error behavior, and return types. Fixed header usage from <name> <value> to <name>:<value>. Added cookie usage syntax. Snapshot flags now show long names, ref numbering, and output format examples. * refactor: auto-generate server.ts help text from COMMAND_DESCRIPTIONS Replace hand-maintained help block with generateHelpText() that reads from COMMAND_DESCRIPTIONS and SNAPSHOT_FLAGS. Eliminates help text drift from source of truth. * test: add usage consistency and pipe guard tests Usage consistency test cross-checks Usage: patterns in implementation against COMMAND_DESCRIPTIONS using structural skeleton comparison. Pipe guard test ensures descriptions don't contain | which would break markdown table rendering. * chore: upgrade eval judge to Sonnet 4.6, update changelog Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable, nuanced scoring. Add changelog entry for all eval improvements. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-07-14 03:27:21 +02:00 · 2026-03-13 22:14:14 -07:00
parent 5205070299
commit a468374272
10 changed files with 233 additions and 112 deletions
@@ -139,6 +139,14 @@ describe('description quality evals', () => {
    }
  });

+  // Guard: descriptions must not contain pipe (breaks markdown table cells)
+  // Usage strings are backtick-wrapped in the table so pipes there are safe.
+  test('no command description contains pipe character', () => {
+    for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) {
+      expect(meta.description).not.toContain('|');
+    }
+  });
+
  // Guard: generated output uses → not ->
  test('generated SKILL.md uses unicode arrows', () => {
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
@@ -7,7 +7,7 @@
 * Requires: ANTHROPIC_API_KEY env var
 * Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
 *
- * Cost: ~$0.01-0.03 per run (haiku)
+ * Cost: ~$0.05-0.15 per run (sonnet)
 */

 import { describe, test, expect } from 'bun:test';
@@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise<JudgeScore> {
  const client = new Anthropic();

  const response = await client.messages.create({
-    model: 'claude-haiku-4-5-20251001',
+    model: 'claude-sonnet-4-6',
    max_tokens: 1024,
    messages: [{
      role: 'user',
@@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => {

    const client = new Anthropic();
    const response = await client.messages.create({
-      model: 'claude-haiku-4-5-20251001',
+      model: 'claude-sonnet-4-6',
      max_tokens: 1024,
      messages: [{
        role: 'user',
@@ -80,6 +80,59 @@ describe('Command registry consistency', () => {
  });
 });

+describe('Usage string consistency', () => {
+  // Normalize a usage string to its structural skeleton for comparison.
+  // Replaces <param-names> with <>, [optional] with [], strips parenthetical hints.
+  // This catches format mismatches (e.g., <name>:<value> vs <name> <value>)
+  // without tripping on abbreviation differences (e.g., <sel> vs <selector>).
+  function skeleton(usage: string): string {
+    return usage
+      .replace(/\(.*?\)/g, '')        // strip parenthetical hints like (e.g., Enter, Tab)
+      .replace(/<[^>]*>/g, '<>')      // normalize <param-name> → <>
+      .replace(/\[[^\]]*\]/g, '[]')   // normalize [optional] → []
+      .replace(/\s+/g, ' ')           // collapse whitespace
+      .trim();
+  }
+
+  // Cross-check Usage: patterns in implementation against COMMAND_DESCRIPTIONS
+  test('implementation Usage: structural format matches COMMAND_DESCRIPTIONS', () => {
+    const implFiles = [
+      path.join(ROOT, 'browse', 'src', 'write-commands.ts'),
+      path.join(ROOT, 'browse', 'src', 'read-commands.ts'),
+      path.join(ROOT, 'browse', 'src', 'meta-commands.ts'),
+    ];
+
+    // Extract "Usage: browse <pattern>" from throw new Error(...) calls
+    const usagePattern = /throw new Error\(['"`]Usage:\s*browse\s+(.+?)['"`]\)/g;
+    const implUsages = new Map<string, string>();
+
+    for (const file of implFiles) {
+      const content = fs.readFileSync(file, 'utf-8');
+      let match;
+      while ((match = usagePattern.exec(content)) !== null) {
+        const usage = match[1].split('\\n')[0].trim();
+        const cmd = usage.split(/\s/)[0];
+        implUsages.set(cmd, usage);
+      }
+    }
+
+    // Compare structural skeletons
+    const mismatches: string[] = [];
+    for (const [cmd, implUsage] of implUsages) {
+      const desc = COMMAND_DESCRIPTIONS[cmd];
+      if (!desc) continue;
+      if (!desc.usage) continue;
+      const descSkel = skeleton(desc.usage);
+      const implSkel = skeleton(implUsage);
+      if (descSkel !== implSkel) {
+        mismatches.push(`${cmd}: docs "${desc.usage}" (${descSkel}) vs impl "${implUsage}" (${implSkel})`);
+      }
+    }
+
+    expect(mismatches).toEqual([]);
+  });
+});
+
 describe('Generated SKILL.md freshness', () => {
  test('no unresolved {{placeholders}} in generated SKILL.md', () => {
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');