feat: SKILL.md template system, 3-tier testing, DX tools (v0.3.3) (#41)

* refactor: extract command registry to commands.ts, add SNAPSHOT_FLAGS metadata - NEW: browse/src/commands.ts — command sets + COMMAND_DESCRIPTIONS + load-time validation (zero side effects) - server.ts imports from commands.ts instead of declaring sets inline - snapshot.ts: SNAPSHOT_FLAGS array drives parseSnapshotArgs (metadata-driven, no duplication) - All 186 existing tests pass * feat: SKILL.md template system with auto-generated command references - SKILL.md.tmpl + browse/SKILL.md.tmpl with {{COMMAND_REFERENCE}} and {{SNAPSHOT_FLAGS}} placeholders - scripts/gen-skill-docs.ts generates SKILL.md from templates (supports --dry-run) - Build pipeline runs gen:skill-docs before binary compilation - Generated files have AUTO-GENERATED header, committed to git * test: Tier 1 static validation — 34 tests for SKILL.md command correctness - test/helpers/skill-parser.ts: extracts $B commands from code blocks, validates against registry - test/skill-parser.test.ts: 13 parser/validator unit tests - test/skill-validation.test.ts: 13 tests validating all SKILL.md files + registry consistency - test/gen-skill-docs.test.ts: 8 generator tests (categories, sorting, freshness) * feat: DX tools (skill:check, dev:skill) + Tier 2 E2E test scaffolding - scripts/skill-check.ts: health summary for all SKILL.md files (commands, templates, freshness) - scripts/dev-skill.ts: watch mode for template development - test/helpers/session-runner.ts: Agent SDK wrapper for E2E skill tests - test/skill-e2e.test.ts: 2 E2E tests + 3 stubs (auto-skip inside Claude Code sessions) - E2E tests must run from plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts * ci: SKILL.md freshness check on push/PR + TODO updates - .github/workflows/skill-docs.yml: fails if generated SKILL.md files are stale - TODO.md: add E2E cost tracking and model pinning to future ideas * fix: restore rich descriptions lost in auto-generation - Snapshot flags: add back value hints (-d <N>, -s <sel>, -o <path>) - Snapshot flags: restore parenthetical context (@e refs, @c refs, etc.) - Commands: is → includes valid states enum - Commands: console → notes --errors filter behavior - Commands: press → lists common keys (Enter, Tab, Escape) - Commands: cookie-import-browser → describes picker UI - Commands: dialog-accept → specifies alert/confirm/prompt - Tips: restore → arrow (was downgraded to ->) * test: quality evals for generated SKILL.md descriptions Catches the exact regressions we shipped and caught in review: - Snapshot flags must include value hints (-d <N>, -s <sel>, -o <path>) - is command must list all valid states (visible/hidden/enabled/...) - press command must list example keys (Enter, Tab, Escape) - console command must describe --errors behavior - Snapshot -i must mention @e refs, -C must mention @c refs - All descriptions must be >= 8 chars (no empty stubs) - Tips section must use → not -> * feat: LLM-as-judge evals for SKILL.md documentation quality 4 eval tests using Anthropic API (claude-haiku, ~$0.01-0.03/run): - Command reference table: clarity/completeness/actionability >= 4/5 - Snapshot flags section: same thresholds - browse/SKILL.md overall quality - Regression: generated version must score >= hand-maintained baseline Requires ANTHROPIC_API_KEY. Auto-skips without it. Run: bun run test:eval (or ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts) * chore: bump version to 0.3.3, update changelog Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: add ARCHITECTURE.md, update CLAUDE.md and CONTRIBUTING.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: conductor.json lifecycle hooks + .env propagation across worktrees bin/dev-setup now copies .env from main worktree so API keys carry over to Conductor workspaces automatically. conductor.json wires up setup and archive hooks. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: complete CHANGELOG for v0.3.3 (architecture, conductor, .env) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-01 19:25:10 +02:00 · 2026-03-13 21:08:12 -07:00
parent ea0c0dad5e
commit 5205070299
29 changed files with 2479 additions and 135 deletions
@@ -0,0 +1,82 @@
+#!/usr/bin/env bun
+/**
+ * dev:skill — Watch mode for SKILL.md template development.
+ *
+ * Watches .tmpl files, regenerates SKILL.md files on change,
+ * validates all $B commands immediately.
+ */
+
+import { validateSkill } from '../test/helpers/skill-parser';
+import { execSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+
+const TEMPLATES = [
+  { tmpl: path.join(ROOT, 'SKILL.md.tmpl'), output: 'SKILL.md' },
+  { tmpl: path.join(ROOT, 'browse', 'SKILL.md.tmpl'), output: 'browse/SKILL.md' },
+];
+
+function regenerateAndValidate() {
+  // Regenerate
+  try {
+    execSync('bun run scripts/gen-skill-docs.ts', { cwd: ROOT, stdio: 'pipe' });
+  } catch (err: any) {
+    console.log(`  [gen]   ERROR: ${err.stderr?.toString().trim() || err.message}`);
+    return;
+  }
+
+  // Validate each generated file
+  for (const { output } of TEMPLATES) {
+    const fullPath = path.join(ROOT, output);
+    if (!fs.existsSync(fullPath)) continue;
+
+    const result = validateSkill(fullPath);
+    const totalValid = result.valid.length;
+    const totalInvalid = result.invalid.length;
+    const totalSnapErrors = result.snapshotFlagErrors.length;
+
+    if (totalInvalid > 0 || totalSnapErrors > 0) {
+      console.log(`  [check] \u274c ${output} (${totalValid} valid)`);
+      for (const inv of result.invalid) {
+        console.log(`          Unknown command: '${inv.command}' at line ${inv.line}`);
+      }
+      for (const se of result.snapshotFlagErrors) {
+        console.log(`          ${se.error} at line ${se.command.line}`);
+      }
+    } else {
+      console.log(`  [check] \u2705 ${output} — ${totalValid} commands, all valid`);
+    }
+  }
+}
+
+// Initial run
+console.log('  [watch] Watching *.md.tmpl files...');
+regenerateAndValidate();
+
+// Watch for changes
+for (const { tmpl } of TEMPLATES) {
+  if (!fs.existsSync(tmpl)) continue;
+  fs.watch(tmpl, () => {
+    console.log(`\n  [watch] ${path.relative(ROOT, tmpl)} changed`);
+    regenerateAndValidate();
+  });
+}
+
+// Also watch commands.ts and snapshot.ts (source of truth changes)
+const SOURCE_FILES = [
+  path.join(ROOT, 'browse', 'src', 'commands.ts'),
+  path.join(ROOT, 'browse', 'src', 'snapshot.ts'),
+];
+
+for (const src of SOURCE_FILES) {
+  if (!fs.existsSync(src)) continue;
+  fs.watch(src, () => {
+    console.log(`\n  [watch] ${path.relative(ROOT, src)} changed`);
+    regenerateAndValidate();
+  });
+}
+
+// Keep alive
+console.log('  [watch] Press Ctrl+C to stop\n');
@@ -0,0 +1,163 @@
+#!/usr/bin/env bun
+/**
+ * Generate SKILL.md files from .tmpl templates.
+ *
+ * Pipeline:
+ *   read .tmpl → find {{PLACEHOLDERS}} → resolve from source → format → write .md
+ *
+ * Supports --dry-run: generate to memory, exit 1 if different from committed file.
+ * Used by skill:check and CI freshness checks.
+ */
+
+import { COMMAND_DESCRIPTIONS } from '../browse/src/commands';
+import { SNAPSHOT_FLAGS } from '../browse/src/snapshot';
+import * as fs from 'fs';
+import * as path from 'path';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const DRY_RUN = process.argv.includes('--dry-run');
+
+// ─── Placeholder Resolvers ──────────────────────────────────
+
+function generateCommandReference(): string {
+  // Group commands by category
+  const groups = new Map<string, Array<{ command: string; description: string; usage?: string }>>();
+  for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) {
+    const list = groups.get(meta.category) || [];
+    list.push({ command: cmd, description: meta.description, usage: meta.usage });
+    groups.set(meta.category, list);
+  }
+
+  // Category display order
+  const categoryOrder = [
+    'Navigation', 'Reading', 'Interaction', 'Inspection',
+    'Visual', 'Snapshot', 'Meta', 'Tabs', 'Server',
+  ];
+
+  const sections: string[] = [];
+  for (const category of categoryOrder) {
+    const commands = groups.get(category);
+    if (!commands || commands.length === 0) continue;
+
+    // Sort alphabetically within category
+    commands.sort((a, b) => a.command.localeCompare(b.command));
+
+    sections.push(`### ${category}`);
+    sections.push('| Command | Description |');
+    sections.push('|---------|-------------|');
+    for (const cmd of commands) {
+      const display = cmd.usage ? `\`${cmd.usage}\`` : `\`${cmd.command}\``;
+      sections.push(`| ${display} | ${cmd.description} |`);
+    }
+    sections.push('');
+  }
+
+  return sections.join('\n').trimEnd();
+}
+
+function generateSnapshotFlags(): string {
+  const lines: string[] = [
+    'The snapshot is your primary tool for understanding and interacting with pages.',
+    '',
+    '```',
+  ];
+
+  for (const flag of SNAPSHOT_FLAGS) {
+    const label = flag.valueHint ? `${flag.short} ${flag.valueHint}` : flag.short;
+    lines.push(`${label.padEnd(10)}${flag.description}`);
+  }
+
+  lines.push('```');
+  lines.push('');
+  lines.push('Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png`');
+  lines.push('');
+  lines.push('After snapshot, use @refs everywhere:');
+  lines.push('```bash');
+  lines.push('$B click @e3       $B fill @e4 "value"     $B hover @e1');
+  lines.push('$B html @e2        $B css @e5 "color"      $B attrs @e6');
+  lines.push('$B click @c1       # cursor-interactive ref (from -C)');
+  lines.push('```');
+  lines.push('');
+  lines.push('Refs are invalidated on navigation — run `snapshot` again after `goto`.');
+
+  return lines.join('\n');
+}
+
+const RESOLVERS: Record<string, () => string> = {
+  COMMAND_REFERENCE: generateCommandReference,
+  SNAPSHOT_FLAGS: generateSnapshotFlags,
+};
+
+// ─── Template Processing ────────────────────────────────────
+
+const GENERATED_HEADER = `<!-- AUTO-GENERATED from {{SOURCE}} — do not edit directly -->\n<!-- Regenerate: bun run gen:skill-docs -->\n`;
+
+function processTemplate(tmplPath: string): { outputPath: string; content: string } {
+  const tmplContent = fs.readFileSync(tmplPath, 'utf-8');
+  const relTmplPath = path.relative(ROOT, tmplPath);
+  const outputPath = tmplPath.replace(/\.tmpl$/, '');
+
+  // Replace placeholders
+  let content = tmplContent.replace(/\{\{(\w+)\}\}/g, (match, name) => {
+    const resolver = RESOLVERS[name];
+    if (!resolver) throw new Error(`Unknown placeholder {{${name}}} in ${relTmplPath}`);
+    return resolver();
+  });
+
+  // Check for any remaining unresolved placeholders
+  const remaining = content.match(/\{\{(\w+)\}\}/g);
+  if (remaining) {
+    throw new Error(`Unresolved placeholders in ${relTmplPath}: ${remaining.join(', ')}`);
+  }
+
+  // Prepend generated header (after frontmatter)
+  const header = GENERATED_HEADER.replace('{{SOURCE}}', path.basename(tmplPath));
+  const fmEnd = content.indexOf('---', content.indexOf('---') + 3);
+  if (fmEnd !== -1) {
+    const insertAt = content.indexOf('\n', fmEnd) + 1;
+    content = content.slice(0, insertAt) + header + content.slice(insertAt);
+  } else {
+    content = header + content;
+  }
+
+  return { outputPath, content };
+}
+
+// ─── Main ───────────────────────────────────────────────────
+
+function findTemplates(): string[] {
+  const templates: string[] = [];
+  const candidates = [
+    path.join(ROOT, 'SKILL.md.tmpl'),
+    path.join(ROOT, 'browse', 'SKILL.md.tmpl'),
+  ];
+  for (const p of candidates) {
+    if (fs.existsSync(p)) templates.push(p);
+  }
+  return templates;
+}
+
+let hasChanges = false;
+
+for (const tmplPath of findTemplates()) {
+  const { outputPath, content } = processTemplate(tmplPath);
+  const relOutput = path.relative(ROOT, outputPath);
+
+  if (DRY_RUN) {
+    const existing = fs.existsSync(outputPath) ? fs.readFileSync(outputPath, 'utf-8') : '';
+    if (existing !== content) {
+      console.log(`STALE: ${relOutput}`);
+      hasChanges = true;
+    } else {
+      console.log(`FRESH: ${relOutput}`);
+    }
+  } else {
+    fs.writeFileSync(outputPath, content);
+    console.log(`GENERATED: ${relOutput}`);
+  }
+}
+
+if (DRY_RUN && hasChanges) {
+  console.error('\nGenerated SKILL.md files are stale. Run: bun run gen:skill-docs');
+  process.exit(1);
+}
@@ -0,0 +1,111 @@
+#!/usr/bin/env bun
+/**
+ * skill:check — Health summary for all SKILL.md files.
+ *
+ * Reports:
+ *   - Command validation (valid/invalid/snapshot errors)
+ *   - Template coverage (which SKILL.md files have .tmpl sources)
+ *   - Freshness check (generated files match committed files)
+ */
+
+import { validateSkill } from '../test/helpers/skill-parser';
+import * as fs from 'fs';
+import * as path from 'path';
+import { execSync } from 'child_process';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+
+// Find all SKILL.md files
+const SKILL_FILES = [
+  'SKILL.md',
+  'browse/SKILL.md',
+  'qa/SKILL.md',
+  'ship/SKILL.md',
+  'review/SKILL.md',
+  'retro/SKILL.md',
+  'plan-ceo-review/SKILL.md',
+  'plan-eng-review/SKILL.md',
+  'setup-browser-cookies/SKILL.md',
+].filter(f => fs.existsSync(path.join(ROOT, f)));
+
+let hasErrors = false;
+
+// ─── Skills ─────────────────────────────────────────────────
+
+console.log('  Skills:');
+for (const file of SKILL_FILES) {
+  const fullPath = path.join(ROOT, file);
+  const result = validateSkill(fullPath);
+
+  if (result.warnings.length > 0) {
+    console.log(`  \u26a0\ufe0f  ${file.padEnd(30)} — ${result.warnings.join(', ')}`);
+    continue;
+  }
+
+  const totalValid = result.valid.length;
+  const totalInvalid = result.invalid.length;
+  const totalSnapErrors = result.snapshotFlagErrors.length;
+
+  if (totalInvalid > 0 || totalSnapErrors > 0) {
+    hasErrors = true;
+    console.log(`  \u274c ${file.padEnd(30)} — ${totalValid} valid, ${totalInvalid} invalid, ${totalSnapErrors} snapshot errors`);
+    for (const inv of result.invalid) {
+      console.log(`      line ${inv.line}: unknown command '${inv.command}'`);
+    }
+    for (const se of result.snapshotFlagErrors) {
+      console.log(`      line ${se.command.line}: ${se.error}`);
+    }
+  } else {
+    console.log(`  \u2705 ${file.padEnd(30)} — ${totalValid} commands, all valid`);
+  }
+}
+
+// ─── Templates ──────────────────────────────────────────────
+
+console.log('\n  Templates:');
+const TEMPLATES = [
+  { tmpl: 'SKILL.md.tmpl', output: 'SKILL.md' },
+  { tmpl: 'browse/SKILL.md.tmpl', output: 'browse/SKILL.md' },
+];
+
+for (const { tmpl, output } of TEMPLATES) {
+  const tmplPath = path.join(ROOT, tmpl);
+  const outPath = path.join(ROOT, output);
+  if (!fs.existsSync(tmplPath)) {
+    console.log(`  \u26a0\ufe0f  ${output.padEnd(30)} — no template`);
+    continue;
+  }
+  if (!fs.existsSync(outPath)) {
+    hasErrors = true;
+    console.log(`  \u274c ${output.padEnd(30)} — generated file missing! Run: bun run gen:skill-docs`);
+    continue;
+  }
+  console.log(`  \u2705 ${tmpl.padEnd(30)} \u2192 ${output}`);
+}
+
+// Skills without templates
+for (const file of SKILL_FILES) {
+  const tmplPath = path.join(ROOT, file + '.tmpl');
+  if (!fs.existsSync(tmplPath) && !TEMPLATES.some(t => t.output === file)) {
+    console.log(`  \u26a0\ufe0f  ${file.padEnd(30)} — no template (OK if no $B commands)`);
+  }
+}
+
+// ─── Freshness ──────────────────────────────────────────────
+
+console.log('\n  Freshness:');
+try {
+  execSync('bun run scripts/gen-skill-docs.ts --dry-run', { cwd: ROOT, stdio: 'pipe' });
+  console.log('  \u2705 All generated files are fresh');
+} catch (err: any) {
+  hasErrors = true;
+  const output = err.stdout?.toString() || '';
+  console.log('  \u274c Generated files are stale:');
+  for (const line of output.split('\n').filter((l: string) => l.startsWith('STALE'))) {
+    console.log(`      ${line}`);
+  }
+  console.log('      Run: bun run gen:skill-docs');
+}
+
+console.log('');
+process.exit(hasErrors ? 1 : 0);