diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bdd600c..dd179e2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## Unreleased — 2026-03-14 + +### Changed +- Enriched 14 command descriptions with specific arg formats, valid values, error behavior, and return types +- Fixed `header` usage from ` ` to `:` (matching actual implementation) +- Added `cookie` usage syntax: `cookie =` +- Enriched 4 snapshot flag descriptions with defaults, output paths, and behavior details +- Snapshot flags section now shows long flag names (`-i / --interactive`) alongside short +- Added ref numbering explanation and output format example to snapshot docs +- Replaced hand-maintained server.ts help text with auto-generated `generateHelpText()` from COMMAND_DESCRIPTIONS +- Upgraded LLM eval judge from Haiku to Sonnet 4.6 for more stable scoring + +### Added +- Usage string consistency test: cross-checks `Usage:` patterns in implementation against COMMAND_DESCRIPTIONS +- Pipe guard test: ensures no command description contains `|` (would break markdown tables) + ## 0.3.3 — 2026-03-13 ### Added diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index 308de814..f978f035 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -7,7 +7,7 @@ * Requires: ANTHROPIC_API_KEY env var * Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts * - * Cost: ~$0.01-0.03 per run (haiku) + * Cost: ~$0.05-0.15 per run (sonnet) */ import { describe, test, expect } from 'bun:test'; @@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise { const client = new Anthropic(); const response = await client.messages.create({ - model: 'claude-haiku-4-5-20251001', + model: 'claude-sonnet-4-6', max_tokens: 1024, messages: [{ role: 'user', @@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => { const client = new Anthropic(); const response = await client.messages.create({ - model: 'claude-haiku-4-5-20251001', + model: 'claude-sonnet-4-6', max_tokens: 1024, messages: [{ role: 'user',