mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-29 14:31:28 +02:00
fix: enrich SKILL.md docs to pass LLM evals, upgrade judge to Sonnet 4.6 (#43)
* fix: enrich command descriptions and snapshot flags for LLM eval quality 14 command descriptions enriched with specific arg formats, valid values, error behavior, and return types. Fixed header usage from <name> <value> to <name>:<value>. Added cookie usage syntax. Snapshot flags now show long names, ref numbering, and output format examples. * refactor: auto-generate server.ts help text from COMMAND_DESCRIPTIONS Replace hand-maintained help block with generateHelpText() that reads from COMMAND_DESCRIPTIONS and SNAPSHOT_FLAGS. Eliminates help text drift from source of truth. * test: add usage consistency and pipe guard tests Usage consistency test cross-checks Usage: patterns in implementation against COMMAND_DESCRIPTIONS using structural skeleton comparison. Pipe guard test ensures descriptions don't contain | which would break markdown table rendering. * chore: upgrade eval judge to Sonnet 4.6, update changelog Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable, nuanced scoring. Add changelog entry for all eval improvements. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -139,6 +139,14 @@ describe('description quality evals', () => {
|
||||
}
|
||||
});
|
||||
|
||||
// Guard: descriptions must not contain pipe (breaks markdown table cells)
|
||||
// Usage strings are backtick-wrapped in the table so pipes there are safe.
|
||||
test('no command description contains pipe character', () => {
|
||||
for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) {
|
||||
expect(meta.description).not.toContain('|');
|
||||
}
|
||||
});
|
||||
|
||||
// Guard: generated output uses → not ->
|
||||
test('generated SKILL.md uses unicode arrows', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
* Requires: ANTHROPIC_API_KEY env var
|
||||
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
|
||||
*
|
||||
* Cost: ~$0.01-0.03 per run (haiku)
|
||||
* Cost: ~$0.05-0.15 per run (sonnet)
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
@@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise<JudgeScore> {
|
||||
const client = new Anthropic();
|
||||
|
||||
const response = await client.messages.create({
|
||||
model: 'claude-haiku-4-5-20251001',
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 1024,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
@@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
|
||||
const client = new Anthropic();
|
||||
const response = await client.messages.create({
|
||||
model: 'claude-haiku-4-5-20251001',
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 1024,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
|
||||
@@ -80,6 +80,59 @@ describe('Command registry consistency', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('Usage string consistency', () => {
|
||||
// Normalize a usage string to its structural skeleton for comparison.
|
||||
// Replaces <param-names> with <>, [optional] with [], strips parenthetical hints.
|
||||
// This catches format mismatches (e.g., <name>:<value> vs <name> <value>)
|
||||
// without tripping on abbreviation differences (e.g., <sel> vs <selector>).
|
||||
function skeleton(usage: string): string {
|
||||
return usage
|
||||
.replace(/\(.*?\)/g, '') // strip parenthetical hints like (e.g., Enter, Tab)
|
||||
.replace(/<[^>]*>/g, '<>') // normalize <param-name> → <>
|
||||
.replace(/\[[^\]]*\]/g, '[]') // normalize [optional] → []
|
||||
.replace(/\s+/g, ' ') // collapse whitespace
|
||||
.trim();
|
||||
}
|
||||
|
||||
// Cross-check Usage: patterns in implementation against COMMAND_DESCRIPTIONS
|
||||
test('implementation Usage: structural format matches COMMAND_DESCRIPTIONS', () => {
|
||||
const implFiles = [
|
||||
path.join(ROOT, 'browse', 'src', 'write-commands.ts'),
|
||||
path.join(ROOT, 'browse', 'src', 'read-commands.ts'),
|
||||
path.join(ROOT, 'browse', 'src', 'meta-commands.ts'),
|
||||
];
|
||||
|
||||
// Extract "Usage: browse <pattern>" from throw new Error(...) calls
|
||||
const usagePattern = /throw new Error\(['"`]Usage:\s*browse\s+(.+?)['"`]\)/g;
|
||||
const implUsages = new Map<string, string>();
|
||||
|
||||
for (const file of implFiles) {
|
||||
const content = fs.readFileSync(file, 'utf-8');
|
||||
let match;
|
||||
while ((match = usagePattern.exec(content)) !== null) {
|
||||
const usage = match[1].split('\\n')[0].trim();
|
||||
const cmd = usage.split(/\s/)[0];
|
||||
implUsages.set(cmd, usage);
|
||||
}
|
||||
}
|
||||
|
||||
// Compare structural skeletons
|
||||
const mismatches: string[] = [];
|
||||
for (const [cmd, implUsage] of implUsages) {
|
||||
const desc = COMMAND_DESCRIPTIONS[cmd];
|
||||
if (!desc) continue;
|
||||
if (!desc.usage) continue;
|
||||
const descSkel = skeleton(desc.usage);
|
||||
const implSkel = skeleton(implUsage);
|
||||
if (descSkel !== implSkel) {
|
||||
mismatches.push(`${cmd}: docs "${desc.usage}" (${descSkel}) vs impl "${implUsage}" (${implSkel})`);
|
||||
}
|
||||
}
|
||||
|
||||
expect(mismatches).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Generated SKILL.md freshness', () => {
|
||||
test('no unresolved {{placeholders}} in generated SKILL.md', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
|
||||
Reference in New Issue
Block a user