mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
a468374272
* fix: enrich command descriptions and snapshot flags for LLM eval quality 14 command descriptions enriched with specific arg formats, valid values, error behavior, and return types. Fixed header usage from <name> <value> to <name>:<value>. Added cookie usage syntax. Snapshot flags now show long names, ref numbering, and output format examples. * refactor: auto-generate server.ts help text from COMMAND_DESCRIPTIONS Replace hand-maintained help block with generateHelpText() that reads from COMMAND_DESCRIPTIONS and SNAPSHOT_FLAGS. Eliminates help text drift from source of truth. * test: add usage consistency and pipe guard tests Usage consistency test cross-checks Usage: patterns in implementation against COMMAND_DESCRIPTIONS using structural skeleton comparison. Pipe guard test ensures descriptions don't contain | which would break markdown table rendering. * chore: upgrade eval judge to Sonnet 4.6, update changelog Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable, nuanced scoring. Add changelog entry for all eval improvements. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
154 lines
5.9 KiB
TypeScript
154 lines
5.9 KiB
TypeScript
import { describe, test, expect } from 'bun:test';
|
|
import { validateSkill } from './helpers/skill-parser';
|
|
import { ALL_COMMANDS, COMMAND_DESCRIPTIONS, READ_COMMANDS, WRITE_COMMANDS, META_COMMANDS } from '../browse/src/commands';
|
|
import { SNAPSHOT_FLAGS } from '../browse/src/snapshot';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
|
|
const ROOT = path.resolve(import.meta.dir, '..');
|
|
|
|
describe('SKILL.md command validation', () => {
|
|
test('all $B commands in SKILL.md are valid browse commands', () => {
|
|
const result = validateSkill(path.join(ROOT, 'SKILL.md'));
|
|
expect(result.invalid).toHaveLength(0);
|
|
expect(result.valid.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
test('all snapshot flags in SKILL.md are valid', () => {
|
|
const result = validateSkill(path.join(ROOT, 'SKILL.md'));
|
|
expect(result.snapshotFlagErrors).toHaveLength(0);
|
|
});
|
|
|
|
test('all $B commands in browse/SKILL.md are valid browse commands', () => {
|
|
const result = validateSkill(path.join(ROOT, 'browse', 'SKILL.md'));
|
|
expect(result.invalid).toHaveLength(0);
|
|
expect(result.valid.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
test('all snapshot flags in browse/SKILL.md are valid', () => {
|
|
const result = validateSkill(path.join(ROOT, 'browse', 'SKILL.md'));
|
|
expect(result.snapshotFlagErrors).toHaveLength(0);
|
|
});
|
|
|
|
test('all $B commands in qa/SKILL.md are valid browse commands', () => {
|
|
const qaSkill = path.join(ROOT, 'qa', 'SKILL.md');
|
|
if (!fs.existsSync(qaSkill)) return; // skip if missing
|
|
const result = validateSkill(qaSkill);
|
|
expect(result.invalid).toHaveLength(0);
|
|
});
|
|
|
|
test('all snapshot flags in qa/SKILL.md are valid', () => {
|
|
const qaSkill = path.join(ROOT, 'qa', 'SKILL.md');
|
|
if (!fs.existsSync(qaSkill)) return;
|
|
const result = validateSkill(qaSkill);
|
|
expect(result.snapshotFlagErrors).toHaveLength(0);
|
|
});
|
|
});
|
|
|
|
describe('Command registry consistency', () => {
|
|
test('COMMAND_DESCRIPTIONS covers all commands in sets', () => {
|
|
const allCmds = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]);
|
|
const descKeys = new Set(Object.keys(COMMAND_DESCRIPTIONS));
|
|
for (const cmd of allCmds) {
|
|
expect(descKeys.has(cmd)).toBe(true);
|
|
}
|
|
});
|
|
|
|
test('COMMAND_DESCRIPTIONS has no extra commands not in sets', () => {
|
|
const allCmds = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]);
|
|
for (const key of Object.keys(COMMAND_DESCRIPTIONS)) {
|
|
expect(allCmds.has(key)).toBe(true);
|
|
}
|
|
});
|
|
|
|
test('ALL_COMMANDS matches union of all sets', () => {
|
|
const union = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]);
|
|
expect(ALL_COMMANDS.size).toBe(union.size);
|
|
for (const cmd of union) {
|
|
expect(ALL_COMMANDS.has(cmd)).toBe(true);
|
|
}
|
|
});
|
|
|
|
test('SNAPSHOT_FLAGS option keys are valid SnapshotOptions fields', () => {
|
|
const validKeys = new Set([
|
|
'interactive', 'compact', 'depth', 'selector',
|
|
'diff', 'annotate', 'outputPath', 'cursorInteractive',
|
|
]);
|
|
for (const flag of SNAPSHOT_FLAGS) {
|
|
expect(validKeys.has(flag.optionKey)).toBe(true);
|
|
}
|
|
});
|
|
});
|
|
|
|
describe('Usage string consistency', () => {
|
|
// Normalize a usage string to its structural skeleton for comparison.
|
|
// Replaces <param-names> with <>, [optional] with [], strips parenthetical hints.
|
|
// This catches format mismatches (e.g., <name>:<value> vs <name> <value>)
|
|
// without tripping on abbreviation differences (e.g., <sel> vs <selector>).
|
|
function skeleton(usage: string): string {
|
|
return usage
|
|
.replace(/\(.*?\)/g, '') // strip parenthetical hints like (e.g., Enter, Tab)
|
|
.replace(/<[^>]*>/g, '<>') // normalize <param-name> → <>
|
|
.replace(/\[[^\]]*\]/g, '[]') // normalize [optional] → []
|
|
.replace(/\s+/g, ' ') // collapse whitespace
|
|
.trim();
|
|
}
|
|
|
|
// Cross-check Usage: patterns in implementation against COMMAND_DESCRIPTIONS
|
|
test('implementation Usage: structural format matches COMMAND_DESCRIPTIONS', () => {
|
|
const implFiles = [
|
|
path.join(ROOT, 'browse', 'src', 'write-commands.ts'),
|
|
path.join(ROOT, 'browse', 'src', 'read-commands.ts'),
|
|
path.join(ROOT, 'browse', 'src', 'meta-commands.ts'),
|
|
];
|
|
|
|
// Extract "Usage: browse <pattern>" from throw new Error(...) calls
|
|
const usagePattern = /throw new Error\(['"`]Usage:\s*browse\s+(.+?)['"`]\)/g;
|
|
const implUsages = new Map<string, string>();
|
|
|
|
for (const file of implFiles) {
|
|
const content = fs.readFileSync(file, 'utf-8');
|
|
let match;
|
|
while ((match = usagePattern.exec(content)) !== null) {
|
|
const usage = match[1].split('\\n')[0].trim();
|
|
const cmd = usage.split(/\s/)[0];
|
|
implUsages.set(cmd, usage);
|
|
}
|
|
}
|
|
|
|
// Compare structural skeletons
|
|
const mismatches: string[] = [];
|
|
for (const [cmd, implUsage] of implUsages) {
|
|
const desc = COMMAND_DESCRIPTIONS[cmd];
|
|
if (!desc) continue;
|
|
if (!desc.usage) continue;
|
|
const descSkel = skeleton(desc.usage);
|
|
const implSkel = skeleton(implUsage);
|
|
if (descSkel !== implSkel) {
|
|
mismatches.push(`${cmd}: docs "${desc.usage}" (${descSkel}) vs impl "${implUsage}" (${implSkel})`);
|
|
}
|
|
}
|
|
|
|
expect(mismatches).toEqual([]);
|
|
});
|
|
});
|
|
|
|
describe('Generated SKILL.md freshness', () => {
|
|
test('no unresolved {{placeholders}} in generated SKILL.md', () => {
|
|
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
const unresolved = content.match(/\{\{\w+\}\}/g);
|
|
expect(unresolved).toBeNull();
|
|
});
|
|
|
|
test('no unresolved {{placeholders}} in generated browse/SKILL.md', () => {
|
|
const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
|
|
const unresolved = content.match(/\{\{\w+\}\}/g);
|
|
expect(unresolved).toBeNull();
|
|
});
|
|
|
|
test('generated SKILL.md has AUTO-GENERATED header', () => {
|
|
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
expect(content).toContain('AUTO-GENERATED');
|
|
});
|
|
});
|