From 7c1d31a25eda48b2d0c97ad1be3e47cee1d37579 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 20:42:35 -0700 Subject: [PATCH] test: LLM-judge for 10 skills + gstack-upgrade E2E Add LLM-judge quality evals for all uncovered skills using a DRY runWorkflowJudge helper with section marker guards. Add real E2E test for gstack-upgrade using mock git remote (replaces test.todo). Add plan-edit assertion to plan-design-review E2E. 14/15 skills now at full coverage. setup-browser-cookies remains deferred (needs real browser). Co-Authored-By: Claude Opus 4.6 (1M context) --- test/skill-e2e.test.ts | 136 +++++++++++++++++++++++- test/skill-llm-eval.test.ts | 204 ++++++++++++++++++++++++++++++++++++ 2 files changed, 335 insertions(+), 5 deletions(-) diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 1d04835c..13539278 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -1876,8 +1876,120 @@ describeE2E('Deferred skill E2E', () => { // Setup-browser-cookies requires interactive browser picker UI test.todo('/setup-browser-cookies imports cookies'); - // Gstack-upgrade is destructive: modifies skill installation directory - test.todo('/gstack-upgrade completes upgrade flow'); +}); + +// --- gstack-upgrade E2E --- + +describeIfSelected('gstack-upgrade E2E', ['gstack-upgrade-happy-path'], () => { + let upgradeDir: string; + let remoteDir: string; + + beforeAll(() => { + upgradeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-upgrade-')); + remoteDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-remote-')); + + const run = (cmd: string, args: string[], cwd: string) => + spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 }); + + // Init the "project" repo + run('git', ['init'], upgradeDir); + run('git', ['config', 'user.email', 'test@test.com'], upgradeDir); + run('git', ['config', 'user.name', 'Test'], upgradeDir); + + // Create mock gstack install directory (local-git type) + const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack'); + fs.mkdirSync(mockGstack, { recursive: true }); + + // Init as a git repo + run('git', ['init'], mockGstack); + run('git', ['config', 'user.email', 'test@test.com'], mockGstack); + run('git', ['config', 'user.name', 'Test'], mockGstack); + + // Create bare remote + run('git', ['init', '--bare'], remoteDir); + run('git', ['remote', 'add', 'origin', remoteDir], mockGstack); + + // Write old version files + fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.5.0\n'); + fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'), + '# Changelog\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n'); + fs.writeFileSync(path.join(mockGstack, 'setup'), + '#!/bin/bash\necho "Setup completed"\n', { mode: 0o755 }); + + // Initial commit + push + run('git', ['add', '.'], mockGstack); + run('git', ['commit', '-m', 'initial'], mockGstack); + run('git', ['push', '-u', 'origin', 'HEAD:main'], mockGstack); + + // Create new version (simulate upstream release) + fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.6.0\n'); + fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'), + '# Changelog\n\n## 0.6.0 — 2026-03-15\n\n- New feature: interactive design review\n- Fix: snapshot flag validation\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n'); + run('git', ['add', '.'], mockGstack); + run('git', ['commit', '-m', 'release 0.6.0'], mockGstack); + run('git', ['push', 'origin', 'HEAD:main'], mockGstack); + + // Reset working copy back to old version + run('git', ['reset', '--hard', 'HEAD~1'], mockGstack); + + // Copy gstack-upgrade skill + fs.mkdirSync(path.join(upgradeDir, 'gstack-upgrade'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'gstack-upgrade', 'SKILL.md'), + path.join(upgradeDir, 'gstack-upgrade', 'SKILL.md'), + ); + + // Commit so git repo is clean + run('git', ['add', '.'], upgradeDir); + run('git', ['commit', '-m', 'initial project'], upgradeDir); + }); + + afterAll(() => { + try { fs.rmSync(upgradeDir, { recursive: true, force: true }); } catch {} + try { fs.rmSync(remoteDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('gstack-upgrade-happy-path', async () => { + const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack'); + const result = await runSkillTest({ + prompt: `Read gstack-upgrade/SKILL.md for the upgrade workflow. + +You are running /gstack-upgrade standalone. The gstack installation is at ./.claude/skills/gstack (local-git type — it has a .git directory with an origin remote). + +Current version: 0.5.0. A new version 0.6.0 is available on origin/main. + +Follow the standalone upgrade flow: +1. Detect install type (local-git) +2. Run git fetch origin && git reset --hard origin/main in the install directory +3. Run the setup script +4. Show what's new from CHANGELOG + +Skip any AskUserQuestion calls — auto-approve the upgrade. Write a summary of what you did to stdout. + +IMPORTANT: The install directory is at ./.claude/skills/gstack — use that exact path.`, + workingDirectory: upgradeDir, + maxTurns: 20, + timeout: 180_000, + testName: 'gstack-upgrade-happy-path', + runId, + }); + + logCost('/gstack-upgrade happy path', result); + + // Check that the version was updated + const versionAfter = fs.readFileSync(path.join(mockGstack, 'VERSION'), 'utf-8').trim(); + const output = result.output || ''; + const mentionsUpgrade = output.toLowerCase().includes('0.6.0') || + output.toLowerCase().includes('upgrade') || + output.toLowerCase().includes('updated'); + + recordE2E('/gstack-upgrade happy path', 'gstack-upgrade E2E', result, { + passed: versionAfter === '0.6.0' && ['success', 'error_max_turns'].includes(result.exitReason), + }); + + expect(['success', 'error_max_turns']).toContain(result.exitReason); + expect(versionAfter).toBe('0.6.0'); + }, 240_000); }); // --- Design Consultation E2E --- @@ -2235,9 +2347,9 @@ Build a user dashboard that shows account stats, recent activity, and settings. Review the plan in ./plan.md. This plan has several design gaps — it uses vague language like "clean, modern UI" and "cards and icons", mentions a "hero section with gradient" (AI slop), and doesn't specify empty states, error states, loading states, responsive behavior, or accessibility. -Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Rate each design dimension 0-10 and explain what would make it a 10. Write your findings and ratings directly to stdout. +Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Rate each design dimension 0-10 and explain what would make it a 10. Then EDIT plan.md to add the missing design decisions (interaction state table, empty states, responsive behavior, etc.). -IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit. Just read the plan file and review it.`, +IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit. Just read the plan file, review it, and edit it to fix the gaps.`, workingDirectory: reviewDir, maxTurns: 15, timeout: 300_000, @@ -2255,13 +2367,27 @@ IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan output.toLowerCase().includes('ai slop') || output.toLowerCase().includes('hierarchy'); + // Check that the plan file was edited (the core new behavior) + const planAfter = fs.readFileSync(path.join(reviewDir, 'plan.md'), 'utf-8'); + const planOriginal = `# Plan: User Dashboard`; + const planWasEdited = planAfter.length > 300; // Original is ~450 chars, edited should be much longer + const planHasDesignAdditions = planAfter.toLowerCase().includes('empty') || + planAfter.toLowerCase().includes('loading') || + planAfter.toLowerCase().includes('error') || + planAfter.toLowerCase().includes('state') || + planAfter.toLowerCase().includes('responsive') || + planAfter.toLowerCase().includes('accessibility'); + recordE2E('/plan-design-review plan-mode', 'Plan Design Review E2E', result, { - passed: hasDesignContent && ['success', 'error_max_turns'].includes(result.exitReason), + passed: hasDesignContent && planWasEdited && ['success', 'error_max_turns'].includes(result.exitReason), }); expect(['success', 'error_max_turns']).toContain(result.exitReason); // Agent should produce design-relevant output about the plan expect(hasDesignContent).toBe(true); + // Agent should have edited the plan file to add missing design decisions + expect(planWasEdited).toBe(true); + expect(planHasDesignAdditions).toBe(true); }, 360_000); testIfSelected('plan-design-review-no-ui-scope', async () => { diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index c3e1aef2..528d5115 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -464,6 +464,210 @@ describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => { }, 60_000); }); +// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) --- + +/** + * DRY helper for workflow SKILL.md judge tests. + * Extracts a section from a SKILL.md file and judges its quality as an agent workflow. + */ +async function runWorkflowJudge(opts: { + testName: string; + suite: string; + skillPath: string; + startMarker: string; + endMarker: string | null; + judgeContext: string; + judgeGoal: string; + thresholds?: { clarity: number; completeness: number; actionability: number }; +}) { + const t0 = Date.now(); + const defaults = { clarity: 4, completeness: 3, actionability: 4 }; + const thresholds = { ...defaults, ...opts.thresholds }; + + const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8'); + const startIdx = content.indexOf(opts.startMarker); + if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`); + + let section: string; + if (opts.endMarker) { + const endIdx = content.indexOf(opts.endMarker, startIdx); + if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`); + section = content.slice(startIdx, endIdx); + } else { + section = content.slice(startIdx); + } + + const scores = await callJudge(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent. + +The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files +that are documented separately — do NOT penalize for missing external definitions. + +Rate on three dimensions (1-5 scale): +- **clarity** (1-5): Can an agent follow the instructions without ambiguity? +- **completeness** (1-5): Are all steps, decision points, and outputs well-defined? +- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables? + +Respond with ONLY valid JSON: +{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"} + +Here is the document to evaluate: + +${section}`); + + console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2)); + + evalCollector?.addTest({ + name: opts.testName, + suite: opts.suite, + tier: 'llm-judge', + passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + + expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity); + expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness); + expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability); +} + +// Block 1: Ship & Release skills +describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => { + testIfSelected('ship/SKILL.md workflow', async () => { + await runWorkflowJudge({ + testName: 'ship/SKILL.md workflow', + suite: 'Ship & Release skill evals', + skillPath: 'ship/SKILL.md', + startMarker: '# Ship:', + endMarker: '## Important Rules', + judgeContext: 'a ship/release workflow document', + judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR', + }); + }, 30_000); + + testIfSelected('document-release/SKILL.md workflow', async () => { + await runWorkflowJudge({ + testName: 'document-release/SKILL.md workflow', + suite: 'Ship & Release skill evals', + skillPath: 'document-release/SKILL.md', + startMarker: '# Document Release:', + endMarker: '## Important Rules', + judgeContext: 'a post-ship documentation update workflow', + judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS', + }); + }, 30_000); +}); + +// Block 2: Plan Review skills +describeIfSelected('Plan Review skill evals', [ + 'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes', +], () => { + testIfSelected('plan-ceo-review/SKILL.md modes', async () => { + await runWorkflowJudge({ + testName: 'plan-ceo-review/SKILL.md modes', + suite: 'Plan Review skill evals', + skillPath: 'plan-ceo-review/SKILL.md', + startMarker: '## Step 0: Nuclear Scope Challenge', + endMarker: '## Review Sections', + judgeContext: 'a CEO/founder plan review framework with 4 scope modes', + judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively', + }); + }, 30_000); + + testIfSelected('plan-eng-review/SKILL.md sections', async () => { + await runWorkflowJudge({ + testName: 'plan-eng-review/SKILL.md sections', + suite: 'Plan Review skill evals', + skillPath: 'plan-eng-review/SKILL.md', + startMarker: '## BEFORE YOU START:', + endMarker: '## CRITICAL RULE', + judgeContext: 'an engineering plan review framework with 4 review sections', + judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion', + }); + }, 30_000); + + testIfSelected('plan-design-review/SKILL.md passes', async () => { + await runWorkflowJudge({ + testName: 'plan-design-review/SKILL.md passes', + suite: 'Plan Review skill evals', + skillPath: 'plan-design-review/SKILL.md', + startMarker: '## Review Sections', + endMarker: '## CRITICAL RULE', + judgeContext: 'a design plan review framework with 7 review passes', + judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate', + }); + }, 30_000); +}); + +// Block 3: Design skills +describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => { + testIfSelected('design-review/SKILL.md fix loop', async () => { + await runWorkflowJudge({ + testName: 'design-review/SKILL.md fix loop', + suite: 'Design skill evals', + skillPath: 'design-review/SKILL.md', + startMarker: '## Phase 7:', + endMarker: '## Additional Rules', + judgeContext: 'a design audit triage and fix loop workflow', + judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots', + }); + }, 30_000); + + testIfSelected('design-consultation/SKILL.md research', async () => { + await runWorkflowJudge({ + testName: 'design-consultation/SKILL.md research', + suite: 'Design skill evals', + skillPath: 'design-consultation/SKILL.md', + startMarker: '## Phase 1:', + endMarker: '## Phase 4:', + judgeContext: 'a design consultation research and proposal workflow', + judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications', + }); + }, 30_000); +}); + +// Block 4: Other skills +describeIfSelected('Other skill evals', [ + 'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow', +], () => { + testIfSelected('retro/SKILL.md instructions', async () => { + await runWorkflowJudge({ + testName: 'retro/SKILL.md instructions', + suite: 'Other skill evals', + skillPath: 'retro/SKILL.md', + startMarker: '## Instructions', + endMarker: '## Compare Mode', + judgeContext: 'an engineering retrospective data gathering and analysis workflow', + judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking', + }); + }, 30_000); + + testIfSelected('qa-only/SKILL.md workflow', async () => { + await runWorkflowJudge({ + testName: 'qa-only/SKILL.md workflow', + suite: 'Other skill evals', + skillPath: 'qa-only/SKILL.md', + startMarker: '## Workflow', + endMarker: '## Important Rules', + judgeContext: 'a report-only QA testing workflow', + judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything', + }); + }, 30_000); + + testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => { + await runWorkflowJudge({ + testName: 'gstack-upgrade/SKILL.md upgrade flow', + suite: 'Other skill evals', + skillPath: 'gstack-upgrade/SKILL.md', + startMarker: '## Inline upgrade flow', + endMarker: '## Standalone usage', + judgeContext: 'a version upgrade detection and execution workflow', + judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed', + }); + }, 30_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { if (evalCollector) {