feat: interactive /plan-design-review + CEO invokes designer + 100% coverage (v0.6.4) (#149)

* refactor: rename qa-design-review → design-review The "qa-" prefix was confusing — this is the live-site design audit with fix loop, not a QA-only report. Rename directory and update all references across docs, tests, scripts, and skill templates. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: interactive /plan-design-review + CEO invokes designer Rewrite /plan-design-review from report-only grading to an interactive plan-fixer that rates each design dimension 0-10, explains what a 10 looks like, and edits the plan to get there. Parallel structure with /plan-ceo-review and /plan-eng-review — one issue = one AskUserQuestion. CEO review now detects UI scope and invokes the designer perspective when the plan has frontend/UX work, so you get design review automatically when it matters. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: validation + touchfile entries for 100% coverage Add design-consultation to command/snapshot flag validation. Add 4 skills to contributor mode validation (plan-design-review, design-review, design-consultation, document-release). Add 2 templates to hardcoded branch check. Register touchfile entries for 10 new LLM-judge tests and 1 new E2E test. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: LLM-judge for 10 skills + gstack-upgrade E2E Add LLM-judge quality evals for all uncovered skills using a DRY runWorkflowJudge helper with section marker guards. Add real E2E test for gstack-upgrade using mock git remote (replaces test.todo). Add plan-edit assertion to plan-design-review E2E. 14/15 skills now at full coverage. setup-browser-cookies remains deferred (needs real browser). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * docs: add bisect commit style to CLAUDE.md All commits should be single logical changes, split before pushing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: bump version and changelog (v0.6.4.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-08 22:35:36 +02:00 · 2026-03-17 22:48:48 -05:00
parent f91222f5bd
commit 78c207efb4
24 changed files with 1120 additions and 765 deletions
@@ -464,6 +464,210 @@ describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => {
  }, 60_000);
 });

+// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) ---
+
+/**
+ * DRY helper for workflow SKILL.md judge tests.
+ * Extracts a section from a SKILL.md file and judges its quality as an agent workflow.
+ */
+async function runWorkflowJudge(opts: {
+  testName: string;
+  suite: string;
+  skillPath: string;
+  startMarker: string;
+  endMarker: string | null;
+  judgeContext: string;
+  judgeGoal: string;
+  thresholds?: { clarity: number; completeness: number; actionability: number };
+}) {
+  const t0 = Date.now();
+  const defaults = { clarity: 4, completeness: 3, actionability: 4 };
+  const thresholds = { ...defaults, ...opts.thresholds };
+
+  const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
+  const startIdx = content.indexOf(opts.startMarker);
+  if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`);
+
+  let section: string;
+  if (opts.endMarker) {
+    const endIdx = content.indexOf(opts.endMarker, startIdx);
+    if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`);
+    section = content.slice(startIdx, endIdx);
+  } else {
+    section = content.slice(startIdx);
+  }
+
+  const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
+
+The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
+that are documented separately — do NOT penalize for missing external definitions.
+
+Rate on three dimensions (1-5 scale):
+- **clarity** (1-5): Can an agent follow the instructions without ambiguity?
+- **completeness** (1-5): Are all steps, decision points, and outputs well-defined?
+- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables?
+
+Respond with ONLY valid JSON:
+{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
+
+Here is the document to evaluate:
+
+${section}`);
+
+  console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2));
+
+  evalCollector?.addTest({
+    name: opts.testName,
+    suite: opts.suite,
+    tier: 'llm-judge',
+    passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability,
+    duration_ms: Date.now() - t0,
+    cost_usd: 0.02,
+    judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+    judge_reasoning: scores.reasoning,
+  });
+
+  expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity);
+  expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness);
+  expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability);
+}
+
+// Block 1: Ship & Release skills
+describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => {
+  testIfSelected('ship/SKILL.md workflow', async () => {
+    await runWorkflowJudge({
+      testName: 'ship/SKILL.md workflow',
+      suite: 'Ship & Release skill evals',
+      skillPath: 'ship/SKILL.md',
+      startMarker: '# Ship:',
+      endMarker: '## Important Rules',
+      judgeContext: 'a ship/release workflow document',
+      judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR',
+    });
+  }, 30_000);
+
+  testIfSelected('document-release/SKILL.md workflow', async () => {
+    await runWorkflowJudge({
+      testName: 'document-release/SKILL.md workflow',
+      suite: 'Ship & Release skill evals',
+      skillPath: 'document-release/SKILL.md',
+      startMarker: '# Document Release:',
+      endMarker: '## Important Rules',
+      judgeContext: 'a post-ship documentation update workflow',
+      judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS',
+    });
+  }, 30_000);
+});
+
+// Block 2: Plan Review skills
+describeIfSelected('Plan Review skill evals', [
+  'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes',
+], () => {
+  testIfSelected('plan-ceo-review/SKILL.md modes', async () => {
+    await runWorkflowJudge({
+      testName: 'plan-ceo-review/SKILL.md modes',
+      suite: 'Plan Review skill evals',
+      skillPath: 'plan-ceo-review/SKILL.md',
+      startMarker: '## Step 0: Nuclear Scope Challenge',
+      endMarker: '## Review Sections',
+      judgeContext: 'a CEO/founder plan review framework with 4 scope modes',
+      judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively',
+    });
+  }, 30_000);
+
+  testIfSelected('plan-eng-review/SKILL.md sections', async () => {
+    await runWorkflowJudge({
+      testName: 'plan-eng-review/SKILL.md sections',
+      suite: 'Plan Review skill evals',
+      skillPath: 'plan-eng-review/SKILL.md',
+      startMarker: '## BEFORE YOU START:',
+      endMarker: '## CRITICAL RULE',
+      judgeContext: 'an engineering plan review framework with 4 review sections',
+      judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion',
+    });
+  }, 30_000);
+
+  testIfSelected('plan-design-review/SKILL.md passes', async () => {
+    await runWorkflowJudge({
+      testName: 'plan-design-review/SKILL.md passes',
+      suite: 'Plan Review skill evals',
+      skillPath: 'plan-design-review/SKILL.md',
+      startMarker: '## Review Sections',
+      endMarker: '## CRITICAL RULE',
+      judgeContext: 'a design plan review framework with 7 review passes',
+      judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate',
+    });
+  }, 30_000);
+});
+
+// Block 3: Design skills
+describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => {
+  testIfSelected('design-review/SKILL.md fix loop', async () => {
+    await runWorkflowJudge({
+      testName: 'design-review/SKILL.md fix loop',
+      suite: 'Design skill evals',
+      skillPath: 'design-review/SKILL.md',
+      startMarker: '## Phase 7:',
+      endMarker: '## Additional Rules',
+      judgeContext: 'a design audit triage and fix loop workflow',
+      judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots',
+    });
+  }, 30_000);
+
+  testIfSelected('design-consultation/SKILL.md research', async () => {
+    await runWorkflowJudge({
+      testName: 'design-consultation/SKILL.md research',
+      suite: 'Design skill evals',
+      skillPath: 'design-consultation/SKILL.md',
+      startMarker: '## Phase 1:',
+      endMarker: '## Phase 4:',
+      judgeContext: 'a design consultation research and proposal workflow',
+      judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications',
+    });
+  }, 30_000);
+});
+
+// Block 4: Other skills
+describeIfSelected('Other skill evals', [
+  'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
+], () => {
+  testIfSelected('retro/SKILL.md instructions', async () => {
+    await runWorkflowJudge({
+      testName: 'retro/SKILL.md instructions',
+      suite: 'Other skill evals',
+      skillPath: 'retro/SKILL.md',
+      startMarker: '## Instructions',
+      endMarker: '## Compare Mode',
+      judgeContext: 'an engineering retrospective data gathering and analysis workflow',
+      judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking',
+    });
+  }, 30_000);
+
+  testIfSelected('qa-only/SKILL.md workflow', async () => {
+    await runWorkflowJudge({
+      testName: 'qa-only/SKILL.md workflow',
+      suite: 'Other skill evals',
+      skillPath: 'qa-only/SKILL.md',
+      startMarker: '## Workflow',
+      endMarker: '## Important Rules',
+      judgeContext: 'a report-only QA testing workflow',
+      judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything',
+    });
+  }, 30_000);
+
+  testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => {
+    await runWorkflowJudge({
+      testName: 'gstack-upgrade/SKILL.md upgrade flow',
+      suite: 'Other skill evals',
+      skillPath: 'gstack-upgrade/SKILL.md',
+      startMarker: '## Inline upgrade flow',
+      endMarker: '## Standalone usage',
+      judgeContext: 'a version upgrade detection and execution workflow',
+      judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed',
+    });
+  }, 30_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  if (evalCollector) {