test: add validation + E2E tests for spec review, sketch, benefits-from

Unit tests (32 new assertions): - SPEC_REVIEW_LOOP: 5 dimensions, Agent dispatch, 3 iterations, quality score, metrics path, convergence guard, graceful failure - DESIGN_SKETCH: DESIGN.md awareness, wireframe, $B goto/screenshot, rough aesthetic, skip conditions - BENEFITS_FROM: prerequisite offer in CEO + eng review, graceful decline, skills without benefits-from don't get offer - office-hours structure: spec review loop, adversarial dimensions, visual sketch section E2E tests (2 new): - office-hours-spec-review: verifies agent understands the spec review loop from SKILL.md - plan-ceo-review-benefits: verifies agent understands the skill chaining offer Touchfiles updated for diff-based test selection. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 21:25:27 +02:00 · 2026-03-19 21:10:37 -07:00
parent ab82e21957
commit b8dcca0b5b
4 changed files with 291 additions and 0 deletions
@@ -422,3 +422,95 @@ describe('REVIEW_DASHBOARD resolver', () => {
    expect(content).not.toContain('Review Chaining');
  });
 });
+
+// --- {{SPEC_REVIEW_LOOP}} resolver tests ---
+
+describe('SPEC_REVIEW_LOOP resolver', () => {
+  const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
+
+  test('contains all 5 review dimensions', () => {
+    for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
+      expect(content).toContain(dim);
+    }
+  });
+
+  test('references Agent tool for subagent dispatch', () => {
+    expect(content).toMatch(/Agent.*tool/i);
+  });
+
+  test('specifies max 3 iterations', () => {
+    expect(content).toMatch(/3.*iteration|maximum.*3/i);
+  });
+
+  test('includes quality score', () => {
+    expect(content).toContain('quality score');
+  });
+
+  test('includes metrics path', () => {
+    expect(content).toContain('spec-review.jsonl');
+  });
+
+  test('includes convergence guard', () => {
+    expect(content).toMatch(/[Cc]onvergence/);
+  });
+
+  test('includes graceful failure handling', () => {
+    expect(content).toMatch(/skip.*review|unavailable/i);
+  });
+});
+
+// --- {{DESIGN_SKETCH}} resolver tests ---
+
+describe('DESIGN_SKETCH resolver', () => {
+  const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
+
+  test('references DESIGN.md for design system constraints', () => {
+    expect(content).toContain('DESIGN.md');
+  });
+
+  test('contains wireframe or sketch terminology', () => {
+    expect(content).toMatch(/wireframe|sketch/i);
+  });
+
+  test('references browse binary for rendering', () => {
+    expect(content).toContain('$B goto');
+  });
+
+  test('references screenshot capture', () => {
+    expect(content).toContain('$B screenshot');
+  });
+
+  test('specifies rough aesthetic', () => {
+    expect(content).toMatch(/[Rr]ough|hand-drawn/);
+  });
+
+  test('includes skip conditions', () => {
+    expect(content).toMatch(/no UI component|skip/i);
+  });
+});
+
+// --- {{BENEFITS_FROM}} resolver tests ---
+
+describe('BENEFITS_FROM resolver', () => {
+  const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
+  const engContent = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
+
+  test('plan-ceo-review contains prerequisite skill offer', () => {
+    expect(ceoContent).toContain('Prerequisite Skill Offer');
+    expect(ceoContent).toContain('/office-hours');
+  });
+
+  test('plan-eng-review contains prerequisite skill offer', () => {
+    expect(engContent).toContain('Prerequisite Skill Offer');
+    expect(engContent).toContain('/office-hours');
+  });
+
+  test('offer includes graceful decline', () => {
+    expect(ceoContent).toContain('No worries');
+  });
+
+  test('skills without benefits-from do NOT have prerequisite offer', () => {
+    const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
+    expect(qaContent).not.toContain('Prerequisite Skill Offer');
+  });
+});
@@ -57,9 +57,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'review-base-branch':       ['review/**'],
  'review-design-lite':       ['review/**', 'test/fixtures/review-eval-design-slop.*'],

+  // Office Hours
+  'office-hours-spec-review':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],
+
  // Plan reviews
  'plan-ceo-review':           ['plan-ceo-review/**'],
  'plan-ceo-review-selective': ['plan-ceo-review/**'],
+  'plan-ceo-review-benefits':  ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'plan-eng-review':           ['plan-eng-review/**'],
  'plan-eng-review-artifact':  ['plan-eng-review/**'],

@@ -136,6 +140,10 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
  'design-review/SKILL.md fix loop':      ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
  'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],

+  // Office Hours
+  'office-hours/SKILL.md spec review':    ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
+  'office-hours/SKILL.md design sketch':  ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
+
  // Other skills
  'retro/SKILL.md instructions':          ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
  'qa-only/SKILL.md workflow':            ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
@@ -2911,6 +2911,128 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
  }, 360_000);
 });

+// --- Office Hours Spec Review E2E ---
+
+describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'], () => {
+  let ohDir: string;
+
+  beforeAll(() => {
+    ohDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-oh-spec-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: ohDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+    fs.writeFileSync(path.join(ohDir, 'README.md'), '# Test Project\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'init']);
+
+    // Copy office-hours skill
+    fs.mkdirSync(path.join(ohDir, 'office-hours'), { recursive: true });
+    fs.copyFileSync(
+      path.join(ROOT, 'office-hours', 'SKILL.md'),
+      path.join(ohDir, 'office-hours', 'SKILL.md'),
+    );
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/office-hours SKILL.md contains spec review loop', async () => {
+    const result = await runSkillTest({
+      prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
+
+Summarize what the "Spec Review Loop" section does — specifically:
+1. How many dimensions does the reviewer check?
+2. What tool is used to dispatch the reviewer?
+3. What's the maximum number of iterations?
+4. What metrics are tracked?
+
+Write your summary to ${ohDir}/spec-review-summary.md`,
+      workingDirectory: ohDir,
+      maxTurns: 8,
+      timeout: 120_000,
+      testName: 'office-hours-spec-review',
+      runId,
+    });
+
+    logCost('/office-hours spec review', result);
+    recordE2E('/office-hours-spec-review', 'Office Hours Spec Review E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    const summaryPath = path.join(ohDir, 'spec-review-summary.md');
+    if (fs.existsSync(summaryPath)) {
+      const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
+      // Verify the agent understood the key concepts
+      expect(summary).toMatch(/5.*dimension|dimension.*5|completeness|consistency|clarity|scope|feasibility/);
+      expect(summary).toMatch(/agent|subagent/);
+      expect(summary).toMatch(/3.*iteration|iteration.*3|maximum.*3/);
+    }
+  }, 180_000);
+});
+
+// --- Plan CEO Review Benefits-From E2E ---
+
+describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefits'], () => {
+  let benefitsDir: string;
+
+  beforeAll(() => {
+    benefitsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benefits-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: benefitsDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+    fs.writeFileSync(path.join(benefitsDir, 'README.md'), '# Test Project\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'init']);
+
+    // Copy plan-ceo-review skill
+    fs.mkdirSync(path.join(benefitsDir, 'plan-ceo-review'), { recursive: true });
+    fs.copyFileSync(
+      path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
+      path.join(benefitsDir, 'plan-ceo-review', 'SKILL.md'),
+    );
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
+    const result = await runSkillTest({
+      prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
+
+Summarize what happens when no design doc is found — specifically:
+1. Is /office-hours offered as a prerequisite?
+2. What options does the user get?
+3. Is there a mid-session detection for when the user seems lost?
+
+Write your summary to ${benefitsDir}/benefits-summary.md`,
+      workingDirectory: benefitsDir,
+      maxTurns: 8,
+      timeout: 120_000,
+      testName: 'plan-ceo-review-benefits',
+      runId,
+    });
+
+    logCost('/plan-ceo-review benefits-from', result);
+    recordE2E('/plan-ceo-review-benefits', 'Plan CEO Review Benefits-From E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    const summaryPath = path.join(benefitsDir, 'benefits-summary.md');
+    if (fs.existsSync(summaryPath)) {
+      const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
+      // Verify the agent understood the skill chaining
+      expect(summary).toMatch(/office.hours/);
+      expect(summary).toMatch(/design doc|no design/i);
+    }
+  }, 180_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  if (evalCollector) {
@@ -644,6 +644,59 @@ describe('office-hours skill structure', () => {
  test('contains builder operating principles', () => {
    expect(content).toContain('Delight is the currency');
  });
+
+  // Spec Review Loop (Phase 5.5)
+  test('contains spec review loop', () => {
+    expect(content).toContain('Spec Review Loop');
+  });
+
+  test('contains adversarial review dimensions', () => {
+    for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
+      expect(content).toContain(dim);
+    }
+  });
+
+  test('contains subagent dispatch instruction', () => {
+    expect(content).toMatch(/Agent.*tool|subagent/i);
+  });
+
+  test('contains max 3 iterations', () => {
+    expect(content).toMatch(/3.*iteration|maximum.*3/i);
+  });
+
+  test('contains quality score', () => {
+    expect(content).toContain('quality score');
+  });
+
+  test('contains spec review metrics path', () => {
+    expect(content).toContain('spec-review.jsonl');
+  });
+
+  test('contains convergence guard', () => {
+    expect(content).toMatch(/convergence/i);
+  });
+
+  // Visual Sketch (Phase 4.5)
+  test('contains visual sketch section', () => {
+    expect(content).toContain('Visual Sketch');
+  });
+
+  test('contains wireframe generation', () => {
+    expect(content).toMatch(/wireframe|sketch/i);
+  });
+
+  test('contains DESIGN.md awareness', () => {
+    expect(content).toContain('DESIGN.md');
+  });
+
+  test('contains browse rendering', () => {
+    expect(content).toContain('$B goto');
+    expect(content).toContain('$B screenshot');
+  });
+
+  test('contains rough aesthetic instruction', () => {
+    expect(content).toMatch(/rough|hand-drawn/i);
+  });
 });

 describe('investigate skill structure', () => {
@@ -856,6 +909,22 @@ describe('CEO review mode validation', () => {
    expect(content).toContain('HOLD SCOPE');
    expect(content).toContain('REDUCTION');
  });
+
+  // Skill chaining (benefits-from)
+  test('contains prerequisite skill offer for office-hours', () => {
+    expect(content).toContain('Prerequisite Skill Offer');
+    expect(content).toContain('/office-hours');
+  });
+
+  test('contains mid-session detection', () => {
+    expect(content).toContain('Mid-session detection');
+    expect(content).toMatch(/still figuring out|seems lost/i);
+  });
+
+  // Spec review on CEO plans
+  test('contains spec review loop for CEO plan documents', () => {
+    expect(content).toContain('Spec Review Loop');
+  });
 });

 // --- gstack-slug helper ---