From b8dcca0b5b6b10f382026318a16f98bf72a48cf4 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 19 Mar 2026 21:10:37 -0700 Subject: [PATCH] test: add validation + E2E tests for spec review, sketch, benefits-from Unit tests (32 new assertions): - SPEC_REVIEW_LOOP: 5 dimensions, Agent dispatch, 3 iterations, quality score, metrics path, convergence guard, graceful failure - DESIGN_SKETCH: DESIGN.md awareness, wireframe, $B goto/screenshot, rough aesthetic, skip conditions - BENEFITS_FROM: prerequisite offer in CEO + eng review, graceful decline, skills without benefits-from don't get offer - office-hours structure: spec review loop, adversarial dimensions, visual sketch section E2E tests (2 new): - office-hours-spec-review: verifies agent understands the spec review loop from SKILL.md - plan-ceo-review-benefits: verifies agent understands the skill chaining offer Touchfiles updated for diff-based test selection. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/gen-skill-docs.test.ts | 92 +++++++++++++++++++++++++ test/helpers/touchfiles.ts | 8 +++ test/skill-e2e.test.ts | 122 ++++++++++++++++++++++++++++++++++ test/skill-validation.test.ts | 69 +++++++++++++++++++ 4 files changed, 291 insertions(+) diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 32d1ad81..7adb2776 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -422,3 +422,95 @@ describe('REVIEW_DASHBOARD resolver', () => { expect(content).not.toContain('Review Chaining'); }); }); + +// --- {{SPEC_REVIEW_LOOP}} resolver tests --- + +describe('SPEC_REVIEW_LOOP resolver', () => { + const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8'); + + test('contains all 5 review dimensions', () => { + for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) { + expect(content).toContain(dim); + } + }); + + test('references Agent tool for subagent dispatch', () => { + expect(content).toMatch(/Agent.*tool/i); + }); + + test('specifies max 3 iterations', () => { + expect(content).toMatch(/3.*iteration|maximum.*3/i); + }); + + test('includes quality score', () => { + expect(content).toContain('quality score'); + }); + + test('includes metrics path', () => { + expect(content).toContain('spec-review.jsonl'); + }); + + test('includes convergence guard', () => { + expect(content).toMatch(/[Cc]onvergence/); + }); + + test('includes graceful failure handling', () => { + expect(content).toMatch(/skip.*review|unavailable/i); + }); +}); + +// --- {{DESIGN_SKETCH}} resolver tests --- + +describe('DESIGN_SKETCH resolver', () => { + const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8'); + + test('references DESIGN.md for design system constraints', () => { + expect(content).toContain('DESIGN.md'); + }); + + test('contains wireframe or sketch terminology', () => { + expect(content).toMatch(/wireframe|sketch/i); + }); + + test('references browse binary for rendering', () => { + expect(content).toContain('$B goto'); + }); + + test('references screenshot capture', () => { + expect(content).toContain('$B screenshot'); + }); + + test('specifies rough aesthetic', () => { + expect(content).toMatch(/[Rr]ough|hand-drawn/); + }); + + test('includes skip conditions', () => { + expect(content).toMatch(/no UI component|skip/i); + }); +}); + +// --- {{BENEFITS_FROM}} resolver tests --- + +describe('BENEFITS_FROM resolver', () => { + const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8'); + const engContent = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8'); + + test('plan-ceo-review contains prerequisite skill offer', () => { + expect(ceoContent).toContain('Prerequisite Skill Offer'); + expect(ceoContent).toContain('/office-hours'); + }); + + test('plan-eng-review contains prerequisite skill offer', () => { + expect(engContent).toContain('Prerequisite Skill Offer'); + expect(engContent).toContain('/office-hours'); + }); + + test('offer includes graceful decline', () => { + expect(ceoContent).toContain('No worries'); + }); + + test('skills without benefits-from do NOT have prerequisite offer', () => { + const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + expect(qaContent).not.toContain('Prerequisite Skill Offer'); + }); +}); diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 19eba66e..c3c48394 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -57,9 +57,13 @@ export const E2E_TOUCHFILES: Record = { 'review-base-branch': ['review/**'], 'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'], + // Office Hours + 'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'], + // Plan reviews 'plan-ceo-review': ['plan-ceo-review/**'], 'plan-ceo-review-selective': ['plan-ceo-review/**'], + 'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'], 'plan-eng-review': ['plan-eng-review/**'], 'plan-eng-review-artifact': ['plan-eng-review/**'], @@ -136,6 +140,10 @@ export const LLM_JUDGE_TOUCHFILES: Record = { 'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'], 'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'], + // Office Hours + 'office-hours/SKILL.md spec review': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'office-hours/SKILL.md design sketch': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + // Other skills 'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'], 'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'], diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 96019f70..0b6331f3 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -2911,6 +2911,128 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m }, 360_000); }); +// --- Office Hours Spec Review E2E --- + +describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'], () => { + let ohDir: string; + + beforeAll(() => { + ohDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-oh-spec-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: ohDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(ohDir, 'README.md'), '# Test Project\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'init']); + + // Copy office-hours skill + fs.mkdirSync(path.join(ohDir, 'office-hours'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'office-hours', 'SKILL.md'), + path.join(ohDir, 'office-hours', 'SKILL.md'), + ); + }); + + afterAll(() => { + try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {} + }); + + test('/office-hours SKILL.md contains spec review loop', async () => { + const result = await runSkillTest({ + prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop. + +Summarize what the "Spec Review Loop" section does — specifically: +1. How many dimensions does the reviewer check? +2. What tool is used to dispatch the reviewer? +3. What's the maximum number of iterations? +4. What metrics are tracked? + +Write your summary to ${ohDir}/spec-review-summary.md`, + workingDirectory: ohDir, + maxTurns: 8, + timeout: 120_000, + testName: 'office-hours-spec-review', + runId, + }); + + logCost('/office-hours spec review', result); + recordE2E('/office-hours-spec-review', 'Office Hours Spec Review E2E', result); + expect(result.exitReason).toBe('success'); + + const summaryPath = path.join(ohDir, 'spec-review-summary.md'); + if (fs.existsSync(summaryPath)) { + const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase(); + // Verify the agent understood the key concepts + expect(summary).toMatch(/5.*dimension|dimension.*5|completeness|consistency|clarity|scope|feasibility/); + expect(summary).toMatch(/agent|subagent/); + expect(summary).toMatch(/3.*iteration|iteration.*3|maximum.*3/); + } + }, 180_000); +}); + +// --- Plan CEO Review Benefits-From E2E --- + +describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefits'], () => { + let benefitsDir: string; + + beforeAll(() => { + benefitsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benefits-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: benefitsDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(benefitsDir, 'README.md'), '# Test Project\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'init']); + + // Copy plan-ceo-review skill + fs.mkdirSync(path.join(benefitsDir, 'plan-ceo-review'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), + path.join(benefitsDir, 'plan-ceo-review', 'SKILL.md'), + ); + }); + + afterAll(() => { + try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {} + }); + + test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => { + const result = await runSkillTest({ + prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found". + +Summarize what happens when no design doc is found — specifically: +1. Is /office-hours offered as a prerequisite? +2. What options does the user get? +3. Is there a mid-session detection for when the user seems lost? + +Write your summary to ${benefitsDir}/benefits-summary.md`, + workingDirectory: benefitsDir, + maxTurns: 8, + timeout: 120_000, + testName: 'plan-ceo-review-benefits', + runId, + }); + + logCost('/plan-ceo-review benefits-from', result); + recordE2E('/plan-ceo-review-benefits', 'Plan CEO Review Benefits-From E2E', result); + expect(result.exitReason).toBe('success'); + + const summaryPath = path.join(benefitsDir, 'benefits-summary.md'); + if (fs.existsSync(summaryPath)) { + const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase(); + // Verify the agent understood the skill chaining + expect(summary).toMatch(/office.hours/); + expect(summary).toMatch(/design doc|no design/i); + } + }, 180_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { if (evalCollector) { diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 13c74841..89ee533e 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -644,6 +644,59 @@ describe('office-hours skill structure', () => { test('contains builder operating principles', () => { expect(content).toContain('Delight is the currency'); }); + + // Spec Review Loop (Phase 5.5) + test('contains spec review loop', () => { + expect(content).toContain('Spec Review Loop'); + }); + + test('contains adversarial review dimensions', () => { + for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) { + expect(content).toContain(dim); + } + }); + + test('contains subagent dispatch instruction', () => { + expect(content).toMatch(/Agent.*tool|subagent/i); + }); + + test('contains max 3 iterations', () => { + expect(content).toMatch(/3.*iteration|maximum.*3/i); + }); + + test('contains quality score', () => { + expect(content).toContain('quality score'); + }); + + test('contains spec review metrics path', () => { + expect(content).toContain('spec-review.jsonl'); + }); + + test('contains convergence guard', () => { + expect(content).toMatch(/convergence/i); + }); + + // Visual Sketch (Phase 4.5) + test('contains visual sketch section', () => { + expect(content).toContain('Visual Sketch'); + }); + + test('contains wireframe generation', () => { + expect(content).toMatch(/wireframe|sketch/i); + }); + + test('contains DESIGN.md awareness', () => { + expect(content).toContain('DESIGN.md'); + }); + + test('contains browse rendering', () => { + expect(content).toContain('$B goto'); + expect(content).toContain('$B screenshot'); + }); + + test('contains rough aesthetic instruction', () => { + expect(content).toMatch(/rough|hand-drawn/i); + }); }); describe('investigate skill structure', () => { @@ -856,6 +909,22 @@ describe('CEO review mode validation', () => { expect(content).toContain('HOLD SCOPE'); expect(content).toContain('REDUCTION'); }); + + // Skill chaining (benefits-from) + test('contains prerequisite skill offer for office-hours', () => { + expect(content).toContain('Prerequisite Skill Offer'); + expect(content).toContain('/office-hours'); + }); + + test('contains mid-session detection', () => { + expect(content).toContain('Mid-session detection'); + expect(content).toMatch(/still figuring out|seems lost/i); + }); + + // Spec review on CEO plans + test('contains spec review loop for CEO plan documents', () => { + expect(content).toContain('Spec Review Loop'); + }); }); // --- gstack-slug helper ---