test: add validation + E2E tests for spec review, sketch, benefits-from

Unit tests (32 new assertions):
- SPEC_REVIEW_LOOP: 5 dimensions, Agent dispatch, 3 iterations, quality
  score, metrics path, convergence guard, graceful failure
- DESIGN_SKETCH: DESIGN.md awareness, wireframe, $B goto/screenshot,
  rough aesthetic, skip conditions
- BENEFITS_FROM: prerequisite offer in CEO + eng review, graceful
  decline, skills without benefits-from don't get offer
- office-hours structure: spec review loop, adversarial dimensions,
  visual sketch section

E2E tests (2 new):
- office-hours-spec-review: verifies agent understands the spec review
  loop from SKILL.md
- plan-ceo-review-benefits: verifies agent understands the skill
  chaining offer

Touchfiles updated for diff-based test selection.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-19 21:10:37 -07:00
parent ab82e21957
commit b8dcca0b5b
4 changed files with 291 additions and 0 deletions
+92
View File
@@ -422,3 +422,95 @@ describe('REVIEW_DASHBOARD resolver', () => {
expect(content).not.toContain('Review Chaining');
});
});
// --- {{SPEC_REVIEW_LOOP}} resolver tests ---
describe('SPEC_REVIEW_LOOP resolver', () => {
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
test('contains all 5 review dimensions', () => {
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
expect(content).toContain(dim);
}
});
test('references Agent tool for subagent dispatch', () => {
expect(content).toMatch(/Agent.*tool/i);
});
test('specifies max 3 iterations', () => {
expect(content).toMatch(/3.*iteration|maximum.*3/i);
});
test('includes quality score', () => {
expect(content).toContain('quality score');
});
test('includes metrics path', () => {
expect(content).toContain('spec-review.jsonl');
});
test('includes convergence guard', () => {
expect(content).toMatch(/[Cc]onvergence/);
});
test('includes graceful failure handling', () => {
expect(content).toMatch(/skip.*review|unavailable/i);
});
});
// --- {{DESIGN_SKETCH}} resolver tests ---
describe('DESIGN_SKETCH resolver', () => {
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
test('references DESIGN.md for design system constraints', () => {
expect(content).toContain('DESIGN.md');
});
test('contains wireframe or sketch terminology', () => {
expect(content).toMatch(/wireframe|sketch/i);
});
test('references browse binary for rendering', () => {
expect(content).toContain('$B goto');
});
test('references screenshot capture', () => {
expect(content).toContain('$B screenshot');
});
test('specifies rough aesthetic', () => {
expect(content).toMatch(/[Rr]ough|hand-drawn/);
});
test('includes skip conditions', () => {
expect(content).toMatch(/no UI component|skip/i);
});
});
// --- {{BENEFITS_FROM}} resolver tests ---
describe('BENEFITS_FROM resolver', () => {
const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
const engContent = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
test('plan-ceo-review contains prerequisite skill offer', () => {
expect(ceoContent).toContain('Prerequisite Skill Offer');
expect(ceoContent).toContain('/office-hours');
});
test('plan-eng-review contains prerequisite skill offer', () => {
expect(engContent).toContain('Prerequisite Skill Offer');
expect(engContent).toContain('/office-hours');
});
test('offer includes graceful decline', () => {
expect(ceoContent).toContain('No worries');
});
test('skills without benefits-from do NOT have prerequisite offer', () => {
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(qaContent).not.toContain('Prerequisite Skill Offer');
});
});
+8
View File
@@ -57,9 +57,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Office Hours
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
// Plan reviews
'plan-ceo-review': ['plan-ceo-review/**'],
'plan-ceo-review-selective': ['plan-ceo-review/**'],
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],
@@ -136,6 +140,10 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
// Office Hours
'office-hours/SKILL.md spec review': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'office-hours/SKILL.md design sketch': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
// Other skills
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
+122
View File
@@ -2911,6 +2911,128 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
}, 360_000);
});
// --- Office Hours Spec Review E2E ---
describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'], () => {
let ohDir: string;
beforeAll(() => {
ohDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-oh-spec-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: ohDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(ohDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy office-hours skill
fs.mkdirSync(path.join(ohDir, 'office-hours'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'office-hours', 'SKILL.md'),
path.join(ohDir, 'office-hours', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
});
test('/office-hours SKILL.md contains spec review loop', async () => {
const result = await runSkillTest({
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
Summarize what the "Spec Review Loop" section does — specifically:
1. How many dimensions does the reviewer check?
2. What tool is used to dispatch the reviewer?
3. What's the maximum number of iterations?
4. What metrics are tracked?
Write your summary to ${ohDir}/spec-review-summary.md`,
workingDirectory: ohDir,
maxTurns: 8,
timeout: 120_000,
testName: 'office-hours-spec-review',
runId,
});
logCost('/office-hours spec review', result);
recordE2E('/office-hours-spec-review', 'Office Hours Spec Review E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(ohDir, 'spec-review-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the key concepts
expect(summary).toMatch(/5.*dimension|dimension.*5|completeness|consistency|clarity|scope|feasibility/);
expect(summary).toMatch(/agent|subagent/);
expect(summary).toMatch(/3.*iteration|iteration.*3|maximum.*3/);
}
}, 180_000);
});
// --- Plan CEO Review Benefits-From E2E ---
describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefits'], () => {
let benefitsDir: string;
beforeAll(() => {
benefitsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benefits-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: benefitsDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(benefitsDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy plan-ceo-review skill
fs.mkdirSync(path.join(benefitsDir, 'plan-ceo-review'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
path.join(benefitsDir, 'plan-ceo-review', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
Summarize what happens when no design doc is found — specifically:
1. Is /office-hours offered as a prerequisite?
2. What options does the user get?
3. Is there a mid-session detection for when the user seems lost?
Write your summary to ${benefitsDir}/benefits-summary.md`,
workingDirectory: benefitsDir,
maxTurns: 8,
timeout: 120_000,
testName: 'plan-ceo-review-benefits',
runId,
});
logCost('/plan-ceo-review benefits-from', result);
recordE2E('/plan-ceo-review-benefits', 'Plan CEO Review Benefits-From E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(benefitsDir, 'benefits-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the skill chaining
expect(summary).toMatch(/office.hours/);
expect(summary).toMatch(/design doc|no design/i);
}
}, 180_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+69
View File
@@ -644,6 +644,59 @@ describe('office-hours skill structure', () => {
test('contains builder operating principles', () => {
expect(content).toContain('Delight is the currency');
});
// Spec Review Loop (Phase 5.5)
test('contains spec review loop', () => {
expect(content).toContain('Spec Review Loop');
});
test('contains adversarial review dimensions', () => {
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
expect(content).toContain(dim);
}
});
test('contains subagent dispatch instruction', () => {
expect(content).toMatch(/Agent.*tool|subagent/i);
});
test('contains max 3 iterations', () => {
expect(content).toMatch(/3.*iteration|maximum.*3/i);
});
test('contains quality score', () => {
expect(content).toContain('quality score');
});
test('contains spec review metrics path', () => {
expect(content).toContain('spec-review.jsonl');
});
test('contains convergence guard', () => {
expect(content).toMatch(/convergence/i);
});
// Visual Sketch (Phase 4.5)
test('contains visual sketch section', () => {
expect(content).toContain('Visual Sketch');
});
test('contains wireframe generation', () => {
expect(content).toMatch(/wireframe|sketch/i);
});
test('contains DESIGN.md awareness', () => {
expect(content).toContain('DESIGN.md');
});
test('contains browse rendering', () => {
expect(content).toContain('$B goto');
expect(content).toContain('$B screenshot');
});
test('contains rough aesthetic instruction', () => {
expect(content).toMatch(/rough|hand-drawn/i);
});
});
describe('investigate skill structure', () => {
@@ -856,6 +909,22 @@ describe('CEO review mode validation', () => {
expect(content).toContain('HOLD SCOPE');
expect(content).toContain('REDUCTION');
});
// Skill chaining (benefits-from)
test('contains prerequisite skill offer for office-hours', () => {
expect(content).toContain('Prerequisite Skill Offer');
expect(content).toContain('/office-hours');
});
test('contains mid-session detection', () => {
expect(content).toContain('Mid-session detection');
expect(content).toMatch(/still figuring out|seems lost/i);
});
// Spec review on CEO plans
test('contains spec review loop for CEO plan documents', () => {
expect(content).toContain('Spec Review Loop');
});
});
// --- gstack-slug helper ---