From 17d9c7fec02e518f0c38856dc6b5556e433dbf7d Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 16 Mar 2026 01:15:45 -0500 Subject: [PATCH] feat: add evals for RECOMMENDATION format, session awareness, and enum completeness Free tests (Tier 1): RECOMMENDATION format + session awareness in all preamble SKILL.md files, enum completeness checklist structure and CRITICAL classification. E2E eval: /review catches missed enum handlers when a new status value is added but not handled in case/switch and notify methods. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/fixtures/review-eval-enum-diff.rb | 30 +++++++++++ test/fixtures/review-eval-enum.rb | 27 ++++++++++ test/skill-e2e.test.ts | 72 ++++++++++++++++++++++++++ test/skill-validation.test.ts | 60 +++++++++++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 test/fixtures/review-eval-enum-diff.rb create mode 100644 test/fixtures/review-eval-enum.rb diff --git a/test/fixtures/review-eval-enum-diff.rb b/test/fixtures/review-eval-enum-diff.rb new file mode 100644 index 00000000..9b87c2a1 --- /dev/null +++ b/test/fixtures/review-eval-enum-diff.rb @@ -0,0 +1,30 @@ +# Feature branch version: adds "returned" status but misses consumers +class Order < ApplicationRecord + STATUSES = %w[pending processing shipped delivered returned].freeze + + validates :status, inclusion: { in: STATUSES } + + def display_status + case status + when 'pending' then 'Awaiting processing' + when 'processing' then 'Being prepared' + when 'shipped' then 'On the way' + when 'delivered' then 'Delivered' + # BUG: 'returned' not handled — falls through to nil + end + end + + def can_cancel? + # BUG: should 'returned' be cancellable? Not considered. + %w[pending processing].include?(status) + end + + def notify_customer + case status + when 'pending' then OrderMailer.confirmation(self).deliver_later + when 'shipped' then OrderMailer.shipped(self).deliver_later + when 'delivered' then OrderMailer.delivered(self).deliver_later + # BUG: 'returned' has no notification — customer won't know return was received + end + end +end diff --git a/test/fixtures/review-eval-enum.rb b/test/fixtures/review-eval-enum.rb new file mode 100644 index 00000000..79960a31 --- /dev/null +++ b/test/fixtures/review-eval-enum.rb @@ -0,0 +1,27 @@ +# Existing file on main: order model with status handling +class Order < ApplicationRecord + STATUSES = %w[pending processing shipped delivered].freeze + + validates :status, inclusion: { in: STATUSES } + + def display_status + case status + when 'pending' then 'Awaiting processing' + when 'processing' then 'Being prepared' + when 'shipped' then 'On the way' + when 'delivered' then 'Delivered' + end + end + + def can_cancel? + %w[pending processing].include?(status) + end + + def notify_customer + case status + when 'pending' then OrderMailer.confirmation(self).deliver_later + when 'shipped' then OrderMailer.shipped(self).deliver_later + when 'delivered' then OrderMailer.delivered(self).deliver_later + end + end +end diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index df6f50be..d36e011e 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -438,6 +438,78 @@ Write your review findings to ${reviewDir}/review-output.md`, }, 120_000); }); +// --- Review: Enum completeness E2E --- + +describeE2E('Review enum completeness E2E', () => { + let enumDir: string; + + beforeAll(() => { + enumDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-enum-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: enumDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Commit baseline on main — order model with 4 statuses + const baseContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-enum.rb'), 'utf-8'); + fs.writeFileSync(path.join(enumDir, 'order.rb'), baseContent); + run('git', ['add', 'order.rb']); + run('git', ['commit', '-m', 'initial order model']); + + // Feature branch adds "returned" status but misses handlers + run('git', ['checkout', '-b', 'feature/add-returned-status']); + const diffContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-enum-diff.rb'), 'utf-8'); + fs.writeFileSync(path.join(enumDir, 'order.rb'), diffContent); + run('git', ['add', 'order.rb']); + run('git', ['commit', '-m', 'add returned status']); + + // Copy review skill files + fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(enumDir, 'review-SKILL.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(enumDir, 'review-checklist.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(enumDir, 'review-greptile-triage.md')); + }); + + afterAll(() => { + try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {} + }); + + test('/review catches missing enum handlers for new status value', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo on branch feature/add-returned-status with changes against main. +Read review-SKILL.md for the review workflow instructions. +Also read review-checklist.md and apply it — pay special attention to the Enum & Value Completeness section. +Run /review on the current diff (git diff main...HEAD). +Write your review findings to ${enumDir}/review-output.md + +The diff adds a new "returned" status to the Order model. Your job is to check if all consumers handle it.`, + workingDirectory: enumDir, + maxTurns: 15, + timeout: 90_000, + testName: 'review-enum-completeness', + runId, + }); + + logCost('/review enum', result); + recordE2E('/review enum completeness', 'Review enum completeness E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify the review caught the missing enum handlers + const reviewPath = path.join(enumDir, 'review-output.md'); + if (fs.existsSync(reviewPath)) { + const review = fs.readFileSync(reviewPath, 'utf-8'); + // Should mention the missing "returned" handling in at least one of the methods + const mentionsReturned = review.toLowerCase().includes('returned'); + const mentionsEnum = review.toLowerCase().includes('enum') || review.toLowerCase().includes('status'); + const mentionsCritical = review.toLowerCase().includes('critical'); + expect(mentionsReturned).toBe(true); + expect(mentionsEnum || mentionsCritical).toBe(true); + } + }, 120_000); +}); + // --- B6/B7/B8: Planted-bug outcome evals --- // Outcome evals also need ANTHROPIC_API_KEY for the LLM judge diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index a2ce421c..88e98935 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -411,6 +411,66 @@ describe('TODOS-format.md reference consistency', () => { }); }); +// --- v0.4.1 feature coverage: RECOMMENDATION format, session awareness, enum completeness --- + +describe('v0.4.1 preamble features', () => { + const skillsWithPreamble = [ + 'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md', + 'qa-only/SKILL.md', + 'setup-browser-cookies/SKILL.md', + 'ship/SKILL.md', 'review/SKILL.md', + 'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md', + 'retro/SKILL.md', + ]; + + for (const skill of skillsWithPreamble) { + test(`${skill} contains RECOMMENDATION format`, () => { + const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8'); + expect(content).toContain('RECOMMENDATION: Choose'); + expect(content).toContain('AskUserQuestion'); + }); + + test(`${skill} contains session awareness`, () => { + const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8'); + expect(content).toContain('_SESSIONS'); + expect(content).toContain('ELI16'); + }); + } +}); + +describe('Enum & Value Completeness in review checklist', () => { + const checklist = fs.readFileSync(path.join(ROOT, 'review', 'checklist.md'), 'utf-8'); + + test('checklist has Enum & Value Completeness section', () => { + expect(checklist).toContain('Enum & Value Completeness'); + }); + + test('Enum & Value Completeness is classified as CRITICAL', () => { + // It should appear under Pass 1 — CRITICAL, not Pass 2 + const pass1Start = checklist.indexOf('### Pass 1'); + const pass2Start = checklist.indexOf('### Pass 2'); + const enumStart = checklist.indexOf('Enum & Value Completeness'); + expect(enumStart).toBeGreaterThan(pass1Start); + expect(enumStart).toBeLessThan(pass2Start); + }); + + test('Enum & Value Completeness mentions tracing through consumers', () => { + expect(checklist).toContain('Trace it through every consumer'); + expect(checklist).toContain('case'); + expect(checklist).toContain('allowlist'); + }); + + test('Enum & Value Completeness is in the gate classification as CRITICAL', () => { + const gateSection = checklist.slice(checklist.indexOf('## Gate Classification')); + // The ASCII art has CRITICAL on the left and INFORMATIONAL on the right + // Enum & Value Completeness should appear on a line with the CRITICAL tree (├─ or └─) + const enumLine = gateSection.split('\n').find(l => l.includes('Enum & Value Completeness')); + expect(enumLine).toBeDefined(); + // It's on the left (CRITICAL) side — starts with ├─ or └─ + expect(enumLine!.trimStart().startsWith('├─') || enumLine!.trimStart().startsWith('└─')).toBe(true); + }); +}); + // --- Part 7: Planted-bug fixture validation (A4) --- describe('Planted-bug fixture validation', () => {