diff --git a/CHANGELOG.md b/CHANGELOG.md index 95931f67..4a7ff0d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.11.15.0] - 2026-03-24 — E2E Test Coverage for Plan Reviews & Codex + +### Added + +- **E2E tests verify plan review reports appear at the bottom of plans.** The `/plan-eng-review` review report is now tested end-to-end — if it stops writing `## GSTACK REVIEW REPORT` to the plan file, the test catches it. +- **E2E tests verify Codex is offered in every plan skill.** Four new lightweight tests confirm that `/office-hours`, `/plan-ceo-review`, `/plan-design-review`, and `/plan-eng-review` all check for Codex availability, prompt the user, and handle the fallback when Codex is unavailable. + +### For contributors + +- New E2E tests in `test/skill-e2e-plan.test.ts`: `plan-review-report`, `codex-offered-eng-review`, `codex-offered-ceo-review`, `codex-offered-office-hours`, `codex-offered-design-review` +- Updated touchfile mappings and selection count assertions +- Added `touchfiles` to the documented global touchfile list in CLAUDE.md + ## [0.11.14.0] - 2026-03-24 — Windows Browse Fix ### Fixed diff --git a/CLAUDE.md b/CLAUDE.md index 25673f4c..492a5adf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,7 +29,7 @@ against the previous run. **Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based on `git diff` against the base branch. Each test declares its file dependencies in `test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store, -llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script +llm-judge, gen-skill-docs, touchfiles) trigger all tests. Use `EVALS_ALL=1` or the `:all` script variants to force all tests. Run `eval:select` to preview which tests would run. ## Testing diff --git a/VERSION b/VERSION index 31d15b21..446cced3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.14.0 +0.11.15.0 diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index d0d232a5..931bcda8 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -68,6 +68,13 @@ export const E2E_TOUCHFILES: Record = { 'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'], 'plan-eng-review': ['plan-eng-review/**'], 'plan-eng-review-artifact': ['plan-eng-review/**'], + 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], + + // Codex offering verification + 'codex-offered-office-hours': ['office-hours/**', 'scripts/gen-skill-docs.ts'], + 'codex-offered-ceo-review': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'], + 'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'], + 'codex-offered-eng-review': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], // Ship 'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'], diff --git a/test/skill-e2e-plan.test.ts b/test/skill-e2e-plan.test.ts index 884fe67b..8953200b 100644 --- a/test/skill-e2e-plan.test.ts +++ b/test/skill-e2e-plan.test.ts @@ -535,6 +535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`, }, 180_000); }); +// --- Plan Review Report E2E --- +// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section +// to the bottom of the plan file (the living review status footer). + +describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => { + let planDir: string; + + beforeAll(() => { + planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System + +## Context +We're building a real-time notification system for our SaaS app. + +## Changes +1. WebSocket server for push notifications +2. Notification preferences API +3. Email digest fallback for offline users +4. PostgreSQL table for notification storage + +## Architecture +- WebSocket: Socket.io on Express +- Queue: Bull + Redis for email digests +- Storage: PostgreSQL notifications table +- Frontend: React toast component + +## Open questions +- Retry policy for failed WebSocket delivery? +- Max notifications stored per user? +`); + + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add plan']); + + // Copy plan-eng-review skill + fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'plan-eng-review', 'SKILL.md'), + path.join(planDir, 'plan-eng-review', 'SKILL.md'), + ); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => { + const result = await runSkillTest({ + prompt: `Read plan-eng-review/SKILL.md for the review workflow. + +Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps. + +Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive. +Skip the preamble bash block, lake intro, telemetry, and contributor mode sections. + +CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content. + +This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`, + workingDirectory: planDir, + maxTurns: 20, + timeout: 360_000, + testName: 'plan-review-report', + runId, + model: 'claude-opus-4-6', + }); + + logCost('/plan-eng-review report', result); + recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + // Verify the review report was written to the plan file + const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8'); + + // Original plan content should still be present + expect(planContent).toContain('# Plan: Add Notifications System'); + expect(planContent).toContain('WebSocket'); + + // Review report section must exist + expect(planContent).toContain('## GSTACK REVIEW REPORT'); + + // Report should be at the bottom of the file + const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT'); + const afterReport = planContent.slice(reportIndex); + + // Should contain the review table with standard rows + expect(afterReport).toMatch(/\|\s*Review\s*\|/); + expect(afterReport).toContain('CEO Review'); + expect(afterReport).toContain('Eng Review'); + expect(afterReport).toContain('Design Review'); + + console.log('Plan review report found at bottom of plan.md'); + }, 420_000); +}); + +// --- Codex Offering E2E --- +// Verifies that Codex is properly offered (with availability check, user prompt, +// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review. + +describeIfSelected('Codex Offering E2E', [ + 'codex-offered-office-hours', 'codex-offered-ceo-review', + 'codex-offered-design-review', 'codex-offered-eng-review', +], () => { + let testDir: string; + + beforeAll(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'init']); + + // Copy all 4 SKILL.md files + for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) { + fs.mkdirSync(path.join(testDir, skill), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, skill, 'SKILL.md'), + path.join(testDir, skill, 'SKILL.md'), + ); + } + }); + + afterAll(() => { + try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {} + }); + + async function checkCodexOffering(skill: string, testName: string, featureName: string) { + const result = await runSkillTest({ + prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion". + +Summarize the Codex/${featureName} integration — answer these specific questions: +1. How is Codex availability checked? (what exact bash command?) +2. How is the user prompted? (via AskUserQuestion? what are the options?) +3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?) +4. Is this step blocking (gates the workflow) or optional (can be skipped)? +5. What prompt/context is sent to Codex? + +Write your summary to ${testDir}/${testName}-summary.md`, + workingDirectory: testDir, + maxTurns: 8, + timeout: 120_000, + testName, + runId, + }); + + logCost(`/${skill} codex offering`, result); + recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result); + expect(result.exitReason).toBe('success'); + + const summaryPath = path.join(testDir, `${testName}-summary.md`); + expect(fs.existsSync(summaryPath)).toBe(true); + + const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase(); + // All skills should have codex availability check + expect(summary).toMatch(/which codex/); + // All skills should have fallback behavior + expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/); + // All skills should show it's optional/non-blocking + expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/); + + console.log(`${skill}: Codex offering verified`); + } + + testConcurrentIfSelected('codex-offered-office-hours', async () => { + await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion'); + }, 180_000); + + testConcurrentIfSelected('codex-offered-ceo-review', async () => { + await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice'); + }, 180_000); + + testConcurrentIfSelected('codex-offered-design-review', async () => { + await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices'); + }, 180_000); + + testConcurrentIfSelected('codex-offered-eng-review', async () => { + await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice'); + }, 180_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { await finalizeEvalCollector(evalCollector); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 0e24b124..69572970 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -80,8 +80,9 @@ describe('selectTests', () => { expect(result.selected).toContain('plan-ceo-review-selective'); expect(result.selected).toContain('plan-ceo-review-benefits'); expect(result.selected).toContain('autoplan-core'); - expect(result.selected.length).toBe(4); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 4); + expect(result.selected).toContain('codex-offered-ceo-review'); + expect(result.selected.length).toBe(5); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5); }); test('global touchfile triggers ALL tests', () => {