diff --git a/test/diff-scope.test.ts b/test/diff-scope.test.ts new file mode 100644 index 00000000..44cfe03f --- /dev/null +++ b/test/diff-scope.test.ts @@ -0,0 +1,165 @@ +/** + * Tests for bin/gstack-diff-scope — verifies scope signal detection. + * + * Creates temp git repos with specific file patterns and verifies + * the correct SCOPE_* variables are output. + */ +import { describe, test, expect, afterAll } from 'bun:test'; +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; +import { spawnSync } from 'child_process'; + +const SCRIPT = join(import.meta.dir, '..', 'bin', 'gstack-diff-scope'); + +const dirs: string[] = []; + +function createRepo(files: string[]): string { + const dir = mkdtempSync(join(tmpdir(), 'diff-scope-test-')); + dirs.push(dir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Base commit + writeFileSync(join(dir, 'README.md'), '# test\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Feature branch with specified files + run('git', ['checkout', '-b', 'feature/test']); + for (const f of files) { + const fullPath = join(dir, f); + const dirPath = fullPath.substring(0, fullPath.lastIndexOf('/')); + if (dirPath !== dir) mkdirSync(dirPath, { recursive: true }); + writeFileSync(fullPath, '# test content\n'); + } + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add files']); + + return dir; +} + +function runScope(dir: string): Record { + const result = spawnSync('bash', [SCRIPT, 'main'], { + cwd: dir, stdio: 'pipe', timeout: 5000, + }); + const output = result.stdout.toString().trim(); + const vars: Record = {}; + for (const line of output.split('\n')) { + const [key, val] = line.split('='); + if (key && val) vars[key] = val; + } + return vars; +} + +afterAll(() => { + for (const d of dirs) { + try { rmSync(d, { recursive: true, force: true }); } catch {} + } +}); + +describe('gstack-diff-scope', () => { + // --- Existing scope signals --- + + test('detects frontend files', () => { + const dir = createRepo(['styles.css', 'component.tsx']); + const scope = runScope(dir); + expect(scope.SCOPE_FRONTEND).toBe('true'); + }); + + test('detects backend files', () => { + const dir = createRepo(['app.rb', 'service.py']); + const scope = runScope(dir); + expect(scope.SCOPE_BACKEND).toBe('true'); + }); + + test('detects test files', () => { + const dir = createRepo(['test/app.test.ts']); + const scope = runScope(dir); + expect(scope.SCOPE_TESTS).toBe('true'); + }); + + // --- New scope signals (Review Army) --- + + test('detects migrations via db/migrate/', () => { + const dir = createRepo(['db/migrate/20260330_create_users.rb']); + const scope = runScope(dir); + expect(scope.SCOPE_MIGRATIONS).toBe('true'); + }); + + test('detects migrations via generic migrations/', () => { + const dir = createRepo(['app/migrations/0001_initial.py']); + const scope = runScope(dir); + expect(scope.SCOPE_MIGRATIONS).toBe('true'); + }); + + test('detects migrations via prisma', () => { + const dir = createRepo(['prisma/migrations/20260330/migration.sql']); + const scope = runScope(dir); + expect(scope.SCOPE_MIGRATIONS).toBe('true'); + }); + + test('detects API via controller files', () => { + const dir = createRepo(['app/controllers/users_controller.rb']); + const scope = runScope(dir); + expect(scope.SCOPE_API).toBe('true'); + }); + + test('detects API via route files', () => { + const dir = createRepo(['src/routes/api.ts']); + const scope = runScope(dir); + expect(scope.SCOPE_API).toBe('true'); + }); + + test('detects API via GraphQL schemas', () => { + const dir = createRepo(['schema.graphql']); + const scope = runScope(dir); + expect(scope.SCOPE_API).toBe('true'); + }); + + test('detects auth files', () => { + const dir = createRepo(['app/services/auth_service.rb']); + const scope = runScope(dir); + expect(scope.SCOPE_AUTH).toBe('true'); + }); + + test('detects session files', () => { + const dir = createRepo(['lib/session_manager.ts']); + const scope = runScope(dir); + expect(scope.SCOPE_AUTH).toBe('true'); + }); + + test('detects JWT files', () => { + const dir = createRepo(['utils/jwt_helper.py']); + const scope = runScope(dir); + expect(scope.SCOPE_AUTH).toBe('true'); + }); + + test('returns false for all new signals when no matching files', () => { + const dir = createRepo(['docs/readme.md', 'config.yml']); + const scope = runScope(dir); + expect(scope.SCOPE_MIGRATIONS).toBe('false'); + expect(scope.SCOPE_API).toBe('false'); + expect(scope.SCOPE_AUTH).toBe('false'); + }); + + test('outputs all 9 scope variables', () => { + const dir = createRepo(['app.ts']); + const scope = runScope(dir); + expect(Object.keys(scope)).toHaveLength(9); + expect(scope).toHaveProperty('SCOPE_FRONTEND'); + expect(scope).toHaveProperty('SCOPE_BACKEND'); + expect(scope).toHaveProperty('SCOPE_PROMPTS'); + expect(scope).toHaveProperty('SCOPE_TESTS'); + expect(scope).toHaveProperty('SCOPE_DOCS'); + expect(scope).toHaveProperty('SCOPE_CONFIG'); + expect(scope).toHaveProperty('SCOPE_MIGRATIONS'); + expect(scope).toHaveProperty('SCOPE_API'); + expect(scope).toHaveProperty('SCOPE_AUTH'); + }); +}); diff --git a/test/fixtures/review-army-migration.sql b/test/fixtures/review-army-migration.sql new file mode 100644 index 00000000..05cbffe1 --- /dev/null +++ b/test/fixtures/review-army-migration.sql @@ -0,0 +1,5 @@ +-- Migration: Drop user email column +-- WARNING: This migration is intentionally unsafe for testing +ALTER TABLE users DROP COLUMN email; +ALTER TABLE users DROP COLUMN phone_number; +-- No backfill, no reversibility check, no data preservation diff --git a/test/fixtures/review-army-n-plus-one.rb b/test/fixtures/review-army-n-plus-one.rb new file mode 100644 index 00000000..0981e51a --- /dev/null +++ b/test/fixtures/review-army-n-plus-one.rb @@ -0,0 +1,12 @@ +# N+1 query example — intentionally bad for testing +class PostsController + def index + @posts = Post.all + @posts.each do |post| + # N+1: queries Author table for every post + puts post.author.name + # N+1: queries Comments table for every post + puts post.comments.count + end + end +end diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 95f4bc9c..88ba5bd2 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -607,7 +607,8 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => { const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); - test('all three modes share codepath tracing methodology', () => { + test('plan and ship modes share codepath tracing methodology', () => { + // Review mode delegates test coverage to the Testing specialist subagent (Review Army) const sharedPhrases = [ 'Trace data flow', 'Diagram the execution', @@ -619,33 +620,40 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => { for (const phrase of sharedPhrases) { expect(planSkill).toContain(phrase); expect(shipSkill).toContain(phrase); - expect(reviewSkill).toContain(phrase); } // Plan mode traces the plan, not a git diff expect(planSkill).toContain('Trace every codepath in the plan'); expect(planSkill).not.toContain('git diff origin'); - // Ship and review modes trace the diff + // Ship mode traces the diff expect(shipSkill).toContain('Trace every codepath changed'); - expect(reviewSkill).toContain('Trace every codepath changed'); }); - test('all three modes include E2E decision matrix', () => { - for (const skill of [planSkill, shipSkill, reviewSkill]) { + test('review mode uses Review Army for specialist dispatch', () => { + expect(reviewSkill).toContain('Review Army'); + expect(reviewSkill).toContain('Specialist Dispatch'); + expect(reviewSkill).toContain('testing.md'); + }); + + test('plan and ship modes include E2E decision matrix', () => { + // Review mode delegates to Testing specialist + for (const skill of [planSkill, shipSkill]) { expect(skill).toContain('E2E Test Decision Matrix'); expect(skill).toContain('→E2E'); expect(skill).toContain('→EVAL'); } }); - test('all three modes include regression rule', () => { - for (const skill of [planSkill, shipSkill, reviewSkill]) { + test('plan and ship modes include regression rule', () => { + // Review mode delegates to Testing specialist + for (const skill of [planSkill, shipSkill]) { expect(skill).toContain('REGRESSION RULE'); expect(skill).toContain('IRON RULE'); } }); - test('all three modes include test framework detection', () => { - for (const skill of [planSkill, shipSkill, reviewSkill]) { + test('plan and ship modes include test framework detection', () => { + // Review mode delegates to Testing specialist + for (const skill of [planSkill, shipSkill]) { expect(skill).toContain('Test Framework Detection'); expect(skill).toContain('CLAUDE.md'); } @@ -664,11 +672,12 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => { expect(shipSkill).toContain('ship-test-plan'); }); - test('review mode generates via Fix-First + gaps are INFORMATIONAL', () => { + test('review mode uses Fix-First + Review Army for specialist coverage', () => { expect(reviewSkill).toContain('Fix-First'); expect(reviewSkill).toContain('INFORMATIONAL'); - expect(reviewSkill).toContain('Step 4.75'); - expect(reviewSkill).toContain('subsumes the "Test Gaps" category'); + // Review Army handles test coverage via Testing specialist subagent + expect(reviewSkill).toContain('Review Army'); + expect(reviewSkill).toContain('Testing'); }); test('plan mode does NOT include ship-specific content', () => { @@ -683,6 +692,35 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => { expect(reviewSkill).not.toContain('ship-test-plan'); }); + test('review/specialists/ directory has all expected checklist files', () => { + const specDir = path.join(ROOT, 'review', 'specialists'); + const expected = [ + 'testing.md', + 'maintainability.md', + 'security.md', + 'performance.md', + 'data-migration.md', + 'api-contract.md', + 'red-team.md', + ]; + for (const f of expected) { + expect(fs.existsSync(path.join(specDir, f))).toBe(true); + } + }); + + test('each specialist file has standard header with scope and output format', () => { + const specDir = path.join(ROOT, 'review', 'specialists'); + const files = fs.readdirSync(specDir).filter(f => f.endsWith('.md')); + for (const f of files) { + const content = fs.readFileSync(path.join(specDir, f), 'utf-8'); + // All specialist files must have Scope and Output/JSON in header + expect(content).toContain('Scope:'); + expect(content.toLowerCase()).toMatch(/output|json/); + // Must define NO FINDINGS behavior + expect(content).toContain('NO FINDINGS'); + } + }); + // Regression guard: ship output contains key phrases from before the refactor test('ship SKILL.md regression guard — key phrases preserved', () => { const regressionPhrases = [ @@ -870,12 +908,9 @@ describe('Coverage gate in ship', () => { expect(shipSkill).toContain('could not determine percentage — skipping'); }); - test('review SKILL.md contains coverage WARNING', () => { - expect(reviewSkill).toContain('COVERAGE WARNING'); - expect(reviewSkill).toContain('Consider writing tests before running /ship'); - }); - - test('review coverage warning is INFORMATIONAL', () => { + test('review SKILL.md delegates coverage to Testing specialist', () => { + // Coverage audit moved to Testing specialist subagent in Review Army + expect(reviewSkill).toContain('testing.md'); expect(reviewSkill).toContain('INFORMATIONAL'); }); }); @@ -1604,10 +1639,9 @@ describe('Codex generation (--host codex)', () => { const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8'); // Correct: references to sidecar files use gstack/review/ path expect(content).toContain('.agents/skills/gstack/review/checklist.md'); - expect(content).toContain('.agents/skills/gstack/review/design-checklist.md'); + // design-checklist.md is now referenced via Review Army specialist (Claude only, stripped for Codex) // Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there) expect(content).not.toContain('.agents/skills/gstack-review/checklist.md'); - expect(content).not.toContain('.agents/skills/gstack-review/design-checklist.md'); }); test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => { diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index a627dd41..73b1d956 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -59,6 +59,15 @@ export const E2E_TOUCHFILES: Record = { 'review-base-branch': ['review/**'], 'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'], + // Review Army (specialist dispatch) + 'review-army-migration-safety': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'], + 'review-army-perf-n-plus-one': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'], + 'review-army-delivery-audit': ['review/**', 'scripts/resolvers/review.ts', 'scripts/resolvers/review-army.ts'], + 'review-army-quality-score': ['review/**', 'scripts/resolvers/review-army.ts'], + 'review-army-json-findings': ['review/**', 'scripts/resolvers/review-army.ts'], + 'review-army-red-team': ['review/**', 'scripts/resolvers/review-army.ts'], + 'review-army-consensus': ['review/**', 'scripts/resolvers/review-army.ts'], + // Office Hours 'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'], @@ -204,6 +213,15 @@ export const E2E_TIERS: Record = { 'review-plan-completion': 'gate', 'review-dashboard-via': 'gate', + // Review Army — gate for core functionality, periodic for multi-specialist + 'review-army-migration-safety': 'gate', // Specialist activation guardrail + 'review-army-perf-n-plus-one': 'gate', // Specialist activation guardrail + 'review-army-delivery-audit': 'gate', // Delivery integrity guardrail + 'review-army-quality-score': 'gate', // Score computation + 'review-army-json-findings': 'gate', // JSON schema compliance + 'review-army-red-team': 'periodic', // Multi-agent coordination + 'review-army-consensus': 'periodic', // Multi-specialist agreement + // Office Hours 'office-hours-spec-review': 'gate', diff --git a/test/skill-e2e-review-army.test.ts b/test/skill-e2e-review-army.test.ts new file mode 100644 index 00000000..be08a721 --- /dev/null +++ b/test/skill-e2e-review-army.test.ts @@ -0,0 +1,562 @@ +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { runSkillTest } from './helpers/session-runner'; +import { + ROOT, runId, describeIfSelected, testConcurrentIfSelected, + logCost, recordE2E, createEvalCollector, finalizeEvalCollector, +} from './helpers/e2e-helpers'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const evalCollector = createEvalCollector('e2e-review-army'); + +// Helper: create a git repo with a feature branch +function setupRepo(prefix: string): { dir: string; run: (cmd: string, args: string[]) => void } { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${prefix}-`)); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 }); + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + return { dir, run }; +} + +// Helper: copy review skill files to test dir +function copyReviewFiles(dir: string) { + fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(dir, 'review-SKILL.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(dir, 'review-checklist.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(dir, 'review-greptile-triage.md')); + // Copy specialist checklists + const specDir = path.join(dir, 'review-specialists'); + fs.mkdirSync(specDir, { recursive: true }); + const specialistsRoot = path.join(ROOT, 'review', 'specialists'); + for (const f of fs.readdirSync(specialistsRoot)) { + fs.copyFileSync(path.join(specialistsRoot, f), path.join(specDir, f)); + } +} + +// --- Review Army: Migration Safety --- + +describeIfSelected('Review Army: Migration Safety', ['review-army-migration-safety'], () => { + let dir: string; + + beforeAll(() => { + const repo = setupRepo('army-migration'); + dir = repo.dir; + + // Base commit + fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n'); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'initial']); + + // Feature branch with unsafe migration + repo.run('git', ['checkout', '-b', 'feature/drop-columns']); + fs.mkdirSync(path.join(dir, 'db', 'migrate'), { recursive: true }); + const migrationContent = fs.readFileSync( + path.join(ROOT, 'test', 'fixtures', 'review-army-migration.sql'), 'utf-8' + ); + fs.writeFileSync(path.join(dir, 'db', 'migrate', '20260330_drop_columns.sql'), migrationContent); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'drop email and phone columns']); + + copyReviewFiles(dir); + }); + + afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} }); + + testConcurrentIfSelected('review-army-migration-safety', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo on a feature branch with a database migration that drops columns. +Read review-SKILL.md for instructions. Also read review-checklist.md. +The specialist checklists are in review-specialists/ (testing.md, security.md, performance.md, data-migration.md, etc.). + +Skip the preamble, lake intro, telemetry sections. +Run Step 4 (Critical pass) then Step 4.5 (Review Army — Specialist Dispatch). +The base branch is main. Run gstack-diff-scope style analysis on the changed files. +Since db/migrate/ files changed, the Data Migration specialist should activate. + +For the specialist dispatch, instead of launching subagents, just read review-specialists/data-migration.md +and apply it yourself against the diff (git diff main...HEAD). + +Write your findings to ${dir}/review-output.md`, + workingDirectory: dir, + maxTurns: 20, + timeout: 180_000, + testName: 'review-army-migration-safety', + runId, + }); + + logCost('/review army migration', result); + recordE2E(evalCollector, '/review army migration safety', 'Review Army', result); + expect(result.exitReason).toBe('success'); + + // Verify migration issues were caught + const outputPath = path.join(dir, 'review-output.md'); + if (fs.existsSync(outputPath)) { + const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase(); + const hasMigrationFinding = + content.includes('drop') || + content.includes('data loss') || + content.includes('reversib') || + content.includes('migration') || + content.includes('column'); + expect(hasMigrationFinding).toBe(true); + } + }, 210_000); +}); + +// --- Review Army: N+1 Performance --- + +describeIfSelected('Review Army: N+1 Performance', ['review-army-perf-n-plus-one'], () => { + let dir: string; + + beforeAll(() => { + const repo = setupRepo('army-n-plus-one'); + dir = repo.dir; + + fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n'); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'initial']); + + repo.run('git', ['checkout', '-b', 'feature/add-posts-index']); + const n1Content = fs.readFileSync( + path.join(ROOT, 'test', 'fixtures', 'review-army-n-plus-one.rb'), 'utf-8' + ); + fs.writeFileSync(path.join(dir, 'posts_controller.rb'), n1Content); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'add posts controller']); + + copyReviewFiles(dir); + }); + + afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} }); + + testConcurrentIfSelected('review-army-perf-n-plus-one', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo on a feature branch with a Ruby controller that has N+1 queries. +Read review-SKILL.md for instructions. Also read review-checklist.md. +The specialist checklists are in review-specialists/ (testing.md, performance.md, etc.). + +Skip the preamble, lake intro, telemetry sections. +Run Step 4 (Critical pass) then Step 4.5 (Review Army). +The base branch is main. This is a Ruby backend file, so Performance specialist should activate. + +For the specialist dispatch, read review-specialists/performance.md and apply it against the diff. + +Write your findings to ${dir}/review-output.md`, + workingDirectory: dir, + maxTurns: 20, + timeout: 180_000, + testName: 'review-army-perf-n-plus-one', + runId, + }); + + logCost('/review army n+1', result); + recordE2E(evalCollector, '/review army N+1 detection', 'Review Army', result); + expect(result.exitReason).toBe('success'); + + const outputPath = path.join(dir, 'review-output.md'); + if (fs.existsSync(outputPath)) { + const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase(); + const hasN1Finding = + content.includes('n+1') || + content.includes('n + 1') || + content.includes('eager') || + content.includes('includes') || + content.includes('preload') || + content.includes('query') || + content.includes('loop'); + expect(hasN1Finding).toBe(true); + } + }, 210_000); +}); + +// --- Review Army: Delivery Audit --- + +describeIfSelected('Review Army: Delivery Audit', ['review-army-delivery-audit'], () => { + let dir: string; + + beforeAll(() => { + const repo = setupRepo('army-delivery'); + dir = repo.dir; + + fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n'); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'initial']); + + repo.run('git', ['checkout', '-b', 'feature/three-features']); + + // Write a plan file promising 3 features + fs.writeFileSync(path.join(dir, 'PLAN.md'), `# Feature Plan + +## Implementation Items +1. Add user authentication with login/logout +2. Add user profile page with avatar upload +3. Add email notification system for new signups + +## Test Items +- Test login flow +- Test profile page rendering +- Test email sending +`); + repo.run('git', ['add', 'PLAN.md']); + repo.run('git', ['commit', '-m', 'add plan']); + + // Implement only 2 of 3 features + fs.writeFileSync(path.join(dir, 'auth.rb'), `class AuthController + def login + # authenticate user + session[:user_id] = user.id + end + + def logout + session.delete(:user_id) + end +end +`); + fs.writeFileSync(path.join(dir, 'profile.rb'), `class ProfileController + def show + @user = User.find(params[:id]) + end + + def update_avatar + @user.avatar.attach(params[:avatar]) + end +end +`); + // NOTE: email notification system is NOT implemented (intentionally missing) + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'implement auth and profile features']); + + copyReviewFiles(dir); + }); + + afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} }); + + testConcurrentIfSelected('review-army-delivery-audit', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo on branch feature/three-features. +There is a PLAN.md file that promises 3 features: auth, profile, and email notifications. +The diff (git diff main...HEAD) only implements 2 of them (auth and profile). + +Read review-SKILL.md for the review workflow. Focus on the Plan Completion Audit section. +The plan file is at ./PLAN.md. Cross-reference it against the diff. + +For each plan item, classify as DONE, PARTIAL, NOT DONE, or CHANGED. +The email notification system should be classified as NOT DONE. + +Write your completion audit to ${dir}/review-output.md`, + workingDirectory: dir, + maxTurns: 15, + timeout: 120_000, + testName: 'review-army-delivery-audit', + runId, + }); + + logCost('/review army delivery', result); + recordE2E(evalCollector, '/review army delivery audit', 'Review Army', result); + expect(result.exitReason).toBe('success'); + + const outputPath = path.join(dir, 'review-output.md'); + if (fs.existsSync(outputPath)) { + const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase(); + // Should identify email notifications as NOT DONE + const hasNotDone = + content.includes('not done') || + content.includes('not_done') || + content.includes('missing') || + content.includes('not implemented'); + const mentionsEmail = + content.includes('email') || + content.includes('notification'); + expect(hasNotDone).toBe(true); + expect(mentionsEmail).toBe(true); + } + }, 150_000); +}); + +// --- Review Army: Quality Score --- + +describeIfSelected('Review Army: Quality Score', ['review-army-quality-score'], () => { + let dir: string; + + beforeAll(() => { + const repo = setupRepo('army-quality'); + dir = repo.dir; + + fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n'); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'initial']); + + repo.run('git', ['checkout', '-b', 'feature/add-controller']); + // Code with obvious issues for quality score computation + fs.writeFileSync(path.join(dir, 'user_controller.rb'), `class UserController + def create + # SQL injection + User.where("name = '#{params[:name]}'") + # Magic number + if users.count > 42 + raise "too many" + end + end +end +`); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'add user controller']); + + copyReviewFiles(dir); + }); + + afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} }); + + testConcurrentIfSelected('review-army-quality-score', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo with a vulnerable user controller. +Read review-SKILL.md and review-checklist.md. +Skip preamble, lake intro, telemetry. + +Run the Critical pass (Step 4) against the diff (git diff main...HEAD). +Then compute the PR Quality Score as described in the Review Army merge step: +quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5)) + +Write your findings AND the computed quality score to ${dir}/review-output.md +Include the line: "PR Quality Score: X/10" where X is the computed score.`, + workingDirectory: dir, + maxTurns: 15, + timeout: 120_000, + testName: 'review-army-quality-score', + runId, + }); + + logCost('/review army quality', result); + recordE2E(evalCollector, '/review army quality score', 'Review Army', result); + expect(result.exitReason).toBe('success'); + + const outputPath = path.join(dir, 'review-output.md'); + if (fs.existsSync(outputPath)) { + const content = fs.readFileSync(outputPath, 'utf-8'); + // Should contain a quality score + const hasScore = + content.toLowerCase().includes('quality score') || + content.match(/\d+\/10/); + expect(hasScore).toBeTruthy(); + } + }, 150_000); +}); + +// --- Review Army: JSON Findings --- + +describeIfSelected('Review Army: JSON Findings', ['review-army-json-findings'], () => { + let dir: string; + + beforeAll(() => { + const repo = setupRepo('army-json'); + dir = repo.dir; + + fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n'); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'initial']); + + repo.run('git', ['checkout', '-b', 'feature/vuln']); + fs.writeFileSync(path.join(dir, 'search.rb'), `class SearchController + def index + # SQL injection via string interpolation + results = ActiveRecord::Base.connection.execute( + "SELECT * FROM products WHERE name LIKE '%#{params[:q]}%'" + ) + render json: results + end +end +`); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'add search']); + + copyReviewFiles(dir); + }); + + afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} }); + + testConcurrentIfSelected('review-army-json-findings', async () => { + const result = await runSkillTest({ + prompt: `You are reviewing a git diff with a SQL injection vulnerability. +Read review-specialists/security.md for the security checklist. + +Apply the checklist against this diff (git diff main...HEAD). +Output your findings as JSON objects, one per line, following the schema: +{"severity":"CRITICAL","confidence":9,"path":"search.rb","line":4,"category":"injection","summary":"SQL injection via string interpolation","fix":"Use parameterized query","fingerprint":"search.rb:4:injection","specialist":"security"} + +Write ONLY JSON findings (no preamble) to ${dir}/findings.json`, + workingDirectory: dir, + maxTurns: 12, + timeout: 90_000, + testName: 'review-army-json-findings', + runId, + }); + + logCost('/review army json', result); + recordE2E(evalCollector, '/review army JSON findings', 'Review Army', result); + expect(result.exitReason).toBe('success'); + + const findingsPath = path.join(dir, 'findings.json'); + if (fs.existsSync(findingsPath)) { + const content = fs.readFileSync(findingsPath, 'utf-8').trim(); + const lines = content.split('\n').filter(l => l.trim()); + // At least one finding + expect(lines.length).toBeGreaterThanOrEqual(1); + // Each line should be valid JSON with required fields + for (const line of lines) { + let parsed: any; + try { parsed = JSON.parse(line); } catch { continue; } + // Required fields per schema + expect(parsed).toHaveProperty('severity'); + expect(parsed).toHaveProperty('confidence'); + expect(parsed).toHaveProperty('path'); + expect(parsed).toHaveProperty('category'); + expect(parsed).toHaveProperty('summary'); + expect(parsed).toHaveProperty('specialist'); + break; // One valid line is enough for the gate test + } + } + }, 120_000); +}); + +// --- Review Army: Red Team (periodic) --- + +describeIfSelected('Review Army: Red Team', ['review-army-red-team'], () => { + let dir: string; + + beforeAll(() => { + const repo = setupRepo('army-redteam'); + dir = repo.dir; + + fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n'); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'initial']); + + repo.run('git', ['checkout', '-b', 'feature/large-change']); + // Create a large diff (300+ lines) + const lines: string[] = ['class LargeController']; + for (let i = 0; i < 100; i++) { + lines.push(` def method_${i}`); + lines.push(` data = params[:input_${i}]`); + lines.push(` process(data)`); + lines.push(' end'); + lines.push(''); + } + lines.push('end'); + fs.writeFileSync(path.join(dir, 'large_controller.rb'), lines.join('\n')); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'add large controller']); + + copyReviewFiles(dir); + }); + + afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} }); + + testConcurrentIfSelected('review-army-red-team', async () => { + const result = await runSkillTest({ + prompt: `You are reviewing a large diff (300+ lines). Read review-SKILL.md. +Skip preamble, lake intro, telemetry. + +The diff is large enough to activate the Red Team specialist. +Read review-specialists/red-team.md and apply it against the diff (git diff main...HEAD). +Focus on finding issues that other specialists might miss. + +Write your red team findings to ${dir}/review-output.md +Start the file with "RED TEAM REVIEW" on the first line.`, + workingDirectory: dir, + maxTurns: 20, + timeout: 180_000, + testName: 'review-army-red-team', + runId, + }); + + logCost('/review army red-team', result); + recordE2E(evalCollector, '/review army red team', 'Review Army', result); + expect(result.exitReason).toBe('success'); + + const outputPath = path.join(dir, 'review-output.md'); + if (fs.existsSync(outputPath)) { + const content = fs.readFileSync(outputPath, 'utf-8'); + expect(content.toLowerCase()).toMatch(/red team|adversarial/); + } + }, 210_000); +}); + +// --- Review Army: Consensus (periodic) --- + +describeIfSelected('Review Army: Consensus', ['review-army-consensus'], () => { + let dir: string; + + beforeAll(() => { + const repo = setupRepo('army-consensus'); + dir = repo.dir; + + fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n'); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'initial']); + + repo.run('git', ['checkout', '-b', 'feature/vuln-auth']); + // SQL injection that both security AND testing specialists should flag + fs.writeFileSync(path.join(dir, 'auth_controller.rb'), `class AuthController + def login + user = User.find_by("email = '#{params[:email]}' AND password = '#{params[:password]}'") + if user + session[:user_id] = user.id + redirect_to root_path + else + flash[:error] = "Invalid credentials" + render :login + end + end +end +`); + repo.run('git', ['add', '.']); + repo.run('git', ['commit', '-m', 'add auth controller']); + + copyReviewFiles(dir); + }); + + afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} }); + + testConcurrentIfSelected('review-army-consensus', async () => { + const result = await runSkillTest({ + prompt: `You are reviewing a git diff with a SQL injection in an auth controller. +Read review-SKILL.md, review-checklist.md, and the specialist checklists in review-specialists/. + +This vulnerability should be caught by BOTH the security specialist (injection vector) +AND the testing specialist (no test for auth bypass). + +Run the review. In your output, if a finding is flagged by multiple perspectives, +mark it as "MULTI-SPECIALIST CONFIRMED" with the confirming categories. + +Write findings to ${dir}/review-output.md`, + workingDirectory: dir, + maxTurns: 20, + timeout: 180_000, + testName: 'review-army-consensus', + runId, + }); + + logCost('/review army consensus', result); + recordE2E(evalCollector, '/review army consensus', 'Review Army', result); + expect(result.exitReason).toBe('success'); + + const outputPath = path.join(dir, 'review-output.md'); + if (fs.existsSync(outputPath)) { + const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase(); + // Should catch the SQL injection + const hasSqlFinding = + content.includes('sql') || + content.includes('injection') || + content.includes('interpolat'); + expect(hasSqlFinding).toBe(true); + } + }, 210_000); +}); + +// Finalize eval collector +afterAll(async () => { + await finalizeEvalCollector(evalCollector); +});