feat: Review Army — parallel specialist reviewers for /review (v0.14.3.0) (#692)

* feat: extend gstack-diff-scope with SCOPE_MIGRATIONS, SCOPE_API, SCOPE_AUTH

Three new scope signals for Review Army specialist activation:
- SCOPE_MIGRATIONS: db/migrate/, prisma/migrations/, alembic/, *.sql
- SCOPE_API: *controller*, *route*, *endpoint*, *.graphql, openapi.*
- SCOPE_AUTH: *auth*, *session*, *jwt*, *oauth*, *permission*, *role*

* feat: add 7 specialist checklist files for Review Army

- testing.md (always-on): coverage gaps, flaky patterns, security enforcement
- maintainability.md (always-on): dead code, DRY, stale comments
- security.md (conditional): OWASP deep analysis, auth bypass, injection
- performance.md (conditional): N+1 queries, bundle impact, complexity
- data-migration.md (conditional): reversibility, lock duration, backfill
- api-contract.md (conditional): breaking changes, versioning, error format
- red-team.md (conditional): adversarial analysis, cross-cutting concerns

All use standard header with JSON output schema and NO FINDINGS fallback.

* feat: Review Army resolver — parallel specialist dispatch + merge

New resolver in review-army.ts generates template prose for:
- Stack detection and specialist selection
- Parallel Agent tool dispatch with learning-informed prompts
- JSON finding collection, fingerprint dedup, consensus highlighting
- PR quality score computation
- Red Team conditional dispatch

Registered as REVIEW_ARMY in resolvers/index.ts.

* refactor: restructure /review template for Review Army

- Replace Steps 4-4.75 with CRITICAL pass + {{REVIEW_ARMY}}
- Remove {{DESIGN_REVIEW_LITE}} and {{TEST_COVERAGE_AUDIT_REVIEW}}
  (subsumed into Design and Testing specialists respectively)
- Extract specialist-covered categories from checklist.md
- Keep CRITICAL + uncovered INFORMATIONAL in main agent pass

* test: Review Army — 14 diff-scope tests + 7 E2E tests

- test/diff-scope.test.ts: 14 tests for all 9 scope signals
- test/skill-e2e-review-army.test.ts: 7 E2E tests
  Gate: migration safety, N+1 detection, delivery audit,
        quality score, JSON findings
  Periodic: red team, consensus
- Updated gen-skill-docs tests for new review structure
- Added touchfile entries and tier classifications

* docs: update SELF_LEARNING_V0.md with Release 2 status + Release 2.5

Mark Release 2 (Review Army) as in-progress. Add Release 2.5 for
deferred expansions (E1 adaptive gating, E3 test stubs, E5 cross-review
dedup, E7 specialist tracking).

* chore: bump version and changelog (v0.14.3.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-30 22:07:50 -06:00
committed by GitHub
parent a0328be04c
commit a4a181ca92
24 changed files with 1666 additions and 308 deletions
+165
View File
@@ -0,0 +1,165 @@
/**
* Tests for bin/gstack-diff-scope — verifies scope signal detection.
*
* Creates temp git repos with specific file patterns and verifies
* the correct SCOPE_* variables are output.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'fs';
import { join } from 'path';
import { tmpdir } from 'os';
import { spawnSync } from 'child_process';
const SCRIPT = join(import.meta.dir, '..', 'bin', 'gstack-diff-scope');
const dirs: string[] = [];
function createRepo(files: string[]): string {
const dir = mkdtempSync(join(tmpdir(), 'diff-scope-test-'));
dirs.push(dir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Base commit
writeFileSync(join(dir, 'README.md'), '# test\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Feature branch with specified files
run('git', ['checkout', '-b', 'feature/test']);
for (const f of files) {
const fullPath = join(dir, f);
const dirPath = fullPath.substring(0, fullPath.lastIndexOf('/'));
if (dirPath !== dir) mkdirSync(dirPath, { recursive: true });
writeFileSync(fullPath, '# test content\n');
}
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add files']);
return dir;
}
function runScope(dir: string): Record<string, string> {
const result = spawnSync('bash', [SCRIPT, 'main'], {
cwd: dir, stdio: 'pipe', timeout: 5000,
});
const output = result.stdout.toString().trim();
const vars: Record<string, string> = {};
for (const line of output.split('\n')) {
const [key, val] = line.split('=');
if (key && val) vars[key] = val;
}
return vars;
}
afterAll(() => {
for (const d of dirs) {
try { rmSync(d, { recursive: true, force: true }); } catch {}
}
});
describe('gstack-diff-scope', () => {
// --- Existing scope signals ---
test('detects frontend files', () => {
const dir = createRepo(['styles.css', 'component.tsx']);
const scope = runScope(dir);
expect(scope.SCOPE_FRONTEND).toBe('true');
});
test('detects backend files', () => {
const dir = createRepo(['app.rb', 'service.py']);
const scope = runScope(dir);
expect(scope.SCOPE_BACKEND).toBe('true');
});
test('detects test files', () => {
const dir = createRepo(['test/app.test.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_TESTS).toBe('true');
});
// --- New scope signals (Review Army) ---
test('detects migrations via db/migrate/', () => {
const dir = createRepo(['db/migrate/20260330_create_users.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects migrations via generic migrations/', () => {
const dir = createRepo(['app/migrations/0001_initial.py']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects migrations via prisma', () => {
const dir = createRepo(['prisma/migrations/20260330/migration.sql']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects API via controller files', () => {
const dir = createRepo(['app/controllers/users_controller.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects API via route files', () => {
const dir = createRepo(['src/routes/api.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects API via GraphQL schemas', () => {
const dir = createRepo(['schema.graphql']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects auth files', () => {
const dir = createRepo(['app/services/auth_service.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('detects session files', () => {
const dir = createRepo(['lib/session_manager.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('detects JWT files', () => {
const dir = createRepo(['utils/jwt_helper.py']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('returns false for all new signals when no matching files', () => {
const dir = createRepo(['docs/readme.md', 'config.yml']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('false');
expect(scope.SCOPE_API).toBe('false');
expect(scope.SCOPE_AUTH).toBe('false');
});
test('outputs all 9 scope variables', () => {
const dir = createRepo(['app.ts']);
const scope = runScope(dir);
expect(Object.keys(scope)).toHaveLength(9);
expect(scope).toHaveProperty('SCOPE_FRONTEND');
expect(scope).toHaveProperty('SCOPE_BACKEND');
expect(scope).toHaveProperty('SCOPE_PROMPTS');
expect(scope).toHaveProperty('SCOPE_TESTS');
expect(scope).toHaveProperty('SCOPE_DOCS');
expect(scope).toHaveProperty('SCOPE_CONFIG');
expect(scope).toHaveProperty('SCOPE_MIGRATIONS');
expect(scope).toHaveProperty('SCOPE_API');
expect(scope).toHaveProperty('SCOPE_AUTH');
});
});
+5
View File
@@ -0,0 +1,5 @@
-- Migration: Drop user email column
-- WARNING: This migration is intentionally unsafe for testing
ALTER TABLE users DROP COLUMN email;
ALTER TABLE users DROP COLUMN phone_number;
-- No backfill, no reversibility check, no data preservation
+12
View File
@@ -0,0 +1,12 @@
# N+1 query example — intentionally bad for testing
class PostsController
def index
@posts = Post.all
@posts.each do |post|
# N+1: queries Author table for every post
puts post.author.name
# N+1: queries Comments table for every post
puts post.comments.count
end
end
end
+55 -21
View File
@@ -607,7 +607,8 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
test('all three modes share codepath tracing methodology', () => {
test('plan and ship modes share codepath tracing methodology', () => {
// Review mode delegates test coverage to the Testing specialist subagent (Review Army)
const sharedPhrases = [
'Trace data flow',
'Diagram the execution',
@@ -619,33 +620,40 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
for (const phrase of sharedPhrases) {
expect(planSkill).toContain(phrase);
expect(shipSkill).toContain(phrase);
expect(reviewSkill).toContain(phrase);
}
// Plan mode traces the plan, not a git diff
expect(planSkill).toContain('Trace every codepath in the plan');
expect(planSkill).not.toContain('git diff origin');
// Ship and review modes trace the diff
// Ship mode traces the diff
expect(shipSkill).toContain('Trace every codepath changed');
expect(reviewSkill).toContain('Trace every codepath changed');
});
test('all three modes include E2E decision matrix', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('review mode uses Review Army for specialist dispatch', () => {
expect(reviewSkill).toContain('Review Army');
expect(reviewSkill).toContain('Specialist Dispatch');
expect(reviewSkill).toContain('testing.md');
});
test('plan and ship modes include E2E decision matrix', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('E2E Test Decision Matrix');
expect(skill).toContain('→E2E');
expect(skill).toContain('→EVAL');
}
});
test('all three modes include regression rule', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('plan and ship modes include regression rule', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('REGRESSION RULE');
expect(skill).toContain('IRON RULE');
}
});
test('all three modes include test framework detection', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('plan and ship modes include test framework detection', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('Test Framework Detection');
expect(skill).toContain('CLAUDE.md');
}
@@ -664,11 +672,12 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
expect(shipSkill).toContain('ship-test-plan');
});
test('review mode generates via Fix-First + gaps are INFORMATIONAL', () => {
test('review mode uses Fix-First + Review Army for specialist coverage', () => {
expect(reviewSkill).toContain('Fix-First');
expect(reviewSkill).toContain('INFORMATIONAL');
expect(reviewSkill).toContain('Step 4.75');
expect(reviewSkill).toContain('subsumes the "Test Gaps" category');
// Review Army handles test coverage via Testing specialist subagent
expect(reviewSkill).toContain('Review Army');
expect(reviewSkill).toContain('Testing');
});
test('plan mode does NOT include ship-specific content', () => {
@@ -683,6 +692,35 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
expect(reviewSkill).not.toContain('ship-test-plan');
});
test('review/specialists/ directory has all expected checklist files', () => {
const specDir = path.join(ROOT, 'review', 'specialists');
const expected = [
'testing.md',
'maintainability.md',
'security.md',
'performance.md',
'data-migration.md',
'api-contract.md',
'red-team.md',
];
for (const f of expected) {
expect(fs.existsSync(path.join(specDir, f))).toBe(true);
}
});
test('each specialist file has standard header with scope and output format', () => {
const specDir = path.join(ROOT, 'review', 'specialists');
const files = fs.readdirSync(specDir).filter(f => f.endsWith('.md'));
for (const f of files) {
const content = fs.readFileSync(path.join(specDir, f), 'utf-8');
// All specialist files must have Scope and Output/JSON in header
expect(content).toContain('Scope:');
expect(content.toLowerCase()).toMatch(/output|json/);
// Must define NO FINDINGS behavior
expect(content).toContain('NO FINDINGS');
}
});
// Regression guard: ship output contains key phrases from before the refactor
test('ship SKILL.md regression guard — key phrases preserved', () => {
const regressionPhrases = [
@@ -870,12 +908,9 @@ describe('Coverage gate in ship', () => {
expect(shipSkill).toContain('could not determine percentage — skipping');
});
test('review SKILL.md contains coverage WARNING', () => {
expect(reviewSkill).toContain('COVERAGE WARNING');
expect(reviewSkill).toContain('Consider writing tests before running /ship');
});
test('review coverage warning is INFORMATIONAL', () => {
test('review SKILL.md delegates coverage to Testing specialist', () => {
// Coverage audit moved to Testing specialist subagent in Review Army
expect(reviewSkill).toContain('testing.md');
expect(reviewSkill).toContain('INFORMATIONAL');
});
});
@@ -1604,10 +1639,9 @@ describe('Codex generation (--host codex)', () => {
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
// Correct: references to sidecar files use gstack/review/ path
expect(content).toContain('.agents/skills/gstack/review/checklist.md');
expect(content).toContain('.agents/skills/gstack/review/design-checklist.md');
// design-checklist.md is now referenced via Review Army specialist (Claude only, stripped for Codex)
// Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there)
expect(content).not.toContain('.agents/skills/gstack-review/checklist.md');
expect(content).not.toContain('.agents/skills/gstack-review/design-checklist.md');
});
test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => {
+18
View File
@@ -59,6 +59,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Review Army (specialist dispatch)
'review-army-migration-safety': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
'review-army-perf-n-plus-one': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
'review-army-delivery-audit': ['review/**', 'scripts/resolvers/review.ts', 'scripts/resolvers/review-army.ts'],
'review-army-quality-score': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-json-findings': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-red-team': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-consensus': ['review/**', 'scripts/resolvers/review-army.ts'],
// Office Hours
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
@@ -204,6 +213,15 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'review-plan-completion': 'gate',
'review-dashboard-via': 'gate',
// Review Army — gate for core functionality, periodic for multi-specialist
'review-army-migration-safety': 'gate', // Specialist activation guardrail
'review-army-perf-n-plus-one': 'gate', // Specialist activation guardrail
'review-army-delivery-audit': 'gate', // Delivery integrity guardrail
'review-army-quality-score': 'gate', // Score computation
'review-army-json-findings': 'gate', // JSON schema compliance
'review-army-red-team': 'periodic', // Multi-agent coordination
'review-army-consensus': 'periodic', // Multi-specialist agreement
// Office Hours
'office-hours-spec-review': 'gate',
+562
View File
@@ -0,0 +1,562 @@
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import {
ROOT, runId, describeIfSelected, testConcurrentIfSelected,
logCost, recordE2E, createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-review-army');
// Helper: create a git repo with a feature branch
function setupRepo(prefix: string): { dir: string; run: (cmd: string, args: string[]) => void } {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${prefix}-`));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
return { dir, run };
}
// Helper: copy review skill files to test dir
function copyReviewFiles(dir: string) {
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(dir, 'review-SKILL.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(dir, 'review-checklist.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(dir, 'review-greptile-triage.md'));
// Copy specialist checklists
const specDir = path.join(dir, 'review-specialists');
fs.mkdirSync(specDir, { recursive: true });
const specialistsRoot = path.join(ROOT, 'review', 'specialists');
for (const f of fs.readdirSync(specialistsRoot)) {
fs.copyFileSync(path.join(specialistsRoot, f), path.join(specDir, f));
}
}
// --- Review Army: Migration Safety ---
describeIfSelected('Review Army: Migration Safety', ['review-army-migration-safety'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-migration');
dir = repo.dir;
// Base commit
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
// Feature branch with unsafe migration
repo.run('git', ['checkout', '-b', 'feature/drop-columns']);
fs.mkdirSync(path.join(dir, 'db', 'migrate'), { recursive: true });
const migrationContent = fs.readFileSync(
path.join(ROOT, 'test', 'fixtures', 'review-army-migration.sql'), 'utf-8'
);
fs.writeFileSync(path.join(dir, 'db', 'migrate', '20260330_drop_columns.sql'), migrationContent);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'drop email and phone columns']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-migration-safety', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with a database migration that drops columns.
Read review-SKILL.md for instructions. Also read review-checklist.md.
The specialist checklists are in review-specialists/ (testing.md, security.md, performance.md, data-migration.md, etc.).
Skip the preamble, lake intro, telemetry sections.
Run Step 4 (Critical pass) then Step 4.5 (Review Army — Specialist Dispatch).
The base branch is main. Run gstack-diff-scope style analysis on the changed files.
Since db/migrate/ files changed, the Data Migration specialist should activate.
For the specialist dispatch, instead of launching subagents, just read review-specialists/data-migration.md
and apply it yourself against the diff (git diff main...HEAD).
Write your findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-migration-safety',
runId,
});
logCost('/review army migration', result);
recordE2E(evalCollector, '/review army migration safety', 'Review Army', result);
expect(result.exitReason).toBe('success');
// Verify migration issues were caught
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
const hasMigrationFinding =
content.includes('drop') ||
content.includes('data loss') ||
content.includes('reversib') ||
content.includes('migration') ||
content.includes('column');
expect(hasMigrationFinding).toBe(true);
}
}, 210_000);
});
// --- Review Army: N+1 Performance ---
describeIfSelected('Review Army: N+1 Performance', ['review-army-perf-n-plus-one'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-n-plus-one');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/add-posts-index']);
const n1Content = fs.readFileSync(
path.join(ROOT, 'test', 'fixtures', 'review-army-n-plus-one.rb'), 'utf-8'
);
fs.writeFileSync(path.join(dir, 'posts_controller.rb'), n1Content);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add posts controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-perf-n-plus-one', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with a Ruby controller that has N+1 queries.
Read review-SKILL.md for instructions. Also read review-checklist.md.
The specialist checklists are in review-specialists/ (testing.md, performance.md, etc.).
Skip the preamble, lake intro, telemetry sections.
Run Step 4 (Critical pass) then Step 4.5 (Review Army).
The base branch is main. This is a Ruby backend file, so Performance specialist should activate.
For the specialist dispatch, read review-specialists/performance.md and apply it against the diff.
Write your findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-perf-n-plus-one',
runId,
});
logCost('/review army n+1', result);
recordE2E(evalCollector, '/review army N+1 detection', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
const hasN1Finding =
content.includes('n+1') ||
content.includes('n + 1') ||
content.includes('eager') ||
content.includes('includes') ||
content.includes('preload') ||
content.includes('query') ||
content.includes('loop');
expect(hasN1Finding).toBe(true);
}
}, 210_000);
});
// --- Review Army: Delivery Audit ---
describeIfSelected('Review Army: Delivery Audit', ['review-army-delivery-audit'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-delivery');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/three-features']);
// Write a plan file promising 3 features
fs.writeFileSync(path.join(dir, 'PLAN.md'), `# Feature Plan
## Implementation Items
1. Add user authentication with login/logout
2. Add user profile page with avatar upload
3. Add email notification system for new signups
## Test Items
- Test login flow
- Test profile page rendering
- Test email sending
`);
repo.run('git', ['add', 'PLAN.md']);
repo.run('git', ['commit', '-m', 'add plan']);
// Implement only 2 of 3 features
fs.writeFileSync(path.join(dir, 'auth.rb'), `class AuthController
def login
# authenticate user
session[:user_id] = user.id
end
def logout
session.delete(:user_id)
end
end
`);
fs.writeFileSync(path.join(dir, 'profile.rb'), `class ProfileController
def show
@user = User.find(params[:id])
end
def update_avatar
@user.avatar.attach(params[:avatar])
end
end
`);
// NOTE: email notification system is NOT implemented (intentionally missing)
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'implement auth and profile features']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-delivery-audit', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/three-features.
There is a PLAN.md file that promises 3 features: auth, profile, and email notifications.
The diff (git diff main...HEAD) only implements 2 of them (auth and profile).
Read review-SKILL.md for the review workflow. Focus on the Plan Completion Audit section.
The plan file is at ./PLAN.md. Cross-reference it against the diff.
For each plan item, classify as DONE, PARTIAL, NOT DONE, or CHANGED.
The email notification system should be classified as NOT DONE.
Write your completion audit to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 15,
timeout: 120_000,
testName: 'review-army-delivery-audit',
runId,
});
logCost('/review army delivery', result);
recordE2E(evalCollector, '/review army delivery audit', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
// Should identify email notifications as NOT DONE
const hasNotDone =
content.includes('not done') ||
content.includes('not_done') ||
content.includes('missing') ||
content.includes('not implemented');
const mentionsEmail =
content.includes('email') ||
content.includes('notification');
expect(hasNotDone).toBe(true);
expect(mentionsEmail).toBe(true);
}
}, 150_000);
});
// --- Review Army: Quality Score ---
describeIfSelected('Review Army: Quality Score', ['review-army-quality-score'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-quality');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/add-controller']);
// Code with obvious issues for quality score computation
fs.writeFileSync(path.join(dir, 'user_controller.rb'), `class UserController
def create
# SQL injection
User.where("name = '#{params[:name]}'")
# Magic number
if users.count > 42
raise "too many"
end
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add user controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-quality-score', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo with a vulnerable user controller.
Read review-SKILL.md and review-checklist.md.
Skip preamble, lake intro, telemetry.
Run the Critical pass (Step 4) against the diff (git diff main...HEAD).
Then compute the PR Quality Score as described in the Review Army merge step:
quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))
Write your findings AND the computed quality score to ${dir}/review-output.md
Include the line: "PR Quality Score: X/10" where X is the computed score.`,
workingDirectory: dir,
maxTurns: 15,
timeout: 120_000,
testName: 'review-army-quality-score',
runId,
});
logCost('/review army quality', result);
recordE2E(evalCollector, '/review army quality score', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8');
// Should contain a quality score
const hasScore =
content.toLowerCase().includes('quality score') ||
content.match(/\d+\/10/);
expect(hasScore).toBeTruthy();
}
}, 150_000);
});
// --- Review Army: JSON Findings ---
describeIfSelected('Review Army: JSON Findings', ['review-army-json-findings'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-json');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/vuln']);
fs.writeFileSync(path.join(dir, 'search.rb'), `class SearchController
def index
# SQL injection via string interpolation
results = ActiveRecord::Base.connection.execute(
"SELECT * FROM products WHERE name LIKE '%#{params[:q]}%'"
)
render json: results
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add search']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-json-findings', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a git diff with a SQL injection vulnerability.
Read review-specialists/security.md for the security checklist.
Apply the checklist against this diff (git diff main...HEAD).
Output your findings as JSON objects, one per line, following the schema:
{"severity":"CRITICAL","confidence":9,"path":"search.rb","line":4,"category":"injection","summary":"SQL injection via string interpolation","fix":"Use parameterized query","fingerprint":"search.rb:4:injection","specialist":"security"}
Write ONLY JSON findings (no preamble) to ${dir}/findings.json`,
workingDirectory: dir,
maxTurns: 12,
timeout: 90_000,
testName: 'review-army-json-findings',
runId,
});
logCost('/review army json', result);
recordE2E(evalCollector, '/review army JSON findings', 'Review Army', result);
expect(result.exitReason).toBe('success');
const findingsPath = path.join(dir, 'findings.json');
if (fs.existsSync(findingsPath)) {
const content = fs.readFileSync(findingsPath, 'utf-8').trim();
const lines = content.split('\n').filter(l => l.trim());
// At least one finding
expect(lines.length).toBeGreaterThanOrEqual(1);
// Each line should be valid JSON with required fields
for (const line of lines) {
let parsed: any;
try { parsed = JSON.parse(line); } catch { continue; }
// Required fields per schema
expect(parsed).toHaveProperty('severity');
expect(parsed).toHaveProperty('confidence');
expect(parsed).toHaveProperty('path');
expect(parsed).toHaveProperty('category');
expect(parsed).toHaveProperty('summary');
expect(parsed).toHaveProperty('specialist');
break; // One valid line is enough for the gate test
}
}
}, 120_000);
});
// --- Review Army: Red Team (periodic) ---
describeIfSelected('Review Army: Red Team', ['review-army-red-team'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-redteam');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/large-change']);
// Create a large diff (300+ lines)
const lines: string[] = ['class LargeController'];
for (let i = 0; i < 100; i++) {
lines.push(` def method_${i}`);
lines.push(` data = params[:input_${i}]`);
lines.push(` process(data)`);
lines.push(' end');
lines.push('');
}
lines.push('end');
fs.writeFileSync(path.join(dir, 'large_controller.rb'), lines.join('\n'));
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add large controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-red-team', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a large diff (300+ lines). Read review-SKILL.md.
Skip preamble, lake intro, telemetry.
The diff is large enough to activate the Red Team specialist.
Read review-specialists/red-team.md and apply it against the diff (git diff main...HEAD).
Focus on finding issues that other specialists might miss.
Write your red team findings to ${dir}/review-output.md
Start the file with "RED TEAM REVIEW" on the first line.`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-red-team',
runId,
});
logCost('/review army red-team', result);
recordE2E(evalCollector, '/review army red team', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8');
expect(content.toLowerCase()).toMatch(/red team|adversarial/);
}
}, 210_000);
});
// --- Review Army: Consensus (periodic) ---
describeIfSelected('Review Army: Consensus', ['review-army-consensus'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-consensus');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/vuln-auth']);
// SQL injection that both security AND testing specialists should flag
fs.writeFileSync(path.join(dir, 'auth_controller.rb'), `class AuthController
def login
user = User.find_by("email = '#{params[:email]}' AND password = '#{params[:password]}'")
if user
session[:user_id] = user.id
redirect_to root_path
else
flash[:error] = "Invalid credentials"
render :login
end
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add auth controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-consensus', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a git diff with a SQL injection in an auth controller.
Read review-SKILL.md, review-checklist.md, and the specialist checklists in review-specialists/.
This vulnerability should be caught by BOTH the security specialist (injection vector)
AND the testing specialist (no test for auth bypass).
Run the review. In your output, if a finding is flagged by multiple perspectives,
mark it as "MULTI-SPECIALIST CONFIRMED" with the confirming categories.
Write findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-consensus',
runId,
});
logCost('/review army consensus', result);
recordE2E(evalCollector, '/review army consensus', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
// Should catch the SQL injection
const hasSqlFinding =
content.includes('sql') ||
content.includes('injection') ||
content.includes('interpolat');
expect(hasSqlFinding).toBe(true);
}
}, 210_000);
});
// Finalize eval collector
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
});