Merge remote-tracking branch 'origin/main' into garrytan/learn-from-reviews

Resolved conflicts: - VERSION: bumped to 0.14.6.0 (our branch on top of main's 0.14.5.0) - CHANGELOG.md: kept our entry on top, main's 7 new entries below, updated version - package.json: version synced to 0.14.6.0 - Regenerated all SKILL.md files from merged templates Main brought: Review Army (parallel specialist reviewers), always-on adversarial, CSS inspector, per-tab agents, design-to-code, comparison board, ship idempotency, skill prefix fix, session intelligence roadmap. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 05:56:41 +02:00 · 2026-03-31 21:07:07 -07:00
parent 80f8cc339b db35b8e5bf
commit c4d6f30af8
93 changed files with 11423 additions and 821 deletions
@@ -0,0 +1,165 @@
+/**
+ * Tests for bin/gstack-diff-scope — verifies scope signal detection.
+ *
+ * Creates temp git repos with specific file patterns and verifies
+ * the correct SCOPE_* variables are output.
+ */
+import { describe, test, expect, afterAll } from 'bun:test';
+import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'fs';
+import { join } from 'path';
+import { tmpdir } from 'os';
+import { spawnSync } from 'child_process';
+
+const SCRIPT = join(import.meta.dir, '..', 'bin', 'gstack-diff-scope');
+
+const dirs: string[] = [];
+
+function createRepo(files: string[]): string {
+  const dir = mkdtempSync(join(tmpdir(), 'diff-scope-test-'));
+  dirs.push(dir);
+
+  const run = (cmd: string, args: string[]) =>
+    spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
+
+  run('git', ['init', '-b', 'main']);
+  run('git', ['config', 'user.email', 'test@test.com']);
+  run('git', ['config', 'user.name', 'Test']);
+
+  // Base commit
+  writeFileSync(join(dir, 'README.md'), '# test\n');
+  run('git', ['add', '.']);
+  run('git', ['commit', '-m', 'initial']);
+
+  // Feature branch with specified files
+  run('git', ['checkout', '-b', 'feature/test']);
+  for (const f of files) {
+    const fullPath = join(dir, f);
+    const dirPath = fullPath.substring(0, fullPath.lastIndexOf('/'));
+    if (dirPath !== dir) mkdirSync(dirPath, { recursive: true });
+    writeFileSync(fullPath, '# test content\n');
+  }
+  run('git', ['add', '.']);
+  run('git', ['commit', '-m', 'add files']);
+
+  return dir;
+}
+
+function runScope(dir: string): Record<string, string> {
+  const result = spawnSync('bash', [SCRIPT, 'main'], {
+    cwd: dir, stdio: 'pipe', timeout: 5000,
+  });
+  const output = result.stdout.toString().trim();
+  const vars: Record<string, string> = {};
+  for (const line of output.split('\n')) {
+    const [key, val] = line.split('=');
+    if (key && val) vars[key] = val;
+  }
+  return vars;
+}
+
+afterAll(() => {
+  for (const d of dirs) {
+    try { rmSync(d, { recursive: true, force: true }); } catch {}
+  }
+});
+
+describe('gstack-diff-scope', () => {
+  // --- Existing scope signals ---
+
+  test('detects frontend files', () => {
+    const dir = createRepo(['styles.css', 'component.tsx']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_FRONTEND).toBe('true');
+  });
+
+  test('detects backend files', () => {
+    const dir = createRepo(['app.rb', 'service.py']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_BACKEND).toBe('true');
+  });
+
+  test('detects test files', () => {
+    const dir = createRepo(['test/app.test.ts']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_TESTS).toBe('true');
+  });
+
+  // --- New scope signals (Review Army) ---
+
+  test('detects migrations via db/migrate/', () => {
+    const dir = createRepo(['db/migrate/20260330_create_users.rb']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_MIGRATIONS).toBe('true');
+  });
+
+  test('detects migrations via generic migrations/', () => {
+    const dir = createRepo(['app/migrations/0001_initial.py']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_MIGRATIONS).toBe('true');
+  });
+
+  test('detects migrations via prisma', () => {
+    const dir = createRepo(['prisma/migrations/20260330/migration.sql']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_MIGRATIONS).toBe('true');
+  });
+
+  test('detects API via controller files', () => {
+    const dir = createRepo(['app/controllers/users_controller.rb']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_API).toBe('true');
+  });
+
+  test('detects API via route files', () => {
+    const dir = createRepo(['src/routes/api.ts']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_API).toBe('true');
+  });
+
+  test('detects API via GraphQL schemas', () => {
+    const dir = createRepo(['schema.graphql']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_API).toBe('true');
+  });
+
+  test('detects auth files', () => {
+    const dir = createRepo(['app/services/auth_service.rb']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_AUTH).toBe('true');
+  });
+
+  test('detects session files', () => {
+    const dir = createRepo(['lib/session_manager.ts']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_AUTH).toBe('true');
+  });
+
+  test('detects JWT files', () => {
+    const dir = createRepo(['utils/jwt_helper.py']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_AUTH).toBe('true');
+  });
+
+  test('returns false for all new signals when no matching files', () => {
+    const dir = createRepo(['docs/readme.md', 'config.yml']);
+    const scope = runScope(dir);
+    expect(scope.SCOPE_MIGRATIONS).toBe('false');
+    expect(scope.SCOPE_API).toBe('false');
+    expect(scope.SCOPE_AUTH).toBe('false');
+  });
+
+  test('outputs all 9 scope variables', () => {
+    const dir = createRepo(['app.ts']);
+    const scope = runScope(dir);
+    expect(Object.keys(scope)).toHaveLength(9);
+    expect(scope).toHaveProperty('SCOPE_FRONTEND');
+    expect(scope).toHaveProperty('SCOPE_BACKEND');
+    expect(scope).toHaveProperty('SCOPE_PROMPTS');
+    expect(scope).toHaveProperty('SCOPE_TESTS');
+    expect(scope).toHaveProperty('SCOPE_DOCS');
+    expect(scope).toHaveProperty('SCOPE_CONFIG');
+    expect(scope).toHaveProperty('SCOPE_MIGRATIONS');
+    expect(scope).toHaveProperty('SCOPE_API');
+    expect(scope).toHaveProperty('SCOPE_AUTH');
+  });
+});
@@ -0,0 +1,5 @@
+-- Migration: Drop user email column
+-- WARNING: This migration is intentionally unsafe for testing
+ALTER TABLE users DROP COLUMN email;
+ALTER TABLE users DROP COLUMN phone_number;
+-- No backfill, no reversibility check, no data preservation
@@ -0,0 +1,12 @@
+# N+1 query example — intentionally bad for testing
+class PostsController
+  def index
+    @posts = Post.all
+    @posts.each do |post|
+      # N+1: queries Author table for every post
+      puts post.author.name
+      # N+1: queries Comments table for every post
+      puts post.comments.count
+    end
+  end
+end
@@ -595,10 +595,12 @@ describe('REVIEW_DASHBOARD resolver', () => {
    expect(content).toContain('/plan-ceo-review');
  });

-  test('plan-design-review chaining mentions eng and ceo reviews', () => {
+  test('plan-design-review chaining mentions eng, ceo, and design skills', () => {
    const content = fs.readFileSync(path.join(ROOT, 'plan-design-review', 'SKILL.md'), 'utf-8');
    expect(content).toContain('/plan-eng-review');
    expect(content).toContain('/plan-ceo-review');
+    expect(content).toContain('/design-shotgun');
+    expect(content).toContain('/design-html');
  });

  test('ship does NOT contain review chaining', () => {
@@ -614,7 +616,8 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
  const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
  const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');

-  test('all three modes share codepath tracing methodology', () => {
+  test('plan and ship modes share codepath tracing methodology', () => {
+    // Review mode delegates test coverage to the Testing specialist subagent (Review Army)
    const sharedPhrases = [
      'Trace data flow',
      'Diagram the execution',
@@ -626,33 +629,40 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
    for (const phrase of sharedPhrases) {
      expect(planSkill).toContain(phrase);
      expect(shipSkill).toContain(phrase);
-      expect(reviewSkill).toContain(phrase);
    }
    // Plan mode traces the plan, not a git diff
    expect(planSkill).toContain('Trace every codepath in the plan');
    expect(planSkill).not.toContain('git diff origin');
-    // Ship and review modes trace the diff
+    // Ship mode traces the diff
    expect(shipSkill).toContain('Trace every codepath changed');
-    expect(reviewSkill).toContain('Trace every codepath changed');
  });

-  test('all three modes include E2E decision matrix', () => {
-    for (const skill of [planSkill, shipSkill, reviewSkill]) {
+  test('review mode uses Review Army for specialist dispatch', () => {
+    expect(reviewSkill).toContain('Review Army');
+    expect(reviewSkill).toContain('Specialist Dispatch');
+    expect(reviewSkill).toContain('testing.md');
+  });
+
+  test('plan and ship modes include E2E decision matrix', () => {
+    // Review mode delegates to Testing specialist
+    for (const skill of [planSkill, shipSkill]) {
      expect(skill).toContain('E2E Test Decision Matrix');
      expect(skill).toContain('→E2E');
      expect(skill).toContain('→EVAL');
    }
  });

-  test('all three modes include regression rule', () => {
-    for (const skill of [planSkill, shipSkill, reviewSkill]) {
+  test('plan and ship modes include regression rule', () => {
+    // Review mode delegates to Testing specialist
+    for (const skill of [planSkill, shipSkill]) {
      expect(skill).toContain('REGRESSION RULE');
      expect(skill).toContain('IRON RULE');
    }
  });

-  test('all three modes include test framework detection', () => {
-    for (const skill of [planSkill, shipSkill, reviewSkill]) {
+  test('plan and ship modes include test framework detection', () => {
+    // Review mode delegates to Testing specialist
+    for (const skill of [planSkill, shipSkill]) {
      expect(skill).toContain('Test Framework Detection');
      expect(skill).toContain('CLAUDE.md');
    }
@@ -671,11 +681,12 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
    expect(shipSkill).toContain('ship-test-plan');
  });

-  test('review mode generates via Fix-First + gaps are INFORMATIONAL', () => {
+  test('review mode uses Fix-First + Review Army for specialist coverage', () => {
    expect(reviewSkill).toContain('Fix-First');
    expect(reviewSkill).toContain('INFORMATIONAL');
-    expect(reviewSkill).toContain('Step 4.75');
-    expect(reviewSkill).toContain('subsumes the "Test Gaps" category');
+    // Review Army handles test coverage via Testing specialist subagent
+    expect(reviewSkill).toContain('Review Army');
+    expect(reviewSkill).toContain('Testing');
  });

  test('plan mode does NOT include ship-specific content', () => {
@@ -690,6 +701,35 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
    expect(reviewSkill).not.toContain('ship-test-plan');
  });

+  test('review/specialists/ directory has all expected checklist files', () => {
+    const specDir = path.join(ROOT, 'review', 'specialists');
+    const expected = [
+      'testing.md',
+      'maintainability.md',
+      'security.md',
+      'performance.md',
+      'data-migration.md',
+      'api-contract.md',
+      'red-team.md',
+    ];
+    for (const f of expected) {
+      expect(fs.existsSync(path.join(specDir, f))).toBe(true);
+    }
+  });
+
+  test('each specialist file has standard header with scope and output format', () => {
+    const specDir = path.join(ROOT, 'review', 'specialists');
+    const files = fs.readdirSync(specDir).filter(f => f.endsWith('.md'));
+    for (const f of files) {
+      const content = fs.readFileSync(path.join(specDir, f), 'utf-8');
+      // All specialist files must have Scope and Output/JSON in header
+      expect(content).toContain('Scope:');
+      expect(content.toLowerCase()).toMatch(/output|json/);
+      // Must define NO FINDINGS behavior
+      expect(content).toContain('NO FINDINGS');
+    }
+  });
+
  // Regression guard: ship output contains key phrases from before the refactor
  test('ship SKILL.md regression guard — key phrases preserved', () => {
    const regressionPhrases = [
@@ -877,12 +917,9 @@ describe('Coverage gate in ship', () => {
    expect(shipSkill).toContain('could not determine percentage — skipping');
  });

-  test('review SKILL.md contains coverage WARNING', () => {
-    expect(reviewSkill).toContain('COVERAGE WARNING');
-    expect(reviewSkill).toContain('Consider writing tests before running /ship');
-  });
-
-  test('review coverage warning is INFORMATIONAL', () => {
+  test('review SKILL.md delegates coverage to Testing specialist', () => {
+    // Coverage audit moved to Testing specialist subagent in Review Army
+    expect(reviewSkill).toContain('testing.md');
    expect(reviewSkill).toContain('INFORMATIONAL');
  });
 });
@@ -1611,10 +1648,9 @@ describe('Codex generation (--host codex)', () => {
    const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
    // Correct: references to sidecar files use gstack/review/ path
    expect(content).toContain('.agents/skills/gstack/review/checklist.md');
-    expect(content).toContain('.agents/skills/gstack/review/design-checklist.md');
+    // design-checklist.md is now referenced via Review Army specialist (Claude only, stripped for Codex)
    // Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there)
    expect(content).not.toContain('.agents/skills/gstack-review/checklist.md');
-    expect(content).not.toContain('.agents/skills/gstack-review/design-checklist.md');
  });

  test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => {
@@ -2469,3 +2505,49 @@ describe('CONFIDENCE_CALIBRATION resolver', () => {
    }
  });
 });
+
+describe('gen-skill-docs prefix warning (#620/#578)', () => {
+  const { execSync } = require('child_process');
+
+  test('warns about skill_prefix when config has prefix=true', () => {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-prefix-warn-'));
+    try {
+      // Create a fake ~/.gstack/config.yaml with skill_prefix: true
+      const fakeHome = tmpDir;
+      const fakeGstack = path.join(fakeHome, '.gstack');
+      fs.mkdirSync(fakeGstack, { recursive: true });
+      fs.writeFileSync(path.join(fakeGstack, 'config.yaml'), 'skill_prefix: true\n');
+
+      const output = execSync('bun run scripts/gen-skill-docs.ts', {
+        cwd: ROOT,
+        env: { ...process.env, HOME: fakeHome },
+        encoding: 'utf-8',
+        timeout: 30000,
+      });
+      expect(output).toContain('skill_prefix is true');
+      expect(output).toContain('gstack-relink');
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
+  });
+
+  test('no warning when skill_prefix is false or absent', () => {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-prefix-warn-'));
+    try {
+      const fakeHome = tmpDir;
+      const fakeGstack = path.join(fakeHome, '.gstack');
+      fs.mkdirSync(fakeGstack, { recursive: true });
+      fs.writeFileSync(path.join(fakeGstack, 'config.yaml'), 'skill_prefix: false\n');
+
+      const output = execSync('bun run scripts/gen-skill-docs.ts', {
+        cwd: ROOT,
+        env: { ...process.env, HOME: fakeHome },
+        encoding: 'utf-8',
+        timeout: 30000,
+      });
+      expect(output).not.toContain('skill_prefix is true');
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
+  });
+});
@@ -59,6 +59,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'review-base-branch':       ['review/**'],
  'review-design-lite':       ['review/**', 'test/fixtures/review-eval-design-slop.*'],

+  // Review Army (specialist dispatch)
+  'review-army-migration-safety': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
+  'review-army-perf-n-plus-one':  ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
+  'review-army-delivery-audit':   ['review/**', 'scripts/resolvers/review.ts', 'scripts/resolvers/review-army.ts'],
+  'review-army-quality-score':    ['review/**', 'scripts/resolvers/review-army.ts'],
+  'review-army-json-findings':    ['review/**', 'scripts/resolvers/review-army.ts'],
+  'review-army-red-team':         ['review/**', 'scripts/resolvers/review-army.ts'],
+  'review-army-consensus':        ['review/**', 'scripts/resolvers/review-army.ts'],
+
  // Office Hours
  'office-hours-spec-review':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],

@@ -122,6 +131,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  // Plan completion audit + verification
  'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
  'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
+  'ship-idempotency':       ['ship/**', 'scripts/resolvers/utility.ts'],
  'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],

  // Design
@@ -152,6 +162,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  // Sidebar agent
  'sidebar-navigate':              ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
  'sidebar-url-accuracy':          ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
+  'sidebar-css-interaction':       ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts', 'browse/src/cdp-inspector.ts', 'extension/**'],

  // Autoplan
  'autoplan-core':  ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
@@ -203,6 +214,15 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'review-plan-completion': 'gate',
  'review-dashboard-via': 'gate',

+  // Review Army — gate for core functionality, periodic for multi-specialist
+  'review-army-migration-safety': 'gate',   // Specialist activation guardrail
+  'review-army-perf-n-plus-one': 'gate',    // Specialist activation guardrail
+  'review-army-delivery-audit': 'gate',     // Delivery integrity guardrail
+  'review-army-quality-score': 'gate',      // Score computation
+  'review-army-json-findings': 'gate',      // JSON schema compliance
+  'review-army-red-team': 'periodic',       // Multi-agent coordination
+  'review-army-consensus': 'periodic',      // Multi-specialist agreement
+
  // Office Hours
  'office-hours-spec-review': 'gate',

@@ -228,6 +248,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'ship-triage': 'gate',
  'ship-plan-completion': 'gate',
  'ship-plan-verification': 'gate',
+  'ship-idempotency': 'periodic',

  // Retro — gate for cheap branch detection, periodic for full Opus retro
  'retro': 'periodic',
@@ -282,6 +303,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  // Sidebar agent
  'sidebar-navigate': 'periodic',
  'sidebar-url-accuracy': 'periodic',
+  'sidebar-css-interaction': 'periodic',

  // Autoplan — periodic (not yet implemented)
  'autoplan-core': 'periodic',
@@ -42,11 +42,18 @@ function setupMockInstall(skills: string[]): void {
    fs.copyFileSync(path.join(BIN, 'gstack-relink'), path.join(mockBin, 'gstack-relink'));
    fs.chmodSync(path.join(mockBin, 'gstack-relink'), 0o755);
  }
+  if (fs.existsSync(path.join(BIN, 'gstack-patch-names'))) {
+    fs.copyFileSync(path.join(BIN, 'gstack-patch-names'), path.join(mockBin, 'gstack-patch-names'));
+    fs.chmodSync(path.join(mockBin, 'gstack-patch-names'), 0o755);
+  }

-  // Create mock skill directories
+  // Create mock skill directories with proper frontmatter
  for (const skill of skills) {
    fs.mkdirSync(path.join(installDir, skill), { recursive: true });
-    fs.writeFileSync(path.join(installDir, skill, 'SKILL.md'), `# ${skill}`);
+    fs.writeFileSync(
+      path.join(installDir, skill, 'SKILL.md'),
+      `---\nname: ${skill}\ndescription: test\n---\n# ${skill}`
+    );
  }
 }

@@ -150,3 +157,73 @@ describe('gstack-relink (#578)', () => {
    expect(fs.existsSync(path.join(skillsDir, 'gstack-ship'))).toBe(true);
  });
 });
+
+describe('gstack-patch-names (#620/#578)', () => {
+  // Helper to read name: from SKILL.md frontmatter
+  function readSkillName(skillDir: string): string | null {
+    const content = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf-8');
+    const match = content.match(/^name:\s*(.+)$/m);
+    return match ? match[1].trim() : null;
+  }
+
+  test('prefix=true patches name: field in SKILL.md', () => {
+    setupMockInstall(['qa', 'ship', 'review']);
+    run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
+    run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
+      GSTACK_INSTALL_DIR: installDir,
+      GSTACK_SKILLS_DIR: skillsDir,
+    });
+    // Verify name: field is patched with gstack- prefix
+    expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
+    expect(readSkillName(path.join(installDir, 'ship'))).toBe('gstack-ship');
+    expect(readSkillName(path.join(installDir, 'review'))).toBe('gstack-review');
+  });
+
+  test('prefix=false restores name: field in SKILL.md', () => {
+    setupMockInstall(['qa', 'ship']);
+    // First, prefix them
+    run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
+    run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
+      GSTACK_INSTALL_DIR: installDir,
+      GSTACK_SKILLS_DIR: skillsDir,
+    });
+    expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
+    // Now switch to flat mode
+    run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix false`);
+    run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
+      GSTACK_INSTALL_DIR: installDir,
+      GSTACK_SKILLS_DIR: skillsDir,
+    });
+    // Verify name: field is restored to unprefixed
+    expect(readSkillName(path.join(installDir, 'qa'))).toBe('qa');
+    expect(readSkillName(path.join(installDir, 'ship'))).toBe('ship');
+  });
+
+  test('gstack-upgrade name: not double-prefixed', () => {
+    setupMockInstall(['qa', 'gstack-upgrade']);
+    run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
+    run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
+      GSTACK_INSTALL_DIR: installDir,
+      GSTACK_SKILLS_DIR: skillsDir,
+    });
+    // gstack-upgrade should keep its name, NOT become gstack-gstack-upgrade
+    expect(readSkillName(path.join(installDir, 'gstack-upgrade'))).toBe('gstack-upgrade');
+    // Regular skill should be prefixed
+    expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
+  });
+
+  test('SKILL.md without frontmatter is a no-op', () => {
+    setupMockInstall(['qa']);
+    // Overwrite qa SKILL.md with no frontmatter
+    fs.writeFileSync(path.join(installDir, 'qa', 'SKILL.md'), '# qa\nSome content.');
+    run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
+    // Should not crash
+    run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
+      GSTACK_INSTALL_DIR: installDir,
+      GSTACK_SKILLS_DIR: skillsDir,
+    });
+    // Content should be unchanged (no name: to patch)
+    const content = fs.readFileSync(path.join(installDir, 'qa', 'SKILL.md'), 'utf-8');
+    expect(content).toBe('# qa\nSome content.');
+  });
+});
@@ -0,0 +1,562 @@
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
+import { runSkillTest } from './helpers/session-runner';
+import {
+  ROOT, runId, describeIfSelected, testConcurrentIfSelected,
+  logCost, recordE2E, createEvalCollector, finalizeEvalCollector,
+} from './helpers/e2e-helpers';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const evalCollector = createEvalCollector('e2e-review-army');
+
+// Helper: create a git repo with a feature branch
+function setupRepo(prefix: string): { dir: string; run: (cmd: string, args: string[]) => void } {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${prefix}-`));
+  const run = (cmd: string, args: string[]) =>
+    spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
+  run('git', ['init', '-b', 'main']);
+  run('git', ['config', 'user.email', 'test@test.com']);
+  run('git', ['config', 'user.name', 'Test']);
+  return { dir, run };
+}
+
+// Helper: copy review skill files to test dir
+function copyReviewFiles(dir: string) {
+  fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(dir, 'review-SKILL.md'));
+  fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(dir, 'review-checklist.md'));
+  fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(dir, 'review-greptile-triage.md'));
+  // Copy specialist checklists
+  const specDir = path.join(dir, 'review-specialists');
+  fs.mkdirSync(specDir, { recursive: true });
+  const specialistsRoot = path.join(ROOT, 'review', 'specialists');
+  for (const f of fs.readdirSync(specialistsRoot)) {
+    fs.copyFileSync(path.join(specialistsRoot, f), path.join(specDir, f));
+  }
+}
+
+// --- Review Army: Migration Safety ---
+
+describeIfSelected('Review Army: Migration Safety', ['review-army-migration-safety'], () => {
+  let dir: string;
+
+  beforeAll(() => {
+    const repo = setupRepo('army-migration');
+    dir = repo.dir;
+
+    // Base commit
+    fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'initial']);
+
+    // Feature branch with unsafe migration
+    repo.run('git', ['checkout', '-b', 'feature/drop-columns']);
+    fs.mkdirSync(path.join(dir, 'db', 'migrate'), { recursive: true });
+    const migrationContent = fs.readFileSync(
+      path.join(ROOT, 'test', 'fixtures', 'review-army-migration.sql'), 'utf-8'
+    );
+    fs.writeFileSync(path.join(dir, 'db', 'migrate', '20260330_drop_columns.sql'), migrationContent);
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'drop email and phone columns']);
+
+    copyReviewFiles(dir);
+  });
+
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
+
+  testConcurrentIfSelected('review-army-migration-safety', async () => {
+    const result = await runSkillTest({
+      prompt: `You are in a git repo on a feature branch with a database migration that drops columns.
+Read review-SKILL.md for instructions. Also read review-checklist.md.
+The specialist checklists are in review-specialists/ (testing.md, security.md, performance.md, data-migration.md, etc.).
+
+Skip the preamble, lake intro, telemetry sections.
+Run Step 4 (Critical pass) then Step 4.5 (Review Army — Specialist Dispatch).
+The base branch is main. Run gstack-diff-scope style analysis on the changed files.
+Since db/migrate/ files changed, the Data Migration specialist should activate.
+
+For the specialist dispatch, instead of launching subagents, just read review-specialists/data-migration.md
+and apply it yourself against the diff (git diff main...HEAD).
+
+Write your findings to ${dir}/review-output.md`,
+      workingDirectory: dir,
+      maxTurns: 20,
+      timeout: 180_000,
+      testName: 'review-army-migration-safety',
+      runId,
+    });
+
+    logCost('/review army migration', result);
+    recordE2E(evalCollector, '/review army migration safety', 'Review Army', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify migration issues were caught
+    const outputPath = path.join(dir, 'review-output.md');
+    if (fs.existsSync(outputPath)) {
+      const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
+      const hasMigrationFinding =
+        content.includes('drop') ||
+        content.includes('data loss') ||
+        content.includes('reversib') ||
+        content.includes('migration') ||
+        content.includes('column');
+      expect(hasMigrationFinding).toBe(true);
+    }
+  }, 210_000);
+});
+
+// --- Review Army: N+1 Performance ---
+
+describeIfSelected('Review Army: N+1 Performance', ['review-army-perf-n-plus-one'], () => {
+  let dir: string;
+
+  beforeAll(() => {
+    const repo = setupRepo('army-n-plus-one');
+    dir = repo.dir;
+
+    fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'initial']);
+
+    repo.run('git', ['checkout', '-b', 'feature/add-posts-index']);
+    const n1Content = fs.readFileSync(
+      path.join(ROOT, 'test', 'fixtures', 'review-army-n-plus-one.rb'), 'utf-8'
+    );
+    fs.writeFileSync(path.join(dir, 'posts_controller.rb'), n1Content);
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'add posts controller']);
+
+    copyReviewFiles(dir);
+  });
+
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
+
+  testConcurrentIfSelected('review-army-perf-n-plus-one', async () => {
+    const result = await runSkillTest({
+      prompt: `You are in a git repo on a feature branch with a Ruby controller that has N+1 queries.
+Read review-SKILL.md for instructions. Also read review-checklist.md.
+The specialist checklists are in review-specialists/ (testing.md, performance.md, etc.).
+
+Skip the preamble, lake intro, telemetry sections.
+Run Step 4 (Critical pass) then Step 4.5 (Review Army).
+The base branch is main. This is a Ruby backend file, so Performance specialist should activate.
+
+For the specialist dispatch, read review-specialists/performance.md and apply it against the diff.
+
+Write your findings to ${dir}/review-output.md`,
+      workingDirectory: dir,
+      maxTurns: 20,
+      timeout: 180_000,
+      testName: 'review-army-perf-n-plus-one',
+      runId,
+    });
+
+    logCost('/review army n+1', result);
+    recordE2E(evalCollector, '/review army N+1 detection', 'Review Army', result);
+    expect(result.exitReason).toBe('success');
+
+    const outputPath = path.join(dir, 'review-output.md');
+    if (fs.existsSync(outputPath)) {
+      const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
+      const hasN1Finding =
+        content.includes('n+1') ||
+        content.includes('n + 1') ||
+        content.includes('eager') ||
+        content.includes('includes') ||
+        content.includes('preload') ||
+        content.includes('query') ||
+        content.includes('loop');
+      expect(hasN1Finding).toBe(true);
+    }
+  }, 210_000);
+});
+
+// --- Review Army: Delivery Audit ---
+
+describeIfSelected('Review Army: Delivery Audit', ['review-army-delivery-audit'], () => {
+  let dir: string;
+
+  beforeAll(() => {
+    const repo = setupRepo('army-delivery');
+    dir = repo.dir;
+
+    fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'initial']);
+
+    repo.run('git', ['checkout', '-b', 'feature/three-features']);
+
+    // Write a plan file promising 3 features
+    fs.writeFileSync(path.join(dir, 'PLAN.md'), `# Feature Plan
+
+## Implementation Items
+1. Add user authentication with login/logout
+2. Add user profile page with avatar upload
+3. Add email notification system for new signups
+
+## Test Items
+- Test login flow
+- Test profile page rendering
+- Test email sending
+`);
+    repo.run('git', ['add', 'PLAN.md']);
+    repo.run('git', ['commit', '-m', 'add plan']);
+
+    // Implement only 2 of 3 features
+    fs.writeFileSync(path.join(dir, 'auth.rb'), `class AuthController
+  def login
+    # authenticate user
+    session[:user_id] = user.id
+  end
+
+  def logout
+    session.delete(:user_id)
+  end
+end
+`);
+    fs.writeFileSync(path.join(dir, 'profile.rb'), `class ProfileController
+  def show
+    @user = User.find(params[:id])
+  end
+
+  def update_avatar
+    @user.avatar.attach(params[:avatar])
+  end
+end
+`);
+    // NOTE: email notification system is NOT implemented (intentionally missing)
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'implement auth and profile features']);
+
+    copyReviewFiles(dir);
+  });
+
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
+
+  testConcurrentIfSelected('review-army-delivery-audit', async () => {
+    const result = await runSkillTest({
+      prompt: `You are in a git repo on branch feature/three-features.
+There is a PLAN.md file that promises 3 features: auth, profile, and email notifications.
+The diff (git diff main...HEAD) only implements 2 of them (auth and profile).
+
+Read review-SKILL.md for the review workflow. Focus on the Plan Completion Audit section.
+The plan file is at ./PLAN.md. Cross-reference it against the diff.
+
+For each plan item, classify as DONE, PARTIAL, NOT DONE, or CHANGED.
+The email notification system should be classified as NOT DONE.
+
+Write your completion audit to ${dir}/review-output.md`,
+      workingDirectory: dir,
+      maxTurns: 15,
+      timeout: 120_000,
+      testName: 'review-army-delivery-audit',
+      runId,
+    });
+
+    logCost('/review army delivery', result);
+    recordE2E(evalCollector, '/review army delivery audit', 'Review Army', result);
+    expect(result.exitReason).toBe('success');
+
+    const outputPath = path.join(dir, 'review-output.md');
+    if (fs.existsSync(outputPath)) {
+      const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
+      // Should identify email notifications as NOT DONE
+      const hasNotDone =
+        content.includes('not done') ||
+        content.includes('not_done') ||
+        content.includes('missing') ||
+        content.includes('not implemented');
+      const mentionsEmail =
+        content.includes('email') ||
+        content.includes('notification');
+      expect(hasNotDone).toBe(true);
+      expect(mentionsEmail).toBe(true);
+    }
+  }, 150_000);
+});
+
+// --- Review Army: Quality Score ---
+
+describeIfSelected('Review Army: Quality Score', ['review-army-quality-score'], () => {
+  let dir: string;
+
+  beforeAll(() => {
+    const repo = setupRepo('army-quality');
+    dir = repo.dir;
+
+    fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'initial']);
+
+    repo.run('git', ['checkout', '-b', 'feature/add-controller']);
+    // Code with obvious issues for quality score computation
+    fs.writeFileSync(path.join(dir, 'user_controller.rb'), `class UserController
+  def create
+    # SQL injection
+    User.where("name = '#{params[:name]}'")
+    # Magic number
+    if users.count > 42
+      raise "too many"
+    end
+  end
+end
+`);
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'add user controller']);
+
+    copyReviewFiles(dir);
+  });
+
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
+
+  testConcurrentIfSelected('review-army-quality-score', async () => {
+    const result = await runSkillTest({
+      prompt: `You are in a git repo with a vulnerable user controller.
+Read review-SKILL.md and review-checklist.md.
+Skip preamble, lake intro, telemetry.
+
+Run the Critical pass (Step 4) against the diff (git diff main...HEAD).
+Then compute the PR Quality Score as described in the Review Army merge step:
+quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))
+
+Write your findings AND the computed quality score to ${dir}/review-output.md
+Include the line: "PR Quality Score: X/10" where X is the computed score.`,
+      workingDirectory: dir,
+      maxTurns: 15,
+      timeout: 120_000,
+      testName: 'review-army-quality-score',
+      runId,
+    });
+
+    logCost('/review army quality', result);
+    recordE2E(evalCollector, '/review army quality score', 'Review Army', result);
+    expect(result.exitReason).toBe('success');
+
+    const outputPath = path.join(dir, 'review-output.md');
+    if (fs.existsSync(outputPath)) {
+      const content = fs.readFileSync(outputPath, 'utf-8');
+      // Should contain a quality score
+      const hasScore =
+        content.toLowerCase().includes('quality score') ||
+        content.match(/\d+\/10/);
+      expect(hasScore).toBeTruthy();
+    }
+  }, 150_000);
+});
+
+// --- Review Army: JSON Findings ---
+
+describeIfSelected('Review Army: JSON Findings', ['review-army-json-findings'], () => {
+  let dir: string;
+
+  beforeAll(() => {
+    const repo = setupRepo('army-json');
+    dir = repo.dir;
+
+    fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'initial']);
+
+    repo.run('git', ['checkout', '-b', 'feature/vuln']);
+    fs.writeFileSync(path.join(dir, 'search.rb'), `class SearchController
+  def index
+    # SQL injection via string interpolation
+    results = ActiveRecord::Base.connection.execute(
+      "SELECT * FROM products WHERE name LIKE '%#{params[:q]}%'"
+    )
+    render json: results
+  end
+end
+`);
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'add search']);
+
+    copyReviewFiles(dir);
+  });
+
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
+
+  testConcurrentIfSelected('review-army-json-findings', async () => {
+    const result = await runSkillTest({
+      prompt: `You are reviewing a git diff with a SQL injection vulnerability.
+Read review-specialists/security.md for the security checklist.
+
+Apply the checklist against this diff (git diff main...HEAD).
+Output your findings as JSON objects, one per line, following the schema:
+{"severity":"CRITICAL","confidence":9,"path":"search.rb","line":4,"category":"injection","summary":"SQL injection via string interpolation","fix":"Use parameterized query","fingerprint":"search.rb:4:injection","specialist":"security"}
+
+Write ONLY JSON findings (no preamble) to ${dir}/findings.json`,
+      workingDirectory: dir,
+      maxTurns: 12,
+      timeout: 90_000,
+      testName: 'review-army-json-findings',
+      runId,
+    });
+
+    logCost('/review army json', result);
+    recordE2E(evalCollector, '/review army JSON findings', 'Review Army', result);
+    expect(result.exitReason).toBe('success');
+
+    const findingsPath = path.join(dir, 'findings.json');
+    if (fs.existsSync(findingsPath)) {
+      const content = fs.readFileSync(findingsPath, 'utf-8').trim();
+      const lines = content.split('\n').filter(l => l.trim());
+      // At least one finding
+      expect(lines.length).toBeGreaterThanOrEqual(1);
+      // Each line should be valid JSON with required fields
+      for (const line of lines) {
+        let parsed: any;
+        try { parsed = JSON.parse(line); } catch { continue; }
+        // Required fields per schema
+        expect(parsed).toHaveProperty('severity');
+        expect(parsed).toHaveProperty('confidence');
+        expect(parsed).toHaveProperty('path');
+        expect(parsed).toHaveProperty('category');
+        expect(parsed).toHaveProperty('summary');
+        expect(parsed).toHaveProperty('specialist');
+        break; // One valid line is enough for the gate test
+      }
+    }
+  }, 120_000);
+});
+
+// --- Review Army: Red Team (periodic) ---
+
+describeIfSelected('Review Army: Red Team', ['review-army-red-team'], () => {
+  let dir: string;
+
+  beforeAll(() => {
+    const repo = setupRepo('army-redteam');
+    dir = repo.dir;
+
+    fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'initial']);
+
+    repo.run('git', ['checkout', '-b', 'feature/large-change']);
+    // Create a large diff (300+ lines)
+    const lines: string[] = ['class LargeController'];
+    for (let i = 0; i < 100; i++) {
+      lines.push(`  def method_${i}`);
+      lines.push(`    data = params[:input_${i}]`);
+      lines.push(`    process(data)`);
+      lines.push('  end');
+      lines.push('');
+    }
+    lines.push('end');
+    fs.writeFileSync(path.join(dir, 'large_controller.rb'), lines.join('\n'));
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'add large controller']);
+
+    copyReviewFiles(dir);
+  });
+
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
+
+  testConcurrentIfSelected('review-army-red-team', async () => {
+    const result = await runSkillTest({
+      prompt: `You are reviewing a large diff (300+ lines). Read review-SKILL.md.
+Skip preamble, lake intro, telemetry.
+
+The diff is large enough to activate the Red Team specialist.
+Read review-specialists/red-team.md and apply it against the diff (git diff main...HEAD).
+Focus on finding issues that other specialists might miss.
+
+Write your red team findings to ${dir}/review-output.md
+Start the file with "RED TEAM REVIEW" on the first line.`,
+      workingDirectory: dir,
+      maxTurns: 20,
+      timeout: 180_000,
+      testName: 'review-army-red-team',
+      runId,
+    });
+
+    logCost('/review army red-team', result);
+    recordE2E(evalCollector, '/review army red team', 'Review Army', result);
+    expect(result.exitReason).toBe('success');
+
+    const outputPath = path.join(dir, 'review-output.md');
+    if (fs.existsSync(outputPath)) {
+      const content = fs.readFileSync(outputPath, 'utf-8');
+      expect(content.toLowerCase()).toMatch(/red team|adversarial/);
+    }
+  }, 210_000);
+});
+
+// --- Review Army: Consensus (periodic) ---
+
+describeIfSelected('Review Army: Consensus', ['review-army-consensus'], () => {
+  let dir: string;
+
+  beforeAll(() => {
+    const repo = setupRepo('army-consensus');
+    dir = repo.dir;
+
+    fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'initial']);
+
+    repo.run('git', ['checkout', '-b', 'feature/vuln-auth']);
+    // SQL injection that both security AND testing specialists should flag
+    fs.writeFileSync(path.join(dir, 'auth_controller.rb'), `class AuthController
+  def login
+    user = User.find_by("email = '#{params[:email]}' AND password = '#{params[:password]}'")
+    if user
+      session[:user_id] = user.id
+      redirect_to root_path
+    else
+      flash[:error] = "Invalid credentials"
+      render :login
+    end
+  end
+end
+`);
+    repo.run('git', ['add', '.']);
+    repo.run('git', ['commit', '-m', 'add auth controller']);
+
+    copyReviewFiles(dir);
+  });
+
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
+
+  testConcurrentIfSelected('review-army-consensus', async () => {
+    const result = await runSkillTest({
+      prompt: `You are reviewing a git diff with a SQL injection in an auth controller.
+Read review-SKILL.md, review-checklist.md, and the specialist checklists in review-specialists/.
+
+This vulnerability should be caught by BOTH the security specialist (injection vector)
+AND the testing specialist (no test for auth bypass).
+
+Run the review. In your output, if a finding is flagged by multiple perspectives,
+mark it as "MULTI-SPECIALIST CONFIRMED" with the confirming categories.
+
+Write findings to ${dir}/review-output.md`,
+      workingDirectory: dir,
+      maxTurns: 20,
+      timeout: 180_000,
+      testName: 'review-army-consensus',
+      runId,
+    });
+
+    logCost('/review army consensus', result);
+    recordE2E(evalCollector, '/review army consensus', 'Review Army', result);
+    expect(result.exitReason).toBe('success');
+
+    const outputPath = path.join(dir, 'review-output.md');
+    if (fs.existsSync(outputPath)) {
+      const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
+      // Should catch the SQL injection
+      const hasSqlFinding =
+        content.includes('sql') ||
+        content.includes('injection') ||
+        content.includes('interpolat');
+      expect(hasSqlFinding).toBe(true);
+    }
+  }, 210_000);
+});
+
+// Finalize eval collector
+afterAll(async () => {
+  await finalizeEvalCollector(evalCollector);
+});
@@ -149,6 +149,196 @@ describeIfSelected('Sidebar URL accuracy E2E', ['sidebar-url-accuracy'], () => {
  }, 30_000);
 });

+// --- Sidebar CSS Interaction E2E (real Claude + real browser) ---
+// Goes to HN, reads comments, identifies the most insightful one, highlights it.
+// Exercises: navigation, snapshot, text reading, LLM judgment, CSS style injection.
+
+describeIfSelected('Sidebar CSS interaction E2E', ['sidebar-css-interaction'], () => {
+  let serverProc: Subprocess | null = null;
+  let agentProc: Subprocess | null = null;
+  let serverPort: number = 0;
+  let authToken: string = '';
+  let tmpDir: string = '';
+  let stateFile: string = '';
+  let queueFile: string = '';
+  let serverLogFile: string = '';
+  let serverErrFile: string = '';
+  let agentLogFile: string = '';
+  let agentErrFile: string = '';
+
+  async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+      ...(opts.headers as Record<string, string> || {}),
+    };
+    if (!headers['Authorization'] && authToken) {
+      headers['Authorization'] = `Bearer ${authToken}`;
+    }
+    return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
+  }
+
+  beforeAll(async () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-css-'));
+    stateFile = path.join(tmpDir, 'browse.json');
+    queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
+    fs.mkdirSync(path.dirname(queueFile), { recursive: true });
+
+    // Start server WITH a real browser for CSS interaction
+    const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
+    serverLogFile = path.join(tmpDir, 'server.log');
+    serverErrFile = path.join(tmpDir, 'server.err');
+    // Use 'pipe' stdio — closing file descriptors kills the child on macOS/bun
+    serverProc = spawn(['bun', 'run', serverScript], {
+      env: {
+        ...process.env,
+        BROWSE_STATE_FILE: stateFile,
+        BROWSE_PORT: '0',
+        SIDEBAR_QUEUE_PATH: queueFile,
+        BROWSE_IDLE_TIMEOUT: '600000', // 10 min in ms — test takes ~3 min
+      },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    // Wait for state file with port/token
+    const deadline = Date.now() + 30000;
+    while (Date.now() < deadline) {
+      if (fs.existsSync(stateFile)) {
+        try {
+          const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
+          if (state.port && state.token) {
+            serverPort = state.port;
+            authToken = state.token;
+            break;
+          }
+        } catch {}
+      }
+      await new Promise(r => setTimeout(r, 200));
+    }
+    if (!serverPort) throw new Error('Server did not start in time');
+
+    // Verify server is healthy before proceeding
+    const healthDeadline = Date.now() + 10000;
+    let healthy = false;
+    while (Date.now() < healthDeadline) {
+      try {
+        const resp = await fetch(`http://127.0.0.1:${serverPort}/health`);
+        if (resp.ok) { healthy = true; break; }
+      } catch {}
+      await new Promise(r => setTimeout(r, 500));
+    }
+    if (!healthy) throw new Error('Server started but health check failed');
+
+    // Start sidebar-agent with the real browse binary
+    const agentScript = path.resolve(ROOT, 'browse', 'src', 'sidebar-agent.ts');
+    const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
+    agentLogFile = path.join(tmpDir, 'agent.log');
+    agentErrFile = path.join(tmpDir, 'agent.err');
+    // Use 'pipe' stdio — closing file descriptors kills the child on macOS/bun
+    agentProc = spawn(['bun', 'run', agentScript], {
+      env: {
+        ...process.env,
+        BROWSE_SERVER_PORT: String(serverPort),
+        BROWSE_STATE_FILE: stateFile,
+        SIDEBAR_QUEUE_PATH: queueFile,
+        SIDEBAR_AGENT_TIMEOUT: '180000', // 3 min — multi-step HN comment task
+        BROWSE_BIN: fs.existsSync(browseBin) ? browseBin : 'echo',
+      },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    await new Promise(r => setTimeout(r, 2000));
+  }, 35000);
+
+  afterAll(() => {
+    if (agentProc) { try { agentProc.kill(); } catch {} }
+    if (serverProc) { try { serverProc.kill(); } catch {} }
+    finalizeEvalCollector(evalCollector);
+    try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('sidebar-css-interaction', async () => {
+    // Fresh session + clean queue
+    try { await api('/sidebar-session/new', { method: 'POST' }); } catch {}
+    fs.writeFileSync(queueFile, '');
+    const startTime = Date.now();
+
+    // Ask the agent to go to HN, find the most insightful comment, and highlight it
+    const resp = await api('/sidebar-command', {
+      method: 'POST',
+      body: JSON.stringify({
+        message: 'Go to https://news.ycombinator.com. Find the top story. Click into its comments. Read the comments and find the most insightful one. Highlight that comment with a 4px solid orange outline.',
+        activeTabUrl: 'about:blank',
+      }),
+    });
+    expect(resp.status).toBe(200);
+
+    // Poll for agent_done (4 min timeout — multi-step task with opus LLM)
+    const deadline = Date.now() + 240000;
+    let entries: any[] = [];
+    while (Date.now() < deadline) {
+      try {
+        const chatResp = await api('/sidebar-chat?after=0');
+        const data = await chatResp.json();
+        entries = data.entries || [];
+        if (entries.some((e: any) => e.type === 'agent_done')) break;
+      } catch (err: any) {
+        // Server may be temporarily busy or restarting — retry on connection errors
+        const isConnErr = err.code === 'ConnectionRefused' || err.message?.includes('ConnectionRefused') || err.message?.includes('Unable to connect');
+        if (!isConnErr) throw err;
+      }
+      await new Promise(r => setTimeout(r, 3000));
+    }
+
+    const duration = Date.now() - startTime;
+    const doneEntry = entries.find((e: any) => e.type === 'agent_done');
+
+    // Dump debug info on failure
+    if (!doneEntry || entries.length === 0) {
+      console.log('ENTRIES:', JSON.stringify(entries.slice(-5), null, 2));
+      console.log('SERVER exitCode:', serverProc?.exitCode, 'signalCode:', serverProc?.signalCode, 'killed:', serverProc?.killed);
+      console.log('AGENT exitCode:', agentProc?.exitCode, 'signalCode:', agentProc?.signalCode, 'killed:', agentProc?.killed);
+      const queueContent = fs.existsSync(queueFile) ? fs.readFileSync(queueFile, 'utf-8').slice(-500) : 'NO QUEUE';
+      console.log('QUEUE:', queueContent.length > 0 ? 'has entries' : 'empty');
+    }
+
+    // Agent should have completed
+    expect(doneEntry).toBeDefined();
+
+    // Agent should have run browse commands (look for tool_use entries)
+    const toolUses = entries.filter((e: any) => e.type === 'tool_use');
+    expect(toolUses.length).toBeGreaterThanOrEqual(2); // At minimum: goto + one more
+
+    // Agent text should mention something about the comment it found
+    const agentText = entries
+      .filter((e: any) => e.role === 'agent' && (e.type === 'text' || e.type === 'result'))
+      .map((e: any) => e.text || '')
+      .join(' ')
+      .toLowerCase();
+
+    // Should have navigated to HN (look for ycombinator/HN in any entry text)
+    const allEntryText = entries
+      .map((e: any) => `${e.text || ''} ${e.input || ''} ${e.message || ''}`)
+      .join(' ');
+    const navigatedToHN = allEntryText.includes('ycombinator') || allEntryText.includes('Hacker News') || allEntryText.includes('news.ycombinator');
+    if (!navigatedToHN) {
+      console.log('ALL ENTRY TEXT (first 2000):', allEntryText.slice(0, 2000));
+    }
+    expect(navigatedToHN).toBe(true);
+
+    // Should have applied a style (look for orange/outline in tool commands)
+    const allText = entries.map((e: any) => e.text || '').join(' ');
+    const appliedStyle = allText.includes('outline') || allText.includes('orange') || allText.includes('style');
+
+    evalCollector?.addTest({
+      name: 'sidebar-css-interaction', suite: 'Sidebar CSS interaction E2E', tier: 'e2e',
+      passed: !!doneEntry && navigatedToHN && appliedStyle,
+      duration_ms: duration,
+      cost_usd: 0,
+      exit_reason: doneEntry ? 'success' : 'timeout',
+    });
+  }, 300_000);
+});
+
 // --- Sidebar Navigate (real Claude, requires ANTHROPIC_API_KEY) ---

 describeIfSelected('Sidebar navigate E2E', ['sidebar-navigate'], () => {
@@ -3257,6 +3257,102 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
  }, 180_000);
 });

+// --- Ship idempotency (#649) ---
+describeIfSelected('Ship idempotency', ['ship-idempotency'], () => {
+  let idempDir: string;
+  const gitRun = (args: string[], cwd: string) =>
+    spawnSync('git', args, { cwd, stdio: 'pipe', timeout: 5000 });
+
+  beforeAll(() => {
+    idempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-ship-idemp-'));
+
+    // Create git repo with initial commit on main
+    gitRun(['init', '-b', 'main'], idempDir);
+    gitRun(['config', 'user.email', 'test@test.com'], idempDir);
+    gitRun(['config', 'user.name', 'Test'], idempDir);
+
+    fs.writeFileSync(path.join(idempDir, 'app.ts'), 'console.log("v1");\n');
+    fs.writeFileSync(path.join(idempDir, 'VERSION'), '0.1.0.0\n');
+    fs.writeFileSync(path.join(idempDir, 'CHANGELOG.md'), '# Changelog\n');
+    gitRun(['add', '.'], idempDir);
+    gitRun(['commit', '-m', 'initial'], idempDir);
+
+    // Create feature branch with changes
+    gitRun(['checkout', '-b', 'feat/my-feature'], idempDir);
+    fs.writeFileSync(path.join(idempDir, 'app.ts'), 'console.log("v2");\n');
+    gitRun(['add', 'app.ts'], idempDir);
+    gitRun(['commit', '-m', 'feat: update to v2'], idempDir);
+
+    // Simulate prior /ship run: bump VERSION and write CHANGELOG entry
+    fs.writeFileSync(path.join(idempDir, 'VERSION'), '0.2.0.0\n');
+    fs.writeFileSync(path.join(idempDir, 'CHANGELOG.md'),
+      '# Changelog\n\n## [0.2.0.0] — 2026-03-30\n\n- Updated app to v2\n');
+    gitRun(['add', 'VERSION', 'CHANGELOG.md'], idempDir);
+    gitRun(['commit', '-m', 'chore: bump version to 0.2.0.0'], idempDir);
+
+    // Extract just the idempotency-relevant sections from ship/SKILL.md
+    const full = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    const step4Start = full.indexOf('## Step 4: Version bump');
+    const step4End = full.indexOf('\n---\n', step4Start);
+    const step7Start = full.indexOf('## Step 7: Push');
+    const step8End = full.indexOf('## Step 8.5');
+    const extracted = [
+      full.slice(step4Start, step4End > step4Start ? step4End : step4Start + 500),
+      full.slice(step7Start, step8End > step7Start ? step8End : step7Start + 500),
+    ].join('\n\n---\n\n');
+    fs.writeFileSync(path.join(idempDir, 'ship-steps.md'), extracted);
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(idempDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('ship-idempotency', async () => {
+    const result = await runSkillTest({
+      prompt: `You are in a git repo on branch feat/my-feature. A prior /ship run already:
+- Bumped VERSION from 0.1.0.0 to 0.2.0.0
+- Wrote a CHANGELOG entry for 0.2.0.0
+- But the push/PR step failed
+
+Read ship-steps.md for the idempotency check instructions from the ship workflow.
+
+Run ONLY the idempotency checks described in Steps 4 and 7. Do NOT actually push or create PRs (there is no remote).
+
+After running the checks, write a report to ${idempDir}/idemp-result.md containing:
+- Whether VERSION was detected as ALREADY_BUMPED or not
+- Whether the push was detected as ALREADY_PUSHED or PUSH_NEEDED
+- The current VERSION value (should still be 0.2.0.0)
+
+Do NOT modify VERSION or CHANGELOG. Only run the detection checks and report.`,
+      workingDirectory: idempDir,
+      maxTurns: 10,
+      timeout: 60_000,
+      testName: 'ship-idempotency',
+      runId,
+    });
+
+    logCost('/ship idempotency', result);
+    recordE2E('/ship idempotency guard', 'Ship idempotency', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify VERSION was NOT modified
+    const version = fs.readFileSync(path.join(idempDir, 'VERSION'), 'utf-8').trim();
+    expect(version).toBe('0.2.0.0');
+
+    // Verify CHANGELOG was NOT duplicated
+    const changelog = fs.readFileSync(path.join(idempDir, 'CHANGELOG.md'), 'utf-8');
+    const versionEntries = (changelog.match(/## \[0\.2\.0\.0\]/g) || []).length;
+    expect(versionEntries).toBe(1);
+
+    // Check the result report if it was written
+    const reportPath = path.join(idempDir, 'idemp-result.md');
+    if (fs.existsSync(reportPath)) {
+      const report = fs.readFileSync(reportPath, 'utf-8');
+      expect(report.toLowerCase()).toContain('already_bumped');
+    }
+  }, 120_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  if (evalCollector) {
@@ -1268,38 +1268,49 @@ describe('Codex skill', () => {
    expect(content).toContain('mktemp');
  });

-  test('adversarial review in /review auto-scales by diff size', () => {
+  test('adversarial review in /review always runs both passes', () => {
    const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
-    expect(content).toContain('Adversarial review (auto-scaled)');
-    // Diff size thresholds
-    expect(content).toContain('< 50');
-    expect(content).toContain('50–199');
-    expect(content).toContain('200+');
-    // All three tiers present
-    expect(content).toContain('Small');
-    expect(content).toContain('Medium tier');
-    expect(content).toContain('Large tier');
+    expect(content).toContain('Adversarial review (always-on)');
+    // Always-on: both Claude and Codex adversarial
+    expect(content).toContain('Claude adversarial subagent (always runs)');
+    expect(content).toContain('Codex adversarial challenge (always runs when available)');
    // Claude adversarial subagent dispatch
    expect(content).toContain('Agent tool');
    expect(content).toContain('FIXABLE');
    expect(content).toContain('INVESTIGATE');
-    // Codex fallback logic
+    // Codex availability check
    expect(content).toContain('CODEX_NOT_AVAILABLE');
-    expect(content).toContain('fall back to the Claude adversarial subagent');
-    // Review log uses new skill name
+    // OLD_CFG only gates Codex, not Claude
+    expect(content).toContain('skip Codex passes only');
+    // Review log
    expect(content).toContain('adversarial-review');
    expect(content).toContain('reasoning_effort="high"');
    expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
+    // Large diff structured review still gated
+    expect(content).toContain('Codex structured review (large diffs only');
+    expect(content).toContain('200');
  });

-  test('adversarial review in /ship auto-scales by diff size', () => {
+  test('adversarial review in /ship always runs both passes', () => {
    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
-    expect(content).toContain('Adversarial review (auto-scaled)');
-    expect(content).toContain('< 50');
-    expect(content).toContain('200+');
+    expect(content).toContain('Adversarial review (always-on)');
    expect(content).toContain('adversarial-review');
    expect(content).toContain('reasoning_effort="high"');
    expect(content).toContain('Investigate and fix');
+    expect(content).toContain('Claude adversarial subagent (always runs)');
+  });
+
+  test('scope drift detection in /review and /ship', () => {
+    const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
+    const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    // Both should contain scope drift from the shared resolver
+    for (const content of [reviewContent, shipContent]) {
+      expect(content).toContain('Scope Check:');
+      expect(content).toContain('DRIFT DETECTED');
+      expect(content).toContain('SCOPE CREEP');
+      expect(content).toContain('MISSING REQUIREMENTS');
+      expect(content).toContain('stated intent');
+    }
  });

  test('codex-host ship/review do NOT contain adversarial review step', () => {
@@ -1522,12 +1533,13 @@ describe('sidebar agent (#584)', () => {
  });

  // #584 — Server Write: server.ts allowedTools includes Write (DRY parity)
-  test('server.ts allowedTools includes Write', () => {
+  test('server.ts allowedTools excludes Write (agent is read-only + Bash)', () => {
    const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'server.ts'), 'utf-8');
    // Find the sidebar allowedTools in the headed-mode path
    const match = content.match(/--allowedTools['"]\s*,\s*['"]([^'"]+)['"]/);
    expect(match).not.toBeNull();
-    expect(match![1]).toContain('Write');
+    expect(match![1]).toContain('Bash');
+    expect(match![1]).not.toContain('Write');
  });

  // #584 — Sidebar stderr: stderr handler is not empty