Merge remote-tracking branch 'origin/main' into garrytan/learn-from-reviews

Resolved conflicts:
- VERSION: bumped to 0.14.6.0 (our branch on top of main's 0.14.5.0)
- CHANGELOG.md: kept our entry on top, main's 7 new entries below, updated version
- package.json: version synced to 0.14.6.0
- Regenerated all SKILL.md files from merged templates

Main brought: Review Army (parallel specialist reviewers), always-on adversarial,
CSS inspector, per-tab agents, design-to-code, comparison board, ship idempotency,
skill prefix fix, session intelligence roadmap.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-31 21:07:07 -07:00
93 changed files with 11423 additions and 821 deletions
+165
View File
@@ -0,0 +1,165 @@
/**
* Tests for bin/gstack-diff-scope — verifies scope signal detection.
*
* Creates temp git repos with specific file patterns and verifies
* the correct SCOPE_* variables are output.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'fs';
import { join } from 'path';
import { tmpdir } from 'os';
import { spawnSync } from 'child_process';
const SCRIPT = join(import.meta.dir, '..', 'bin', 'gstack-diff-scope');
const dirs: string[] = [];
function createRepo(files: string[]): string {
const dir = mkdtempSync(join(tmpdir(), 'diff-scope-test-'));
dirs.push(dir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Base commit
writeFileSync(join(dir, 'README.md'), '# test\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Feature branch with specified files
run('git', ['checkout', '-b', 'feature/test']);
for (const f of files) {
const fullPath = join(dir, f);
const dirPath = fullPath.substring(0, fullPath.lastIndexOf('/'));
if (dirPath !== dir) mkdirSync(dirPath, { recursive: true });
writeFileSync(fullPath, '# test content\n');
}
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add files']);
return dir;
}
function runScope(dir: string): Record<string, string> {
const result = spawnSync('bash', [SCRIPT, 'main'], {
cwd: dir, stdio: 'pipe', timeout: 5000,
});
const output = result.stdout.toString().trim();
const vars: Record<string, string> = {};
for (const line of output.split('\n')) {
const [key, val] = line.split('=');
if (key && val) vars[key] = val;
}
return vars;
}
afterAll(() => {
for (const d of dirs) {
try { rmSync(d, { recursive: true, force: true }); } catch {}
}
});
describe('gstack-diff-scope', () => {
// --- Existing scope signals ---
test('detects frontend files', () => {
const dir = createRepo(['styles.css', 'component.tsx']);
const scope = runScope(dir);
expect(scope.SCOPE_FRONTEND).toBe('true');
});
test('detects backend files', () => {
const dir = createRepo(['app.rb', 'service.py']);
const scope = runScope(dir);
expect(scope.SCOPE_BACKEND).toBe('true');
});
test('detects test files', () => {
const dir = createRepo(['test/app.test.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_TESTS).toBe('true');
});
// --- New scope signals (Review Army) ---
test('detects migrations via db/migrate/', () => {
const dir = createRepo(['db/migrate/20260330_create_users.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects migrations via generic migrations/', () => {
const dir = createRepo(['app/migrations/0001_initial.py']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects migrations via prisma', () => {
const dir = createRepo(['prisma/migrations/20260330/migration.sql']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects API via controller files', () => {
const dir = createRepo(['app/controllers/users_controller.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects API via route files', () => {
const dir = createRepo(['src/routes/api.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects API via GraphQL schemas', () => {
const dir = createRepo(['schema.graphql']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects auth files', () => {
const dir = createRepo(['app/services/auth_service.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('detects session files', () => {
const dir = createRepo(['lib/session_manager.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('detects JWT files', () => {
const dir = createRepo(['utils/jwt_helper.py']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('returns false for all new signals when no matching files', () => {
const dir = createRepo(['docs/readme.md', 'config.yml']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('false');
expect(scope.SCOPE_API).toBe('false');
expect(scope.SCOPE_AUTH).toBe('false');
});
test('outputs all 9 scope variables', () => {
const dir = createRepo(['app.ts']);
const scope = runScope(dir);
expect(Object.keys(scope)).toHaveLength(9);
expect(scope).toHaveProperty('SCOPE_FRONTEND');
expect(scope).toHaveProperty('SCOPE_BACKEND');
expect(scope).toHaveProperty('SCOPE_PROMPTS');
expect(scope).toHaveProperty('SCOPE_TESTS');
expect(scope).toHaveProperty('SCOPE_DOCS');
expect(scope).toHaveProperty('SCOPE_CONFIG');
expect(scope).toHaveProperty('SCOPE_MIGRATIONS');
expect(scope).toHaveProperty('SCOPE_API');
expect(scope).toHaveProperty('SCOPE_AUTH');
});
});
+5
View File
@@ -0,0 +1,5 @@
-- Migration: Drop user email column
-- WARNING: This migration is intentionally unsafe for testing
ALTER TABLE users DROP COLUMN email;
ALTER TABLE users DROP COLUMN phone_number;
-- No backfill, no reversibility check, no data preservation
+12
View File
@@ -0,0 +1,12 @@
# N+1 query example — intentionally bad for testing
class PostsController
def index
@posts = Post.all
@posts.each do |post|
# N+1: queries Author table for every post
puts post.author.name
# N+1: queries Comments table for every post
puts post.comments.count
end
end
end
+104 -22
View File
@@ -595,10 +595,12 @@ describe('REVIEW_DASHBOARD resolver', () => {
expect(content).toContain('/plan-ceo-review');
});
test('plan-design-review chaining mentions eng and ceo reviews', () => {
test('plan-design-review chaining mentions eng, ceo, and design skills', () => {
const content = fs.readFileSync(path.join(ROOT, 'plan-design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('/plan-eng-review');
expect(content).toContain('/plan-ceo-review');
expect(content).toContain('/design-shotgun');
expect(content).toContain('/design-html');
});
test('ship does NOT contain review chaining', () => {
@@ -614,7 +616,8 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
test('all three modes share codepath tracing methodology', () => {
test('plan and ship modes share codepath tracing methodology', () => {
// Review mode delegates test coverage to the Testing specialist subagent (Review Army)
const sharedPhrases = [
'Trace data flow',
'Diagram the execution',
@@ -626,33 +629,40 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
for (const phrase of sharedPhrases) {
expect(planSkill).toContain(phrase);
expect(shipSkill).toContain(phrase);
expect(reviewSkill).toContain(phrase);
}
// Plan mode traces the plan, not a git diff
expect(planSkill).toContain('Trace every codepath in the plan');
expect(planSkill).not.toContain('git diff origin');
// Ship and review modes trace the diff
// Ship mode traces the diff
expect(shipSkill).toContain('Trace every codepath changed');
expect(reviewSkill).toContain('Trace every codepath changed');
});
test('all three modes include E2E decision matrix', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('review mode uses Review Army for specialist dispatch', () => {
expect(reviewSkill).toContain('Review Army');
expect(reviewSkill).toContain('Specialist Dispatch');
expect(reviewSkill).toContain('testing.md');
});
test('plan and ship modes include E2E decision matrix', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('E2E Test Decision Matrix');
expect(skill).toContain('→E2E');
expect(skill).toContain('→EVAL');
}
});
test('all three modes include regression rule', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('plan and ship modes include regression rule', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('REGRESSION RULE');
expect(skill).toContain('IRON RULE');
}
});
test('all three modes include test framework detection', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('plan and ship modes include test framework detection', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('Test Framework Detection');
expect(skill).toContain('CLAUDE.md');
}
@@ -671,11 +681,12 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
expect(shipSkill).toContain('ship-test-plan');
});
test('review mode generates via Fix-First + gaps are INFORMATIONAL', () => {
test('review mode uses Fix-First + Review Army for specialist coverage', () => {
expect(reviewSkill).toContain('Fix-First');
expect(reviewSkill).toContain('INFORMATIONAL');
expect(reviewSkill).toContain('Step 4.75');
expect(reviewSkill).toContain('subsumes the "Test Gaps" category');
// Review Army handles test coverage via Testing specialist subagent
expect(reviewSkill).toContain('Review Army');
expect(reviewSkill).toContain('Testing');
});
test('plan mode does NOT include ship-specific content', () => {
@@ -690,6 +701,35 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
expect(reviewSkill).not.toContain('ship-test-plan');
});
test('review/specialists/ directory has all expected checklist files', () => {
const specDir = path.join(ROOT, 'review', 'specialists');
const expected = [
'testing.md',
'maintainability.md',
'security.md',
'performance.md',
'data-migration.md',
'api-contract.md',
'red-team.md',
];
for (const f of expected) {
expect(fs.existsSync(path.join(specDir, f))).toBe(true);
}
});
test('each specialist file has standard header with scope and output format', () => {
const specDir = path.join(ROOT, 'review', 'specialists');
const files = fs.readdirSync(specDir).filter(f => f.endsWith('.md'));
for (const f of files) {
const content = fs.readFileSync(path.join(specDir, f), 'utf-8');
// All specialist files must have Scope and Output/JSON in header
expect(content).toContain('Scope:');
expect(content.toLowerCase()).toMatch(/output|json/);
// Must define NO FINDINGS behavior
expect(content).toContain('NO FINDINGS');
}
});
// Regression guard: ship output contains key phrases from before the refactor
test('ship SKILL.md regression guard — key phrases preserved', () => {
const regressionPhrases = [
@@ -877,12 +917,9 @@ describe('Coverage gate in ship', () => {
expect(shipSkill).toContain('could not determine percentage — skipping');
});
test('review SKILL.md contains coverage WARNING', () => {
expect(reviewSkill).toContain('COVERAGE WARNING');
expect(reviewSkill).toContain('Consider writing tests before running /ship');
});
test('review coverage warning is INFORMATIONAL', () => {
test('review SKILL.md delegates coverage to Testing specialist', () => {
// Coverage audit moved to Testing specialist subagent in Review Army
expect(reviewSkill).toContain('testing.md');
expect(reviewSkill).toContain('INFORMATIONAL');
});
});
@@ -1611,10 +1648,9 @@ describe('Codex generation (--host codex)', () => {
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
// Correct: references to sidecar files use gstack/review/ path
expect(content).toContain('.agents/skills/gstack/review/checklist.md');
expect(content).toContain('.agents/skills/gstack/review/design-checklist.md');
// design-checklist.md is now referenced via Review Army specialist (Claude only, stripped for Codex)
// Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there)
expect(content).not.toContain('.agents/skills/gstack-review/checklist.md');
expect(content).not.toContain('.agents/skills/gstack-review/design-checklist.md');
});
test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => {
@@ -2469,3 +2505,49 @@ describe('CONFIDENCE_CALIBRATION resolver', () => {
}
});
});
describe('gen-skill-docs prefix warning (#620/#578)', () => {
const { execSync } = require('child_process');
test('warns about skill_prefix when config has prefix=true', () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-prefix-warn-'));
try {
// Create a fake ~/.gstack/config.yaml with skill_prefix: true
const fakeHome = tmpDir;
const fakeGstack = path.join(fakeHome, '.gstack');
fs.mkdirSync(fakeGstack, { recursive: true });
fs.writeFileSync(path.join(fakeGstack, 'config.yaml'), 'skill_prefix: true\n');
const output = execSync('bun run scripts/gen-skill-docs.ts', {
cwd: ROOT,
env: { ...process.env, HOME: fakeHome },
encoding: 'utf-8',
timeout: 30000,
});
expect(output).toContain('skill_prefix is true');
expect(output).toContain('gstack-relink');
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
});
test('no warning when skill_prefix is false or absent', () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-prefix-warn-'));
try {
const fakeHome = tmpDir;
const fakeGstack = path.join(fakeHome, '.gstack');
fs.mkdirSync(fakeGstack, { recursive: true });
fs.writeFileSync(path.join(fakeGstack, 'config.yaml'), 'skill_prefix: false\n');
const output = execSync('bun run scripts/gen-skill-docs.ts', {
cwd: ROOT,
env: { ...process.env, HOME: fakeHome },
encoding: 'utf-8',
timeout: 30000,
});
expect(output).not.toContain('skill_prefix is true');
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
});
});
+22
View File
@@ -59,6 +59,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Review Army (specialist dispatch)
'review-army-migration-safety': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
'review-army-perf-n-plus-one': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
'review-army-delivery-audit': ['review/**', 'scripts/resolvers/review.ts', 'scripts/resolvers/review-army.ts'],
'review-army-quality-score': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-json-findings': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-red-team': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-consensus': ['review/**', 'scripts/resolvers/review-army.ts'],
// Office Hours
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
@@ -122,6 +131,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Plan completion audit + verification
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
'ship-idempotency': ['ship/**', 'scripts/resolvers/utility.ts'],
'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],
// Design
@@ -152,6 +162,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Sidebar agent
'sidebar-navigate': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
'sidebar-url-accuracy': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
'sidebar-css-interaction': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts', 'browse/src/cdp-inspector.ts', 'extension/**'],
// Autoplan
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
@@ -203,6 +214,15 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'review-plan-completion': 'gate',
'review-dashboard-via': 'gate',
// Review Army — gate for core functionality, periodic for multi-specialist
'review-army-migration-safety': 'gate', // Specialist activation guardrail
'review-army-perf-n-plus-one': 'gate', // Specialist activation guardrail
'review-army-delivery-audit': 'gate', // Delivery integrity guardrail
'review-army-quality-score': 'gate', // Score computation
'review-army-json-findings': 'gate', // JSON schema compliance
'review-army-red-team': 'periodic', // Multi-agent coordination
'review-army-consensus': 'periodic', // Multi-specialist agreement
// Office Hours
'office-hours-spec-review': 'gate',
@@ -228,6 +248,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'ship-triage': 'gate',
'ship-plan-completion': 'gate',
'ship-plan-verification': 'gate',
'ship-idempotency': 'periodic',
// Retro — gate for cheap branch detection, periodic for full Opus retro
'retro': 'periodic',
@@ -282,6 +303,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Sidebar agent
'sidebar-navigate': 'periodic',
'sidebar-url-accuracy': 'periodic',
'sidebar-css-interaction': 'periodic',
// Autoplan — periodic (not yet implemented)
'autoplan-core': 'periodic',
+79 -2
View File
@@ -42,11 +42,18 @@ function setupMockInstall(skills: string[]): void {
fs.copyFileSync(path.join(BIN, 'gstack-relink'), path.join(mockBin, 'gstack-relink'));
fs.chmodSync(path.join(mockBin, 'gstack-relink'), 0o755);
}
if (fs.existsSync(path.join(BIN, 'gstack-patch-names'))) {
fs.copyFileSync(path.join(BIN, 'gstack-patch-names'), path.join(mockBin, 'gstack-patch-names'));
fs.chmodSync(path.join(mockBin, 'gstack-patch-names'), 0o755);
}
// Create mock skill directories
// Create mock skill directories with proper frontmatter
for (const skill of skills) {
fs.mkdirSync(path.join(installDir, skill), { recursive: true });
fs.writeFileSync(path.join(installDir, skill, 'SKILL.md'), `# ${skill}`);
fs.writeFileSync(
path.join(installDir, skill, 'SKILL.md'),
`---\nname: ${skill}\ndescription: test\n---\n# ${skill}`
);
}
}
@@ -150,3 +157,73 @@ describe('gstack-relink (#578)', () => {
expect(fs.existsSync(path.join(skillsDir, 'gstack-ship'))).toBe(true);
});
});
describe('gstack-patch-names (#620/#578)', () => {
// Helper to read name: from SKILL.md frontmatter
function readSkillName(skillDir: string): string | null {
const content = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf-8');
const match = content.match(/^name:\s*(.+)$/m);
return match ? match[1].trim() : null;
}
test('prefix=true patches name: field in SKILL.md', () => {
setupMockInstall(['qa', 'ship', 'review']);
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Verify name: field is patched with gstack- prefix
expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
expect(readSkillName(path.join(installDir, 'ship'))).toBe('gstack-ship');
expect(readSkillName(path.join(installDir, 'review'))).toBe('gstack-review');
});
test('prefix=false restores name: field in SKILL.md', () => {
setupMockInstall(['qa', 'ship']);
// First, prefix them
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
// Now switch to flat mode
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix false`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Verify name: field is restored to unprefixed
expect(readSkillName(path.join(installDir, 'qa'))).toBe('qa');
expect(readSkillName(path.join(installDir, 'ship'))).toBe('ship');
});
test('gstack-upgrade name: not double-prefixed', () => {
setupMockInstall(['qa', 'gstack-upgrade']);
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// gstack-upgrade should keep its name, NOT become gstack-gstack-upgrade
expect(readSkillName(path.join(installDir, 'gstack-upgrade'))).toBe('gstack-upgrade');
// Regular skill should be prefixed
expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
});
test('SKILL.md without frontmatter is a no-op', () => {
setupMockInstall(['qa']);
// Overwrite qa SKILL.md with no frontmatter
fs.writeFileSync(path.join(installDir, 'qa', 'SKILL.md'), '# qa\nSome content.');
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
// Should not crash
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Content should be unchanged (no name: to patch)
const content = fs.readFileSync(path.join(installDir, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toBe('# qa\nSome content.');
});
});
+562
View File
@@ -0,0 +1,562 @@
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import {
ROOT, runId, describeIfSelected, testConcurrentIfSelected,
logCost, recordE2E, createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-review-army');
// Helper: create a git repo with a feature branch
function setupRepo(prefix: string): { dir: string; run: (cmd: string, args: string[]) => void } {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${prefix}-`));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
return { dir, run };
}
// Helper: copy review skill files to test dir
function copyReviewFiles(dir: string) {
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(dir, 'review-SKILL.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(dir, 'review-checklist.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(dir, 'review-greptile-triage.md'));
// Copy specialist checklists
const specDir = path.join(dir, 'review-specialists');
fs.mkdirSync(specDir, { recursive: true });
const specialistsRoot = path.join(ROOT, 'review', 'specialists');
for (const f of fs.readdirSync(specialistsRoot)) {
fs.copyFileSync(path.join(specialistsRoot, f), path.join(specDir, f));
}
}
// --- Review Army: Migration Safety ---
describeIfSelected('Review Army: Migration Safety', ['review-army-migration-safety'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-migration');
dir = repo.dir;
// Base commit
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
// Feature branch with unsafe migration
repo.run('git', ['checkout', '-b', 'feature/drop-columns']);
fs.mkdirSync(path.join(dir, 'db', 'migrate'), { recursive: true });
const migrationContent = fs.readFileSync(
path.join(ROOT, 'test', 'fixtures', 'review-army-migration.sql'), 'utf-8'
);
fs.writeFileSync(path.join(dir, 'db', 'migrate', '20260330_drop_columns.sql'), migrationContent);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'drop email and phone columns']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-migration-safety', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with a database migration that drops columns.
Read review-SKILL.md for instructions. Also read review-checklist.md.
The specialist checklists are in review-specialists/ (testing.md, security.md, performance.md, data-migration.md, etc.).
Skip the preamble, lake intro, telemetry sections.
Run Step 4 (Critical pass) then Step 4.5 (Review Army — Specialist Dispatch).
The base branch is main. Run gstack-diff-scope style analysis on the changed files.
Since db/migrate/ files changed, the Data Migration specialist should activate.
For the specialist dispatch, instead of launching subagents, just read review-specialists/data-migration.md
and apply it yourself against the diff (git diff main...HEAD).
Write your findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-migration-safety',
runId,
});
logCost('/review army migration', result);
recordE2E(evalCollector, '/review army migration safety', 'Review Army', result);
expect(result.exitReason).toBe('success');
// Verify migration issues were caught
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
const hasMigrationFinding =
content.includes('drop') ||
content.includes('data loss') ||
content.includes('reversib') ||
content.includes('migration') ||
content.includes('column');
expect(hasMigrationFinding).toBe(true);
}
}, 210_000);
});
// --- Review Army: N+1 Performance ---
describeIfSelected('Review Army: N+1 Performance', ['review-army-perf-n-plus-one'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-n-plus-one');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/add-posts-index']);
const n1Content = fs.readFileSync(
path.join(ROOT, 'test', 'fixtures', 'review-army-n-plus-one.rb'), 'utf-8'
);
fs.writeFileSync(path.join(dir, 'posts_controller.rb'), n1Content);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add posts controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-perf-n-plus-one', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with a Ruby controller that has N+1 queries.
Read review-SKILL.md for instructions. Also read review-checklist.md.
The specialist checklists are in review-specialists/ (testing.md, performance.md, etc.).
Skip the preamble, lake intro, telemetry sections.
Run Step 4 (Critical pass) then Step 4.5 (Review Army).
The base branch is main. This is a Ruby backend file, so Performance specialist should activate.
For the specialist dispatch, read review-specialists/performance.md and apply it against the diff.
Write your findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-perf-n-plus-one',
runId,
});
logCost('/review army n+1', result);
recordE2E(evalCollector, '/review army N+1 detection', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
const hasN1Finding =
content.includes('n+1') ||
content.includes('n + 1') ||
content.includes('eager') ||
content.includes('includes') ||
content.includes('preload') ||
content.includes('query') ||
content.includes('loop');
expect(hasN1Finding).toBe(true);
}
}, 210_000);
});
// --- Review Army: Delivery Audit ---
describeIfSelected('Review Army: Delivery Audit', ['review-army-delivery-audit'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-delivery');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/three-features']);
// Write a plan file promising 3 features
fs.writeFileSync(path.join(dir, 'PLAN.md'), `# Feature Plan
## Implementation Items
1. Add user authentication with login/logout
2. Add user profile page with avatar upload
3. Add email notification system for new signups
## Test Items
- Test login flow
- Test profile page rendering
- Test email sending
`);
repo.run('git', ['add', 'PLAN.md']);
repo.run('git', ['commit', '-m', 'add plan']);
// Implement only 2 of 3 features
fs.writeFileSync(path.join(dir, 'auth.rb'), `class AuthController
def login
# authenticate user
session[:user_id] = user.id
end
def logout
session.delete(:user_id)
end
end
`);
fs.writeFileSync(path.join(dir, 'profile.rb'), `class ProfileController
def show
@user = User.find(params[:id])
end
def update_avatar
@user.avatar.attach(params[:avatar])
end
end
`);
// NOTE: email notification system is NOT implemented (intentionally missing)
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'implement auth and profile features']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-delivery-audit', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/three-features.
There is a PLAN.md file that promises 3 features: auth, profile, and email notifications.
The diff (git diff main...HEAD) only implements 2 of them (auth and profile).
Read review-SKILL.md for the review workflow. Focus on the Plan Completion Audit section.
The plan file is at ./PLAN.md. Cross-reference it against the diff.
For each plan item, classify as DONE, PARTIAL, NOT DONE, or CHANGED.
The email notification system should be classified as NOT DONE.
Write your completion audit to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 15,
timeout: 120_000,
testName: 'review-army-delivery-audit',
runId,
});
logCost('/review army delivery', result);
recordE2E(evalCollector, '/review army delivery audit', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
// Should identify email notifications as NOT DONE
const hasNotDone =
content.includes('not done') ||
content.includes('not_done') ||
content.includes('missing') ||
content.includes('not implemented');
const mentionsEmail =
content.includes('email') ||
content.includes('notification');
expect(hasNotDone).toBe(true);
expect(mentionsEmail).toBe(true);
}
}, 150_000);
});
// --- Review Army: Quality Score ---
describeIfSelected('Review Army: Quality Score', ['review-army-quality-score'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-quality');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/add-controller']);
// Code with obvious issues for quality score computation
fs.writeFileSync(path.join(dir, 'user_controller.rb'), `class UserController
def create
# SQL injection
User.where("name = '#{params[:name]}'")
# Magic number
if users.count > 42
raise "too many"
end
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add user controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-quality-score', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo with a vulnerable user controller.
Read review-SKILL.md and review-checklist.md.
Skip preamble, lake intro, telemetry.
Run the Critical pass (Step 4) against the diff (git diff main...HEAD).
Then compute the PR Quality Score as described in the Review Army merge step:
quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))
Write your findings AND the computed quality score to ${dir}/review-output.md
Include the line: "PR Quality Score: X/10" where X is the computed score.`,
workingDirectory: dir,
maxTurns: 15,
timeout: 120_000,
testName: 'review-army-quality-score',
runId,
});
logCost('/review army quality', result);
recordE2E(evalCollector, '/review army quality score', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8');
// Should contain a quality score
const hasScore =
content.toLowerCase().includes('quality score') ||
content.match(/\d+\/10/);
expect(hasScore).toBeTruthy();
}
}, 150_000);
});
// --- Review Army: JSON Findings ---
describeIfSelected('Review Army: JSON Findings', ['review-army-json-findings'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-json');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/vuln']);
fs.writeFileSync(path.join(dir, 'search.rb'), `class SearchController
def index
# SQL injection via string interpolation
results = ActiveRecord::Base.connection.execute(
"SELECT * FROM products WHERE name LIKE '%#{params[:q]}%'"
)
render json: results
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add search']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-json-findings', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a git diff with a SQL injection vulnerability.
Read review-specialists/security.md for the security checklist.
Apply the checklist against this diff (git diff main...HEAD).
Output your findings as JSON objects, one per line, following the schema:
{"severity":"CRITICAL","confidence":9,"path":"search.rb","line":4,"category":"injection","summary":"SQL injection via string interpolation","fix":"Use parameterized query","fingerprint":"search.rb:4:injection","specialist":"security"}
Write ONLY JSON findings (no preamble) to ${dir}/findings.json`,
workingDirectory: dir,
maxTurns: 12,
timeout: 90_000,
testName: 'review-army-json-findings',
runId,
});
logCost('/review army json', result);
recordE2E(evalCollector, '/review army JSON findings', 'Review Army', result);
expect(result.exitReason).toBe('success');
const findingsPath = path.join(dir, 'findings.json');
if (fs.existsSync(findingsPath)) {
const content = fs.readFileSync(findingsPath, 'utf-8').trim();
const lines = content.split('\n').filter(l => l.trim());
// At least one finding
expect(lines.length).toBeGreaterThanOrEqual(1);
// Each line should be valid JSON with required fields
for (const line of lines) {
let parsed: any;
try { parsed = JSON.parse(line); } catch { continue; }
// Required fields per schema
expect(parsed).toHaveProperty('severity');
expect(parsed).toHaveProperty('confidence');
expect(parsed).toHaveProperty('path');
expect(parsed).toHaveProperty('category');
expect(parsed).toHaveProperty('summary');
expect(parsed).toHaveProperty('specialist');
break; // One valid line is enough for the gate test
}
}
}, 120_000);
});
// --- Review Army: Red Team (periodic) ---
describeIfSelected('Review Army: Red Team', ['review-army-red-team'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-redteam');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/large-change']);
// Create a large diff (300+ lines)
const lines: string[] = ['class LargeController'];
for (let i = 0; i < 100; i++) {
lines.push(` def method_${i}`);
lines.push(` data = params[:input_${i}]`);
lines.push(` process(data)`);
lines.push(' end');
lines.push('');
}
lines.push('end');
fs.writeFileSync(path.join(dir, 'large_controller.rb'), lines.join('\n'));
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add large controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-red-team', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a large diff (300+ lines). Read review-SKILL.md.
Skip preamble, lake intro, telemetry.
The diff is large enough to activate the Red Team specialist.
Read review-specialists/red-team.md and apply it against the diff (git diff main...HEAD).
Focus on finding issues that other specialists might miss.
Write your red team findings to ${dir}/review-output.md
Start the file with "RED TEAM REVIEW" on the first line.`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-red-team',
runId,
});
logCost('/review army red-team', result);
recordE2E(evalCollector, '/review army red team', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8');
expect(content.toLowerCase()).toMatch(/red team|adversarial/);
}
}, 210_000);
});
// --- Review Army: Consensus (periodic) ---
describeIfSelected('Review Army: Consensus', ['review-army-consensus'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-consensus');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/vuln-auth']);
// SQL injection that both security AND testing specialists should flag
fs.writeFileSync(path.join(dir, 'auth_controller.rb'), `class AuthController
def login
user = User.find_by("email = '#{params[:email]}' AND password = '#{params[:password]}'")
if user
session[:user_id] = user.id
redirect_to root_path
else
flash[:error] = "Invalid credentials"
render :login
end
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add auth controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-consensus', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a git diff with a SQL injection in an auth controller.
Read review-SKILL.md, review-checklist.md, and the specialist checklists in review-specialists/.
This vulnerability should be caught by BOTH the security specialist (injection vector)
AND the testing specialist (no test for auth bypass).
Run the review. In your output, if a finding is flagged by multiple perspectives,
mark it as "MULTI-SPECIALIST CONFIRMED" with the confirming categories.
Write findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-consensus',
runId,
});
logCost('/review army consensus', result);
recordE2E(evalCollector, '/review army consensus', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
// Should catch the SQL injection
const hasSqlFinding =
content.includes('sql') ||
content.includes('injection') ||
content.includes('interpolat');
expect(hasSqlFinding).toBe(true);
}
}, 210_000);
});
// Finalize eval collector
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
});
+190
View File
@@ -149,6 +149,196 @@ describeIfSelected('Sidebar URL accuracy E2E', ['sidebar-url-accuracy'], () => {
}, 30_000);
});
// --- Sidebar CSS Interaction E2E (real Claude + real browser) ---
// Goes to HN, reads comments, identifies the most insightful one, highlights it.
// Exercises: navigation, snapshot, text reading, LLM judgment, CSS style injection.
describeIfSelected('Sidebar CSS interaction E2E', ['sidebar-css-interaction'], () => {
let serverProc: Subprocess | null = null;
let agentProc: Subprocess | null = null;
let serverPort: number = 0;
let authToken: string = '';
let tmpDir: string = '';
let stateFile: string = '';
let queueFile: string = '';
let serverLogFile: string = '';
let serverErrFile: string = '';
let agentLogFile: string = '';
let agentErrFile: string = '';
async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
...(opts.headers as Record<string, string> || {}),
};
if (!headers['Authorization'] && authToken) {
headers['Authorization'] = `Bearer ${authToken}`;
}
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
}
beforeAll(async () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-css-'));
stateFile = path.join(tmpDir, 'browse.json');
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
// Start server WITH a real browser for CSS interaction
const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
serverLogFile = path.join(tmpDir, 'server.log');
serverErrFile = path.join(tmpDir, 'server.err');
// Use 'pipe' stdio — closing file descriptors kills the child on macOS/bun
serverProc = spawn(['bun', 'run', serverScript], {
env: {
...process.env,
BROWSE_STATE_FILE: stateFile,
BROWSE_PORT: '0',
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_IDLE_TIMEOUT: '600000', // 10 min in ms — test takes ~3 min
},
stdio: ['ignore', 'pipe', 'pipe'],
});
// Wait for state file with port/token
const deadline = Date.now() + 30000;
while (Date.now() < deadline) {
if (fs.existsSync(stateFile)) {
try {
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
if (state.port && state.token) {
serverPort = state.port;
authToken = state.token;
break;
}
} catch {}
}
await new Promise(r => setTimeout(r, 200));
}
if (!serverPort) throw new Error('Server did not start in time');
// Verify server is healthy before proceeding
const healthDeadline = Date.now() + 10000;
let healthy = false;
while (Date.now() < healthDeadline) {
try {
const resp = await fetch(`http://127.0.0.1:${serverPort}/health`);
if (resp.ok) { healthy = true; break; }
} catch {}
await new Promise(r => setTimeout(r, 500));
}
if (!healthy) throw new Error('Server started but health check failed');
// Start sidebar-agent with the real browse binary
const agentScript = path.resolve(ROOT, 'browse', 'src', 'sidebar-agent.ts');
const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
agentLogFile = path.join(tmpDir, 'agent.log');
agentErrFile = path.join(tmpDir, 'agent.err');
// Use 'pipe' stdio — closing file descriptors kills the child on macOS/bun
agentProc = spawn(['bun', 'run', agentScript], {
env: {
...process.env,
BROWSE_SERVER_PORT: String(serverPort),
BROWSE_STATE_FILE: stateFile,
SIDEBAR_QUEUE_PATH: queueFile,
SIDEBAR_AGENT_TIMEOUT: '180000', // 3 min — multi-step HN comment task
BROWSE_BIN: fs.existsSync(browseBin) ? browseBin : 'echo',
},
stdio: ['ignore', 'pipe', 'pipe'],
});
await new Promise(r => setTimeout(r, 2000));
}, 35000);
afterAll(() => {
if (agentProc) { try { agentProc.kill(); } catch {} }
if (serverProc) { try { serverProc.kill(); } catch {} }
finalizeEvalCollector(evalCollector);
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('sidebar-css-interaction', async () => {
// Fresh session + clean queue
try { await api('/sidebar-session/new', { method: 'POST' }); } catch {}
fs.writeFileSync(queueFile, '');
const startTime = Date.now();
// Ask the agent to go to HN, find the most insightful comment, and highlight it
const resp = await api('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: 'Go to https://news.ycombinator.com. Find the top story. Click into its comments. Read the comments and find the most insightful one. Highlight that comment with a 4px solid orange outline.',
activeTabUrl: 'about:blank',
}),
});
expect(resp.status).toBe(200);
// Poll for agent_done (4 min timeout — multi-step task with opus LLM)
const deadline = Date.now() + 240000;
let entries: any[] = [];
while (Date.now() < deadline) {
try {
const chatResp = await api('/sidebar-chat?after=0');
const data = await chatResp.json();
entries = data.entries || [];
if (entries.some((e: any) => e.type === 'agent_done')) break;
} catch (err: any) {
// Server may be temporarily busy or restarting — retry on connection errors
const isConnErr = err.code === 'ConnectionRefused' || err.message?.includes('ConnectionRefused') || err.message?.includes('Unable to connect');
if (!isConnErr) throw err;
}
await new Promise(r => setTimeout(r, 3000));
}
const duration = Date.now() - startTime;
const doneEntry = entries.find((e: any) => e.type === 'agent_done');
// Dump debug info on failure
if (!doneEntry || entries.length === 0) {
console.log('ENTRIES:', JSON.stringify(entries.slice(-5), null, 2));
console.log('SERVER exitCode:', serverProc?.exitCode, 'signalCode:', serverProc?.signalCode, 'killed:', serverProc?.killed);
console.log('AGENT exitCode:', agentProc?.exitCode, 'signalCode:', agentProc?.signalCode, 'killed:', agentProc?.killed);
const queueContent = fs.existsSync(queueFile) ? fs.readFileSync(queueFile, 'utf-8').slice(-500) : 'NO QUEUE';
console.log('QUEUE:', queueContent.length > 0 ? 'has entries' : 'empty');
}
// Agent should have completed
expect(doneEntry).toBeDefined();
// Agent should have run browse commands (look for tool_use entries)
const toolUses = entries.filter((e: any) => e.type === 'tool_use');
expect(toolUses.length).toBeGreaterThanOrEqual(2); // At minimum: goto + one more
// Agent text should mention something about the comment it found
const agentText = entries
.filter((e: any) => e.role === 'agent' && (e.type === 'text' || e.type === 'result'))
.map((e: any) => e.text || '')
.join(' ')
.toLowerCase();
// Should have navigated to HN (look for ycombinator/HN in any entry text)
const allEntryText = entries
.map((e: any) => `${e.text || ''} ${e.input || ''} ${e.message || ''}`)
.join(' ');
const navigatedToHN = allEntryText.includes('ycombinator') || allEntryText.includes('Hacker News') || allEntryText.includes('news.ycombinator');
if (!navigatedToHN) {
console.log('ALL ENTRY TEXT (first 2000):', allEntryText.slice(0, 2000));
}
expect(navigatedToHN).toBe(true);
// Should have applied a style (look for orange/outline in tool commands)
const allText = entries.map((e: any) => e.text || '').join(' ');
const appliedStyle = allText.includes('outline') || allText.includes('orange') || allText.includes('style');
evalCollector?.addTest({
name: 'sidebar-css-interaction', suite: 'Sidebar CSS interaction E2E', tier: 'e2e',
passed: !!doneEntry && navigatedToHN && appliedStyle,
duration_ms: duration,
cost_usd: 0,
exit_reason: doneEntry ? 'success' : 'timeout',
});
}, 300_000);
});
// --- Sidebar Navigate (real Claude, requires ANTHROPIC_API_KEY) ---
describeIfSelected('Sidebar navigate E2E', ['sidebar-navigate'], () => {
+96
View File
@@ -3257,6 +3257,102 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
}, 180_000);
});
// --- Ship idempotency (#649) ---
describeIfSelected('Ship idempotency', ['ship-idempotency'], () => {
let idempDir: string;
const gitRun = (args: string[], cwd: string) =>
spawnSync('git', args, { cwd, stdio: 'pipe', timeout: 5000 });
beforeAll(() => {
idempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-ship-idemp-'));
// Create git repo with initial commit on main
gitRun(['init', '-b', 'main'], idempDir);
gitRun(['config', 'user.email', 'test@test.com'], idempDir);
gitRun(['config', 'user.name', 'Test'], idempDir);
fs.writeFileSync(path.join(idempDir, 'app.ts'), 'console.log("v1");\n');
fs.writeFileSync(path.join(idempDir, 'VERSION'), '0.1.0.0\n');
fs.writeFileSync(path.join(idempDir, 'CHANGELOG.md'), '# Changelog\n');
gitRun(['add', '.'], idempDir);
gitRun(['commit', '-m', 'initial'], idempDir);
// Create feature branch with changes
gitRun(['checkout', '-b', 'feat/my-feature'], idempDir);
fs.writeFileSync(path.join(idempDir, 'app.ts'), 'console.log("v2");\n');
gitRun(['add', 'app.ts'], idempDir);
gitRun(['commit', '-m', 'feat: update to v2'], idempDir);
// Simulate prior /ship run: bump VERSION and write CHANGELOG entry
fs.writeFileSync(path.join(idempDir, 'VERSION'), '0.2.0.0\n');
fs.writeFileSync(path.join(idempDir, 'CHANGELOG.md'),
'# Changelog\n\n## [0.2.0.0] — 2026-03-30\n\n- Updated app to v2\n');
gitRun(['add', 'VERSION', 'CHANGELOG.md'], idempDir);
gitRun(['commit', '-m', 'chore: bump version to 0.2.0.0'], idempDir);
// Extract just the idempotency-relevant sections from ship/SKILL.md
const full = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const step4Start = full.indexOf('## Step 4: Version bump');
const step4End = full.indexOf('\n---\n', step4Start);
const step7Start = full.indexOf('## Step 7: Push');
const step8End = full.indexOf('## Step 8.5');
const extracted = [
full.slice(step4Start, step4End > step4Start ? step4End : step4Start + 500),
full.slice(step7Start, step8End > step7Start ? step8End : step7Start + 500),
].join('\n\n---\n\n');
fs.writeFileSync(path.join(idempDir, 'ship-steps.md'), extracted);
});
afterAll(() => {
try { fs.rmSync(idempDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('ship-idempotency', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feat/my-feature. A prior /ship run already:
- Bumped VERSION from 0.1.0.0 to 0.2.0.0
- Wrote a CHANGELOG entry for 0.2.0.0
- But the push/PR step failed
Read ship-steps.md for the idempotency check instructions from the ship workflow.
Run ONLY the idempotency checks described in Steps 4 and 7. Do NOT actually push or create PRs (there is no remote).
After running the checks, write a report to ${idempDir}/idemp-result.md containing:
- Whether VERSION was detected as ALREADY_BUMPED or not
- Whether the push was detected as ALREADY_PUSHED or PUSH_NEEDED
- The current VERSION value (should still be 0.2.0.0)
Do NOT modify VERSION or CHANGELOG. Only run the detection checks and report.`,
workingDirectory: idempDir,
maxTurns: 10,
timeout: 60_000,
testName: 'ship-idempotency',
runId,
});
logCost('/ship idempotency', result);
recordE2E('/ship idempotency guard', 'Ship idempotency', result);
expect(result.exitReason).toBe('success');
// Verify VERSION was NOT modified
const version = fs.readFileSync(path.join(idempDir, 'VERSION'), 'utf-8').trim();
expect(version).toBe('0.2.0.0');
// Verify CHANGELOG was NOT duplicated
const changelog = fs.readFileSync(path.join(idempDir, 'CHANGELOG.md'), 'utf-8');
const versionEntries = (changelog.match(/## \[0\.2\.0\.0\]/g) || []).length;
expect(versionEntries).toBe(1);
// Check the result report if it was written
const reportPath = path.join(idempDir, 'idemp-result.md');
if (fs.existsSync(reportPath)) {
const report = fs.readFileSync(reportPath, 'utf-8');
expect(report.toLowerCase()).toContain('already_bumped');
}
}, 120_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+31 -19
View File
@@ -1268,38 +1268,49 @@ describe('Codex skill', () => {
expect(content).toContain('mktemp');
});
test('adversarial review in /review auto-scales by diff size', () => {
test('adversarial review in /review always runs both passes', () => {
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Adversarial review (auto-scaled)');
// Diff size thresholds
expect(content).toContain('< 50');
expect(content).toContain('50199');
expect(content).toContain('200+');
// All three tiers present
expect(content).toContain('Small');
expect(content).toContain('Medium tier');
expect(content).toContain('Large tier');
expect(content).toContain('Adversarial review (always-on)');
// Always-on: both Claude and Codex adversarial
expect(content).toContain('Claude adversarial subagent (always runs)');
expect(content).toContain('Codex adversarial challenge (always runs when available)');
// Claude adversarial subagent dispatch
expect(content).toContain('Agent tool');
expect(content).toContain('FIXABLE');
expect(content).toContain('INVESTIGATE');
// Codex fallback logic
// Codex availability check
expect(content).toContain('CODEX_NOT_AVAILABLE');
expect(content).toContain('fall back to the Claude adversarial subagent');
// Review log uses new skill name
// OLD_CFG only gates Codex, not Claude
expect(content).toContain('skip Codex passes only');
// Review log
expect(content).toContain('adversarial-review');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
// Large diff structured review still gated
expect(content).toContain('Codex structured review (large diffs only');
expect(content).toContain('200');
});
test('adversarial review in /ship auto-scales by diff size', () => {
test('adversarial review in /ship always runs both passes', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Adversarial review (auto-scaled)');
expect(content).toContain('< 50');
expect(content).toContain('200+');
expect(content).toContain('Adversarial review (always-on)');
expect(content).toContain('adversarial-review');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('Investigate and fix');
expect(content).toContain('Claude adversarial subagent (always runs)');
});
test('scope drift detection in /review and /ship', () => {
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
// Both should contain scope drift from the shared resolver
for (const content of [reviewContent, shipContent]) {
expect(content).toContain('Scope Check:');
expect(content).toContain('DRIFT DETECTED');
expect(content).toContain('SCOPE CREEP');
expect(content).toContain('MISSING REQUIREMENTS');
expect(content).toContain('stated intent');
}
});
test('codex-host ship/review do NOT contain adversarial review step', () => {
@@ -1522,12 +1533,13 @@ describe('sidebar agent (#584)', () => {
});
// #584 — Server Write: server.ts allowedTools includes Write (DRY parity)
test('server.ts allowedTools includes Write', () => {
test('server.ts allowedTools excludes Write (agent is read-only + Bash)', () => {
const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'server.ts'), 'utf-8');
// Find the sidebar allowedTools in the headed-mode path
const match = content.match(/--allowedTools['"]\s*,\s*['"]([^'"]+)['"]/);
expect(match).not.toBeNull();
expect(match![1]).toContain('Write');
expect(match![1]).toContain('Bash');
expect(match![1]).not.toContain('Write');
});
// #584 — Sidebar stderr: stderr handler is not empty