merge: incorporate origin/main into community-mode branch

Conflicts resolved:
- VERSION: accept main's 0.14.5.0 (higher than our 0.14.0.0)
- package.json: same version resolution
- CHANGELOG.md: drop duplicate 0.14.0.0 entry (already on main),
  keep main's entries for 0.14.1-0.14.5 and 0.13.7-0.13.10
- README.md: merge skill lists — keep main's /design-html + /learn,
  add our /gstack-submit to both install and troubleshooting lists
- docs/skills.md: keep all three entries (/gstack-submit, /autoplan, /learn)

Main brought in 0.14.1-0.14.5: design-to-code (/design-html),
comparison board chooser, sidebar CSS inspector + per-tab agents,
always-on adversarial review + scope drift, review army (7 parallel
specialist reviewers), ship idempotency, skill prefix fix.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-30 21:50:02 -07:00
128 changed files with 14314 additions and 1393 deletions
+34 -7
View File
@@ -45,15 +45,17 @@ describe('Audit compliance', () => {
expect(completionSection).toContain('_TEL" != "off"');
});
// Fix 3: W012 — Bun install is version-pinned
test('bun install commands use version pinning', () => {
// Round 2 Fix 1: W012 — Bun install uses checksum verification
test('bun install uses checksum-verified method', () => {
const browseResolver = readFileSync(join(ROOT, 'scripts/resolvers/browse.ts'), 'utf-8');
expect(browseResolver).toContain('BUN_VERSION');
// Should not have unpinned curl|bash (without BUN_VERSION on same line)
const lines = browseResolver.split('\n');
expect(browseResolver).toContain('shasum -a 256');
expect(browseResolver).toContain('BUN_INSTALL_SHA');
const setup = readFileSync(join(ROOT, 'setup'), 'utf-8');
// Setup error message should not have unverified curl|bash
const lines = setup.split('\n');
for (const line of lines) {
if (line.includes('bun.sh/install') && line.includes('bash') && !line.includes('BUN_VERSION') && !line.includes('command -v')) {
throw new Error(`Unpinned bun install found: ${line.trim()}`);
if (line.includes('bun.sh/install') && line.includes('| bash') && !line.includes('shasum')) {
throw new Error(`Unverified bun install found: ${line.trim()}`);
}
}
});
@@ -69,6 +71,17 @@ describe('Audit compliance', () => {
expect(between.toLowerCase()).toContain('untrusted');
});
// Round 2 Fix 2: Trust boundary markers + helper + wrapping in all paths
test('browse wraps untrusted content with trust boundary markers', () => {
const commands = readFileSync(join(ROOT, 'browse/src/commands.ts'), 'utf-8');
expect(commands).toContain('PAGE_CONTENT_COMMANDS');
expect(commands).toContain('wrapUntrustedContent');
const server = readFileSync(join(ROOT, 'browse/src/server.ts'), 'utf-8');
expect(server).toContain('wrapUntrustedContent');
const meta = readFileSync(join(ROOT, 'browse/src/meta-commands.ts'), 'utf-8');
expect(meta).toContain('wrapUntrustedContent');
});
// Fix 5: Data flow documentation in review.ts
test('review.ts has data flow documentation', () => {
const review = readFileSync(join(ROOT, 'scripts/resolvers/review.ts'), 'utf-8');
@@ -76,6 +89,20 @@ describe('Audit compliance', () => {
expect(review).toContain('Data NOT sent');
});
// Round 2 Fix 3: Extension sender validation + message type allowlist
test('extension background.js validates message sender', () => {
const bg = readFileSync(join(ROOT, 'extension/background.js'), 'utf-8');
expect(bg).toContain('sender.id !== chrome.runtime.id');
expect(bg).toContain('ALLOWED_TYPES');
});
// Round 2 Fix 4: Chrome CDP binds to localhost only
test('chrome-cdp binds to localhost only', () => {
const cdp = readFileSync(join(ROOT, 'bin/chrome-cdp'), 'utf-8');
expect(cdp).toContain('--remote-debugging-address=127.0.0.1');
expect(cdp).toContain('--remote-allow-origins=');
});
// Fix 2+6: All generated SKILL.md files with telemetry are conditional
test('all generated SKILL.md files with telemetry calls use conditional pattern', () => {
const skills = getAllSkillMds();
+165
View File
@@ -0,0 +1,165 @@
/**
* Tests for bin/gstack-diff-scope — verifies scope signal detection.
*
* Creates temp git repos with specific file patterns and verifies
* the correct SCOPE_* variables are output.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'fs';
import { join } from 'path';
import { tmpdir } from 'os';
import { spawnSync } from 'child_process';
const SCRIPT = join(import.meta.dir, '..', 'bin', 'gstack-diff-scope');
const dirs: string[] = [];
function createRepo(files: string[]): string {
const dir = mkdtempSync(join(tmpdir(), 'diff-scope-test-'));
dirs.push(dir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Base commit
writeFileSync(join(dir, 'README.md'), '# test\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Feature branch with specified files
run('git', ['checkout', '-b', 'feature/test']);
for (const f of files) {
const fullPath = join(dir, f);
const dirPath = fullPath.substring(0, fullPath.lastIndexOf('/'));
if (dirPath !== dir) mkdirSync(dirPath, { recursive: true });
writeFileSync(fullPath, '# test content\n');
}
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add files']);
return dir;
}
function runScope(dir: string): Record<string, string> {
const result = spawnSync('bash', [SCRIPT, 'main'], {
cwd: dir, stdio: 'pipe', timeout: 5000,
});
const output = result.stdout.toString().trim();
const vars: Record<string, string> = {};
for (const line of output.split('\n')) {
const [key, val] = line.split('=');
if (key && val) vars[key] = val;
}
return vars;
}
afterAll(() => {
for (const d of dirs) {
try { rmSync(d, { recursive: true, force: true }); } catch {}
}
});
describe('gstack-diff-scope', () => {
// --- Existing scope signals ---
test('detects frontend files', () => {
const dir = createRepo(['styles.css', 'component.tsx']);
const scope = runScope(dir);
expect(scope.SCOPE_FRONTEND).toBe('true');
});
test('detects backend files', () => {
const dir = createRepo(['app.rb', 'service.py']);
const scope = runScope(dir);
expect(scope.SCOPE_BACKEND).toBe('true');
});
test('detects test files', () => {
const dir = createRepo(['test/app.test.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_TESTS).toBe('true');
});
// --- New scope signals (Review Army) ---
test('detects migrations via db/migrate/', () => {
const dir = createRepo(['db/migrate/20260330_create_users.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects migrations via generic migrations/', () => {
const dir = createRepo(['app/migrations/0001_initial.py']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects migrations via prisma', () => {
const dir = createRepo(['prisma/migrations/20260330/migration.sql']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('true');
});
test('detects API via controller files', () => {
const dir = createRepo(['app/controllers/users_controller.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects API via route files', () => {
const dir = createRepo(['src/routes/api.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects API via GraphQL schemas', () => {
const dir = createRepo(['schema.graphql']);
const scope = runScope(dir);
expect(scope.SCOPE_API).toBe('true');
});
test('detects auth files', () => {
const dir = createRepo(['app/services/auth_service.rb']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('detects session files', () => {
const dir = createRepo(['lib/session_manager.ts']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('detects JWT files', () => {
const dir = createRepo(['utils/jwt_helper.py']);
const scope = runScope(dir);
expect(scope.SCOPE_AUTH).toBe('true');
});
test('returns false for all new signals when no matching files', () => {
const dir = createRepo(['docs/readme.md', 'config.yml']);
const scope = runScope(dir);
expect(scope.SCOPE_MIGRATIONS).toBe('false');
expect(scope.SCOPE_API).toBe('false');
expect(scope.SCOPE_AUTH).toBe('false');
});
test('outputs all 9 scope variables', () => {
const dir = createRepo(['app.ts']);
const scope = runScope(dir);
expect(Object.keys(scope)).toHaveLength(9);
expect(scope).toHaveProperty('SCOPE_FRONTEND');
expect(scope).toHaveProperty('SCOPE_BACKEND');
expect(scope).toHaveProperty('SCOPE_PROMPTS');
expect(scope).toHaveProperty('SCOPE_TESTS');
expect(scope).toHaveProperty('SCOPE_DOCS');
expect(scope).toHaveProperty('SCOPE_CONFIG');
expect(scope).toHaveProperty('SCOPE_MIGRATIONS');
expect(scope).toHaveProperty('SCOPE_API');
expect(scope).toHaveProperty('SCOPE_AUTH');
});
});
+5
View File
@@ -0,0 +1,5 @@
-- Migration: Drop user email column
-- WARNING: This migration is intentionally unsafe for testing
ALTER TABLE users DROP COLUMN email;
ALTER TABLE users DROP COLUMN phone_number;
-- No backfill, no reversibility check, no data preservation
+12
View File
@@ -0,0 +1,12 @@
# N+1 query example — intentionally bad for testing
class PostsController
def index
@posts = Post.all
@posts.each do |post|
# N+1: queries Author table for every post
puts post.author.name
# N+1: queries Comments table for every post
puts post.comments.count
end
end
end
+333 -24
View File
@@ -586,10 +586,12 @@ describe('REVIEW_DASHBOARD resolver', () => {
expect(content).toContain('/plan-ceo-review');
});
test('plan-design-review chaining mentions eng and ceo reviews', () => {
test('plan-design-review chaining mentions eng, ceo, and design skills', () => {
const content = fs.readFileSync(path.join(ROOT, 'plan-design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('/plan-eng-review');
expect(content).toContain('/plan-ceo-review');
expect(content).toContain('/design-shotgun');
expect(content).toContain('/design-html');
});
test('ship does NOT contain review chaining', () => {
@@ -605,7 +607,8 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
test('all three modes share codepath tracing methodology', () => {
test('plan and ship modes share codepath tracing methodology', () => {
// Review mode delegates test coverage to the Testing specialist subagent (Review Army)
const sharedPhrases = [
'Trace data flow',
'Diagram the execution',
@@ -617,33 +620,40 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
for (const phrase of sharedPhrases) {
expect(planSkill).toContain(phrase);
expect(shipSkill).toContain(phrase);
expect(reviewSkill).toContain(phrase);
}
// Plan mode traces the plan, not a git diff
expect(planSkill).toContain('Trace every codepath in the plan');
expect(planSkill).not.toContain('git diff origin');
// Ship and review modes trace the diff
// Ship mode traces the diff
expect(shipSkill).toContain('Trace every codepath changed');
expect(reviewSkill).toContain('Trace every codepath changed');
});
test('all three modes include E2E decision matrix', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('review mode uses Review Army for specialist dispatch', () => {
expect(reviewSkill).toContain('Review Army');
expect(reviewSkill).toContain('Specialist Dispatch');
expect(reviewSkill).toContain('testing.md');
});
test('plan and ship modes include E2E decision matrix', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('E2E Test Decision Matrix');
expect(skill).toContain('→E2E');
expect(skill).toContain('→EVAL');
}
});
test('all three modes include regression rule', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('plan and ship modes include regression rule', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('REGRESSION RULE');
expect(skill).toContain('IRON RULE');
}
});
test('all three modes include test framework detection', () => {
for (const skill of [planSkill, shipSkill, reviewSkill]) {
test('plan and ship modes include test framework detection', () => {
// Review mode delegates to Testing specialist
for (const skill of [planSkill, shipSkill]) {
expect(skill).toContain('Test Framework Detection');
expect(skill).toContain('CLAUDE.md');
}
@@ -662,11 +672,12 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
expect(shipSkill).toContain('ship-test-plan');
});
test('review mode generates via Fix-First + gaps are INFORMATIONAL', () => {
test('review mode uses Fix-First + Review Army for specialist coverage', () => {
expect(reviewSkill).toContain('Fix-First');
expect(reviewSkill).toContain('INFORMATIONAL');
expect(reviewSkill).toContain('Step 4.75');
expect(reviewSkill).toContain('subsumes the "Test Gaps" category');
// Review Army handles test coverage via Testing specialist subagent
expect(reviewSkill).toContain('Review Army');
expect(reviewSkill).toContain('Testing');
});
test('plan mode does NOT include ship-specific content', () => {
@@ -681,6 +692,35 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
expect(reviewSkill).not.toContain('ship-test-plan');
});
test('review/specialists/ directory has all expected checklist files', () => {
const specDir = path.join(ROOT, 'review', 'specialists');
const expected = [
'testing.md',
'maintainability.md',
'security.md',
'performance.md',
'data-migration.md',
'api-contract.md',
'red-team.md',
];
for (const f of expected) {
expect(fs.existsSync(path.join(specDir, f))).toBe(true);
}
});
test('each specialist file has standard header with scope and output format', () => {
const specDir = path.join(ROOT, 'review', 'specialists');
const files = fs.readdirSync(specDir).filter(f => f.endsWith('.md'));
for (const f of files) {
const content = fs.readFileSync(path.join(specDir, f), 'utf-8');
// All specialist files must have Scope and Output/JSON in header
expect(content).toContain('Scope:');
expect(content.toLowerCase()).toMatch(/output|json/);
// Must define NO FINDINGS behavior
expect(content).toContain('NO FINDINGS');
}
});
// Regression guard: ship output contains key phrases from before the refactor
test('ship SKILL.md regression guard — key phrases preserved', () => {
const regressionPhrases = [
@@ -868,12 +908,9 @@ describe('Coverage gate in ship', () => {
expect(shipSkill).toContain('could not determine percentage — skipping');
});
test('review SKILL.md contains coverage WARNING', () => {
expect(reviewSkill).toContain('COVERAGE WARNING');
expect(reviewSkill).toContain('Consider writing tests before running /ship');
});
test('review coverage warning is INFORMATIONAL', () => {
test('review SKILL.md delegates coverage to Testing specialist', () => {
// Coverage audit moved to Testing specialist subagent in Review Army
expect(reviewSkill).toContain('testing.md');
expect(reviewSkill).toContain('INFORMATIONAL');
});
});
@@ -1153,6 +1190,138 @@ describe('BENEFITS_FROM resolver', () => {
expect(ceoContent).toContain('office-hours/SKILL.md');
expect(engContent).toContain('office-hours/SKILL.md');
});
test('BENEFITS_FROM delegates to INVOKE_SKILL pattern', () => {
// Should contain the INVOKE_SKILL-style loading prose (not the old manual skip list)
expect(engContent).toContain('Follow its instructions from top to bottom');
expect(engContent).toContain('skipping these sections');
expect(ceoContent).toContain('Follow its instructions from top to bottom');
});
});
// --- {{INVOKE_SKILL}} resolver tests ---
describe('INVOKE_SKILL resolver', () => {
const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
test('plan-ceo-review uses INVOKE_SKILL for mid-session office-hours fallback', () => {
// The mid-session detection path should use INVOKE_SKILL-generated prose
expect(ceoContent).toContain('office-hours/SKILL.md');
expect(ceoContent).toContain('Follow its instructions from top to bottom');
});
test('INVOKE_SKILL output includes default skip list', () => {
expect(ceoContent).toContain('Preamble (run first)');
expect(ceoContent).toContain('Telemetry (run last)');
expect(ceoContent).toContain('AskUserQuestion Format');
});
test('INVOKE_SKILL output includes error handling', () => {
expect(ceoContent).toContain('If unreadable');
expect(ceoContent).toContain('Could not load');
});
test('template uses {{INVOKE_SKILL:office-hours}} placeholder', () => {
const tmpl = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md.tmpl'), 'utf-8');
expect(tmpl).toContain('{{INVOKE_SKILL:office-hours}}');
});
});
// --- {{CHANGELOG_WORKFLOW}} resolver tests ---
describe('CHANGELOG_WORKFLOW resolver', () => {
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
test('ship SKILL.md contains changelog workflow', () => {
expect(shipContent).toContain('CHANGELOG (auto-generate)');
expect(shipContent).toContain('git log <base>..HEAD --oneline');
});
test('changelog workflow includes cross-check step', () => {
expect(shipContent).toContain('Cross-check');
expect(shipContent).toContain('Every commit must map to at least one bullet point');
});
test('changelog workflow includes voice guidance', () => {
expect(shipContent).toContain('Lead with what the user can now **do**');
});
test('template uses {{CHANGELOG_WORKFLOW}} placeholder', () => {
const tmpl = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md.tmpl'), 'utf-8');
expect(tmpl).toContain('{{CHANGELOG_WORKFLOW}}');
// Should NOT contain the old inline changelog content
expect(tmpl).not.toContain('Group commits by theme');
});
test('changelog workflow includes keep-changelog format', () => {
expect(shipContent).toContain('### Added');
expect(shipContent).toContain('### Fixed');
});
});
// --- Parameterized resolver infrastructure tests ---
describe('parameterized resolver support', () => {
test('gen-skill-docs regex handles colon-separated args', () => {
// Verify the template containing {{INVOKE_SKILL:office-hours}} was processed
// without leaving unresolved placeholders
const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
expect(ceoContent).not.toMatch(/\{\{INVOKE_SKILL:[^}]+\}\}/);
});
test('templates with parameterized resolvers pass unresolved check', () => {
// All generated SKILL.md files should have no unresolved {{...}} placeholders
const skillDirs = fs.readdirSync(ROOT).filter(d =>
fs.existsSync(path.join(ROOT, d, 'SKILL.md'))
);
for (const dir of skillDirs) {
const content = fs.readFileSync(path.join(ROOT, dir, 'SKILL.md'), 'utf-8');
const unresolved = content.match(/\{\{[A-Z_]+(?::[^}]*)?\}\}/g);
if (unresolved) {
throw new Error(`${dir}/SKILL.md has unresolved placeholders: ${unresolved.join(', ')}`);
}
}
});
});
// --- Preamble routing injection tests ---
describe('preamble routing injection', () => {
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
test('preamble bash checks for routing section in CLAUDE.md', () => {
expect(shipContent).toContain('grep -q "## Skill routing" CLAUDE.md');
expect(shipContent).toContain('HAS_ROUTING');
});
test('preamble bash reads routing_declined config', () => {
expect(shipContent).toContain('routing_declined');
expect(shipContent).toContain('ROUTING_DECLINED');
});
test('preamble includes routing injection AskUserQuestion', () => {
expect(shipContent).toContain('Add routing rules to CLAUDE.md');
expect(shipContent).toContain("I'll invoke skills manually");
});
test('routing injection respects prior decline', () => {
expect(shipContent).toContain('ROUTING_DECLINED');
expect(shipContent).toMatch(/routing_declined.*true/);
});
test('routing injection only fires when all conditions met', () => {
// Must be: HAS_ROUTING=no AND ROUTING_DECLINED=false AND PROACTIVE_PROMPTED=yes
expect(shipContent).toContain('HAS_ROUTING');
expect(shipContent).toContain('ROUTING_DECLINED');
expect(shipContent).toContain('PROACTIVE_PROMPTED');
});
test('routing section content includes key routing rules', () => {
expect(shipContent).toContain('invoke office-hours');
expect(shipContent).toContain('invoke investigate');
expect(shipContent).toContain('invoke ship');
expect(shipContent).toContain('invoke qa');
});
});
// --- {{DESIGN_OUTSIDE_VOICES}} resolver tests ---
@@ -1470,10 +1639,9 @@ describe('Codex generation (--host codex)', () => {
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
// Correct: references to sidecar files use gstack/review/ path
expect(content).toContain('.agents/skills/gstack/review/checklist.md');
expect(content).toContain('.agents/skills/gstack/review/design-checklist.md');
// design-checklist.md is now referenced via Review Army specialist (Claude only, stripped for Codex)
// Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there)
expect(content).not.toContain('.agents/skills/gstack-review/checklist.md');
expect(content).not.toContain('.agents/skills/gstack-review/design-checklist.md');
});
test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => {
@@ -1793,11 +1961,12 @@ describe('setup script validation', () => {
});
test('link_claude_skill_dirs creates relative symlinks', () => {
// Claude links should be relative: ln -snf "gstack/skill_name"
// Claude links should be relative: ln -snf "gstack/$dir_name"
// Uses dir_name (not skill_name) because symlink target must point to the physical directory
const fnStart = setupContent.indexOf('link_claude_skill_dirs()');
const fnEnd = setupContent.indexOf('}', setupContent.indexOf('linked[@]}', fnStart));
const fnBody = setupContent.slice(fnStart, fnEnd);
expect(fnBody).toContain('ln -snf "gstack/$skill_name"');
expect(fnBody).toContain('ln -snf "gstack/$dir_name"');
});
test('setup supports --host auto|claude|codex|kiro', () => {
@@ -2036,6 +2205,100 @@ describe('telemetry', () => {
});
});
describe('community fixes wave', () => {
// Helper to get all generated SKILL.md files
function getAllSkillMds(): Array<{ name: string; content: string }> {
const results: Array<{ name: string; content: string }> = [];
const rootPath = path.join(ROOT, 'SKILL.md');
if (fs.existsSync(rootPath)) {
results.push({ name: 'root', content: fs.readFileSync(rootPath, 'utf-8') });
}
for (const entry of fs.readdirSync(ROOT, { withFileTypes: true })) {
if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
const skillPath = path.join(ROOT, entry.name, 'SKILL.md');
if (fs.existsSync(skillPath)) {
results.push({ name: entry.name, content: fs.readFileSync(skillPath, 'utf-8') });
}
}
return results;
}
// #594 — Discoverability: every SKILL.md.tmpl description contains "gstack"
test('every SKILL.md.tmpl description contains "gstack"', () => {
for (const skill of ALL_SKILLS) {
const tmplPath = skill.dir === '.' ? path.join(ROOT, 'SKILL.md.tmpl') : path.join(ROOT, skill.dir, 'SKILL.md.tmpl');
const content = fs.readFileSync(tmplPath, 'utf-8');
const desc = extractDescription(content);
expect(desc.toLowerCase()).toContain('gstack');
}
});
// #594 — Discoverability: first line of each description is under 120 chars
test('every SKILL.md.tmpl description first line is under 120 chars', () => {
for (const skill of ALL_SKILLS) {
const tmplPath = skill.dir === '.' ? path.join(ROOT, 'SKILL.md.tmpl') : path.join(ROOT, skill.dir, 'SKILL.md.tmpl');
const content = fs.readFileSync(tmplPath, 'utf-8');
const desc = extractDescription(content);
const firstLine = desc.split('\n')[0];
expect(firstLine.length).toBeLessThanOrEqual(120);
}
});
// #573 — Feature signals: ship/SKILL.md contains feature signal detection
test('ship/SKILL.md contains feature signal detection in Step 4', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content.toLowerCase()).toContain('feature signal');
});
// #510 — Context warnings: no SKILL.md contains "running low on context"
test('no generated SKILL.md contains "running low on context"', () => {
const skills = getAllSkillMds();
for (const { name, content } of skills) {
expect(content).not.toContain('running low on context');
}
});
// #510 — Context warnings: plan-eng-review has explicit anti-warning
test('plan-eng-review/SKILL.md contains "Do not preemptively warn"', () => {
const content = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Do not preemptively warn');
});
// #474 — Safety Net: no SKILL.md uses find with -delete
test('no generated SKILL.md contains find with -delete flag', () => {
const skills = getAllSkillMds();
for (const { name, content } of skills) {
// Match find commands that use -delete (but not prose mentioning the word "delete")
const lines = content.split('\n');
for (const line of lines) {
if (line.includes('find ') && line.includes('-delete')) {
throw new Error(`${name}/SKILL.md contains find with -delete: ${line.trim()}`);
}
}
}
});
// #467 — Telemetry: preamble JSONL writes are gated by telemetry setting
test('preamble JSONL writes are inside telemetry conditional', () => {
const preamble = fs.readFileSync(path.join(ROOT, 'scripts/resolvers/preamble.ts'), 'utf-8');
// Find all skill-usage.jsonl write lines
const lines = preamble.split('\n');
for (let i = 0; i < lines.length; i++) {
if (lines[i].includes('skill-usage.jsonl') && lines[i].includes('>>')) {
// Look backwards for a telemetry conditional within 5 lines
let foundConditional = false;
for (let j = i - 1; j >= Math.max(0, i - 5); j--) {
if (lines[j].includes('_TEL') && lines[j].includes('off')) {
foundConditional = true;
break;
}
}
expect(foundConditional).toBe(true);
}
}
});
});
describe('codex commands must not use inline $(git rev-parse --show-toplevel) for cwd', () => {
// Regression test: inline $(git rev-parse --show-toplevel) in codex exec -C
// or codex review without cd evaluates in whatever cwd the background shell
@@ -2233,3 +2496,49 @@ describe('CONFIDENCE_CALIBRATION resolver', () => {
}
});
});
describe('gen-skill-docs prefix warning (#620/#578)', () => {
const { execSync } = require('child_process');
test('warns about skill_prefix when config has prefix=true', () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-prefix-warn-'));
try {
// Create a fake ~/.gstack/config.yaml with skill_prefix: true
const fakeHome = tmpDir;
const fakeGstack = path.join(fakeHome, '.gstack');
fs.mkdirSync(fakeGstack, { recursive: true });
fs.writeFileSync(path.join(fakeGstack, 'config.yaml'), 'skill_prefix: true\n');
const output = execSync('bun run scripts/gen-skill-docs.ts', {
cwd: ROOT,
env: { ...process.env, HOME: fakeHome },
encoding: 'utf-8',
timeout: 30000,
});
expect(output).toContain('skill_prefix is true');
expect(output).toContain('gstack-relink');
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
});
test('no warning when skill_prefix is false or absent', () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-prefix-warn-'));
try {
const fakeHome = tmpDir;
const fakeGstack = path.join(fakeHome, '.gstack');
fs.mkdirSync(fakeGstack, { recursive: true });
fs.writeFileSync(path.join(fakeGstack, 'config.yaml'), 'skill_prefix: false\n');
const output = execSync('bun run scripts/gen-skill-docs.ts', {
cwd: ROOT,
env: { ...process.env, HOME: fakeHome },
encoding: 'utf-8',
timeout: 30000,
});
expect(output).not.toContain('skill_prefix is true');
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
});
});
+22
View File
@@ -59,6 +59,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Review Army (specialist dispatch)
'review-army-migration-safety': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
'review-army-perf-n-plus-one': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
'review-army-delivery-audit': ['review/**', 'scripts/resolvers/review.ts', 'scripts/resolvers/review-army.ts'],
'review-army-quality-score': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-json-findings': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-red-team': ['review/**', 'scripts/resolvers/review-army.ts'],
'review-army-consensus': ['review/**', 'scripts/resolvers/review-army.ts'],
// Office Hours
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
@@ -122,6 +131,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Plan completion audit + verification
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
'ship-idempotency': ['ship/**', 'scripts/resolvers/utility.ts'],
'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],
// Design
@@ -152,6 +162,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Sidebar agent
'sidebar-navigate': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
'sidebar-url-accuracy': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
'sidebar-css-interaction': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts', 'browse/src/cdp-inspector.ts', 'extension/**'],
// Autoplan
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
@@ -203,6 +214,15 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'review-plan-completion': 'gate',
'review-dashboard-via': 'gate',
// Review Army — gate for core functionality, periodic for multi-specialist
'review-army-migration-safety': 'gate', // Specialist activation guardrail
'review-army-perf-n-plus-one': 'gate', // Specialist activation guardrail
'review-army-delivery-audit': 'gate', // Delivery integrity guardrail
'review-army-quality-score': 'gate', // Score computation
'review-army-json-findings': 'gate', // JSON schema compliance
'review-army-red-team': 'periodic', // Multi-agent coordination
'review-army-consensus': 'periodic', // Multi-specialist agreement
// Office Hours
'office-hours-spec-review': 'gate',
@@ -228,6 +248,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'ship-triage': 'gate',
'ship-plan-completion': 'gate',
'ship-plan-verification': 'gate',
'ship-idempotency': 'periodic',
// Retro — gate for cheap branch detection, periodic for full Opus retro
'retro': 'periodic',
@@ -282,6 +303,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Sidebar agent
'sidebar-navigate': 'periodic',
'sidebar-url-accuracy': 'periodic',
'sidebar-css-interaction': 'periodic',
// Autoplan — periodic (not yet implemented)
'autoplan-core': 'periodic',
+229
View File
@@ -0,0 +1,229 @@
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { execSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin');
let tmpDir: string;
let skillsDir: string;
let installDir: string;
function run(cmd: string, env: Record<string, string> = {}, expectFail = false): string {
try {
return execSync(cmd, {
cwd: ROOT,
env: { ...process.env, GSTACK_STATE_DIR: tmpDir, ...env },
encoding: 'utf-8',
timeout: 10000,
stdio: ['pipe', 'pipe', 'pipe'],
}).trim();
} catch (e: any) {
if (expectFail) return (e.stderr || e.stdout || '').toString().trim();
throw e;
}
}
// Create a mock gstack install directory with skill subdirs
function setupMockInstall(skills: string[]): void {
installDir = path.join(tmpDir, 'gstack-install');
skillsDir = path.join(tmpDir, 'skills');
fs.mkdirSync(installDir, { recursive: true });
fs.mkdirSync(skillsDir, { recursive: true });
// Copy the real gstack-config and gstack-relink to the mock install
const mockBin = path.join(installDir, 'bin');
fs.mkdirSync(mockBin, { recursive: true });
fs.copyFileSync(path.join(BIN, 'gstack-config'), path.join(mockBin, 'gstack-config'));
fs.chmodSync(path.join(mockBin, 'gstack-config'), 0o755);
if (fs.existsSync(path.join(BIN, 'gstack-relink'))) {
fs.copyFileSync(path.join(BIN, 'gstack-relink'), path.join(mockBin, 'gstack-relink'));
fs.chmodSync(path.join(mockBin, 'gstack-relink'), 0o755);
}
if (fs.existsSync(path.join(BIN, 'gstack-patch-names'))) {
fs.copyFileSync(path.join(BIN, 'gstack-patch-names'), path.join(mockBin, 'gstack-patch-names'));
fs.chmodSync(path.join(mockBin, 'gstack-patch-names'), 0o755);
}
// Create mock skill directories with proper frontmatter
for (const skill of skills) {
fs.mkdirSync(path.join(installDir, skill), { recursive: true });
fs.writeFileSync(
path.join(installDir, skill, 'SKILL.md'),
`---\nname: ${skill}\ndescription: test\n---\n# ${skill}`
);
}
}
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-relink-test-'));
});
afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});
describe('gstack-relink (#578)', () => {
// Test 11: prefixed symlinks when skill_prefix=true
test('creates gstack-* symlinks when skill_prefix=true', () => {
setupMockInstall(['qa', 'ship', 'review']);
// Set config to prefix mode
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
// Run relink with env pointing to the mock install
const output = run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Verify gstack-* symlinks exist
expect(fs.existsSync(path.join(skillsDir, 'gstack-qa'))).toBe(true);
expect(fs.existsSync(path.join(skillsDir, 'gstack-ship'))).toBe(true);
expect(fs.existsSync(path.join(skillsDir, 'gstack-review'))).toBe(true);
expect(output).toContain('gstack-');
});
// Test 12: flat symlinks when skill_prefix=false
test('creates flat symlinks when skill_prefix=false', () => {
setupMockInstall(['qa', 'ship', 'review']);
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix false`);
const output = run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
expect(fs.existsSync(path.join(skillsDir, 'qa'))).toBe(true);
expect(fs.existsSync(path.join(skillsDir, 'ship'))).toBe(true);
expect(fs.existsSync(path.join(skillsDir, 'review'))).toBe(true);
expect(output).toContain('flat');
});
// Test 13: cleans stale symlinks from opposite mode
test('cleans up stale symlinks from opposite mode', () => {
setupMockInstall(['qa', 'ship']);
// Create prefixed symlinks first
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
expect(fs.existsSync(path.join(skillsDir, 'gstack-qa'))).toBe(true);
// Switch to flat mode
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix false`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Flat symlinks should exist, prefixed should be gone
expect(fs.existsSync(path.join(skillsDir, 'qa'))).toBe(true);
expect(fs.existsSync(path.join(skillsDir, 'gstack-qa'))).toBe(false);
});
// Test 14: error when install dir missing
test('prints error when install dir missing', () => {
const output = run(`${BIN}/gstack-relink`, {
GSTACK_INSTALL_DIR: '/nonexistent/path/gstack',
GSTACK_SKILLS_DIR: '/nonexistent/path/skills',
}, true);
expect(output).toContain('setup');
});
// Test: gstack-upgrade does NOT get double-prefixed
test('does not double-prefix gstack-upgrade directory', () => {
setupMockInstall(['qa', 'ship', 'gstack-upgrade']);
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// gstack-upgrade should keep its name, NOT become gstack-gstack-upgrade
expect(fs.existsSync(path.join(skillsDir, 'gstack-upgrade'))).toBe(true);
expect(fs.existsSync(path.join(skillsDir, 'gstack-gstack-upgrade'))).toBe(false);
// Regular skills still get prefixed
expect(fs.existsSync(path.join(skillsDir, 'gstack-qa'))).toBe(true);
});
// Test 15: gstack-config set skill_prefix triggers relink
test('gstack-config set skill_prefix triggers relink', () => {
setupMockInstall(['qa', 'ship']);
// Run gstack-config set which should auto-trigger relink
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// If relink was triggered, symlinks should exist
expect(fs.existsSync(path.join(skillsDir, 'gstack-qa'))).toBe(true);
expect(fs.existsSync(path.join(skillsDir, 'gstack-ship'))).toBe(true);
});
});
describe('gstack-patch-names (#620/#578)', () => {
// Helper to read name: from SKILL.md frontmatter
function readSkillName(skillDir: string): string | null {
const content = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf-8');
const match = content.match(/^name:\s*(.+)$/m);
return match ? match[1].trim() : null;
}
test('prefix=true patches name: field in SKILL.md', () => {
setupMockInstall(['qa', 'ship', 'review']);
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Verify name: field is patched with gstack- prefix
expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
expect(readSkillName(path.join(installDir, 'ship'))).toBe('gstack-ship');
expect(readSkillName(path.join(installDir, 'review'))).toBe('gstack-review');
});
test('prefix=false restores name: field in SKILL.md', () => {
setupMockInstall(['qa', 'ship']);
// First, prefix them
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
// Now switch to flat mode
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix false`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Verify name: field is restored to unprefixed
expect(readSkillName(path.join(installDir, 'qa'))).toBe('qa');
expect(readSkillName(path.join(installDir, 'ship'))).toBe('ship');
});
test('gstack-upgrade name: not double-prefixed', () => {
setupMockInstall(['qa', 'gstack-upgrade']);
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// gstack-upgrade should keep its name, NOT become gstack-gstack-upgrade
expect(readSkillName(path.join(installDir, 'gstack-upgrade'))).toBe('gstack-upgrade');
// Regular skill should be prefixed
expect(readSkillName(path.join(installDir, 'qa'))).toBe('gstack-qa');
});
test('SKILL.md without frontmatter is a no-op', () => {
setupMockInstall(['qa']);
// Overwrite qa SKILL.md with no frontmatter
fs.writeFileSync(path.join(installDir, 'qa', 'SKILL.md'), '# qa\nSome content.');
run(`${path.join(installDir, 'bin', 'gstack-config')} set skill_prefix true`);
// Should not crash
run(`${path.join(installDir, 'bin', 'gstack-relink')}`, {
GSTACK_INSTALL_DIR: installDir,
GSTACK_SKILLS_DIR: skillsDir,
});
// Content should be unchanged (no name: to patch)
const content = fs.readFileSync(path.join(installDir, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toBe('# qa\nSome content.');
});
});
+562
View File
@@ -0,0 +1,562 @@
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import {
ROOT, runId, describeIfSelected, testConcurrentIfSelected,
logCost, recordE2E, createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-review-army');
// Helper: create a git repo with a feature branch
function setupRepo(prefix: string): { dir: string; run: (cmd: string, args: string[]) => void } {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${prefix}-`));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
return { dir, run };
}
// Helper: copy review skill files to test dir
function copyReviewFiles(dir: string) {
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(dir, 'review-SKILL.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(dir, 'review-checklist.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(dir, 'review-greptile-triage.md'));
// Copy specialist checklists
const specDir = path.join(dir, 'review-specialists');
fs.mkdirSync(specDir, { recursive: true });
const specialistsRoot = path.join(ROOT, 'review', 'specialists');
for (const f of fs.readdirSync(specialistsRoot)) {
fs.copyFileSync(path.join(specialistsRoot, f), path.join(specDir, f));
}
}
// --- Review Army: Migration Safety ---
describeIfSelected('Review Army: Migration Safety', ['review-army-migration-safety'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-migration');
dir = repo.dir;
// Base commit
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
// Feature branch with unsafe migration
repo.run('git', ['checkout', '-b', 'feature/drop-columns']);
fs.mkdirSync(path.join(dir, 'db', 'migrate'), { recursive: true });
const migrationContent = fs.readFileSync(
path.join(ROOT, 'test', 'fixtures', 'review-army-migration.sql'), 'utf-8'
);
fs.writeFileSync(path.join(dir, 'db', 'migrate', '20260330_drop_columns.sql'), migrationContent);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'drop email and phone columns']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-migration-safety', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with a database migration that drops columns.
Read review-SKILL.md for instructions. Also read review-checklist.md.
The specialist checklists are in review-specialists/ (testing.md, security.md, performance.md, data-migration.md, etc.).
Skip the preamble, lake intro, telemetry sections.
Run Step 4 (Critical pass) then Step 4.5 (Review Army — Specialist Dispatch).
The base branch is main. Run gstack-diff-scope style analysis on the changed files.
Since db/migrate/ files changed, the Data Migration specialist should activate.
For the specialist dispatch, instead of launching subagents, just read review-specialists/data-migration.md
and apply it yourself against the diff (git diff main...HEAD).
Write your findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-migration-safety',
runId,
});
logCost('/review army migration', result);
recordE2E(evalCollector, '/review army migration safety', 'Review Army', result);
expect(result.exitReason).toBe('success');
// Verify migration issues were caught
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
const hasMigrationFinding =
content.includes('drop') ||
content.includes('data loss') ||
content.includes('reversib') ||
content.includes('migration') ||
content.includes('column');
expect(hasMigrationFinding).toBe(true);
}
}, 210_000);
});
// --- Review Army: N+1 Performance ---
describeIfSelected('Review Army: N+1 Performance', ['review-army-perf-n-plus-one'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-n-plus-one');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/add-posts-index']);
const n1Content = fs.readFileSync(
path.join(ROOT, 'test', 'fixtures', 'review-army-n-plus-one.rb'), 'utf-8'
);
fs.writeFileSync(path.join(dir, 'posts_controller.rb'), n1Content);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add posts controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-perf-n-plus-one', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with a Ruby controller that has N+1 queries.
Read review-SKILL.md for instructions. Also read review-checklist.md.
The specialist checklists are in review-specialists/ (testing.md, performance.md, etc.).
Skip the preamble, lake intro, telemetry sections.
Run Step 4 (Critical pass) then Step 4.5 (Review Army).
The base branch is main. This is a Ruby backend file, so Performance specialist should activate.
For the specialist dispatch, read review-specialists/performance.md and apply it against the diff.
Write your findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-perf-n-plus-one',
runId,
});
logCost('/review army n+1', result);
recordE2E(evalCollector, '/review army N+1 detection', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
const hasN1Finding =
content.includes('n+1') ||
content.includes('n + 1') ||
content.includes('eager') ||
content.includes('includes') ||
content.includes('preload') ||
content.includes('query') ||
content.includes('loop');
expect(hasN1Finding).toBe(true);
}
}, 210_000);
});
// --- Review Army: Delivery Audit ---
describeIfSelected('Review Army: Delivery Audit', ['review-army-delivery-audit'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-delivery');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/three-features']);
// Write a plan file promising 3 features
fs.writeFileSync(path.join(dir, 'PLAN.md'), `# Feature Plan
## Implementation Items
1. Add user authentication with login/logout
2. Add user profile page with avatar upload
3. Add email notification system for new signups
## Test Items
- Test login flow
- Test profile page rendering
- Test email sending
`);
repo.run('git', ['add', 'PLAN.md']);
repo.run('git', ['commit', '-m', 'add plan']);
// Implement only 2 of 3 features
fs.writeFileSync(path.join(dir, 'auth.rb'), `class AuthController
def login
# authenticate user
session[:user_id] = user.id
end
def logout
session.delete(:user_id)
end
end
`);
fs.writeFileSync(path.join(dir, 'profile.rb'), `class ProfileController
def show
@user = User.find(params[:id])
end
def update_avatar
@user.avatar.attach(params[:avatar])
end
end
`);
// NOTE: email notification system is NOT implemented (intentionally missing)
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'implement auth and profile features']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-delivery-audit', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/three-features.
There is a PLAN.md file that promises 3 features: auth, profile, and email notifications.
The diff (git diff main...HEAD) only implements 2 of them (auth and profile).
Read review-SKILL.md for the review workflow. Focus on the Plan Completion Audit section.
The plan file is at ./PLAN.md. Cross-reference it against the diff.
For each plan item, classify as DONE, PARTIAL, NOT DONE, or CHANGED.
The email notification system should be classified as NOT DONE.
Write your completion audit to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 15,
timeout: 120_000,
testName: 'review-army-delivery-audit',
runId,
});
logCost('/review army delivery', result);
recordE2E(evalCollector, '/review army delivery audit', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
// Should identify email notifications as NOT DONE
const hasNotDone =
content.includes('not done') ||
content.includes('not_done') ||
content.includes('missing') ||
content.includes('not implemented');
const mentionsEmail =
content.includes('email') ||
content.includes('notification');
expect(hasNotDone).toBe(true);
expect(mentionsEmail).toBe(true);
}
}, 150_000);
});
// --- Review Army: Quality Score ---
describeIfSelected('Review Army: Quality Score', ['review-army-quality-score'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-quality');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/add-controller']);
// Code with obvious issues for quality score computation
fs.writeFileSync(path.join(dir, 'user_controller.rb'), `class UserController
def create
# SQL injection
User.where("name = '#{params[:name]}'")
# Magic number
if users.count > 42
raise "too many"
end
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add user controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-quality-score', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo with a vulnerable user controller.
Read review-SKILL.md and review-checklist.md.
Skip preamble, lake intro, telemetry.
Run the Critical pass (Step 4) against the diff (git diff main...HEAD).
Then compute the PR Quality Score as described in the Review Army merge step:
quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))
Write your findings AND the computed quality score to ${dir}/review-output.md
Include the line: "PR Quality Score: X/10" where X is the computed score.`,
workingDirectory: dir,
maxTurns: 15,
timeout: 120_000,
testName: 'review-army-quality-score',
runId,
});
logCost('/review army quality', result);
recordE2E(evalCollector, '/review army quality score', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8');
// Should contain a quality score
const hasScore =
content.toLowerCase().includes('quality score') ||
content.match(/\d+\/10/);
expect(hasScore).toBeTruthy();
}
}, 150_000);
});
// --- Review Army: JSON Findings ---
describeIfSelected('Review Army: JSON Findings', ['review-army-json-findings'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-json');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/vuln']);
fs.writeFileSync(path.join(dir, 'search.rb'), `class SearchController
def index
# SQL injection via string interpolation
results = ActiveRecord::Base.connection.execute(
"SELECT * FROM products WHERE name LIKE '%#{params[:q]}%'"
)
render json: results
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add search']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-json-findings', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a git diff with a SQL injection vulnerability.
Read review-specialists/security.md for the security checklist.
Apply the checklist against this diff (git diff main...HEAD).
Output your findings as JSON objects, one per line, following the schema:
{"severity":"CRITICAL","confidence":9,"path":"search.rb","line":4,"category":"injection","summary":"SQL injection via string interpolation","fix":"Use parameterized query","fingerprint":"search.rb:4:injection","specialist":"security"}
Write ONLY JSON findings (no preamble) to ${dir}/findings.json`,
workingDirectory: dir,
maxTurns: 12,
timeout: 90_000,
testName: 'review-army-json-findings',
runId,
});
logCost('/review army json', result);
recordE2E(evalCollector, '/review army JSON findings', 'Review Army', result);
expect(result.exitReason).toBe('success');
const findingsPath = path.join(dir, 'findings.json');
if (fs.existsSync(findingsPath)) {
const content = fs.readFileSync(findingsPath, 'utf-8').trim();
const lines = content.split('\n').filter(l => l.trim());
// At least one finding
expect(lines.length).toBeGreaterThanOrEqual(1);
// Each line should be valid JSON with required fields
for (const line of lines) {
let parsed: any;
try { parsed = JSON.parse(line); } catch { continue; }
// Required fields per schema
expect(parsed).toHaveProperty('severity');
expect(parsed).toHaveProperty('confidence');
expect(parsed).toHaveProperty('path');
expect(parsed).toHaveProperty('category');
expect(parsed).toHaveProperty('summary');
expect(parsed).toHaveProperty('specialist');
break; // One valid line is enough for the gate test
}
}
}, 120_000);
});
// --- Review Army: Red Team (periodic) ---
describeIfSelected('Review Army: Red Team', ['review-army-red-team'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-redteam');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/large-change']);
// Create a large diff (300+ lines)
const lines: string[] = ['class LargeController'];
for (let i = 0; i < 100; i++) {
lines.push(` def method_${i}`);
lines.push(` data = params[:input_${i}]`);
lines.push(` process(data)`);
lines.push(' end');
lines.push('');
}
lines.push('end');
fs.writeFileSync(path.join(dir, 'large_controller.rb'), lines.join('\n'));
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add large controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-red-team', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a large diff (300+ lines). Read review-SKILL.md.
Skip preamble, lake intro, telemetry.
The diff is large enough to activate the Red Team specialist.
Read review-specialists/red-team.md and apply it against the diff (git diff main...HEAD).
Focus on finding issues that other specialists might miss.
Write your red team findings to ${dir}/review-output.md
Start the file with "RED TEAM REVIEW" on the first line.`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-red-team',
runId,
});
logCost('/review army red-team', result);
recordE2E(evalCollector, '/review army red team', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8');
expect(content.toLowerCase()).toMatch(/red team|adversarial/);
}
}, 210_000);
});
// --- Review Army: Consensus (periodic) ---
describeIfSelected('Review Army: Consensus', ['review-army-consensus'], () => {
let dir: string;
beforeAll(() => {
const repo = setupRepo('army-consensus');
dir = repo.dir;
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'initial']);
repo.run('git', ['checkout', '-b', 'feature/vuln-auth']);
// SQL injection that both security AND testing specialists should flag
fs.writeFileSync(path.join(dir, 'auth_controller.rb'), `class AuthController
def login
user = User.find_by("email = '#{params[:email]}' AND password = '#{params[:password]}'")
if user
session[:user_id] = user.id
redirect_to root_path
else
flash[:error] = "Invalid credentials"
render :login
end
end
end
`);
repo.run('git', ['add', '.']);
repo.run('git', ['commit', '-m', 'add auth controller']);
copyReviewFiles(dir);
});
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
testConcurrentIfSelected('review-army-consensus', async () => {
const result = await runSkillTest({
prompt: `You are reviewing a git diff with a SQL injection in an auth controller.
Read review-SKILL.md, review-checklist.md, and the specialist checklists in review-specialists/.
This vulnerability should be caught by BOTH the security specialist (injection vector)
AND the testing specialist (no test for auth bypass).
Run the review. In your output, if a finding is flagged by multiple perspectives,
mark it as "MULTI-SPECIALIST CONFIRMED" with the confirming categories.
Write findings to ${dir}/review-output.md`,
workingDirectory: dir,
maxTurns: 20,
timeout: 180_000,
testName: 'review-army-consensus',
runId,
});
logCost('/review army consensus', result);
recordE2E(evalCollector, '/review army consensus', 'Review Army', result);
expect(result.exitReason).toBe('success');
const outputPath = path.join(dir, 'review-output.md');
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
// Should catch the SQL injection
const hasSqlFinding =
content.includes('sql') ||
content.includes('injection') ||
content.includes('interpolat');
expect(hasSqlFinding).toBe(true);
}
}, 210_000);
});
// Finalize eval collector
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
});
+190
View File
@@ -149,6 +149,196 @@ describeIfSelected('Sidebar URL accuracy E2E', ['sidebar-url-accuracy'], () => {
}, 30_000);
});
// --- Sidebar CSS Interaction E2E (real Claude + real browser) ---
// Goes to HN, reads comments, identifies the most insightful one, highlights it.
// Exercises: navigation, snapshot, text reading, LLM judgment, CSS style injection.
describeIfSelected('Sidebar CSS interaction E2E', ['sidebar-css-interaction'], () => {
let serverProc: Subprocess | null = null;
let agentProc: Subprocess | null = null;
let serverPort: number = 0;
let authToken: string = '';
let tmpDir: string = '';
let stateFile: string = '';
let queueFile: string = '';
let serverLogFile: string = '';
let serverErrFile: string = '';
let agentLogFile: string = '';
let agentErrFile: string = '';
async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
...(opts.headers as Record<string, string> || {}),
};
if (!headers['Authorization'] && authToken) {
headers['Authorization'] = `Bearer ${authToken}`;
}
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
}
beforeAll(async () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-css-'));
stateFile = path.join(tmpDir, 'browse.json');
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
// Start server WITH a real browser for CSS interaction
const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
serverLogFile = path.join(tmpDir, 'server.log');
serverErrFile = path.join(tmpDir, 'server.err');
// Use 'pipe' stdio — closing file descriptors kills the child on macOS/bun
serverProc = spawn(['bun', 'run', serverScript], {
env: {
...process.env,
BROWSE_STATE_FILE: stateFile,
BROWSE_PORT: '0',
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_IDLE_TIMEOUT: '600000', // 10 min in ms — test takes ~3 min
},
stdio: ['ignore', 'pipe', 'pipe'],
});
// Wait for state file with port/token
const deadline = Date.now() + 30000;
while (Date.now() < deadline) {
if (fs.existsSync(stateFile)) {
try {
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
if (state.port && state.token) {
serverPort = state.port;
authToken = state.token;
break;
}
} catch {}
}
await new Promise(r => setTimeout(r, 200));
}
if (!serverPort) throw new Error('Server did not start in time');
// Verify server is healthy before proceeding
const healthDeadline = Date.now() + 10000;
let healthy = false;
while (Date.now() < healthDeadline) {
try {
const resp = await fetch(`http://127.0.0.1:${serverPort}/health`);
if (resp.ok) { healthy = true; break; }
} catch {}
await new Promise(r => setTimeout(r, 500));
}
if (!healthy) throw new Error('Server started but health check failed');
// Start sidebar-agent with the real browse binary
const agentScript = path.resolve(ROOT, 'browse', 'src', 'sidebar-agent.ts');
const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
agentLogFile = path.join(tmpDir, 'agent.log');
agentErrFile = path.join(tmpDir, 'agent.err');
// Use 'pipe' stdio — closing file descriptors kills the child on macOS/bun
agentProc = spawn(['bun', 'run', agentScript], {
env: {
...process.env,
BROWSE_SERVER_PORT: String(serverPort),
BROWSE_STATE_FILE: stateFile,
SIDEBAR_QUEUE_PATH: queueFile,
SIDEBAR_AGENT_TIMEOUT: '180000', // 3 min — multi-step HN comment task
BROWSE_BIN: fs.existsSync(browseBin) ? browseBin : 'echo',
},
stdio: ['ignore', 'pipe', 'pipe'],
});
await new Promise(r => setTimeout(r, 2000));
}, 35000);
afterAll(() => {
if (agentProc) { try { agentProc.kill(); } catch {} }
if (serverProc) { try { serverProc.kill(); } catch {} }
finalizeEvalCollector(evalCollector);
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('sidebar-css-interaction', async () => {
// Fresh session + clean queue
try { await api('/sidebar-session/new', { method: 'POST' }); } catch {}
fs.writeFileSync(queueFile, '');
const startTime = Date.now();
// Ask the agent to go to HN, find the most insightful comment, and highlight it
const resp = await api('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: 'Go to https://news.ycombinator.com. Find the top story. Click into its comments. Read the comments and find the most insightful one. Highlight that comment with a 4px solid orange outline.',
activeTabUrl: 'about:blank',
}),
});
expect(resp.status).toBe(200);
// Poll for agent_done (4 min timeout — multi-step task with opus LLM)
const deadline = Date.now() + 240000;
let entries: any[] = [];
while (Date.now() < deadline) {
try {
const chatResp = await api('/sidebar-chat?after=0');
const data = await chatResp.json();
entries = data.entries || [];
if (entries.some((e: any) => e.type === 'agent_done')) break;
} catch (err: any) {
// Server may be temporarily busy or restarting — retry on connection errors
const isConnErr = err.code === 'ConnectionRefused' || err.message?.includes('ConnectionRefused') || err.message?.includes('Unable to connect');
if (!isConnErr) throw err;
}
await new Promise(r => setTimeout(r, 3000));
}
const duration = Date.now() - startTime;
const doneEntry = entries.find((e: any) => e.type === 'agent_done');
// Dump debug info on failure
if (!doneEntry || entries.length === 0) {
console.log('ENTRIES:', JSON.stringify(entries.slice(-5), null, 2));
console.log('SERVER exitCode:', serverProc?.exitCode, 'signalCode:', serverProc?.signalCode, 'killed:', serverProc?.killed);
console.log('AGENT exitCode:', agentProc?.exitCode, 'signalCode:', agentProc?.signalCode, 'killed:', agentProc?.killed);
const queueContent = fs.existsSync(queueFile) ? fs.readFileSync(queueFile, 'utf-8').slice(-500) : 'NO QUEUE';
console.log('QUEUE:', queueContent.length > 0 ? 'has entries' : 'empty');
}
// Agent should have completed
expect(doneEntry).toBeDefined();
// Agent should have run browse commands (look for tool_use entries)
const toolUses = entries.filter((e: any) => e.type === 'tool_use');
expect(toolUses.length).toBeGreaterThanOrEqual(2); // At minimum: goto + one more
// Agent text should mention something about the comment it found
const agentText = entries
.filter((e: any) => e.role === 'agent' && (e.type === 'text' || e.type === 'result'))
.map((e: any) => e.text || '')
.join(' ')
.toLowerCase();
// Should have navigated to HN (look for ycombinator/HN in any entry text)
const allEntryText = entries
.map((e: any) => `${e.text || ''} ${e.input || ''} ${e.message || ''}`)
.join(' ');
const navigatedToHN = allEntryText.includes('ycombinator') || allEntryText.includes('Hacker News') || allEntryText.includes('news.ycombinator');
if (!navigatedToHN) {
console.log('ALL ENTRY TEXT (first 2000):', allEntryText.slice(0, 2000));
}
expect(navigatedToHN).toBe(true);
// Should have applied a style (look for orange/outline in tool commands)
const allText = entries.map((e: any) => e.text || '').join(' ');
const appliedStyle = allText.includes('outline') || allText.includes('orange') || allText.includes('style');
evalCollector?.addTest({
name: 'sidebar-css-interaction', suite: 'Sidebar CSS interaction E2E', tier: 'e2e',
passed: !!doneEntry && navigatedToHN && appliedStyle,
duration_ms: duration,
cost_usd: 0,
exit_reason: doneEntry ? 'success' : 'timeout',
});
}, 300_000);
});
// --- Sidebar Navigate (real Claude, requires ANTHROPIC_API_KEY) ---
describeIfSelected('Sidebar navigate E2E', ['sidebar-navigate'], () => {
+96
View File
@@ -3313,6 +3313,102 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
}, 180_000);
});
// --- Ship idempotency (#649) ---
describeIfSelected('Ship idempotency', ['ship-idempotency'], () => {
let idempDir: string;
const gitRun = (args: string[], cwd: string) =>
spawnSync('git', args, { cwd, stdio: 'pipe', timeout: 5000 });
beforeAll(() => {
idempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-ship-idemp-'));
// Create git repo with initial commit on main
gitRun(['init', '-b', 'main'], idempDir);
gitRun(['config', 'user.email', 'test@test.com'], idempDir);
gitRun(['config', 'user.name', 'Test'], idempDir);
fs.writeFileSync(path.join(idempDir, 'app.ts'), 'console.log("v1");\n');
fs.writeFileSync(path.join(idempDir, 'VERSION'), '0.1.0.0\n');
fs.writeFileSync(path.join(idempDir, 'CHANGELOG.md'), '# Changelog\n');
gitRun(['add', '.'], idempDir);
gitRun(['commit', '-m', 'initial'], idempDir);
// Create feature branch with changes
gitRun(['checkout', '-b', 'feat/my-feature'], idempDir);
fs.writeFileSync(path.join(idempDir, 'app.ts'), 'console.log("v2");\n');
gitRun(['add', 'app.ts'], idempDir);
gitRun(['commit', '-m', 'feat: update to v2'], idempDir);
// Simulate prior /ship run: bump VERSION and write CHANGELOG entry
fs.writeFileSync(path.join(idempDir, 'VERSION'), '0.2.0.0\n');
fs.writeFileSync(path.join(idempDir, 'CHANGELOG.md'),
'# Changelog\n\n## [0.2.0.0] — 2026-03-30\n\n- Updated app to v2\n');
gitRun(['add', 'VERSION', 'CHANGELOG.md'], idempDir);
gitRun(['commit', '-m', 'chore: bump version to 0.2.0.0'], idempDir);
// Extract just the idempotency-relevant sections from ship/SKILL.md
const full = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const step4Start = full.indexOf('## Step 4: Version bump');
const step4End = full.indexOf('\n---\n', step4Start);
const step7Start = full.indexOf('## Step 7: Push');
const step8End = full.indexOf('## Step 8.5');
const extracted = [
full.slice(step4Start, step4End > step4Start ? step4End : step4Start + 500),
full.slice(step7Start, step8End > step7Start ? step8End : step7Start + 500),
].join('\n\n---\n\n');
fs.writeFileSync(path.join(idempDir, 'ship-steps.md'), extracted);
});
afterAll(() => {
try { fs.rmSync(idempDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('ship-idempotency', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feat/my-feature. A prior /ship run already:
- Bumped VERSION from 0.1.0.0 to 0.2.0.0
- Wrote a CHANGELOG entry for 0.2.0.0
- But the push/PR step failed
Read ship-steps.md for the idempotency check instructions from the ship workflow.
Run ONLY the idempotency checks described in Steps 4 and 7. Do NOT actually push or create PRs (there is no remote).
After running the checks, write a report to ${idempDir}/idemp-result.md containing:
- Whether VERSION was detected as ALREADY_BUMPED or not
- Whether the push was detected as ALREADY_PUSHED or PUSH_NEEDED
- The current VERSION value (should still be 0.2.0.0)
Do NOT modify VERSION or CHANGELOG. Only run the detection checks and report.`,
workingDirectory: idempDir,
maxTurns: 10,
timeout: 60_000,
testName: 'ship-idempotency',
runId,
});
logCost('/ship idempotency', result);
recordE2E('/ship idempotency guard', 'Ship idempotency', result);
expect(result.exitReason).toBe('success');
// Verify VERSION was NOT modified
const version = fs.readFileSync(path.join(idempDir, 'VERSION'), 'utf-8').trim();
expect(version).toBe('0.2.0.0');
// Verify CHANGELOG was NOT duplicated
const changelog = fs.readFileSync(path.join(idempDir, 'CHANGELOG.md'), 'utf-8');
const versionEntries = (changelog.match(/## \[0\.2\.0\.0\]/g) || []).length;
expect(versionEntries).toBe(1);
// Check the result report if it was written
const reportPath = path.join(idempDir, 'idemp-result.md');
if (fs.existsSync(reportPath)) {
const report = fs.readFileSync(reportPath, 'utf-8');
expect(report.toLowerCase()).toContain('already_bumped');
}
}, 120_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+24 -5
View File
@@ -93,11 +93,30 @@ function installSkills(tmpDir: string) {
}
}
// Copy CLAUDE.md so Claude has project context for skill routing.
const claudeMdSrc = path.join(ROOT, 'CLAUDE.md');
if (fs.existsSync(claudeMdSrc)) {
fs.copyFileSync(claudeMdSrc, path.join(tmpDir, 'CLAUDE.md'));
}
// Write a CLAUDE.md with explicit routing instructions.
// The skill descriptions in system-reminder aren't strong enough to override
// Claude's default behavior of answering directly. A CLAUDE.md instruction
// puts routing rules in project context which Claude weighs more heavily.
fs.writeFileSync(path.join(tmpDir, 'CLAUDE.md'), `# Project Instructions
## Skill routing
When the user's request matches an available skill, ALWAYS invoke it using the Skill
tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
The skill has specialized workflows that produce better results than ad-hoc answers.
Key routing rules:
- Product ideas, "is this worth building", brainstorming → invoke office-hours
- Bugs, errors, "why is this broken", 500 errors → invoke investigate
- Ship, deploy, push, create PR → invoke ship
- QA, test the site, find bugs → invoke qa
- Code review, check my diff → invoke review
- Update docs after shipping → invoke document-release
- Weekly retro → invoke retro
- Design system, brand → invoke design-consultation
- Visual audit, design polish → invoke design-review
- Architecture review → invoke plan-eng-review
`);
}
/** Init a git repo with config */
+58 -19
View File
@@ -1305,38 +1305,49 @@ describe('Codex skill', () => {
expect(content).toContain('mktemp');
});
test('adversarial review in /review auto-scales by diff size', () => {
test('adversarial review in /review always runs both passes', () => {
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Adversarial review (auto-scaled)');
// Diff size thresholds
expect(content).toContain('< 50');
expect(content).toContain('50199');
expect(content).toContain('200+');
// All three tiers present
expect(content).toContain('Small');
expect(content).toContain('Medium tier');
expect(content).toContain('Large tier');
expect(content).toContain('Adversarial review (always-on)');
// Always-on: both Claude and Codex adversarial
expect(content).toContain('Claude adversarial subagent (always runs)');
expect(content).toContain('Codex adversarial challenge (always runs when available)');
// Claude adversarial subagent dispatch
expect(content).toContain('Agent tool');
expect(content).toContain('FIXABLE');
expect(content).toContain('INVESTIGATE');
// Codex fallback logic
// Codex availability check
expect(content).toContain('CODEX_NOT_AVAILABLE');
expect(content).toContain('fall back to the Claude adversarial subagent');
// Review log uses new skill name
// OLD_CFG only gates Codex, not Claude
expect(content).toContain('skip Codex passes only');
// Review log
expect(content).toContain('adversarial-review');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
// Large diff structured review still gated
expect(content).toContain('Codex structured review (large diffs only');
expect(content).toContain('200');
});
test('adversarial review in /ship auto-scales by diff size', () => {
test('adversarial review in /ship always runs both passes', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Adversarial review (auto-scaled)');
expect(content).toContain('< 50');
expect(content).toContain('200+');
expect(content).toContain('Adversarial review (always-on)');
expect(content).toContain('adversarial-review');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('Investigate and fix');
expect(content).toContain('Claude adversarial subagent (always runs)');
});
test('scope drift detection in /review and /ship', () => {
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
// Both should contain scope drift from the shared resolver
for (const content of [reviewContent, shipContent]) {
expect(content).toContain('Scope Check:');
expect(content).toContain('DRIFT DETECTED');
expect(content).toContain('SCOPE CREEP');
expect(content).toContain('MISSING REQUIREMENTS');
expect(content).toContain('stated intent');
}
});
test('codex-host ship/review do NOT contain adversarial review step', () => {
@@ -1409,13 +1420,13 @@ describe('Skill trigger phrases', () => {
];
for (const skill of SKILLS_REQUIRING_PROACTIVE) {
test(`${skill}/SKILL.md has "Proactively suggest" phrase`, () => {
test(`${skill}/SKILL.md has proactive routing phrase`, () => {
const skillPath = path.join(ROOT, skill, 'SKILL.md');
if (!fs.existsSync(skillPath)) return;
const content = fs.readFileSync(skillPath, 'utf-8');
const frontmatterEnd = content.indexOf('---', 4);
const frontmatter = content.slice(0, frontmatterEnd);
expect(frontmatter).toMatch(/Proactively suggest/i);
expect(frontmatter).toMatch(/Proactively (suggest|invoke)/i);
});
}
});
@@ -1547,3 +1558,31 @@ describe('Test failure triage in ship skill', () => {
expect(content).toContain('In-branch test failures');
});
});
describe('sidebar agent (#584)', () => {
// #584 — Sidebar Write: sidebar-agent.ts allowedTools includes Write
test('sidebar-agent.ts allowedTools includes Write', () => {
const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'sidebar-agent.ts'), 'utf-8');
// Find the allowedTools line in the askClaude function
const match = content.match(/--allowedTools['"]\s*,\s*['"]([^'"]+)['"]/);
expect(match).not.toBeNull();
expect(match![1]).toContain('Write');
});
// #584 — Server Write: server.ts allowedTools includes Write (DRY parity)
test('server.ts allowedTools excludes Write (agent is read-only + Bash)', () => {
const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'server.ts'), 'utf-8');
// Find the sidebar allowedTools in the headed-mode path
const match = content.match(/--allowedTools['"]\s*,\s*['"]([^'"]+)['"]/);
expect(match).not.toBeNull();
expect(match![1]).toContain('Bash');
expect(match![1]).not.toContain('Write');
});
// #584 — Sidebar stderr: stderr handler is not empty
test('sidebar-agent.ts stderr handler is not empty', () => {
const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'sidebar-agent.ts'), 'utf-8');
// The stderr handler should NOT be an empty arrow function
expect(content).not.toContain("proc.stderr.on('data', () => {})");
});
});
+22
View File
@@ -459,3 +459,25 @@ describe('gstack-community-dashboard', () => {
expect(output).not.toContain('Supabase not configured');
});
});
describe('preamble telemetry gating (#467)', () => {
test('preamble source does not write JSONL unconditionally', () => {
const preamble = fs.readFileSync(path.join(ROOT, 'scripts', 'resolvers', 'preamble.ts'), 'utf-8');
const lines = preamble.split('\n');
for (let i = 0; i < lines.length; i++) {
if (lines[i].includes('skill-usage.jsonl') && lines[i].includes('>>')) {
// Each JSONL write must be inside a _TEL conditional (within 5 lines above)
let foundConditional = false;
for (let j = i - 1; j >= Math.max(0, i - 5); j--) {
if (lines[j].includes('_TEL') && lines[j].includes('off')) {
foundConditional = true;
break;
}
}
if (!foundConditional) {
throw new Error(`Unconditional JSONL write at preamble.ts line ${i + 1}: ${lines[i].trim()}`);
}
}
}
});
});