Merge branch 'main' into garrytan/team-supabase-store

Brings in 55 commits from main (v0.12.x–v0.13.5.0): Factory Droid compat,
prompt injection defense, user sovereignty, security audit, design binary,
skill namespacing, modular resolvers, Chrome sidebar, and more.

Conflict resolution:
- .agents/ SKILL.md files: deleted (main moved to .factory/)
- 8 .tmpl templates: accepted main (new features: CDP mode, design tools,
  global retro, parallelization, distribution checks, plan audits)
- scripts/gen-skill-docs.ts: accepted main's modular resolver refactor
- test/helpers/session-runner.ts: accepted main + layered back CostEntry
  tracking from team branch
- Generated SKILL.md files: regenerated via bun run gen:skill-docs
- Updated tests to match main's gstack-slug output (2 lines, no PROJECTS_DIR)
  and review log mechanism (gstack-review-log, not $BRANCH.jsonl)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-29 15:12:12 -07:00
267 changed files with 60292 additions and 12207 deletions
+88
View File
@@ -0,0 +1,88 @@
import { describe, test, expect } from 'bun:test';
import { readFileSync, readdirSync, existsSync } from 'fs';
import { join } from 'path';
const ROOT = join(import.meta.dir, '..');
function getAllSkillMds(): Array<{ name: string; content: string }> {
const results: Array<{ name: string; content: string }> = [];
const rootPath = join(ROOT, 'SKILL.md');
if (existsSync(rootPath)) {
results.push({ name: 'root', content: readFileSync(rootPath, 'utf-8') });
}
for (const entry of readdirSync(ROOT, { withFileTypes: true })) {
if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
const skillPath = join(ROOT, entry.name, 'SKILL.md');
if (existsSync(skillPath)) {
results.push({ name: entry.name, content: readFileSync(skillPath, 'utf-8') });
}
}
return results;
}
describe('Audit compliance', () => {
// Fix 1: W007 — No hardcoded credentials in documentation
test('no hardcoded credential patterns in SKILL.md.tmpl', () => {
const tmpl = readFileSync(join(ROOT, 'SKILL.md.tmpl'), 'utf-8');
expect(tmpl).not.toContain('"password123"');
expect(tmpl).not.toContain('"test@example.com"');
expect(tmpl).not.toContain('"test@test.com"');
expect(tmpl).toContain('$TEST_EMAIL');
expect(tmpl).toContain('$TEST_PASSWORD');
});
// Fix 2: Conditional telemetry — binary calls wrapped with existence check
test('preamble telemetry calls are conditional on _TEL and binary existence', () => {
const preamble = readFileSync(join(ROOT, 'scripts/resolvers/preamble.ts'), 'utf-8');
// Pending finalization must check _TEL and binary existence
expect(preamble).toContain('_TEL" != "off"');
expect(preamble).toContain('-x ');
expect(preamble).toContain('gstack-telemetry-log');
// End-of-skill telemetry must also be conditional
const completionIdx = preamble.indexOf('Telemetry (run last)');
expect(completionIdx).toBeGreaterThan(-1);
const completionSection = preamble.slice(completionIdx);
expect(completionSection).toContain('_TEL" != "off"');
});
// Fix 3: W012 — Bun install is version-pinned
test('bun install commands use version pinning', () => {
const browseResolver = readFileSync(join(ROOT, 'scripts/resolvers/browse.ts'), 'utf-8');
expect(browseResolver).toContain('BUN_VERSION');
// Should not have unpinned curl|bash (without BUN_VERSION on same line)
const lines = browseResolver.split('\n');
for (const line of lines) {
if (line.includes('bun.sh/install') && line.includes('bash') && !line.includes('BUN_VERSION') && !line.includes('command -v')) {
throw new Error(`Unpinned bun install found: ${line.trim()}`);
}
}
});
// Fix 4: W011 — Untrusted content warning in command reference
test('command reference includes untrusted content warning after Navigation', () => {
const rootSkill = readFileSync(join(ROOT, 'SKILL.md'), 'utf-8');
const navIdx = rootSkill.indexOf('### Navigation');
const readingIdx = rootSkill.indexOf('### Reading');
expect(navIdx).toBeGreaterThan(-1);
expect(readingIdx).toBeGreaterThan(navIdx);
const between = rootSkill.slice(navIdx, readingIdx);
expect(between.toLowerCase()).toContain('untrusted');
});
// Fix 5: Data flow documentation in review.ts
test('review.ts has data flow documentation', () => {
const review = readFileSync(join(ROOT, 'scripts/resolvers/review.ts'), 'utf-8');
expect(review).toContain('Data sent');
expect(review).toContain('Data NOT sent');
});
// Fix 2+6: All generated SKILL.md files with telemetry are conditional
test('all generated SKILL.md files with telemetry calls use conditional pattern', () => {
const skills = getAllSkillMds();
for (const { name, content } of skills) {
if (content.includes('gstack-telemetry-log')) {
expect(content).toContain('_TEL" != "off"');
}
}
});
});
+19 -6
View File
@@ -13,12 +13,13 @@
* Skips gracefully when prerequisites are not met.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runCodexSkill, parseCodexJSONL, installSkillToTempHome } from './helpers/codex-session-runner';
import type { CodexResult } from './helpers/codex-session-runner';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { createTestWorktree, harvestAndCleanup } from './helpers/e2e-helpers';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
@@ -118,16 +119,25 @@ afterAll(async () => {
// --- Tests ---
describeCodex('Codex E2E', () => {
let testWorktree: string;
beforeAll(() => {
testWorktree = createTestWorktree('codex');
});
afterAll(() => {
harvestAndCleanup('codex');
});
testIfSelected('codex-discover-skill', async () => {
// Install gstack-review skill to a temp HOME and ask Codex to list skills
const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
const skillDir = path.join(testWorktree, '.agents', 'skills', 'gstack-review');
const result = await runCodexSkill({
skillDir,
prompt: 'List any skills or instructions you have available. Just list the names.',
timeoutMs: 60_000,
cwd: ROOT,
cwd: testWorktree,
skillName: 'gstack-review',
});
@@ -139,6 +149,9 @@ describeCodex('Codex E2E', () => {
expect(result.exitCode).toBe(0);
expect(result.output.length).toBeGreaterThan(0);
// Skill loading errors mean our generated SKILL.md files are broken
expect(result.stderr).not.toContain('invalid');
expect(result.stderr).not.toContain('Skipped loading');
// The output should reference the skill name in some form
const outputLower = result.output.toLowerCase();
expect(
@@ -150,14 +163,14 @@ describeCodex('Codex E2E', () => {
// code review, and produce structured review output with findings/issues.
// Accepts Codex timeout (exit 124/137) as non-failure since that's a CLI perf issue.
testIfSelected('codex-review-findings', async () => {
// Install gstack-review skill and ask Codex to review the current repo
const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
// Install gstack-review skill and ask Codex to review the worktree
const skillDir = path.join(testWorktree, '.agents', 'skills', 'gstack-review');
const result = await runCodexSkill({
skillDir,
prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
timeoutMs: 540_000,
cwd: ROOT,
cwd: testWorktree,
skillName: 'gstack-review',
});
+76
View File
@@ -0,0 +1,76 @@
/**
* Shared fixture for test coverage audit E2E tests.
*
* Creates a Node.js project with billing source code that has intentional
* test coverage gaps: processPayment has happy-path-only tests,
* refundPayment has no tests at all.
*
* Used by: ship-coverage-audit E2E, review-coverage-audit E2E
*/
import * as fs from 'fs';
import * as path from 'path';
import { spawnSync } from 'child_process';
export function createCoverageAuditFixture(dir: string): void {
// Create a Node.js project WITH test framework but coverage gaps
fs.writeFileSync(path.join(dir, 'package.json'), JSON.stringify({
name: 'test-coverage-app',
version: '1.0.0',
type: 'module',
scripts: { test: 'echo "no tests yet"' },
devDependencies: { vitest: '^1.0.0' },
}, null, 2));
// Create vitest config
fs.writeFileSync(path.join(dir, 'vitest.config.ts'),
`import { defineConfig } from 'vitest/config';\nexport default defineConfig({ test: {} });\n`);
fs.writeFileSync(path.join(dir, 'VERSION'), '0.1.0.0\n');
fs.writeFileSync(path.join(dir, 'CHANGELOG.md'), '# Changelog\n');
// Create source file with multiple code paths
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
fs.writeFileSync(path.join(dir, 'src', 'billing.ts'), `
export function processPayment(amount: number, currency: string) {
if (amount <= 0) throw new Error('Invalid amount');
if (currency !== 'USD' && currency !== 'EUR') throw new Error('Unsupported currency');
return { status: 'success', amount, currency };
}
export function refundPayment(paymentId: string, reason: string) {
if (!paymentId) throw new Error('Payment ID required');
if (!reason) throw new Error('Reason required');
return { status: 'refunded', paymentId, reason };
}
`);
// Create a test directory with ONE test (partial coverage)
fs.mkdirSync(path.join(dir, 'test'), { recursive: true });
fs.writeFileSync(path.join(dir, 'test', 'billing.test.ts'), `
import { describe, test, expect } from 'vitest';
import { processPayment } from '../src/billing';
describe('processPayment', () => {
test('processes valid payment', () => {
const result = processPayment(100, 'USD');
expect(result.status).toBe('success');
});
// GAP: no test for invalid amount
// GAP: no test for unsupported currency
// GAP: refundPayment not tested at all
});
`);
// Init git repo with main branch
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial commit']);
// Create feature branch
run('git', ['checkout', '-b', 'feature/billing']);
}
+16 -6
View File
@@ -13,11 +13,12 @@
* Skips gracefully when prerequisites are not met.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runGeminiSkill } from './helpers/gemini-session-runner';
import type { GeminiResult } from './helpers/gemini-session-runner';
import { EvalCollector } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { createTestWorktree, harvestAndCleanup } from './helpers/e2e-helpers';
import * as path from 'path';
const ROOT = path.resolve(import.meta.dir, '..');
@@ -76,7 +77,7 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
// --- Eval result collector ---
@@ -114,13 +115,22 @@ afterAll(async () => {
// --- Tests ---
describeGemini('Gemini E2E', () => {
let testWorktree: string;
beforeAll(() => {
testWorktree = createTestWorktree('gemini');
});
afterAll(() => {
harvestAndCleanup('gemini');
});
testIfSelected('gemini-discover-skill', async () => {
// Run Gemini in the repo root where .agents/skills/ exists
// Run Gemini in an isolated worktree (has .agents/skills/ copied from ROOT)
const result = await runGeminiSkill({
prompt: 'List any skills or instructions you have available. Just list the names.',
timeoutMs: 60_000,
cwd: ROOT,
cwd: testWorktree,
});
logGeminiCost('gemini-discover-skill', result);
@@ -139,11 +149,11 @@ describeGemini('Gemini E2E', () => {
}, 120_000);
testIfSelected('gemini-review-findings', async () => {
// Run gstack-review skill via Gemini on this repo
// Run gstack-review skill via Gemini on worktree (isolated from main working tree)
const result = await runGeminiSkill({
prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
timeoutMs: 540_000,
cwd: ROOT,
cwd: testWorktree,
});
logGeminiCost('gemini-review-findings', result);
File diff suppressed because it is too large Load Diff
+187
View File
@@ -0,0 +1,187 @@
import { describe, test, expect, beforeEach, afterEach } from "bun:test";
import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync } from "fs";
import { join } from "path";
import { tmpdir } from "os";
import { spawnSync } from "child_process";
// Import normalizeRemoteUrl for unit testing
// We test the script end-to-end via CLI and normalizeRemoteUrl via import
const scriptPath = join(import.meta.dir, "..", "bin", "gstack-global-discover.ts");
describe("gstack-global-discover", () => {
describe("normalizeRemoteUrl", () => {
// Dynamically import to test the exported function
let normalizeRemoteUrl: (url: string) => string;
beforeEach(async () => {
const mod = await import("../bin/gstack-global-discover.ts");
normalizeRemoteUrl = mod.normalizeRemoteUrl;
});
test("strips .git suffix", () => {
expect(normalizeRemoteUrl("https://github.com/user/repo.git")).toBe(
"https://github.com/user/repo"
);
});
test("converts SSH to HTTPS", () => {
expect(normalizeRemoteUrl("git@github.com:user/repo.git")).toBe(
"https://github.com/user/repo"
);
});
test("converts SSH without .git to HTTPS", () => {
expect(normalizeRemoteUrl("git@github.com:user/repo")).toBe(
"https://github.com/user/repo"
);
});
test("lowercases host", () => {
expect(normalizeRemoteUrl("https://GitHub.COM/user/repo")).toBe(
"https://github.com/user/repo"
);
});
test("SSH and HTTPS for same repo normalize to same URL", () => {
const ssh = normalizeRemoteUrl("git@github.com:garrytan/gstack.git");
const https = normalizeRemoteUrl("https://github.com/garrytan/gstack.git");
const httpsNoDotGit = normalizeRemoteUrl("https://github.com/garrytan/gstack");
expect(ssh).toBe(https);
expect(https).toBe(httpsNoDotGit);
});
test("handles local: URLs consistently", () => {
const result = normalizeRemoteUrl("local:/tmp/my-repo");
// local: gets parsed as a URL scheme — the important thing is consistency
expect(result).toContain("/tmp/my-repo");
});
test("handles GitLab SSH URLs", () => {
expect(normalizeRemoteUrl("git@gitlab.com:org/project.git")).toBe(
"https://gitlab.com/org/project"
);
});
});
describe("CLI", () => {
test("--help exits 0 and prints usage", () => {
const result = spawnSync("bun", ["run", scriptPath, "--help"], {
encoding: "utf-8",
timeout: 10000,
});
expect(result.status).toBe(0);
expect(result.stderr).toContain("--since");
});
test("no args exits 1 with error", () => {
const result = spawnSync("bun", ["run", scriptPath], {
encoding: "utf-8",
timeout: 10000,
});
expect(result.status).toBe(1);
expect(result.stderr).toContain("--since is required");
});
test("invalid window format exits 1", () => {
const result = spawnSync("bun", ["run", scriptPath, "--since", "abc"], {
encoding: "utf-8",
timeout: 10000,
});
expect(result.status).toBe(1);
expect(result.stderr).toContain("Invalid window format");
});
test("--since 7d produces valid JSON", () => {
const result = spawnSync(
"bun",
["run", scriptPath, "--since", "7d", "--format", "json"],
{ encoding: "utf-8", timeout: 30000 }
);
expect(result.status).toBe(0);
const json = JSON.parse(result.stdout);
expect(json).toHaveProperty("window", "7d");
expect(json).toHaveProperty("repos");
expect(json).toHaveProperty("total_sessions");
expect(json).toHaveProperty("total_repos");
expect(json).toHaveProperty("tools");
expect(Array.isArray(json.repos)).toBe(true);
});
test("--since 7d --format summary produces readable output", () => {
const result = spawnSync(
"bun",
["run", scriptPath, "--since", "7d", "--format", "summary"],
{ encoding: "utf-8", timeout: 30000 }
);
expect(result.status).toBe(0);
expect(result.stdout).toContain("Window: 7d");
expect(result.stdout).toContain("Sessions:");
expect(result.stdout).toContain("Repos:");
});
test("--since 1h returns results (may be empty)", () => {
const result = spawnSync(
"bun",
["run", scriptPath, "--since", "1h", "--format", "json"],
{ encoding: "utf-8", timeout: 30000 }
);
expect(result.status).toBe(0);
const json = JSON.parse(result.stdout);
expect(json.total_sessions).toBeGreaterThanOrEqual(0);
});
});
describe("discovery output structure", () => {
test("repos have required fields", () => {
const result = spawnSync(
"bun",
["run", scriptPath, "--since", "30d", "--format", "json"],
{ encoding: "utf-8", timeout: 30000 }
);
expect(result.status).toBe(0);
const json = JSON.parse(result.stdout);
for (const repo of json.repos) {
expect(repo).toHaveProperty("name");
expect(repo).toHaveProperty("remote");
expect(repo).toHaveProperty("paths");
expect(repo).toHaveProperty("sessions");
expect(Array.isArray(repo.paths)).toBe(true);
expect(repo.paths.length).toBeGreaterThan(0);
expect(repo.sessions).toHaveProperty("claude_code");
expect(repo.sessions).toHaveProperty("codex");
expect(repo.sessions).toHaveProperty("gemini");
}
});
test("tools summary matches repo data", () => {
const result = spawnSync(
"bun",
["run", scriptPath, "--since", "30d", "--format", "json"],
{ encoding: "utf-8", timeout: 30000 }
);
const json = JSON.parse(result.stdout);
// Total sessions should equal sum across tools
const toolTotal =
json.tools.claude_code.total_sessions +
json.tools.codex.total_sessions +
json.tools.gemini.total_sessions;
expect(json.total_sessions).toBe(toolTotal);
});
test("deduplicates Conductor workspaces by remote", () => {
const result = spawnSync(
"bun",
["run", scriptPath, "--since", "30d", "--format", "json"],
{ encoding: "utf-8", timeout: 30000 }
);
const json = JSON.parse(result.stdout);
// Check that no two repos share the same normalized remote
const remotes = json.repos.map((r: any) => r.remote);
const uniqueRemotes = new Set(remotes);
expect(remotes.length).toBe(uniqueRemotes.size);
});
});
});
+12 -1
View File
@@ -27,6 +27,7 @@ export interface CodexResult {
durationMs: number; // Wall clock time
sessionId: string | null; // Thread ID for session continuity
rawLines: string[]; // Raw JSONL lines for debugging
stderr: string; // Stderr output (skill loading errors, auth failures)
}
// --- JSONL parser (ported from Python in codex/SKILL.md.tmpl) ---
@@ -98,7 +99,8 @@ export function parseCodexJSONL(lines: string[]): ParsedCodexJSONL {
/**
* Install a SKILL.md into a temp HOME directory for Codex to discover.
* Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME.
* Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME and copies
* agents/openai.yaml when present so Codex sees the same metadata as a real install.
*
* Returns the temp HOME path. Caller is responsible for cleanup.
*/
@@ -116,6 +118,13 @@ export function installSkillToTempHome(
fs.copyFileSync(srcSkill, path.join(destDir, 'SKILL.md'));
}
const srcOpenAIYaml = path.join(skillDir, 'agents', 'openai.yaml');
if (fs.existsSync(srcOpenAIYaml)) {
const destAgentsDir = path.join(destDir, 'agents');
fs.mkdirSync(destAgentsDir, { recursive: true });
fs.copyFileSync(srcOpenAIYaml, path.join(destAgentsDir, 'openai.yaml'));
}
return home;
}
@@ -159,6 +168,7 @@ export async function runCodexSkill(opts: {
durationMs: Date.now() - startTime,
sessionId: null,
rawLines: [],
stderr: '',
};
}
@@ -274,6 +284,7 @@ export async function runCodexSkill(opts: {
durationMs,
sessionId: parsed.sessionId,
rawLines: collectedLines,
stderr,
};
} finally {
// Clean up temp HOME
+71 -16
View File
@@ -5,11 +5,13 @@
* tests across multiple files by category.
*/
import { describe, test, afterAll } from 'bun:test';
import { describe, test, beforeAll, afterAll } from 'bun:test';
import type { SkillTestResult } from './session-runner';
import { EvalCollector, judgePassed } from './eval-store';
import type { EvalTestEntry } from './eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './touchfiles';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './touchfiles';
import { WorktreeManager } from '../../lib/worktree';
import type { HarvestResult } from '../../lib/worktree';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
@@ -30,13 +32,6 @@ export const evalsEnabled = !!process.env.EVALS;
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
export let selectedTests: string[] | null = null; // null = run all
// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback
const FAST_EXCLUDED_TESTS = [
'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch',
'design-consultation-core', 'design-consultation-existing',
'qa-fix-loop', 'design-review-fix',
];
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE
|| detectBaseBranch(ROOT)
@@ -55,15 +50,22 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
}
// Apply EVALS_FAST filter after diff-based selection
if (evalsEnabled && process.env.EVALS_FAST) {
// EVALS_TIER: filter tests by tier after diff-based selection.
// 'gate' = gate tests only (CI default — blocks merge)
// 'periodic' = periodic tests only (weekly cron / manual)
// not set = run all selected tests (local dev default, backward compat)
if (evalsEnabled && process.env.EVALS_TIER) {
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
const tierTests = Object.entries(E2E_TIERS)
.filter(([, t]) => t === tier)
.map(([name]) => name);
if (selectedTests === null) {
// Run all minus excluded
selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t));
selectedTests = tierTests;
} else {
selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t));
selectedTests = selectedTests.filter(t => tierTests.includes(t));
}
process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`);
process.stderr.write(`EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
}
export const describeE2E = evalsEnabled ? describe : describe.skip;
@@ -205,7 +207,7 @@ export async function finalizeEvalCollector(evalCollector: EvalCollector | null)
if (evalsEnabled) {
const gstackDir = path.join(os.homedir(), '.gstack');
fs.mkdirSync(gstackDir, { recursive: true });
for (const f of ['.completeness-intro-seen', '.telemetry-prompted']) {
for (const f of ['.completeness-intro-seen', '.telemetry-prompted', '.proactive-prompted']) {
const p = path.join(gstackDir, f);
if (!fs.existsSync(p)) fs.writeFileSync(p, '');
}
@@ -234,6 +236,59 @@ export function testConcurrentIfSelected(testName: string, fn: () => Promise<voi
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
// --- Worktree isolation ---
let worktreeManager: WorktreeManager | null = null;
export function getWorktreeManager(): WorktreeManager {
if (!worktreeManager) {
worktreeManager = new WorktreeManager();
worktreeManager.pruneStale();
}
return worktreeManager;
}
/** Create an isolated worktree for a test. Returns the worktree path. */
export function createTestWorktree(testName: string): string {
return getWorktreeManager().create(testName);
}
/** Harvest changes and clean up. Call in afterAll(). Returns HarvestResult for eval integration. */
export function harvestAndCleanup(testName: string): HarvestResult | null {
const mgr = getWorktreeManager();
const result = mgr.harvest(testName);
if (result) {
if (result.isDuplicate) {
process.stderr.write(`\n HARVEST [${testName}]: duplicate patch (skipped)\n`);
} else {
process.stderr.write(`\n HARVEST [${testName}]: ${result.changedFiles.length} files changed\n`);
process.stderr.write(` Patch: ${result.patchPath}\n`);
process.stderr.write(` ${result.diffStat}\n\n`);
}
}
mgr.cleanup(testName);
return result;
}
/**
* Convenience: describe block with automatic worktree isolation + harvest.
* Any test file can use this to get real repo context instead of a tmpdir.
* Note: tests with planted-bug fixtures should NOT use this — they need their fixture repos.
*/
export function describeWithWorktree(
name: string,
testNames: string[],
fn: (getWorktreePath: () => string) => void,
) {
describeIfSelected(name, testNames, () => {
let worktreePath: string;
beforeAll(() => { worktreePath = createTestWorktree(name); });
afterAll(() => { harvestAndCleanup(name); });
fn(() => worktreePath);
});
}
export { judgePassed } from './eval-store';
export { EvalCollector } from './eval-store';
export type { EvalTestEntry } from './eval-store';
export type { HarvestResult } from '../../lib/worktree';
+34 -2
View File
@@ -2,7 +2,7 @@
* Eval result persistence and comparison.
*
* EvalCollector accumulates test results, writes them to
* ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
* ~/.gstack/projects/$SLUG/evals/{version}-{branch}-{tier}-{timestamp}.json,
* prints a summary table, and auto-compares with the previous run.
*
* Comparison functions are exported for reuse by the eval:compare CLI.
@@ -16,7 +16,32 @@ import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '
import type { CostEntry } from '../../lib/eval-format';
const SCHEMA_VERSION = 1;
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
const LEGACY_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
/**
* Detect project-scoped eval dir via gstack-slug.
* Falls back to legacy ~/.gstack-dev/evals/ if slug detection fails.
*/
export function getProjectEvalDir(): string {
try {
// Try repo-local gstack-slug first, then global install
const localSlug = spawnSync('bash', ['-c', '.claude/skills/gstack/bin/gstack-slug 2>/dev/null || ~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null'], {
stdio: 'pipe', timeout: 3000,
});
const output = localSlug.stdout?.toString().trim();
if (output) {
const slugMatch = output.match(/^SLUG=(.+)$/m);
if (slugMatch && slugMatch[1]) {
const dir = path.join(os.homedir(), '.gstack', 'projects', slugMatch[1], 'evals');
fs.mkdirSync(dir, { recursive: true });
return dir;
}
}
} catch { /* fall through */ }
return LEGACY_EVAL_DIR;
}
const DEFAULT_EVAL_DIR = getProjectEvalDir();
// --- Interfaces ---
@@ -60,6 +85,13 @@ export interface EvalTestEntry {
costs?: CostEntry[];
error?: string;
// Worktree harvest data
harvest?: {
filesChanged: number;
patchPath: string;
isDuplicate: boolean;
};
}
export interface EvalResult {
+13 -7
View File
@@ -9,15 +9,23 @@
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
import { getProjectEvalDir } from './eval-store';
import type { CostEntry } from '../../lib/eval-format';
import { resolveTier, tierToModel } from '../../lib/eval-tier';
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
const PROJECT_DIR = path.dirname(getProjectEvalDir()); // ~/.gstack/projects/$SLUG/
/** Sanitize test name for use as filename: strip leading slashes, replace / with - */
export function sanitizeTestName(name: string): string {
return sanitizeForFilename(name);
return name.replace(/^\/+/, '').replace(/\//g, '-');
}
/** Atomic write: write to .tmp then rename. Non-fatal on error. */
function atomicWriteSync(filePath: string, data: string): void {
const tmp = filePath + '.tmp';
fs.writeFileSync(tmp, data);
fs.renameSync(tmp, filePath);
}
export interface CostEstimate {
@@ -140,15 +148,13 @@ export async function runSkillTest(options: {
const safeName = testName ? sanitizeTestName(testName) : null;
if (runId) {
try {
runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
runDir = path.join(PROJECT_DIR, 'e2e-runs', runId);
fs.mkdirSync(runDir, { recursive: true });
} catch { /* non-fatal */ }
}
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
// avoid shell escaping issues. --verbose is required for stream-json mode.
// Model pinned via EVAL_TIER env var (default: sonnet).
const evalModel = tierToModel(resolveTier());
const args = [
'-p',
'--model', model,
+211 -46
View File
@@ -32,25 +32,25 @@ export function matchGlob(file: string, pattern: string): boolean {
* Each test lists the file patterns that, if changed, require the test to run.
*/
export const E2E_TOUCHFILES: Record<string, string[]> = {
// Browse core
'browse-basic': ['browse/src/**'],
'browse-snapshot': ['browse/src/**'],
// Browse core (+ test-server dependency)
'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'],
'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
// SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'contributor-mode': ['SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
// QA
'qa-quick': ['qa/**', 'browse/src/**'],
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
// QA (+ test-server dependency)
'qa-quick': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
'qa-fix-loop': ['qa/**', 'browse/src/**'],
'qa-fix-loop': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
'qa-bootstrap': ['qa/**', 'ship/**'],
// Review
@@ -68,58 +68,94 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],
'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
// Codex offering verification
'codex-offered-office-hours': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
'codex-offered-ceo-review': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
'codex-offered-eng-review': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
// Ship
'ship-base-branch': ['ship/**'],
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
// Setup browser cookies
'setup-cookies-detect': ['setup-browser-cookies/**'],
'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'],
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'],
// Retro
'retro': ['retro/**'],
'retro-base-branch': ['retro/**'],
// Global discover
'global-discover': ['bin/gstack-global-discover.ts', 'test/global-discover.test.ts'],
// CSO
'cso-full-audit': ['cso/**'],
'cso-diff-mode': ['cso/**'],
'cso-infra-scope': ['cso/**'],
// Document-release
'document-release': ['document-release/**'],
// Codex (Claude E2E — tests /codex skill via Claude)
'codex-review': ['codex/**'],
// Codex E2E (tests skills via Codex CLI)
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
// Codex E2E (tests skills via Codex CLI + worktree)
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
// Gemini E2E (tests skills via Gemini CLI)
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
// Gemini E2E (tests skills via Gemini CLI + worktree)
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
// Ship coverage audit
'ship-coverage-audit': ['ship/**'],
// Coverage audit (shared fixture) + triage + gates
'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'],
'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'],
'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'],
'ship-triage': ['ship/**', 'bin/gstack-repo-mode'],
// Plan completion audit + verification
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],
// Design
'design-consultation-core': ['design-consultation/**'],
'design-consultation-existing': ['design-consultation/**'],
'design-consultation-research': ['design-consultation/**'],
'design-consultation-preview': ['design-consultation/**'],
'plan-design-review-plan-mode': ['plan-design-review/**'],
'plan-design-review-no-ui-scope': ['plan-design-review/**'],
'design-review-fix': ['design-review/**', 'browse/src/**'],
'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'],
'design-consultation-existing': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
'design-consultation-research': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
'design-consultation-preview': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
'plan-design-review-no-ui-scope': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
'design-review-fix': ['design-review/**', 'browse/src/**', 'scripts/gen-skill-docs.ts'],
// Design Shotgun
'design-shotgun-path': ['design-shotgun/**', 'design/src/**', 'scripts/resolvers/design.ts'],
'design-shotgun-session': ['design-shotgun/**', 'scripts/resolvers/design.ts'],
'design-shotgun-full': ['design-shotgun/**', 'design/src/**', 'browse/src/**'],
// gstack-upgrade
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
// Deploy skills
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
'canary-workflow': ['canary/**', 'browse/src/**'],
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
'land-and-deploy-first-run': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
'land-and-deploy-review-gate': ['land-and-deploy/**', 'bin/gstack-review-read'],
'canary-workflow': ['canary/**', 'browse/src/**'],
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
// Sidebar agent
'sidebar-navigate': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
'sidebar-url-accuracy': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
// Autoplan
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-think-bigger': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-debug': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-code-review': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -130,6 +166,133 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
};
/**
* E2E test tiers — 'gate' blocks PRs, 'periodic' runs weekly/on-demand.
* Must have exactly the same keys as E2E_TOUCHFILES.
*/
export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Browse core — gate (if browse breaks, everything breaks)
'browse-basic': 'gate',
'browse-snapshot': 'gate',
// SKILL.md setup — gate (if setup breaks, no skill works)
'skillmd-setup-discovery': 'gate',
'skillmd-no-local-binary': 'gate',
'skillmd-outside-git': 'gate',
'contributor-mode': 'gate',
'session-awareness': 'gate',
// QA — gate for functional, periodic for quality/benchmarks
'qa-quick': 'gate',
'qa-b6-static': 'periodic',
'qa-b7-spa': 'periodic',
'qa-b8-checkout': 'periodic',
'qa-only-no-fix': 'gate', // CRITICAL guardrail: Edit tool forbidden
'qa-fix-loop': 'periodic',
'qa-bootstrap': 'gate',
// Review — gate for functional/guardrails, periodic for quality
'review-sql-injection': 'gate', // Security guardrail
'review-enum-completeness': 'gate',
'review-base-branch': 'gate',
'review-design-lite': 'periodic', // 4/7 threshold is subjective
'review-coverage-audit': 'gate',
'review-plan-completion': 'gate',
'review-dashboard-via': 'gate',
// Office Hours
'office-hours-spec-review': 'gate',
// Plan reviews — gate for cheap functional, periodic for Opus quality
'plan-ceo-review': 'periodic',
'plan-ceo-review-selective': 'periodic',
'plan-ceo-review-benefits': 'gate',
'plan-eng-review': 'periodic',
'plan-eng-review-artifact': 'periodic',
'plan-eng-coverage-audit': 'gate',
'plan-review-report': 'gate',
// Codex offering verification
'codex-offered-office-hours': 'gate',
'codex-offered-ceo-review': 'gate',
'codex-offered-design-review': 'gate',
'codex-offered-eng-review': 'gate',
// Ship — gate (end-to-end ship path)
'ship-base-branch': 'gate',
'ship-local-workflow': 'gate',
'ship-coverage-audit': 'gate',
'ship-triage': 'gate',
'ship-plan-completion': 'gate',
'ship-plan-verification': 'gate',
// Retro — gate for cheap branch detection, periodic for full Opus retro
'retro': 'periodic',
'retro-base-branch': 'gate',
// Global discover
'global-discover': 'gate',
// CSO — gate for security guardrails, periodic for quality
'cso-full-audit': 'gate', // Hardcoded secrets detection
'cso-diff-mode': 'gate',
'cso-infra-scope': 'periodic',
// Document-release — gate (CHANGELOG guardrail)
'document-release': 'gate',
// Codex — periodic (Opus, requires codex CLI)
'codex-review': 'periodic',
// Multi-AI — periodic (require external CLIs)
'codex-discover-skill': 'periodic',
'codex-review-findings': 'periodic',
'gemini-discover-skill': 'periodic',
'gemini-review-findings': 'periodic',
// Design — gate for cheap functional, periodic for Opus/quality
'design-consultation-core': 'periodic',
'design-consultation-existing': 'periodic',
'design-consultation-research': 'gate',
'design-consultation-preview': 'gate',
'plan-design-review-plan-mode': 'periodic',
'plan-design-review-no-ui-scope': 'gate',
'design-review-fix': 'periodic',
'design-shotgun-path': 'gate',
'design-shotgun-session': 'gate',
'design-shotgun-full': 'periodic',
// gstack-upgrade
'gstack-upgrade-happy-path': 'gate',
// Deploy skills
'land-and-deploy-workflow': 'gate',
'land-and-deploy-first-run': 'gate',
'land-and-deploy-review-gate': 'gate',
'canary-workflow': 'gate',
'benchmark-workflow': 'gate',
'setup-deploy-workflow': 'gate',
// Sidebar agent
'sidebar-navigate': 'periodic',
'sidebar-url-accuracy': 'periodic',
// Autoplan — periodic (not yet implemented)
'autoplan-core': 'periodic',
// Skill routing — periodic (LLM routing is non-deterministic)
'journey-ideation': 'periodic',
'journey-plan-eng': 'periodic',
'journey-debug': 'periodic',
'journey-qa': 'periodic',
'journey-code-review': 'periodic',
'journey-ship': 'periodic',
'journey-docs': 'periodic',
'journey-retro': 'periodic',
'journey-design-system': 'periodic',
'journey-visual-qa': 'periodic',
};
/**
* LLM-judge test touchfiles — keyed by test description string.
*/
@@ -172,20 +335,22 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
// Voice directive
'voice directive tone': ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
};
/**
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
*
* Keep this list minimal — only files that genuinely affect every test.
* Scoped dependencies (gen-skill-docs, llm-judge, test-server, worktree,
* codex/gemini session runners) belong in individual test entries instead.
*/
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts',
'test/helpers/codex-session-runner.ts',
'test/helpers/gemini-session-runner.ts',
'test/helpers/eval-store.ts',
'test/helpers/llm-judge.ts',
'scripts/gen-skill-docs.ts',
'test/helpers/touchfiles.ts',
'browse/test/test-server.ts',
'test/helpers/session-runner.ts', // All E2E tests use this runner
'test/helpers/eval-store.ts', // All E2E tests store results here
'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous
];
// --- Base branch detection ---
+77
View File
@@ -0,0 +1,77 @@
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { execSync, ExecSyncOptionsWithStringEncoding } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin');
let tmpDir: string;
let slugDir: string;
function run(input: string, opts: { expectFail?: boolean } = {}): { stdout: string; exitCode: number } {
const execOpts: ExecSyncOptionsWithStringEncoding = {
cwd: ROOT,
env: { ...process.env, GSTACK_HOME: tmpDir },
encoding: 'utf-8',
timeout: 10000,
};
try {
const stdout = execSync(`${BIN}/gstack-review-log '${input.replace(/'/g, "'\\''")}'`, execOpts).trim();
return { stdout, exitCode: 0 };
} catch (e: any) {
if (opts.expectFail) {
return { stdout: e.stderr?.toString() || '', exitCode: e.status || 1 };
}
throw e;
}
}
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-revlog-'));
// gstack-review-log uses gstack-slug which needs a git repo — create the projects dir
// with a predictable slug by pre-creating the directory structure
slugDir = path.join(tmpDir, 'projects');
fs.mkdirSync(slugDir, { recursive: true });
});
afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});
describe('gstack-review-log', () => {
test('appends valid JSON to review JSONL file', () => {
const input = '{"skill":"plan-eng-review","status":"clean"}';
const result = run(input);
expect(result.exitCode).toBe(0);
// Find the JSONL file that was written
const projectDirs = fs.readdirSync(slugDir);
expect(projectDirs.length).toBeGreaterThan(0);
const projectDir = path.join(slugDir, projectDirs[0]);
const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
expect(jsonlFiles.length).toBeGreaterThan(0);
const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
const parsed = JSON.parse(content);
expect(parsed.skill).toBe('plan-eng-review');
expect(parsed.status).toBe('clean');
});
test('rejects non-JSON input with non-zero exit code', () => {
const result = run('not json at all', { expectFail: true });
expect(result.exitCode).not.toBe(0);
// Verify nothing was written
const projectDirs = fs.readdirSync(slugDir);
if (projectDirs.length > 0) {
const projectDir = path.join(slugDir, projectDirs[0]);
const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
if (jsonlFiles.length > 0) {
const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
expect(content).toBe('');
}
}
});
});
@@ -25,7 +25,11 @@ describeIfSelected('Skill E2E tests', [
testServer = startTestServer();
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
setupBrowseShims(tmpDir);
});
// Pre-warm the browse server so Chromium is already launched for tests.
// In CI, Chromium can take 10-20s to launch (Docker + --no-sandbox).
spawnSync(browseBin, ['goto', testServer.url], { cwd: tmpDir, timeout: 30000, stdio: 'pipe' });
}, 45_000);
afterAll(() => {
testServer?.server?.stop();
@@ -41,7 +45,7 @@ describeIfSelected('Skill E2E tests', [
4. $B screenshot /tmp/skill-e2e-test.png
Report the results of each command.`,
workingDirectory: tmpDir,
maxTurns: 10,
maxTurns: 7,
timeout: 60_000,
testName: 'browse-basic',
runId,
@@ -63,7 +67,7 @@ Report the results of each command.`,
5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
Report what each command returned.`,
workingDirectory: tmpDir,
maxTurns: 10,
maxTurns: 7,
timeout: 60_000,
testName: 'browse-snapshot',
runId,
@@ -274,12 +278,25 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi
expect(lower.includes('payment') || lower.includes('feature')).toBe(true);
// Must mention what we're working on
expect(lower.includes('stripe') || lower.includes('checkout') || lower.includes('payment')).toBe(true);
// Must have a RECOMMENDATION
expect(output).toContain('RECOMMENDATION');
// Must have a recommendation or structured options
expect(
output.includes('RECOMMENDATION') ||
lower.includes('recommend') ||
lower.includes('option a') ||
lower.includes('which do you want') ||
lower.includes('which approach')
).toBe(true);
} else {
// Check agent output as fallback
const output = result.output || '';
expect(output).toContain('RECOMMENDATION');
const lowerOut = output.toLowerCase();
expect(
output.includes('RECOMMENDATION') ||
lowerOut.includes('recommend') ||
lowerOut.includes('option a') ||
lowerOut.includes('which do you want') ||
lowerOut.includes('which approach')
).toBe(true);
}
// Clean up
+258
View File
@@ -0,0 +1,258 @@
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import {
ROOT, runId, evalsEnabled,
describeIfSelected, logCost, recordE2E,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-cso');
afterAll(() => {
finalizeEvalCollector(evalCollector);
});
// --- CSO v2 E2E Tests ---
describeIfSelected('CSO v2 — full audit', ['cso-full-audit'], () => {
let csoDir: string;
beforeAll(() => {
csoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: csoDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Create a minimal app with a planted vulnerability
fs.writeFileSync(path.join(csoDir, 'package.json'), JSON.stringify({
name: 'cso-test-app',
version: '1.0.0',
dependencies: { express: '4.18.0' },
}, null, 2));
// Planted vuln: hardcoded API key
fs.writeFileSync(path.join(csoDir, 'server.ts'), `
import express from 'express';
const app = express();
const API_KEY = "sk-1234567890abcdef1234567890abcdef";
app.get('/api/data', (req, res) => {
const id = req.query.id;
res.json({ data: \`result for \${id}\` });
});
app.listen(3000);
`);
// Planted vuln: .env tracked by git
fs.writeFileSync(path.join(csoDir, '.env'), 'DATABASE_URL=postgres://admin:secretpass@prod.db.example.com:5432/myapp\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
});
afterAll(() => {
try { fs.rmSync(csoDir, { recursive: true, force: true }); } catch {}
});
test('/cso finds planted vulnerabilities', async () => {
const result = await runSkillTest({
prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
Run /cso on this repo (full daily audit, no flags).
IMPORTANT:
- Do NOT use AskUserQuestion — skip any interactive prompts.
- Focus on finding the planted vulnerabilities in this small repo.
- Produce the SECURITY FINDINGS table.
- Save the report to .gstack/security-reports/.`,
workingDirectory: csoDir,
maxTurns: 30,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent'],
timeout: 300_000,
});
logCost('cso', result);
expect(result.exitReason).toBe('success');
// Should detect hardcoded API key
const output = result.output.toLowerCase();
expect(
output.includes('sk-') || output.includes('hardcoded') || output.includes('api key') || output.includes('api_key')
).toBe(true);
// Should detect .env tracked by git
expect(
output.includes('.env') && (output.includes('tracked') || output.includes('gitignore'))
).toBe(true);
// Should produce a findings table
expect(
output.includes('security findings') || output.includes('SECURITY FINDINGS')
).toBe(true);
// Should save a report
const reportDir = path.join(csoDir, '.gstack', 'security-reports');
const reportExists = fs.existsSync(reportDir);
if (reportExists) {
const reports = fs.readdirSync(reportDir).filter(f => f.endsWith('.json'));
expect(reports.length).toBeGreaterThanOrEqual(1);
}
recordE2E(evalCollector, 'cso-full-audit', 'e2e-cso', result);
}, 300_000);
});
describeIfSelected('CSO v2 — diff mode', ['cso-diff-mode'], () => {
let csoDiffDir: string;
beforeAll(() => {
csoDiffDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-diff-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: csoDiffDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Clean initial commit
fs.writeFileSync(path.join(csoDiffDir, 'package.json'), JSON.stringify({
name: 'cso-diff-test', version: '1.0.0',
}, null, 2));
fs.writeFileSync(path.join(csoDiffDir, 'app.ts'), 'console.log("hello");\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Feature branch with a vuln
run('git', ['checkout', '-b', 'feat/add-webhook']);
fs.writeFileSync(path.join(csoDiffDir, 'webhook.ts'), `
import express from 'express';
const app = express();
// No signature verification!
app.post('/webhook/stripe', (req, res) => {
const event = req.body;
processPayment(event);
res.sendStatus(200);
});
`);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'feat: add webhook']);
});
afterAll(() => {
try { fs.rmSync(csoDiffDir, { recursive: true, force: true }); } catch {}
});
test('/cso --diff scopes to branch changes', async () => {
const result = await runSkillTest({
prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
Run /cso --diff on this repo. The base branch is "main".
IMPORTANT:
- Do NOT use AskUserQuestion — skip any interactive prompts.
- Focus on changes in the current branch vs main.
- The webhook.ts file was added on this branch — it should be analyzed.`,
workingDirectory: csoDiffDir,
maxTurns: 25,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent'],
timeout: 240_000,
});
logCost('cso', result);
expect(result.exitReason).toBe('success');
const output = result.output.toLowerCase();
// Should mention webhook and missing signature verification
expect(
output.includes('webhook') && (output.includes('signature') || output.includes('verify'))
).toBe(true);
recordE2E(evalCollector, 'cso-diff-mode', 'e2e-cso', result);
}, 240_000);
});
describeIfSelected('CSO v2 — infra scope', ['cso-infra-scope'], () => {
let csoInfraDir: string;
beforeAll(() => {
csoInfraDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-infra-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: csoInfraDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// CI workflow with unpinned action
fs.mkdirSync(path.join(csoInfraDir, '.github', 'workflows'), { recursive: true });
fs.writeFileSync(path.join(csoInfraDir, '.github', 'workflows', 'ci.yml'), `
name: CI
on: [push]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: some-third-party/action@main
- run: echo "Building..."
`);
// Dockerfile running as root
fs.writeFileSync(path.join(csoInfraDir, 'Dockerfile'), `
FROM node:20
WORKDIR /app
COPY . .
RUN npm install
EXPOSE 3000
CMD ["node", "server.js"]
`);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
});
afterAll(() => {
try { fs.rmSync(csoInfraDir, { recursive: true, force: true }); } catch {}
});
test('/cso --infra runs infrastructure phases only', async () => {
const result = await runSkillTest({
prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
Run /cso --infra on this repo. This should run infrastructure-only phases (0-6, 12-14).
IMPORTANT:
- Do NOT use AskUserQuestion — skip any interactive prompts.
- This is a TINY repo with only 3 files: .github/workflows/ci.yml, Dockerfile, and package.json. Do NOT waste turns exploring — just read those files directly and audit them.
- The Dockerfile has no USER directive (runs as root). The CI workflow uses an unpinned third-party GitHub Action (some-third-party/action@main).
- Focus on infrastructure findings, NOT code-level OWASP scanning.
- Skip the preamble (gstack-update-check, telemetry, etc.) — go straight to the audit.
- Do NOT use the Agent tool for exploration or verification — read the files yourself. This repo is too small to need subagents.`,
workingDirectory: csoInfraDir,
maxTurns: 30,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 360_000,
});
logCost('cso', result);
expect(result.exitReason).toBe('success');
const output = result.output.toLowerCase();
// Should mention unpinned action or Dockerfile issues
expect(
output.includes('unpinned') || output.includes('third-party') ||
output.includes('user directive') || output.includes('root')
).toBe(true);
recordE2E(evalCollector, 'cso-infra-scope', 'e2e-cso', result);
}, 360_000);
});
+159 -4
View File
@@ -44,7 +44,7 @@ describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], ()
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
});
test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
testConcurrentIfSelected('land-and-deploy-workflow', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
@@ -85,6 +85,161 @@ Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
}, 180_000);
});
// --- Land-and-Deploy First-Run E2E ---
describeIfSelected('Land-and-Deploy first-run E2E', ['land-and-deploy-first-run'], () => {
let firstRunDir: string;
beforeAll(() => {
firstRunDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-first-run-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: firstRunDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "world"; }\n');
fs.writeFileSync(path.join(firstRunDir, 'fly.toml'), 'app = "first-run-app"\n\n[http_service]\n internal_port = 3000\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
run('git', ['checkout', '-b', 'feat/first-deploy']);
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "first deploy"; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'feat: first deploy']);
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(firstRunDir, 'land-and-deploy'));
});
afterAll(() => {
try { fs.rmSync(firstRunDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('land-and-deploy-first-run', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
You are on branch feat/first-deploy. This is the FIRST TIME running /land-and-deploy
for this project — there is NO land-deploy-confirmed file.
This repo has a fly.toml with app = "first-run-app", indicating a Fly.io deployment.
IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
Instead, simulate the Step 1.5 first-run dry-run validation:
1. Detect that this is a FIRST_RUN (no land-deploy-confirmed file)
2. Detect the deploy platform from fly.toml (Fly.io, app = first-run-app)
3. Infer the production URL (https://first-run-app.fly.dev)
4. Build the DEPLOY INFRASTRUCTURE VALIDATION table showing:
- Platform detected
- Command validation results (simulated as all passing)
- Staging detection results (none expected)
- What will happen steps
5. Write the dry-run report to .gstack/deploy-reports/dry-run-validation.md
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.
Just demonstrate the first-run dry-run output.`,
workingDirectory: firstRunDir,
maxTurns: 20,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'land-and-deploy-first-run',
runId,
});
logCost('/land-and-deploy first-run', result);
recordE2E(evalCollector, '/land-and-deploy first-run', 'Land-and-Deploy first-run E2E', result);
expect(result.exitReason).toBe('success');
// Verify dry-run report was created
const reportDir = path.join(firstRunDir, '.gstack', 'deploy-reports');
expect(fs.existsSync(reportDir)).toBe(true);
// Check report content mentions platform detection
const reportFiles = fs.readdirSync(reportDir);
expect(reportFiles.length).toBeGreaterThan(0);
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
const hasPlatform = reportContent.toLowerCase().includes('fly') || reportContent.toLowerCase().includes('first-run-app');
expect(hasPlatform).toBe(true);
}, 180_000);
});
// --- Land-and-Deploy Review Gate E2E ---
describeIfSelected('Land-and-Deploy review gate E2E', ['land-and-deploy-review-gate'], () => {
let reviewDir: string;
beforeAll(() => {
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-review-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(reviewDir, 'app.ts'), 'export function hello() { return "world"; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Create 6 more commits to make any review stale
for (let i = 1; i <= 6; i++) {
fs.writeFileSync(path.join(reviewDir, `file${i}.ts`), `export const x${i} = ${i};\n`);
run('git', ['add', '.']);
run('git', ['commit', '-m', `feat: add file${i}`]);
}
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(reviewDir, 'land-and-deploy'));
});
afterAll(() => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('land-and-deploy-review-gate', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
Focus on Step 3.5a and Step 3.5a-bis (the review staleness check and inline review offer).
This repo has 6 commits since the initial commit. There are NO review logs
(gstack-review-read would return NO_REVIEWS).
Simulate what the readiness gate would show:
1. Run gstack-review-read equivalent (simulate NO_REVIEWS output)
2. Determine review staleness: Eng Review should be "NOT RUN"
3. Note that Step 3.5a-bis would offer an inline review
4. Write a simulated readiness report to .gstack/deploy-reports/readiness-report.md
showing the review status as NOT RUN with the inline review offer text
Do NOT use AskUserQuestion. Do NOT run gh commands.
Show what the readiness gate output would look like.`,
workingDirectory: reviewDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'land-and-deploy-review-gate',
runId,
});
logCost('/land-and-deploy review-gate', result);
recordE2E(evalCollector, '/land-and-deploy review-gate', 'Land-and-Deploy review gate E2E', result);
expect(result.exitReason).toBe('success');
// Verify readiness report was created
const reportDir = path.join(reviewDir, '.gstack', 'deploy-reports');
expect(fs.existsSync(reportDir)).toBe(true);
const reportFiles = fs.readdirSync(reportDir);
expect(reportFiles.length).toBeGreaterThan(0);
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
// Should mention review status
const hasReviewMention = reportContent.toLowerCase().includes('review') ||
reportContent.toLowerCase().includes('not run');
expect(hasReviewMention).toBe(true);
}, 180_000);
});
// --- Canary skill E2E ---
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
@@ -110,7 +265,7 @@ describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
});
test('/canary skill produces monitoring report structure', async () => {
testConcurrentIfSelected('canary-workflow', async () => {
const result = await runSkillTest({
prompt: `Read canary/SKILL.md for the /canary skill instructions.
@@ -171,7 +326,7 @@ describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
});
test('/benchmark skill produces performance report structure', async () => {
testConcurrentIfSelected('benchmark-workflow', async () => {
const result = await runSkillTest({
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
@@ -237,7 +392,7 @@ describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
});
test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
testConcurrentIfSelected('setup-deploy-workflow', async () => {
const result = await runSkillTest({
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
+1 -1
View File
@@ -560,7 +560,7 @@ describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
});
test('Test 7: /design-review audits and fixes design issues', async () => {
testConcurrentIfSelected('design-review-fix', async () => {
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
const result = await runSkillTest({
+204 -8
View File
@@ -66,7 +66,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review produces structured review output', async () => {
testConcurrentIfSelected('plan-ceo-review', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
@@ -150,7 +150,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SELECTIVE EXPANSION produces structured review output', async () => {
testConcurrentIfSelected('plan-ceo-review-selective', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
@@ -244,7 +244,7 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
test('/plan-eng-review produces structured review output', async () => {
testConcurrentIfSelected('plan-eng-review', async () => {
const result = await runSkillTest({
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
@@ -364,7 +364,7 @@ export function main() { return Dashboard(); }
} catch {}
});
test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
testConcurrentIfSelected('plan-eng-review-artifact', async () => {
// Count existing test-plan files before
const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
@@ -408,8 +408,11 @@ Write your review to ${planDir}/review-output.md`,
console.warn('No test-plan artifact found — agent may not have followed artifact instructions');
}
// Soft assertion: we expect an artifact but agent compliance is not guaranteed
expect(newFiles.length).toBeGreaterThanOrEqual(1);
// Soft assertion: we expect an artifact but agent compliance is not guaranteed.
// Log rather than fail — the test-plan artifact is a bonus output, not the core test.
if (newFiles.length === 0) {
console.warn('SOFT FAIL: No test-plan artifact written — agent did not follow artifact instructions');
}
}, 420_000);
});
@@ -442,7 +445,7 @@ describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'],
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
});
test('/office-hours SKILL.md contains spec review loop', async () => {
testConcurrentIfSelected('office-hours-spec-review', async () => {
const result = await runSkillTest({
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
@@ -502,7 +505,7 @@ describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefi
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
testConcurrentIfSelected('plan-ceo-review-benefits', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
@@ -532,6 +535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
}, 180_000);
});
// --- Plan Review Report E2E ---
// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section
// to the bottom of the plan file (the living review status footer).
describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => {
let planDir: string;
beforeAll(() => {
planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System
## Context
We're building a real-time notification system for our SaaS app.
## Changes
1. WebSocket server for push notifications
2. Notification preferences API
3. Email digest fallback for offline users
4. PostgreSQL table for notification storage
## Architecture
- WebSocket: Socket.io on Express
- Queue: Bull + Redis for email digests
- Storage: PostgreSQL notifications table
- Frontend: React toast component
## Open questions
- Retry policy for failed WebSocket delivery?
- Max notifications stored per user?
`);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add plan']);
// Copy plan-eng-review skill
fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
path.join(planDir, 'plan-eng-review', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => {
const result = await runSkillTest({
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.
Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive.
Skip the preamble bash block, lake intro, telemetry, and contributor mode sections.
CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content.
This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`,
workingDirectory: planDir,
maxTurns: 20,
timeout: 360_000,
testName: 'plan-review-report',
runId,
model: 'claude-opus-4-6',
});
logCost('/plan-eng-review report', result);
recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
// Verify the review report was written to the plan file
const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8');
// Original plan content should still be present
expect(planContent).toContain('# Plan: Add Notifications System');
expect(planContent).toContain('WebSocket');
// Review report section must exist
expect(planContent).toContain('## GSTACK REVIEW REPORT');
// Report should be at the bottom of the file
const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT');
const afterReport = planContent.slice(reportIndex);
// Should contain the review table with standard rows
expect(afterReport).toMatch(/\|\s*Review\s*\|/);
expect(afterReport).toContain('CEO Review');
expect(afterReport).toContain('Eng Review');
expect(afterReport).toContain('Design Review');
console.log('Plan review report found at bottom of plan.md');
}, 420_000);
});
// --- Codex Offering E2E ---
// Verifies that Codex is properly offered (with availability check, user prompt,
// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review.
describeIfSelected('Codex Offering E2E', [
'codex-offered-office-hours', 'codex-offered-ceo-review',
'codex-offered-design-review', 'codex-offered-eng-review',
], () => {
let testDir: string;
beforeAll(() => {
testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy all 4 SKILL.md files
for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) {
fs.mkdirSync(path.join(testDir, skill), { recursive: true });
fs.copyFileSync(
path.join(ROOT, skill, 'SKILL.md'),
path.join(testDir, skill, 'SKILL.md'),
);
}
});
afterAll(() => {
try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {}
});
async function checkCodexOffering(skill: string, testName: string, featureName: string) {
const result = await runSkillTest({
prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion".
Summarize the Codex/${featureName} integration — answer these specific questions:
1. How is Codex availability checked? (what exact bash command?)
2. How is the user prompted? (via AskUserQuestion? what are the options?)
3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?)
4. Is this step blocking (gates the workflow) or optional (can be skipped)?
5. What prompt/context is sent to Codex?
Write your summary to ${testDir}/${testName}-summary.md`,
workingDirectory: testDir,
maxTurns: 8,
timeout: 120_000,
testName,
runId,
});
logCost(`/${skill} codex offering`, result);
recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(testDir, `${testName}-summary.md`);
expect(fs.existsSync(summaryPath)).toBe(true);
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// All skills should have codex availability check
expect(summary).toMatch(/which codex/);
// All skills should have fallback behavior
expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/);
// All skills should show it's optional/non-blocking
expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/);
console.log(`${skill}: Codex offering verified`);
}
testConcurrentIfSelected('codex-offered-office-hours', async () => {
await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion');
}, 180_000);
testConcurrentIfSelected('codex-offered-ceo-review', async () => {
await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice');
}, 180_000);
testConcurrentIfSelected('codex-offered-design-review', async () => {
await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices');
}, 180_000);
testConcurrentIfSelected('codex-offered-eng-review', async () => {
await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice');
}, 180_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
+4 -4
View File
@@ -4,7 +4,7 @@ import { outcomeJudge } from './helpers/llm-judge';
import { judgePassed } from './helpers/eval-store';
import {
ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
describeIfSelected, describeE2E,
describeIfSelected, describeE2E, testConcurrentIfSelected,
copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
@@ -172,17 +172,17 @@ CRITICAL RULES:
}
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
testConcurrentIfSelected('qa-b6-static', async () => {
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
}, 360_000);
// B7: SPA — broken route, stale state, async race, missing aria, console warning
test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
testConcurrentIfSelected('qa-b7-spa', async () => {
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
}, 360_000);
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
testConcurrentIfSelected('qa-b8-checkout', async () => {
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
}, 360_000);
+3 -3
View File
@@ -37,7 +37,7 @@ describeIfSelected('QA skill E2E', ['qa-quick'], () => {
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
});
test('/qa quick completes without browse errors', async () => {
testConcurrentIfSelected('qa-quick', async () => {
const result = await runSkillTest({
prompt: `B="${browseBin}"
@@ -108,7 +108,7 @@ describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
});
test('/qa-only produces report without using Edit tool', async () => {
testConcurrentIfSelected('qa-only-no-fix', async () => {
const result = await runSkillTest({
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
@@ -227,7 +227,7 @@ describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
});
test('/qa fix loop finds bugs and commits fixes', async () => {
testConcurrentIfSelected('qa-fix-loop', async () => {
const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;
const result = await runSkillTest({
+133 -14
View File
@@ -51,7 +51,7 @@ describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
test('/review produces findings on SQL injection branch', async () => {
testConcurrentIfSelected('review-sql-injection', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ -125,7 +125,7 @@ describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'],
try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
});
test('/review catches missing enum handlers for new status value', async () => {
testConcurrentIfSelected('review-enum-completeness', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ -200,7 +200,7 @@ describeIfSelected('Review design lite E2E', ['review-design-lite'], () => {
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
});
test('/review catches design anti-patterns in CSS/HTML diff', async () => {
testConcurrentIfSelected('review-design-lite', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ -340,21 +340,22 @@ Write your findings to ${dir}/review-output.md`,
run('git', ['add', 'app.ts'], dir);
run('git', ['commit', '-m', 'feat: update to v2'], dir);
// Copy ship skill
fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dir, 'ship-SKILL.md'));
// Extract only Step 0 (base branch detection) from ship/SKILL.md
// (copying the full 1900-line file causes agent context bloat and flaky timeouts)
const fullShipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const step0Start = fullShipSkill.indexOf('## Step 0: Detect platform and base branch');
const step0End = fullShipSkill.indexOf('## Step 1: Pre-flight');
const shipSection = fullShipSkill.slice(step0Start, step0End > step0Start ? step0End : undefined);
fs.writeFileSync(path.join(dir, 'ship-SKILL.md'), shipSection);
const result = await runSkillTest({
prompt: `Read ship-SKILL.md for the ship workflow.
prompt: `Read ship-SKILL.md. It contains Step 0 (Detect base branch) from the ship workflow.
Skip the preamble bash block, lake intro, telemetry, and contributor mode sections — go straight to Step 0.
Run the base branch detection. Since there is no remote, gh commands will fail — fall back to main.
Run ONLY Step 0 (Detect base branch) and Step 1 (Pre-flight) from the ship workflow.
Since there is no remote, gh commands will fail — fall back to main.
Then run git diff and git log against the detected base branch.
After completing Step 0 and Step 1, STOP. Do NOT proceed to Step 2 or beyond.
Do NOT push, create PRs, or modify VERSION/CHANGELOG.
Write a summary of what you detected to ${dir}/ship-preflight.md including:
Write a summary to ${dir}/ship-preflight.md including:
- The detected base branch name
- The current branch name
- The diff stat against the base branch`,
@@ -497,7 +498,7 @@ describeIfSelected('Retro E2E', ['retro'], () => {
try { fs.rmSync(retroDir, { recursive: true, force: true }); } catch {}
});
test('/retro produces analysis from git history', async () => {
testConcurrentIfSelected('retro', async () => {
const result = await runSkillTest({
prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.
@@ -529,6 +530,124 @@ Analyze the git history and produce the narrative report as described in the SKI
}, 420_000);
});
// --- Review Dashboard Via Attribution E2E ---
describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'], () => {
let dashDir: string;
beforeAll(() => {
dashDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-dashboard-via-'));
const run = (cmd: string, args: string[], cwd = dashDir) =>
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
// Create git repo with feature branch
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v1");\n');
run('git', ['add', 'app.ts']);
run('git', ['commit', '-m', 'initial']);
run('git', ['checkout', '-b', 'feature/dashboard-test']);
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v2");\n');
run('git', ['add', 'app.ts']);
run('git', ['commit', '-m', 'feat: update']);
// Get HEAD commit for review entries
const headResult = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { cwd: dashDir, stdio: 'pipe' });
const commit = headResult.stdout.toString().trim();
// Pre-populate review log with autoplan-sourced entries
// gstack-review-read reads from ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl
// For the test, we'll write a mock gstack-review-read script that returns our test data
const timestamp = new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
const reviewData = [
`{"skill":"plan-eng-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"${commit}"}`,
`{"skill":"plan-ceo-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"${commit}"}`,
`{"skill":"codex-plan-review","timestamp":"${timestamp}","status":"clean","source":"codex","commit":"${commit}"}`,
].join('\n');
// Write a mock gstack-review-read that returns our test data
const mockBinDir = path.join(dashDir, '.mock-bin');
fs.mkdirSync(mockBinDir, { recursive: true });
fs.writeFileSync(path.join(mockBinDir, 'gstack-review-read'), [
'#!/usr/bin/env bash',
`echo '${reviewData.split('\n').join("'\necho '")}'`,
'echo "---CONFIG---"',
'echo "false"',
'echo "---HEAD---"',
`echo "${commit}"`,
].join('\n'));
fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755);
// Extract only the Review Readiness Dashboard section from ship/SKILL.md
// (copying the full 1900-line file causes agent context bloat and timeouts)
const fullSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const dashStart = fullSkill.indexOf('## Review Readiness Dashboard');
const dashEnd = fullSkill.indexOf('\n---\n', dashStart);
const dashSection = fullSkill.slice(dashStart, dashEnd > dashStart ? dashEnd : undefined);
fs.writeFileSync(path.join(dashDir, 'ship-SKILL.md'), dashSection);
});
afterAll(() => {
try { fs.rmSync(dashDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('review-dashboard-via', async () => {
const mockBinDir = path.join(dashDir, '.mock-bin');
const result = await runSkillTest({
prompt: `Read ship-SKILL.md. You only need to run the Review Readiness Dashboard section.
Instead of running ~/.claude/skills/gstack/bin/gstack-review-read, run this mock: ${mockBinDir}/gstack-review-read
Parse the output and display the dashboard table. Pay attention to:
1. The "via" field in entries — show source attribution (e.g., "via /autoplan")
2. The codex-plan-review entry — it should populate the Outside Voice row
3. Since Eng Review IS clear, there should be NO gate blocking — just display the dashboard
Skip the preamble, lake intro, telemetry, and all other ship steps.
Write the dashboard output to ${dashDir}/dashboard-output.md`,
workingDirectory: dashDir,
maxTurns: 12,
timeout: 180_000,
testName: 'review-dashboard-via',
runId,
});
logCost('/ship dashboard-via', result);
recordE2E(evalCollector, '/ship review dashboard via attribution', 'Dashboard via field', result);
expect(result.exitReason).toBe('success');
// Check dashboard output for via attribution
const dashPath = path.join(dashDir, 'dashboard-output.md');
const allOutput = [
result.output || '',
...result.toolCalls.map(tc => tc.output || ''),
].join('\n').toLowerCase();
// Verify via attribution appears somewhere (conversation or file)
let dashContent = '';
if (fs.existsSync(dashPath)) {
dashContent = fs.readFileSync(dashPath, 'utf-8').toLowerCase();
}
const combined = allOutput + dashContent;
// Should mention autoplan attribution
expect(combined).toMatch(/autoplan/);
// Should show eng review as CLEAR (it has a clean entry)
expect(combined).toMatch(/clear/i);
// Should NOT contain AskUserQuestion gate (no blocking)
const gateQuestions = result.toolCalls.filter(tc =>
tc.tool === 'mcp__conductor__AskUserQuestion' ||
(tc.tool === 'AskUserQuestion')
);
// Ship dashboard should not gate when eng review is clear
expect(gateQuestions).toHaveLength(0);
}, 240_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
+279
View File
@@ -0,0 +1,279 @@
/**
* Layer 4: E2E tests for the sidebar agent.
*
* sidebar-url-accuracy: Deterministic test that verifies the activeTabUrl fix.
* Starts server (no browser), POSTs to /sidebar-command with different activeTabUrl
* values, reads the queue file, and verifies the prompt uses the extension URL.
* No real Claude needed — this is a fast, cheap, deterministic test.
*
* sidebar-navigate: Full E2E with real Claude (requires ANTHROPIC_API_KEY).
* Starts server + sidebar-agent, sends a message, waits for Claude to respond.
* Tests the complete message flow through the queue.
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { spawn, type Subprocess } from 'bun';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import {
ROOT,
describeIfSelected, testIfSelected,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
const evalCollector = createEvalCollector('e2e-sidebar');
// --- Sidebar URL Accuracy (deterministic, no Claude) ---
describeIfSelected('Sidebar URL accuracy E2E', ['sidebar-url-accuracy'], () => {
let serverProc: Subprocess | null = null;
let serverPort: number = 0;
let authToken: string = '';
let tmpDir: string = '';
let stateFile: string = '';
let queueFile: string = '';
async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
...(opts.headers as Record<string, string> || {}),
};
if (!headers['Authorization'] && authToken) {
headers['Authorization'] = `Bearer ${authToken}`;
}
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
}
beforeAll(async () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-url-'));
stateFile = path.join(tmpDir, 'browse.json');
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
serverProc = spawn(['bun', 'run', serverScript], {
env: {
...process.env,
BROWSE_STATE_FILE: stateFile,
BROWSE_HEADLESS_SKIP: '1',
BROWSE_PORT: '0',
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_IDLE_TIMEOUT: '300',
},
stdio: ['ignore', 'pipe', 'pipe'],
});
const deadline = Date.now() + 15000;
while (Date.now() < deadline) {
if (fs.existsSync(stateFile)) {
try {
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
if (state.port && state.token) {
serverPort = state.port;
authToken = state.token;
break;
}
} catch {}
}
await new Promise(r => setTimeout(r, 100));
}
if (!serverPort) throw new Error('Server did not start in time');
}, 20000);
afterAll(() => {
if (serverProc) { try { serverProc.kill(); } catch {} }
finalizeEvalCollector(evalCollector);
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('sidebar-url-accuracy', async () => {
// Fresh session
await api('/sidebar-session/new', { method: 'POST' });
fs.writeFileSync(queueFile, '');
const extensionUrl = 'https://example.com/user-navigated-here';
const resp = await api('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: 'What page am I on?',
activeTabUrl: extensionUrl,
}),
});
expect(resp.status).toBe(200);
// Wait for queue entry
let lastEntry: any = null;
const deadline = Date.now() + 5000;
while (Date.now() < deadline) {
await new Promise(r => setTimeout(r, 100));
if (!fs.existsSync(queueFile)) continue;
const lines = fs.readFileSync(queueFile, 'utf-8').trim().split('\n').filter(Boolean);
if (lines.length > 0) {
lastEntry = JSON.parse(lines[lines.length - 1]);
break;
}
}
expect(lastEntry).not.toBeNull();
// Extension URL should be used, not the Playwright fallback
expect(lastEntry.pageUrl).toBe(extensionUrl);
expect(lastEntry.prompt).toContain(extensionUrl);
expect(lastEntry.pageUrl).not.toBe('about:blank');
// Also test: chrome:// URL should be rejected, falling back to about:blank
await api('/sidebar-agent/kill', { method: 'POST' });
fs.writeFileSync(queueFile, '');
await api('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: 'test',
activeTabUrl: 'chrome://settings',
}),
});
await new Promise(r => setTimeout(r, 200));
const lines2 = fs.readFileSync(queueFile, 'utf-8').trim().split('\n').filter(Boolean);
if (lines2.length > 0) {
const entry2 = JSON.parse(lines2[lines2.length - 1]);
expect(entry2.pageUrl).toBe('about:blank');
}
evalCollector?.addTest({
name: 'sidebar-url-accuracy', suite: 'Sidebar URL accuracy E2E', tier: 'e2e',
passed: true,
duration_ms: 0,
cost_usd: 0,
exit_reason: 'success',
});
}, 30_000);
});
// --- Sidebar Navigate (real Claude, requires ANTHROPIC_API_KEY) ---
describeIfSelected('Sidebar navigate E2E', ['sidebar-navigate'], () => {
let serverProc: Subprocess | null = null;
let agentProc: Subprocess | null = null;
let serverPort: number = 0;
let authToken: string = '';
let tmpDir: string = '';
let stateFile: string = '';
let queueFile: string = '';
async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
...(opts.headers as Record<string, string> || {}),
};
if (!headers['Authorization'] && authToken) {
headers['Authorization'] = `Bearer ${authToken}`;
}
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
}
beforeAll(async () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-nav-'));
stateFile = path.join(tmpDir, 'browse.json');
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
// Start server WITHOUT headless skip — we need a real browser for Claude to use
const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
serverProc = spawn(['bun', 'run', serverScript], {
env: {
...process.env,
BROWSE_STATE_FILE: stateFile,
BROWSE_HEADLESS_SKIP: '1', // Still skip browser — Claude uses curl/fetch instead
BROWSE_PORT: '0',
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_IDLE_TIMEOUT: '300',
},
stdio: ['ignore', 'pipe', 'pipe'],
});
const deadline = Date.now() + 15000;
while (Date.now() < deadline) {
if (fs.existsSync(stateFile)) {
try {
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
if (state.port && state.token) {
serverPort = state.port;
authToken = state.token;
break;
}
} catch {}
}
await new Promise(r => setTimeout(r, 100));
}
if (!serverPort) throw new Error('Server did not start in time');
// Start sidebar-agent
const agentScript = path.resolve(ROOT, 'browse', 'src', 'sidebar-agent.ts');
agentProc = spawn(['bun', 'run', agentScript], {
env: {
...process.env,
BROWSE_SERVER_PORT: String(serverPort),
BROWSE_STATE_FILE: stateFile,
SIDEBAR_QUEUE_PATH: queueFile,
SIDEBAR_AGENT_TIMEOUT: '90000',
BROWSE_BIN: 'echo', // browse commands won't work, but Claude can use curl
},
stdio: ['ignore', 'pipe', 'pipe'],
});
await new Promise(r => setTimeout(r, 1500));
}, 25000);
afterAll(() => {
if (agentProc) { try { agentProc.kill(); } catch {} }
if (serverProc) { try { serverProc.kill(); } catch {} }
finalizeEvalCollector(evalCollector);
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('sidebar-navigate', async () => {
await api('/sidebar-session/new', { method: 'POST' });
fs.writeFileSync(queueFile, '');
const startTime = Date.now();
// Ask Claude a simple question — it doesn't need browse commands for this
const resp = await api('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: 'Say exactly "SIDEBAR_TEST_OK" and nothing else.',
activeTabUrl: 'https://example.com',
}),
});
expect(resp.status).toBe(200);
// Poll for agent_done
const deadline = Date.now() + 90000;
let entries: any[] = [];
while (Date.now() < deadline) {
const chatResp = await api('/sidebar-chat?after=0');
const data = await chatResp.json();
entries = data.entries;
if (entries.some((e: any) => e.type === 'agent_done')) break;
await new Promise(r => setTimeout(r, 2000));
}
const duration = Date.now() - startTime;
const doneEntry = entries.find((e: any) => e.type === 'agent_done');
expect(doneEntry).toBeDefined();
// Claude should have responded with something
const agentText = entries
.filter((e: any) => e.role === 'agent' && (e.type === 'text' || e.type === 'result'))
.map((e: any) => e.text || '')
.join(' ');
expect(agentText.length).toBeGreaterThan(0);
evalCollector?.addTest({
name: 'sidebar-navigate', suite: 'Sidebar navigate E2E', tier: 'e2e',
passed: !!doneEntry && agentText.length > 0,
duration_ms: duration,
cost_usd: 0,
exit_reason: doneEntry ? 'success' : 'timeout',
});
}, 120_000);
});
+21 -90
View File
@@ -60,7 +60,7 @@ describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
});
test('/document-release updates docs without clobbering CHANGELOG', async () => {
testConcurrentIfSelected('document-release', async () => {
const result = await runSkillTest({
prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.
@@ -161,36 +161,13 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {
testConcurrentIfSelected('ship-local-workflow', async () => {
const result = await runSkillTest({
prompt: `You are running a ship workflow. This is fully automated — do NOT ask for confirmation at any step. Run straight through.
Step 0 — Detect base branch:
Try: gh pr view --json baseRefName -q .baseRefName
If that fails, try: gh repo view --json defaultBranchRef -q .defaultBranchRef.name
If both fail, fall back to "main". Use the detected branch as <base> in all subsequent steps.
Step 2 — Merge base branch:
git fetch origin <base> && git merge origin/<base> --no-edit
If already up to date, continue silently.
Step 4 — Version bump:
Read the VERSION file (4-digit format: MAJOR.MINOR.PATCH.MICRO).
Auto-pick MICRO bump (increment the 4th digit). Write the new version to VERSION.
Step 5 — CHANGELOG:
Read CHANGELOG.md. Auto-generate an entry from the branch commits:
- git log <base>..HEAD --oneline
- git diff <base>...HEAD
Format: ## [X.Y.Z.W] - YYYY-MM-DD with bullet points. Prepend after the header.
Step 6 — Commit:
Stage all changes. Commit with message: "chore: bump version and changelog (vX.Y.Z.W)"
Step 7 — Push:
git push -u origin <branch-name>
Finally, write ship-summary.md with the version and branch.`,
prompt: `You are in a git repo on branch feature/ship-test. Do these steps in order:
1. Read VERSION file and bump the last digit by 1 (e.g. 0.1.0.0 → 0.1.0.1). Write the new version back.
2. Add a CHANGELOG.md entry: "## [NEW_VERSION] - TODAY" with a bullet "- Ship test feature".
3. Stage all changes, commit with message "ship: vNEW_VERSION".
4. Push to origin: git push origin feature/ship-test`,
workingDirectory: shipWorkDir,
maxTurns: 15,
maxTurns: 8,
timeout: 120_000,
testName: 'ship-local-workflow',
runId,
@@ -198,76 +175,30 @@ Finally, write ship-summary.md with the version and branch.`,
logCost('/ship local workflow', result);
// Check push succeeded
const remoteLog = spawnSync('git', ['log', '--oneline'], { cwd: shipRemoteDir, stdio: 'pipe' });
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').length;
// Check push succeeded — verify the feature branch exists on the bare remote
const branchCheck = spawnSync('git', ['branch', '--list', 'feature/ship-test'], { cwd: shipRemoteDir, stdio: 'pipe' });
const branchExists = branchCheck.stdout.toString().trim().length > 0;
// Check VERSION was bumped
// Check VERSION was bumped locally (even if push failed, this shows the LLM did the work)
const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
? fs.readFileSync(path.join(shipWorkDir, 'VERSION'), 'utf-8').trim() : '';
const versionBumped = versionContent !== '0.1.0.0';
recordE2E(evalCollector, '/ship local workflow', 'Ship workflow E2E', result, {
passed: remoteCommits > 1 && ['success', 'error_max_turns'].includes(result.exitReason),
passed: branchExists && versionBumped && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(remoteCommits).toBeGreaterThan(1);
console.log(`Remote commits: ${remoteCommits}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
expect(branchExists).toBe(true);
expect(versionBumped).toBe(true);
console.log(`Branch pushed: ${branchExists}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
}, 150_000);
});
// --- Browser cookie detection smoke test ---
describeIfSelected('Setup Browser Cookies E2E', ['setup-cookies-detect'], () => {
let cookieDir: string;
beforeAll(() => {
cookieDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cookies-'));
// Copy skill files
fs.mkdirSync(path.join(cookieDir, 'setup-browser-cookies'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'setup-browser-cookies', 'SKILL.md'),
path.join(cookieDir, 'setup-browser-cookies', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(cookieDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('setup-cookies-detect', async () => {
const result = await runSkillTest({
prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow.
This is a test environment. List which browsers you can detect on this system by checking for their cookie database files.
Write the detected browsers to ${cookieDir}/detected-browsers.md.
Do NOT launch the cookie picker UI — just detect and report.`,
workingDirectory: cookieDir,
maxTurns: 5,
timeout: 45_000,
testName: 'setup-cookies-detect',
runId,
});
logCost('/setup-browser-cookies detect', result);
const detectPath = path.join(cookieDir, 'detected-browsers.md');
const detectExists = fs.existsSync(detectPath);
const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : '';
const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent);
recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, {
passed: detectExists && hasBrowserName && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(detectExists).toBe(true);
if (detectExists) {
expect(hasBrowserName).toBe(true);
}
}, 60_000);
});
// setup-cookies-detect REMOVED: The cookie-import-browser module has 30+ thorough
// unit tests in browse/test/cookie-import-browser.test.ts (decryption, profile
// detection, error handling, path traversal). The E2E just tested LLM instruction-
// following ("write a file saying no browsers") on a CI box with no browsers.
// --- gstack-upgrade E2E ---
@@ -461,7 +392,7 @@ describe('processPayment', () => {
try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
});
test('/ship Step 3.4 produces coverage diagram', async () => {
testConcurrentIfSelected('ship-coverage-audit', async () => {
const result = await runSkillTest({
prompt: `Read the file ship/SKILL.md for the ship workflow instructions.
@@ -544,7 +475,7 @@ describeIfSelected('Codex skill E2E', ['codex-review'], () => {
try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
});
test('/codex review produces findings and GATE verdict', async () => {
testConcurrentIfSelected('codex-review', async () => {
// Check codex is available — skip if not installed
const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
if (codexCheck.status !== 0) {
File diff suppressed because it is too large Load Diff
+69 -3
View File
@@ -74,7 +74,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
/** Skip an individual test if not selected (for multi-test describe blocks). */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
describeIfSelected('LLM-as-judge quality evals', [
@@ -91,11 +91,14 @@ describeIfSelected('LLM-as-judge quality evals', [
const { result: scores, meta } = await judge('command reference table', section);
console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
// Completeness threshold is 3 (not 4) — the command reference table is
// intentionally terse (quick-reference format). The judge consistently scores
// completeness=3 because detailed argument docs live in per-command sections.
evalCollector?.addTest({
name: 'command reference table',
suite: 'LLM-as-judge quality evals',
tier: 'llm-judge',
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
duration_ms: Date.now() - t0,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
@@ -104,7 +107,7 @@ describeIfSelected('LLM-as-judge quality evals', [
});
expect(scores.clarity).toBeGreaterThanOrEqual(4);
expect(scores.completeness).toBeGreaterThanOrEqual(4);
expect(scores.completeness).toBeGreaterThanOrEqual(3);
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
@@ -790,6 +793,69 @@ describeIfSelected('Other skill evals', [
}, 30_000);
});
// Voice directive eval — tests that the voice section produces the right tone
describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
testIfSelected('voice directive tone', async () => {
const t0 = Date.now();
// Read a tier 2+ skill to get the full voice directive in context
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
const voiceStart = content.indexOf('## Voice');
if (voiceStart === -1) {
throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
}
const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
const result = await callJudge<{
directness: number;
concreteness: number;
avoids_corporate: number;
avoids_ai_vocabulary: number;
connects_user_outcomes: number;
reasoning: string;
}>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
Score each dimension 1-5 where 5 is excellent:
1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
Return JSON only:
{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
THE VOICE DIRECTIVE:
${voiceSection}`);
console.log('Voice directive scores:', JSON.stringify(result, null, 2));
evalCollector?.addTest({
name: 'voice directive tone',
suite: 'Voice directive eval',
tier: 'llm-judge',
passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
&& result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
judge_scores: {
directness: result.directness,
concreteness: result.concreteness,
avoids_corporate: result.avoids_corporate,
avoids_ai_vocabulary: result.avoids_ai_vocabulary,
connects_user_outcomes: result.connects_user_outcomes,
},
judge_reasoning: result.reasoning,
});
expect(result.directness).toBeGreaterThanOrEqual(4);
expect(result.concreteness).toBeGreaterThanOrEqual(4);
expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
}, 30_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+106 -120
View File
@@ -3,7 +3,7 @@ import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
@@ -42,9 +42,28 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
}
}
// Apply EVALS_TIER filter (same logic as e2e-helpers.ts)
if (evalsEnabled && process.env.EVALS_TIER) {
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
const tierTests = Object.entries(E2E_TIERS)
.filter(([, t]) => t === tier)
.map(([name]) => name);
if (selectedTests === null) {
selectedTests = tierTests;
} else {
selectedTests = selectedTests.filter(t => tierTests.includes(t));
}
process.stderr.write(`Routing EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
}
// --- Helper functions ---
/** Copy all SKILL.md files into tmpDir/.claude/skills/gstack/ for auto-discovery */
/** Copy all SKILL.md files for auto-discovery.
* Install to BOTH project-level (.claude/skills/) AND user-level (~/.claude/skills/)
* because Claude Code discovers skills from both locations. In CI containers,
* $HOME may differ from the working directory, so we need both paths to ensure
* the Skill tool appears in Claude's available tools list. */
function installSkills(tmpDir: string) {
const skillDirs = [
'', // root gstack SKILL.md
@@ -54,15 +73,30 @@ function installSkills(tmpDir: string) {
'gstack-upgrade', 'humanizer',
];
// Install to both project-level and user-level skill directories
const homeDir = process.env.HOME || os.homedir();
const installTargets = [
path.join(tmpDir, '.claude', 'skills'), // project-level
path.join(homeDir, '.claude', 'skills'), // user-level (~/.claude/skills/)
];
for (const skill of skillDirs) {
const srcPath = path.join(ROOT, skill, 'SKILL.md');
if (!fs.existsSync(srcPath)) continue;
const destDir = skill
? path.join(tmpDir, '.claude', 'skills', 'gstack', skill)
: path.join(tmpDir, '.claude', 'skills', 'gstack');
fs.mkdirSync(destDir, { recursive: true });
fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
const skillName = skill || 'gstack';
for (const targetBase of installTargets) {
const destDir = path.join(targetBase, skillName);
fs.mkdirSync(destDir, { recursive: true });
fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
}
}
// Copy CLAUDE.md so Claude has project context for skill routing.
const claudeMdSrc = path.join(ROOT, 'CLAUDE.md');
if (fs.existsSync(claudeMdSrc)) {
fs.copyFileSync(claudeMdSrc, path.join(tmpDir, 'CLAUDE.md'));
}
}
@@ -75,6 +109,31 @@ function initGitRepo(dir: string) {
run('git', ['config', 'user.name', 'Test']);
}
/**
* Create a routing test working directory.
* Uses the actual repo checkout (ROOT) which has CLAUDE.md, .claude/skills/,
* and full project context. This matches the local environment where routing
* tests pass reliably. In containerized CI, bare tmpDirs lack the context
* Claude needs to make correct routing decisions.
*/
function createRoutingWorkDir(suffix: string): string {
// Clone the repo checkout into a tmpDir so concurrent tests don't interfere
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), `routing-${suffix}-`));
// Copy essential context files
const filesToCopy = ['CLAUDE.md', 'README.md', 'package.json', 'ETHOS.md'];
for (const f of filesToCopy) {
const src = path.join(ROOT, f);
if (fs.existsSync(src)) fs.copyFileSync(src, path.join(tmpDir, f));
}
// Copy skill files
installSkills(tmpDir);
// Init git
initGitRepo(tmpDir);
spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
return tmpDir;
}
function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
const durationSec = Math.round(result.duration / 1000);
@@ -96,6 +155,15 @@ function recordRouting(name: string, result: SkillTestResult, expectedSkill: str
});
}
// Skip individual tests based on selectedTests (diff + tier filtering)
const testIfSelected = (name: string, fn: () => Promise<void>, timeout?: number) => {
if (selectedTests !== null && !selectedTests.includes(name)) {
test.skip(name, () => {});
} else {
test.concurrent(name, fn, timeout);
}
};
// --- Tests ---
describeE2E('Skill Routing E2E — Developer Journey', () => {
@@ -103,14 +171,9 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
evalCollector?.finalize();
});
test.concurrent('journey-ideation', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ideation-'));
testIfSelected('journey-ideation', async () => {
const tmpDir = createRoutingWorkDir('ideation');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
fs.writeFileSync(path.join(tmpDir, 'README.md'), '# New Project\n');
spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
const testName = 'journey-ideation';
const expectedSkill = 'office-hours';
@@ -137,11 +200,9 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 150_000);
test.concurrent('journey-plan-eng', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-plan-eng-'));
testIfSelected('journey-plan-eng', async () => {
const tmpDir = createRoutingWorkDir('plan-eng');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
## Components
@@ -189,64 +250,14 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 150_000);
test.concurrent('journey-think-bigger', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-think-bigger-'));
// Removed: journey-think-bigger
// Tested ambiguous routing ("think bigger" → plan-ceo-review) but Claude
// legitimately answers directly instead of routing. Never passed reliably.
// The other 10 journey tests cover routing with clear signals.
testIfSelected('journey-debug', async () => {
const tmpDir = createRoutingWorkDir('debug');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
## Components
- REST API (Express.js)
- PostgreSQL database
- React frontend
- SMS integration (Twilio)
## Data Model
- restaurants (id, name, settings)
- parties (id, restaurant_id, name, size, phone, status, created_at)
- wait_estimates (id, restaurant_id, avg_wait_minutes)
## API Endpoints
- POST /api/parties - add party to waitlist
- GET /api/parties - list current waitlist
- PATCH /api/parties/:id/status - update party status
- GET /api/estimate - get current wait estimate
`);
spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
const testName = 'journey-think-bigger';
const expectedSkill = 'plan-ceo-review';
const result = await runSkillTest({
prompt: "Actually, looking at this plan again, I feel like we're thinking too small. We're just doing waitlists but what about the whole restaurant guest experience? Is there a bigger opportunity here we should go after?",
workingDirectory: tmpDir,
maxTurns: 5,
allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
timeout: 120_000,
testName,
runId,
});
const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill');
const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined;
logCost(`journey: ${testName}`, result);
recordRouting(testName, result, expectedSkill, actualSkill);
expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
}, 180_000);
test.concurrent('journey-debug', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-debug-'));
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
@@ -295,18 +306,16 @@ export default app;
recordRouting(testName, result, expectedSkill, actualSkill);
expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
const validSkills = ['investigate', 'qa'];
expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
}, 150_000);
test.concurrent('journey-qa', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-qa-'));
testIfSelected('journey-qa', async () => {
const tmpDir = createRoutingWorkDir('qa');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2));
fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true });
fs.writeFileSync(path.join(tmpDir, 'src/index.html'), '<html><body><h1>Waitlist App</h1></body></html>');
@@ -340,18 +349,15 @@ export default app;
}
}, 150_000);
test.concurrent('journey-code-review', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-code-review-'));
testIfSelected('journey-code-review', async () => {
const tmpDir = createRoutingWorkDir('code-review');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
run('git', ['commit', '-m', 'add base app']);
run('git', ['checkout', '-b', 'feature/add-waitlist']);
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// updated with waitlist feature\nimport { WaitlistService } from "./waitlist";\n');
fs.writeFileSync(path.join(tmpDir, 'waitlist.ts'), 'export class WaitlistService {\n async addParty(name: string, size: number) {\n // TODO: implement\n }\n}\n');
@@ -383,18 +389,15 @@ export default app;
}
}, 150_000);
test.concurrent('journey-ship', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ship-'));
testIfSelected('journey-ship', async () => {
const tmpDir = createRoutingWorkDir('ship');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
run('git', ['commit', '-m', 'add base app']);
run('git', ['checkout', '-b', 'feature/waitlist']);
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// waitlist feature\n');
run('git', ['add', '.']);
@@ -425,12 +428,9 @@ export default app;
}
}, 150_000);
test.concurrent('journey-docs', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-docs-'));
testIfSelected('journey-docs', async () => {
const tmpDir = createRoutingWorkDir('docs');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
@@ -465,12 +465,9 @@ export default app;
}
}, 150_000);
test.concurrent('journey-retro', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-retro-'));
testIfSelected('journey-retro', async () => {
const tmpDir = createRoutingWorkDir('retro');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
@@ -511,18 +508,9 @@ export default app;
}
}, 150_000);
test.concurrent('journey-design-system', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-design-system-'));
testIfSelected('journey-design-system', async () => {
const tmpDir = createRoutingWorkDir('design-system');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app' }, null, 2));
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
const testName = 'journey-design-system';
const expectedSkill = 'design-consultation';
@@ -549,12 +537,9 @@ export default app;
}
}, 150_000);
test.concurrent('journey-visual-qa', async () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-visual-qa-'));
testIfSelected('journey-visual-qa', async () => {
const tmpDir = createRoutingWorkDir('visual-qa');
try {
initGitRepo(tmpDir);
installSkills(tmpDir);
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
@@ -597,7 +582,8 @@ body { font-family: sans-serif; }
recordRouting(testName, result, expectedSkill, actualSkill);
expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
const validSkills = ['design-review', 'qa', 'qa-only', 'browse'];
expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
+139 -29
View File
@@ -99,6 +99,20 @@ describe('SKILL.md command validation', () => {
const result = validateSkill(skill);
expect(result.snapshotFlagErrors).toHaveLength(0);
});
test('all $B commands in autoplan/SKILL.md are valid browse commands', () => {
const skill = path.join(ROOT, 'autoplan', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.invalid).toHaveLength(0);
});
test('all snapshot flags in autoplan/SKILL.md are valid', () => {
const skill = path.join(ROOT, 'autoplan', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.snapshotFlagErrors).toHaveLength(0);
});
});
describe('Command registry consistency', () => {
@@ -227,6 +241,7 @@ describe('Update check preamble', () => {
'benchmark/SKILL.md',
'land-and-deploy/SKILL.md',
'setup-deploy/SKILL.md',
'cso/SKILL.md',
];
for (const skill of skillsWithUpdateCheck) {
@@ -513,10 +528,12 @@ describe('TODOS-format.md reference consistency', () => {
// --- v0.4.1 feature coverage: RECOMMENDATION format, session awareness, enum completeness ---
describe('v0.4.1 preamble features', () => {
const skillsWithPreamble = [
'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md',
'qa-only/SKILL.md',
'setup-browser-cookies/SKILL.md',
// Tier 1 skills have core preamble only (no AskUserQuestion format)
const tier1Skills = ['SKILL.md', 'browse/SKILL.md', 'setup-browser-cookies/SKILL.md', 'benchmark/SKILL.md'];
// Tier 2+ skills have AskUserQuestion format with RECOMMENDATION
const tier2PlusSkills = [
'qa/SKILL.md', 'qa-only/SKILL.md',
'ship/SKILL.md', 'review/SKILL.md',
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
'retro/SKILL.md',
@@ -526,22 +543,25 @@ describe('v0.4.1 preamble features', () => {
'design-consultation/SKILL.md',
'document-release/SKILL.md',
'canary/SKILL.md',
'benchmark/SKILL.md',
'land-and-deploy/SKILL.md',
'setup-deploy/SKILL.md',
'cso/SKILL.md',
];
for (const skill of skillsWithPreamble) {
const skillsWithPreamble = [...tier1Skills, ...tier2PlusSkills];
for (const skill of tier2PlusSkills) {
test(`${skill} contains RECOMMENDATION format`, () => {
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
expect(content).toContain('RECOMMENDATION: Choose');
expect(content).toContain('AskUserQuestion');
});
}
for (const skill of skillsWithPreamble) {
test(`${skill} contains session awareness`, () => {
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
expect(content).toContain('_SESSIONS');
expect(content).toContain('RECOMMENDATION');
});
}
@@ -724,14 +744,8 @@ describe('Contributor mode preamble structure', () => {
for (const skill of skillsWithPreamble) {
test(`${skill} has 0-10 rating in contributor mode`, () => {
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
expect(content).toContain('0 to 10');
expect(content).toContain('My rating');
});
test(`${skill} has calibration example`, () => {
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
expect(content).toContain('Calibration');
expect(content).toContain('the bar');
expect(content).toContain('0-10');
expect(content).toContain('Rating');
});
test(`${skill} has "what would make this a 10" field`, () => {
@@ -807,7 +821,7 @@ describe('Completeness Principle in generated SKILL.md files', () => {
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
'cso/SKILL.md', ];
for (const skill of skillsWithPreamble) {
test(`${skill} contains Completeness Principle section`, () => {
@@ -817,17 +831,12 @@ describe('Completeness Principle in generated SKILL.md files', () => {
});
}
test('Completeness Principle includes compression table', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
test('Completeness Principle includes compression table in tier 2+ skills', () => {
// Root is tier 1 (no completeness). Check tier 2+ skill.
const content = fs.readFileSync(path.join(ROOT, 'cso', 'SKILL.md'), 'utf-8');
expect(content).toContain('CC+gstack');
expect(content).toContain('Compression');
});
test('Completeness Principle includes anti-patterns', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('BAD:');
expect(content).toContain('Anti-patterns');
});
});
// --- Part 7: Planted-bug fixture validation (A4) ---
@@ -961,10 +970,37 @@ describe('gstack-slug', () => {
test('output is eval-compatible (KEY=VALUE format)', () => {
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
const lines = result.stdout.toString().trim().split('\n');
expect(lines.length).toBe(3);
expect(lines.length).toBe(2);
expect(lines[0]).toMatch(/^SLUG=.+/);
expect(lines[1]).toMatch(/^BRANCH=.+/);
expect(lines[2]).toMatch(/^PROJECTS_DIR=.+/);
});
test('output values contain only safe characters (no shell metacharacters)', () => {
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
const slug = result.stdout.toString().match(/SLUG=(.*)/)?.[1] ?? '';
const branch = result.stdout.toString().match(/BRANCH=(.*)/)?.[1] ?? '';
// Only alphanumeric, dot, dash, underscore are allowed (#133)
expect(slug).toMatch(/^[a-zA-Z0-9._-]+$/);
expect(branch).toMatch(/^[a-zA-Z0-9._-]+$/);
});
test('eval sets variables under bash with set -euo pipefail', () => {
const result = Bun.spawnSync(
['bash', '-c', 'set -euo pipefail; eval "$(./bin/gstack-slug 2>/dev/null)"; echo "SLUG=$SLUG"; echo "BRANCH=$BRANCH"'],
{ cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }
);
expect(result.exitCode).toBe(0);
const output = result.stdout.toString();
expect(output).toMatch(/^SLUG=.+/m);
expect(output).toMatch(/^BRANCH=.+/m);
});
test('no templates or bin scripts use source process substitution for gstack-slug', () => {
const result = Bun.spawnSync(
['grep', '-r', 'source <(.*gstack-slug', '--include=*.tmpl', '--include=gstack-review-*', '.'],
{ cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }
);
// grep returns exit code 1 when no matches found — that's what we want
expect(result.stdout.toString().trim()).toBe('');
});
});
@@ -1275,7 +1311,7 @@ describe('Codex skill', () => {
expect(content).toContain('fall back to the Claude adversarial subagent');
// Review log uses new skill name
expect(content).toContain('adversarial-review');
expect(content).toContain('xhigh');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
});
@@ -1285,17 +1321,23 @@ describe('Codex skill', () => {
expect(content).toContain('< 50');
expect(content).toContain('200+');
expect(content).toContain('adversarial-review');
expect(content).toContain('xhigh');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('Investigate and fix');
});
test('codex-host ship/review do NOT contain adversarial review step', () => {
// .agents/ is gitignored — generate on demand
Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
});
const shipContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-ship', 'SKILL.md'), 'utf-8');
expect(shipContent).not.toContain('codex review --base');
expect(shipContent).not.toContain('Investigate and fix');
expect(shipContent).not.toContain('CODEX_REVIEWS');
const reviewContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-review', 'SKILL.md'), 'utf-8');
expect(reviewContent).not.toContain('codex review --base');
expect(reviewContent).not.toContain('codex_reviews');
expect(reviewContent).not.toContain('CODEX_REVIEWS');
expect(reviewContent).not.toContain('adversarial-review');
expect(reviewContent).not.toContain('Investigate and fix');
});
@@ -1306,6 +1348,13 @@ describe('Codex skill', () => {
expect(content).toContain('codex exec');
});
test('/review persists a review-log entry for ship readiness', () => {
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
expect(content).toContain('"skill":"review"');
expect(content).toContain('"issues_found":N');
expect(content).toContain('Persist Eng Review result');
});
test('Review Readiness Dashboard includes Adversarial Review row', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Adversarial');
@@ -1362,6 +1411,11 @@ describe('Skill trigger phrases', () => {
describe('Codex skill validation', () => {
const AGENTS_DIR = path.join(ROOT, '.agents', 'skills');
// .agents/ is gitignored (v0.11.2.0) — generate on demand for tests
Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
});
// Discover all Claude skills with templates (except /codex which is Claude-only)
const CLAUDE_SKILLS_WITH_TEMPLATES = (() => {
const skills: string[] = [];
@@ -1423,3 +1477,59 @@ describe('Codex skill validation', () => {
}
});
});
// --- Repo mode and test failure triage validation ---
describe('Repo mode preamble validation', () => {
test('generated SKILL.md preamble contains REPO_MODE output', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('REPO_MODE:');
expect(content).toContain('gstack-repo-mode');
});
test('tier 3+ skills contain See Something Say Something section', () => {
// Root SKILL.md is tier 1 (no Repo Mode). Check a tier 3 skill instead.
const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('See Something, Say Something');
expect(content).toContain('REPO_MODE');
expect(content).toContain('solo');
expect(content).toContain('collaborative');
});
});
describe('Test failure triage in ship skill', () => {
test('ship/SKILL.md contains Test Failure Ownership Triage', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Failure Ownership Triage');
});
test('ship/SKILL.md triage uses git diff for classification', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('git diff origin/<base>...HEAD --name-only');
});
test('ship/SKILL.md triage has solo and collaborative paths', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('REPO_MODE');
expect(content).toContain('solo');
expect(content).toContain('collaborative');
expect(content).toContain('Investigate and fix now');
expect(content).toContain('Add as P0 TODO');
});
test('ship/SKILL.md triage has GitHub issue assignment for collaborative mode', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('gh issue create');
expect(content).toContain('--assignee');
});
test('{{TEST_FAILURE_TRIAGE}} placeholder is fully resolved in ship/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).not.toContain('{{TEST_FAILURE_TRIAGE}}');
});
test('ship/SKILL.md uses in-branch language for stop condition', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('In-branch test failures');
});
});
+125 -5
View File
@@ -78,8 +78,8 @@ describe('gstack-telemetry-log', () => {
const events = parseJsonl();
expect(events).toHaveLength(1);
// installation_id should be a SHA-256 hash (64 hex chars)
expect(events[0].installation_id).toMatch(/^[a-f0-9]{64}$/);
// installation_id should be a UUID v4 (or hex fallback)
expect(events[0].installation_id).toMatch(/^[a-f0-9-]{32,36}$/);
});
test('installation_id is null for anonymous tier', () => {
@@ -125,6 +125,82 @@ describe('gstack-telemetry-log', () => {
expect(events[0]).toHaveProperty('_branch');
});
// ─── json_safe() injection prevention tests ────────────────
test('sanitizes skill name with quote injection attempt', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill 'review","injected":"true' --duration 10 --outcome success --session-id inj-1`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
// Must be valid JSON (no injection — quotes stripped, so no field injection possible)
const event = JSON.parse(lines[0]);
// The key check: no injected top-level property was created
expect(event).not.toHaveProperty('injected');
// Skill field should have quotes stripped but content preserved
expect(event.skill).not.toContain('"');
});
test('truncates skill name exceeding 200 chars', () => {
setConfig('telemetry', 'anonymous');
const longSkill = 'a'.repeat(250);
run(`${BIN}/gstack-telemetry-log --skill '${longSkill}' --duration 10 --outcome success --session-id trunc-1`);
const events = parseJsonl();
expect(events[0].skill.length).toBeLessThanOrEqual(200);
});
test('sanitizes outcome with newline injection attempt', () => {
setConfig('telemetry', 'anonymous');
// Use printf to pass actual newline in the argument
run(`bash -c 'OUTCOME=$(printf "success\\nfake\\":\\"true"); ${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome "$OUTCOME" --session-id inj-2'`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('fake');
});
test('sanitizes session_id with backslash-quote injection', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id 'id\\\\"","x":"y'`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('x');
});
test('sanitizes error_class with quote injection', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-class 'timeout","extra":"val' --session-id inj-3`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('extra');
});
test('sanitizes failed_step with quote injection', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --failed-step 'step1","hacked":"yes' --session-id inj-4`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('hacked');
});
test('escapes error_message quotes and preserves content', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-message 'Error: file "test.txt" not found' --session-id inj-5`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event.error_message).toContain('file');
expect(event.error_message).toContain('not found');
});
test('creates analytics directory if missing', () => {
// Remove analytics dir
const analyticsDir = path.join(tmpDir, 'analytics');
@@ -136,6 +212,34 @@ describe('gstack-telemetry-log', () => {
expect(fs.existsSync(analyticsDir)).toBe(true);
expect(readJsonl()).toHaveLength(1);
});
// ─── Telemetry JSON safety: branch/repo with special chars ────
test('branch name with quotes does not corrupt JSON', () => {
setConfig('telemetry', 'anonymous');
// Simulate a branch name with double quotes by setting it via git env override
// The json_safe function strips quotes, so the JSONL should remain valid
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id branch-quotes-1`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
// Every line must be valid JSON
const event = JSON.parse(lines[0]);
expect(event._branch).toBeDefined();
// _branch should not contain double quotes (json_safe strips them)
expect(event._branch).not.toContain('"');
});
test('repo slug with special chars does not corrupt JSON', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id repo-special-1`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event._repo_slug).toBeDefined();
// _repo_slug should not contain double quotes (json_safe strips them)
expect(event._repo_slug).not.toContain('"');
});
});
describe('.pending marker', () => {
@@ -244,16 +348,32 @@ describe('gstack-analytics', () => {
});
describe('gstack-telemetry-sync', () => {
test('exits silently with no endpoint configured', () => {
// Default: GSTACK_TELEMETRY_ENDPOINT is not set → exit 0
test('exits silently with no Supabase URL configured', () => {
// Default: GSTACK_SUPABASE_URL is not set → exit 0
const result = run(`${BIN}/gstack-telemetry-sync`);
expect(result).toBe('');
});
test('exits silently with no JSONL file', () => {
const result = run(`${BIN}/gstack-telemetry-sync`, { GSTACK_TELEMETRY_ENDPOINT: 'http://localhost:9999' });
const result = run(`${BIN}/gstack-telemetry-sync`, { GSTACK_SUPABASE_URL: 'http://localhost:9999' });
expect(result).toBe('');
});
test('does not rename JSONL field names (edge function expects raw names)', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 60 --outcome success --session-id raw-fields-1`);
const events = parseJsonl();
expect(events).toHaveLength(1);
// Edge function expects these raw field names, NOT Postgres column names
expect(events[0]).toHaveProperty('v');
expect(events[0]).toHaveProperty('ts');
expect(events[0]).toHaveProperty('sessions');
// Should NOT have Postgres column names
expect(events[0]).not.toHaveProperty('schema_version');
expect(events[0]).not.toHaveProperty('event_timestamp');
expect(events[0]).not.toHaveProperty('concurrent_sessions');
});
});
describe('gstack-community-dashboard', () => {
+48 -6
View File
@@ -13,6 +13,7 @@ import {
selectTests,
detectBaseBranch,
E2E_TOUCHFILES,
E2E_TIERS,
LLM_JUDGE_TOUCHFILES,
GLOBAL_TOUCHFILES,
} from './helpers/touchfiles';
@@ -79,8 +80,10 @@ describe('selectTests', () => {
expect(result.selected).toContain('plan-ceo-review');
expect(result.selected).toContain('plan-ceo-review-selective');
expect(result.selected).toContain('plan-ceo-review-benefits');
expect(result.selected.length).toBe(3);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 3);
expect(result.selected).toContain('autoplan-core');
expect(result.selected).toContain('codex-offered-ceo-review');
expect(result.selected.length).toBe(5);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5);
});
test('global touchfile triggers ALL tests', () => {
@@ -90,10 +93,19 @@ describe('selectTests', () => {
expect(result.reason).toContain('global');
});
test('gen-skill-docs.ts is a global touchfile', () => {
test('gen-skill-docs.ts is a scoped touchfile, not global', () => {
const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
expect(result.reason).toContain('global');
// Should select tests that list gen-skill-docs.ts in their touchfiles, not ALL tests
expect(result.selected.length).toBeGreaterThan(0);
expect(result.selected.length).toBeLessThan(Object.keys(E2E_TOUCHFILES).length);
expect(result.reason).toBe('diff');
// Should include tests that depend on gen-skill-docs.ts
expect(result.selected).toContain('skillmd-setup-discovery');
expect(result.selected).toContain('contributor-mode');
expect(result.selected).toContain('journey-ideation');
// Should NOT include tests that don't depend on it
expect(result.selected).not.toContain('retro');
expect(result.selected).not.toContain('cso-full-audit');
});
test('unrelated file selects nothing', () => {
@@ -142,7 +154,7 @@ describe('selectTests', () => {
});
test('global touchfiles work for LLM-judge tests too', () => {
const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
const result = selectTests(['test/helpers/session-runner.ts'], LLM_JUDGE_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
});
});
@@ -232,6 +244,36 @@ describe('TOUCHFILES completeness', () => {
}
});
test('E2E_TIERS covers exactly the same tests as E2E_TOUCHFILES', () => {
const touchfileKeys = new Set(Object.keys(E2E_TOUCHFILES));
const tierKeys = new Set(Object.keys(E2E_TIERS));
const missingFromTiers = [...touchfileKeys].filter(k => !tierKeys.has(k));
const extraInTiers = [...tierKeys].filter(k => !touchfileKeys.has(k));
if (missingFromTiers.length > 0) {
throw new Error(
`E2E tests missing TIER entries: ${missingFromTiers.join(', ')}\n` +
`Add these to E2E_TIERS in test/helpers/touchfiles.ts`,
);
}
if (extraInTiers.length > 0) {
throw new Error(
`E2E_TIERS has extra entries not in E2E_TOUCHFILES: ${extraInTiers.join(', ')}\n` +
`Remove these from E2E_TIERS or add to E2E_TOUCHFILES`,
);
}
});
test('E2E_TIERS only contains valid tier values', () => {
const validTiers = ['gate', 'periodic'];
for (const [name, tier] of Object.entries(E2E_TIERS)) {
if (!validTiers.includes(tier)) {
throw new Error(`E2E_TIERS['${name}'] has invalid tier '${tier}'. Valid: ${validTiers.join(', ')}`);
}
}
});
test('every LLM-judge test has a TOUCHFILES entry', () => {
const llmContent = fs.readFileSync(
path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
+165
View File
@@ -0,0 +1,165 @@
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const UNINSTALL = path.join(ROOT, 'bin', 'gstack-uninstall');
describe('gstack-uninstall', () => {
test('syntax check passes', () => {
const result = spawnSync('bash', ['-n', UNINSTALL], { stdio: 'pipe' });
expect(result.status).toBe(0);
});
test('--help prints usage and exits 0', () => {
const result = spawnSync('bash', [UNINSTALL, '--help'], { stdio: 'pipe' });
expect(result.status).toBe(0);
const output = result.stdout.toString();
expect(output).toContain('gstack-uninstall');
expect(output).toContain('--force');
expect(output).toContain('--keep-state');
});
test('unknown flag exits with error', () => {
const result = spawnSync('bash', [UNINSTALL, '--bogus'], {
stdio: 'pipe',
env: { ...process.env, HOME: '/nonexistent' },
});
expect(result.status).toBe(1);
expect(result.stderr.toString()).toContain('Unknown option');
});
describe('integration tests with mock layout', () => {
let tmpDir: string;
let mockHome: string;
let mockGitRoot: string;
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-uninstall-test-'));
mockHome = path.join(tmpDir, 'home');
mockGitRoot = path.join(tmpDir, 'repo');
// Create mock gstack install layout
fs.mkdirSync(path.join(mockHome, '.claude', 'skills', 'gstack'), { recursive: true });
fs.writeFileSync(path.join(mockHome, '.claude', 'skills', 'gstack', 'SKILL.md'), 'test');
// Create per-skill symlinks (both old unprefixed and new prefixed)
fs.symlinkSync('gstack/review', path.join(mockHome, '.claude', 'skills', 'review'));
fs.symlinkSync('gstack/ship', path.join(mockHome, '.claude', 'skills', 'gstack-ship'));
// Create a non-gstack symlink (should NOT be removed)
fs.mkdirSync(path.join(mockHome, '.claude', 'skills', 'other-tool'), { recursive: true });
// Create state directory
fs.mkdirSync(path.join(mockHome, '.gstack', 'projects'), { recursive: true });
fs.writeFileSync(path.join(mockHome, '.gstack', 'config.json'), '{}');
// Create mock git repo
fs.mkdirSync(mockGitRoot, { recursive: true });
spawnSync('git', ['init', '-b', 'main'], { cwd: mockGitRoot, stdio: 'pipe' });
});
afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});
test('--force removes global Claude skills and state', () => {
const result = spawnSync('bash', [UNINSTALL, '--force'], {
stdio: 'pipe',
env: {
...process.env,
HOME: mockHome,
GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
},
cwd: mockGitRoot,
});
expect(result.status).toBe(0);
const output = result.stdout.toString();
expect(output).toContain('gstack uninstalled');
// Global skill dir should be removed
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack'))).toBe(false);
// Per-skill symlinks pointing into gstack/ should be removed
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'review'))).toBe(false);
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack-ship'))).toBe(false);
// Non-gstack tool should still exist
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'other-tool'))).toBe(true);
// State should be removed
expect(fs.existsSync(path.join(mockHome, '.gstack'))).toBe(false);
});
test('--keep-state preserves state directory', () => {
const result = spawnSync('bash', [UNINSTALL, '--force', '--keep-state'], {
stdio: 'pipe',
env: {
...process.env,
HOME: mockHome,
GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
},
cwd: mockGitRoot,
});
expect(result.status).toBe(0);
// Skills should be removed
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack'))).toBe(false);
// State should still exist
expect(fs.existsSync(path.join(mockHome, '.gstack'))).toBe(true);
expect(fs.existsSync(path.join(mockHome, '.gstack', 'config.json'))).toBe(true);
});
test('clean system outputs nothing to remove', () => {
const cleanHome = path.join(tmpDir, 'clean-home');
fs.mkdirSync(cleanHome, { recursive: true });
const result = spawnSync('bash', [UNINSTALL, '--force'], {
stdio: 'pipe',
env: {
...process.env,
HOME: cleanHome,
GSTACK_DIR: path.join(cleanHome, 'nonexistent'),
GSTACK_STATE_DIR: path.join(cleanHome, '.gstack'),
},
cwd: mockGitRoot,
});
expect(result.status).toBe(0);
expect(result.stdout.toString()).toContain('Nothing to remove');
});
test('upgrade path: prefixed install + uninstall cleans both old and new symlinks', () => {
// Simulate the state after setup --no-prefix followed by setup (with prefix):
// Both old unprefixed and new prefixed symlinks exist
// (mockHome already has both 'review' and 'gstack-ship' symlinks)
const result = spawnSync('bash', [UNINSTALL, '--force'], {
stdio: 'pipe',
env: {
...process.env,
HOME: mockHome,
GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
},
cwd: mockGitRoot,
});
expect(result.status).toBe(0);
// Both old (review) and new (gstack-ship) symlinks should be gone
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'review'))).toBe(false);
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack-ship'))).toBe(false);
// Non-gstack should survive
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'other-tool'))).toBe(true);
});
});
});
+271
View File
@@ -0,0 +1,271 @@
/**
* Unit tests for WorktreeManager.
*
* Tests worktree lifecycle: create, harvest, dedup, cleanup, prune.
* Each test creates real git worktrees in a temporary repo.
*/
import { describe, test, expect, afterEach } from 'bun:test';
import { WorktreeManager } from '../lib/worktree';
import type { HarvestResult } from '../lib/worktree';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
/** Create a minimal git repo in a tmpdir for testing. */
function createTestRepo(): string {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'worktree-test-'));
spawnSync('git', ['init'], { cwd: dir, stdio: 'pipe' });
spawnSync('git', ['config', 'user.email', 'test@test.com'], { cwd: dir, stdio: 'pipe' });
spawnSync('git', ['config', 'user.name', 'Test'], { cwd: dir, stdio: 'pipe' });
// Create initial commit so HEAD exists
fs.writeFileSync(path.join(dir, 'README.md'), '# Test repo\n');
// Add .gitignore matching real repo (so copied build artifacts don't appear as changes)
fs.writeFileSync(path.join(dir, '.gitignore'), '.agents/\nbrowse/dist/\n.gstack-worktrees/\n');
// Create a .agents directory (simulating gitignored build artifacts)
fs.mkdirSync(path.join(dir, '.agents', 'skills'), { recursive: true });
fs.writeFileSync(path.join(dir, '.agents', 'skills', 'test-skill.md'), '# Test skill\n');
// Create browse/dist (simulating build artifacts)
fs.mkdirSync(path.join(dir, 'browse', 'dist'), { recursive: true });
fs.writeFileSync(path.join(dir, 'browse', 'dist', 'browse'), '#!/bin/sh\necho browse\n');
spawnSync('git', ['add', 'README.md', '.gitignore'], { cwd: dir, stdio: 'pipe' });
spawnSync('git', ['commit', '-m', 'Initial commit'], { cwd: dir, stdio: 'pipe' });
return dir;
}
/** Clean up a test repo. */
function cleanupRepo(dir: string): void {
// Prune worktrees first to avoid git lock issues
spawnSync('git', ['worktree', 'prune'], { cwd: dir, stdio: 'pipe' });
fs.rmSync(dir, { recursive: true, force: true });
}
// Track repos to clean up
const repos: string[] = [];
// Dedup index path — clear before each test to avoid cross-run contamination
const DEDUP_PATH = path.join(os.homedir(), '.gstack-dev', 'harvests', 'dedup.json');
afterEach(() => {
for (const repo of repos) {
try { cleanupRepo(repo); } catch { /* best effort */ }
}
repos.length = 0;
// Clear dedup index so tests are independent
try { fs.unlinkSync(DEDUP_PATH); } catch { /* may not exist */ }
});
describe('WorktreeManager', () => {
test('create() produces a valid worktree at the expected path', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const worktreePath = mgr.create('test-1');
expect(fs.existsSync(worktreePath)).toBe(true);
expect(fs.existsSync(path.join(worktreePath, 'README.md'))).toBe(true);
expect(worktreePath).toContain('.gstack-worktrees');
expect(worktreePath).toContain('test-1');
mgr.cleanup('test-1');
});
test('create() worktree has .agents/skills/ (gitignored artifacts copied)', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const worktreePath = mgr.create('test-agents');
expect(fs.existsSync(path.join(worktreePath, '.agents', 'skills', 'test-skill.md'))).toBe(true);
expect(fs.existsSync(path.join(worktreePath, 'browse', 'dist', 'browse'))).toBe(true);
mgr.cleanup('test-agents');
});
test('create() stores correct originalSha', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const expectedSha = spawnSync('git', ['rev-parse', 'HEAD'], { cwd: repo, stdio: 'pipe' })
.stdout.toString().trim();
mgr.create('test-sha');
const info = mgr.getInfo('test-sha');
expect(info).toBeDefined();
expect(info!.originalSha).toBe(expectedSha);
mgr.cleanup('test-sha');
});
test('harvest() captures modifications to tracked files', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const worktreePath = mgr.create('test-harvest-mod');
// Modify a tracked file in the worktree
fs.writeFileSync(path.join(worktreePath, 'README.md'), '# Modified!\n');
const result = mgr.harvest('test-harvest-mod');
expect(result).not.toBeNull();
expect(result!.changedFiles).toContain('README.md');
expect(result!.isDuplicate).toBe(false);
expect(result!.patchPath).toBeTruthy();
expect(fs.existsSync(result!.patchPath)).toBe(true);
mgr.cleanup('test-harvest-mod');
});
test('harvest() captures new untracked files (git add -A path)', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const worktreePath = mgr.create('test-harvest-new');
// Create a new file in the worktree
fs.writeFileSync(path.join(worktreePath, 'new-file.txt'), 'Hello from agent\n');
const result = mgr.harvest('test-harvest-new');
expect(result).not.toBeNull();
expect(result!.changedFiles).toContain('new-file.txt');
mgr.cleanup('test-harvest-new');
});
test('harvest() captures committed changes (git diff originalSha)', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const worktreePath = mgr.create('test-harvest-commit');
// Make a commit in the worktree (simulating agent running git commit)
fs.writeFileSync(path.join(worktreePath, 'committed.txt'), 'Agent committed this\n');
spawnSync('git', ['add', 'committed.txt'], { cwd: worktreePath, stdio: 'pipe' });
spawnSync('git', ['commit', '-m', 'Agent commit'], { cwd: worktreePath, stdio: 'pipe' });
const result = mgr.harvest('test-harvest-commit');
expect(result).not.toBeNull();
expect(result!.changedFiles).toContain('committed.txt');
mgr.cleanup('test-harvest-commit');
});
test('harvest() returns null when worktree is clean', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
mgr.create('test-harvest-clean');
// Don't modify anything
const result = mgr.harvest('test-harvest-clean');
expect(result).toBeNull();
mgr.cleanup('test-harvest-clean');
});
test('harvest() dedup skips identical patches', () => {
const repo = createTestRepo();
repos.push(repo);
// First run
const mgr1 = new WorktreeManager(repo);
const wt1 = mgr1.create('test-dedup-1');
fs.writeFileSync(path.join(wt1, 'dedup-test.txt'), 'same content\n');
const result1 = mgr1.harvest('test-dedup-1');
mgr1.cleanup('test-dedup-1');
expect(result1).not.toBeNull();
expect(result1!.isDuplicate).toBe(false);
// Second run with same change
const mgr2 = new WorktreeManager(repo);
const wt2 = mgr2.create('test-dedup-2');
fs.writeFileSync(path.join(wt2, 'dedup-test.txt'), 'same content\n');
const result2 = mgr2.harvest('test-dedup-2');
mgr2.cleanup('test-dedup-2');
expect(result2).not.toBeNull();
expect(result2!.isDuplicate).toBe(true);
});
test('cleanup() removes worktree directory', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const worktreePath = mgr.create('test-cleanup');
expect(fs.existsSync(worktreePath)).toBe(true);
mgr.cleanup('test-cleanup');
expect(fs.existsSync(worktreePath)).toBe(false);
});
test('pruneStale() removes orphaned worktrees from previous runs', () => {
const repo = createTestRepo();
repos.push(repo);
// Create a worktree with a different manager (simulating a previous run)
const oldMgr = new WorktreeManager(repo);
const oldPath = oldMgr.create('stale-test');
const oldRunDir = path.dirname(oldPath);
expect(fs.existsSync(oldPath)).toBe(true);
// Remove via git but leave directory (simulating a crash)
spawnSync('git', ['worktree', 'remove', '--force', oldPath], { cwd: repo, stdio: 'pipe' });
// Recreate the directory to simulate orphaned state
fs.mkdirSync(oldPath, { recursive: true });
// New manager should prune the old run's directory
const newMgr = new WorktreeManager(repo);
newMgr.pruneStale();
expect(fs.existsSync(oldRunDir)).toBe(false);
});
test('create() throws on failure (no silent fallback to ROOT)', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
// Create the same worktree twice — second should fail because path exists
mgr.create('test-fail');
expect(() => mgr.create('test-fail')).toThrow();
mgr.cleanup('test-fail');
});
test('harvest() returns null gracefully when worktree dir was deleted by agent', () => {
const repo = createTestRepo();
repos.push(repo);
const mgr = new WorktreeManager(repo);
const worktreePath = mgr.create('test-deleted');
// Simulate agent deleting its own worktree directory
fs.rmSync(worktreePath, { recursive: true, force: true });
// harvest should return null gracefully, not throw
const result = mgr.harvest('test-deleted');
expect(result).toBeNull();
// cleanup should also be non-fatal
mgr.cleanup('test-deleted');
});
});