mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 13:45:35 +02:00
Merge branch 'main' into garrytan/team-supabase-store
Brings in 55 commits from main (v0.12.x–v0.13.5.0): Factory Droid compat, prompt injection defense, user sovereignty, security audit, design binary, skill namespacing, modular resolvers, Chrome sidebar, and more. Conflict resolution: - .agents/ SKILL.md files: deleted (main moved to .factory/) - 8 .tmpl templates: accepted main (new features: CDP mode, design tools, global retro, parallelization, distribution checks, plan audits) - scripts/gen-skill-docs.ts: accepted main's modular resolver refactor - test/helpers/session-runner.ts: accepted main + layered back CostEntry tracking from team branch - Generated SKILL.md files: regenerated via bun run gen:skill-docs - Updated tests to match main's gstack-slug output (2 lines, no PROJECTS_DIR) and review log mechanism (gstack-review-log, not $BRANCH.jsonl) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { readFileSync, readdirSync, existsSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
|
||||
const ROOT = join(import.meta.dir, '..');
|
||||
|
||||
function getAllSkillMds(): Array<{ name: string; content: string }> {
|
||||
const results: Array<{ name: string; content: string }> = [];
|
||||
const rootPath = join(ROOT, 'SKILL.md');
|
||||
if (existsSync(rootPath)) {
|
||||
results.push({ name: 'root', content: readFileSync(rootPath, 'utf-8') });
|
||||
}
|
||||
for (const entry of readdirSync(ROOT, { withFileTypes: true })) {
|
||||
if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
|
||||
const skillPath = join(ROOT, entry.name, 'SKILL.md');
|
||||
if (existsSync(skillPath)) {
|
||||
results.push({ name: entry.name, content: readFileSync(skillPath, 'utf-8') });
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
describe('Audit compliance', () => {
|
||||
// Fix 1: W007 — No hardcoded credentials in documentation
|
||||
test('no hardcoded credential patterns in SKILL.md.tmpl', () => {
|
||||
const tmpl = readFileSync(join(ROOT, 'SKILL.md.tmpl'), 'utf-8');
|
||||
expect(tmpl).not.toContain('"password123"');
|
||||
expect(tmpl).not.toContain('"test@example.com"');
|
||||
expect(tmpl).not.toContain('"test@test.com"');
|
||||
expect(tmpl).toContain('$TEST_EMAIL');
|
||||
expect(tmpl).toContain('$TEST_PASSWORD');
|
||||
});
|
||||
|
||||
// Fix 2: Conditional telemetry — binary calls wrapped with existence check
|
||||
test('preamble telemetry calls are conditional on _TEL and binary existence', () => {
|
||||
const preamble = readFileSync(join(ROOT, 'scripts/resolvers/preamble.ts'), 'utf-8');
|
||||
// Pending finalization must check _TEL and binary existence
|
||||
expect(preamble).toContain('_TEL" != "off"');
|
||||
expect(preamble).toContain('-x ');
|
||||
expect(preamble).toContain('gstack-telemetry-log');
|
||||
// End-of-skill telemetry must also be conditional
|
||||
const completionIdx = preamble.indexOf('Telemetry (run last)');
|
||||
expect(completionIdx).toBeGreaterThan(-1);
|
||||
const completionSection = preamble.slice(completionIdx);
|
||||
expect(completionSection).toContain('_TEL" != "off"');
|
||||
});
|
||||
|
||||
// Fix 3: W012 — Bun install is version-pinned
|
||||
test('bun install commands use version pinning', () => {
|
||||
const browseResolver = readFileSync(join(ROOT, 'scripts/resolvers/browse.ts'), 'utf-8');
|
||||
expect(browseResolver).toContain('BUN_VERSION');
|
||||
// Should not have unpinned curl|bash (without BUN_VERSION on same line)
|
||||
const lines = browseResolver.split('\n');
|
||||
for (const line of lines) {
|
||||
if (line.includes('bun.sh/install') && line.includes('bash') && !line.includes('BUN_VERSION') && !line.includes('command -v')) {
|
||||
throw new Error(`Unpinned bun install found: ${line.trim()}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Fix 4: W011 — Untrusted content warning in command reference
|
||||
test('command reference includes untrusted content warning after Navigation', () => {
|
||||
const rootSkill = readFileSync(join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const navIdx = rootSkill.indexOf('### Navigation');
|
||||
const readingIdx = rootSkill.indexOf('### Reading');
|
||||
expect(navIdx).toBeGreaterThan(-1);
|
||||
expect(readingIdx).toBeGreaterThan(navIdx);
|
||||
const between = rootSkill.slice(navIdx, readingIdx);
|
||||
expect(between.toLowerCase()).toContain('untrusted');
|
||||
});
|
||||
|
||||
// Fix 5: Data flow documentation in review.ts
|
||||
test('review.ts has data flow documentation', () => {
|
||||
const review = readFileSync(join(ROOT, 'scripts/resolvers/review.ts'), 'utf-8');
|
||||
expect(review).toContain('Data sent');
|
||||
expect(review).toContain('Data NOT sent');
|
||||
});
|
||||
|
||||
// Fix 2+6: All generated SKILL.md files with telemetry are conditional
|
||||
test('all generated SKILL.md files with telemetry calls use conditional pattern', () => {
|
||||
const skills = getAllSkillMds();
|
||||
for (const { name, content } of skills) {
|
||||
if (content.includes('gstack-telemetry-log')) {
|
||||
expect(content).toContain('_TEL" != "off"');
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
+19
-6
@@ -13,12 +13,13 @@
|
||||
* Skips gracefully when prerequisites are not met.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, afterAll } from 'bun:test';
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runCodexSkill, parseCodexJSONL, installSkillToTempHome } from './helpers/codex-session-runner';
|
||||
import type { CodexResult } from './helpers/codex-session-runner';
|
||||
import { EvalCollector } from './helpers/eval-store';
|
||||
import type { EvalTestEntry } from './helpers/eval-store';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
import { createTestWorktree, harvestAndCleanup } from './helpers/e2e-helpers';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
@@ -118,16 +119,25 @@ afterAll(async () => {
|
||||
// --- Tests ---
|
||||
|
||||
describeCodex('Codex E2E', () => {
|
||||
let testWorktree: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testWorktree = createTestWorktree('codex');
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
harvestAndCleanup('codex');
|
||||
});
|
||||
|
||||
testIfSelected('codex-discover-skill', async () => {
|
||||
// Install gstack-review skill to a temp HOME and ask Codex to list skills
|
||||
const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
|
||||
const skillDir = path.join(testWorktree, '.agents', 'skills', 'gstack-review');
|
||||
|
||||
const result = await runCodexSkill({
|
||||
skillDir,
|
||||
prompt: 'List any skills or instructions you have available. Just list the names.',
|
||||
timeoutMs: 60_000,
|
||||
cwd: ROOT,
|
||||
cwd: testWorktree,
|
||||
skillName: 'gstack-review',
|
||||
});
|
||||
|
||||
@@ -139,6 +149,9 @@ describeCodex('Codex E2E', () => {
|
||||
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(result.output.length).toBeGreaterThan(0);
|
||||
// Skill loading errors mean our generated SKILL.md files are broken
|
||||
expect(result.stderr).not.toContain('invalid');
|
||||
expect(result.stderr).not.toContain('Skipped loading');
|
||||
// The output should reference the skill name in some form
|
||||
const outputLower = result.output.toLowerCase();
|
||||
expect(
|
||||
@@ -150,14 +163,14 @@ describeCodex('Codex E2E', () => {
|
||||
// code review, and produce structured review output with findings/issues.
|
||||
// Accepts Codex timeout (exit 124/137) as non-failure since that's a CLI perf issue.
|
||||
testIfSelected('codex-review-findings', async () => {
|
||||
// Install gstack-review skill and ask Codex to review the current repo
|
||||
const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
|
||||
// Install gstack-review skill and ask Codex to review the worktree
|
||||
const skillDir = path.join(testWorktree, '.agents', 'skills', 'gstack-review');
|
||||
|
||||
const result = await runCodexSkill({
|
||||
skillDir,
|
||||
prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
|
||||
timeoutMs: 540_000,
|
||||
cwd: ROOT,
|
||||
cwd: testWorktree,
|
||||
skillName: 'gstack-review',
|
||||
});
|
||||
|
||||
|
||||
+76
@@ -0,0 +1,76 @@
|
||||
/**
|
||||
* Shared fixture for test coverage audit E2E tests.
|
||||
*
|
||||
* Creates a Node.js project with billing source code that has intentional
|
||||
* test coverage gaps: processPayment has happy-path-only tests,
|
||||
* refundPayment has no tests at all.
|
||||
*
|
||||
* Used by: ship-coverage-audit E2E, review-coverage-audit E2E
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { spawnSync } from 'child_process';
|
||||
|
||||
export function createCoverageAuditFixture(dir: string): void {
|
||||
// Create a Node.js project WITH test framework but coverage gaps
|
||||
fs.writeFileSync(path.join(dir, 'package.json'), JSON.stringify({
|
||||
name: 'test-coverage-app',
|
||||
version: '1.0.0',
|
||||
type: 'module',
|
||||
scripts: { test: 'echo "no tests yet"' },
|
||||
devDependencies: { vitest: '^1.0.0' },
|
||||
}, null, 2));
|
||||
|
||||
// Create vitest config
|
||||
fs.writeFileSync(path.join(dir, 'vitest.config.ts'),
|
||||
`import { defineConfig } from 'vitest/config';\nexport default defineConfig({ test: {} });\n`);
|
||||
|
||||
fs.writeFileSync(path.join(dir, 'VERSION'), '0.1.0.0\n');
|
||||
fs.writeFileSync(path.join(dir, 'CHANGELOG.md'), '# Changelog\n');
|
||||
|
||||
// Create source file with multiple code paths
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'src', 'billing.ts'), `
|
||||
export function processPayment(amount: number, currency: string) {
|
||||
if (amount <= 0) throw new Error('Invalid amount');
|
||||
if (currency !== 'USD' && currency !== 'EUR') throw new Error('Unsupported currency');
|
||||
return { status: 'success', amount, currency };
|
||||
}
|
||||
|
||||
export function refundPayment(paymentId: string, reason: string) {
|
||||
if (!paymentId) throw new Error('Payment ID required');
|
||||
if (!reason) throw new Error('Reason required');
|
||||
return { status: 'refunded', paymentId, reason };
|
||||
}
|
||||
`);
|
||||
|
||||
// Create a test directory with ONE test (partial coverage)
|
||||
fs.mkdirSync(path.join(dir, 'test'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'test', 'billing.test.ts'), `
|
||||
import { describe, test, expect } from 'vitest';
|
||||
import { processPayment } from '../src/billing';
|
||||
|
||||
describe('processPayment', () => {
|
||||
test('processes valid payment', () => {
|
||||
const result = processPayment(100, 'USD');
|
||||
expect(result.status).toBe('success');
|
||||
});
|
||||
// GAP: no test for invalid amount
|
||||
// GAP: no test for unsupported currency
|
||||
// GAP: refundPayment not tested at all
|
||||
});
|
||||
`);
|
||||
|
||||
// Init git repo with main branch
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial commit']);
|
||||
|
||||
// Create feature branch
|
||||
run('git', ['checkout', '-b', 'feature/billing']);
|
||||
}
|
||||
+16
-6
@@ -13,11 +13,12 @@
|
||||
* Skips gracefully when prerequisites are not met.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, afterAll } from 'bun:test';
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runGeminiSkill } from './helpers/gemini-session-runner';
|
||||
import type { GeminiResult } from './helpers/gemini-session-runner';
|
||||
import { EvalCollector } from './helpers/eval-store';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
import { createTestWorktree, harvestAndCleanup } from './helpers/e2e-helpers';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
@@ -76,7 +77,7 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
/** Skip an individual test if not selected by diff-based selection. */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
// --- Eval result collector ---
|
||||
@@ -114,13 +115,22 @@ afterAll(async () => {
|
||||
// --- Tests ---
|
||||
|
||||
describeGemini('Gemini E2E', () => {
|
||||
let testWorktree: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testWorktree = createTestWorktree('gemini');
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
harvestAndCleanup('gemini');
|
||||
});
|
||||
|
||||
testIfSelected('gemini-discover-skill', async () => {
|
||||
// Run Gemini in the repo root where .agents/skills/ exists
|
||||
// Run Gemini in an isolated worktree (has .agents/skills/ copied from ROOT)
|
||||
const result = await runGeminiSkill({
|
||||
prompt: 'List any skills or instructions you have available. Just list the names.',
|
||||
timeoutMs: 60_000,
|
||||
cwd: ROOT,
|
||||
cwd: testWorktree,
|
||||
});
|
||||
|
||||
logGeminiCost('gemini-discover-skill', result);
|
||||
@@ -139,11 +149,11 @@ describeGemini('Gemini E2E', () => {
|
||||
}, 120_000);
|
||||
|
||||
testIfSelected('gemini-review-findings', async () => {
|
||||
// Run gstack-review skill via Gemini on this repo
|
||||
// Run gstack-review skill via Gemini on worktree (isolated from main working tree)
|
||||
const result = await runGeminiSkill({
|
||||
prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
|
||||
timeoutMs: 540_000,
|
||||
cwd: ROOT,
|
||||
cwd: testWorktree,
|
||||
});
|
||||
|
||||
logGeminiCost('gemini-review-findings', result);
|
||||
|
||||
+1237
-17
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,187 @@
|
||||
import { describe, test, expect, beforeEach, afterEach } from "bun:test";
|
||||
import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync } from "fs";
|
||||
import { join } from "path";
|
||||
import { tmpdir } from "os";
|
||||
import { spawnSync } from "child_process";
|
||||
|
||||
// Import normalizeRemoteUrl for unit testing
|
||||
// We test the script end-to-end via CLI and normalizeRemoteUrl via import
|
||||
const scriptPath = join(import.meta.dir, "..", "bin", "gstack-global-discover.ts");
|
||||
|
||||
describe("gstack-global-discover", () => {
|
||||
describe("normalizeRemoteUrl", () => {
|
||||
// Dynamically import to test the exported function
|
||||
let normalizeRemoteUrl: (url: string) => string;
|
||||
|
||||
beforeEach(async () => {
|
||||
const mod = await import("../bin/gstack-global-discover.ts");
|
||||
normalizeRemoteUrl = mod.normalizeRemoteUrl;
|
||||
});
|
||||
|
||||
test("strips .git suffix", () => {
|
||||
expect(normalizeRemoteUrl("https://github.com/user/repo.git")).toBe(
|
||||
"https://github.com/user/repo"
|
||||
);
|
||||
});
|
||||
|
||||
test("converts SSH to HTTPS", () => {
|
||||
expect(normalizeRemoteUrl("git@github.com:user/repo.git")).toBe(
|
||||
"https://github.com/user/repo"
|
||||
);
|
||||
});
|
||||
|
||||
test("converts SSH without .git to HTTPS", () => {
|
||||
expect(normalizeRemoteUrl("git@github.com:user/repo")).toBe(
|
||||
"https://github.com/user/repo"
|
||||
);
|
||||
});
|
||||
|
||||
test("lowercases host", () => {
|
||||
expect(normalizeRemoteUrl("https://GitHub.COM/user/repo")).toBe(
|
||||
"https://github.com/user/repo"
|
||||
);
|
||||
});
|
||||
|
||||
test("SSH and HTTPS for same repo normalize to same URL", () => {
|
||||
const ssh = normalizeRemoteUrl("git@github.com:garrytan/gstack.git");
|
||||
const https = normalizeRemoteUrl("https://github.com/garrytan/gstack.git");
|
||||
const httpsNoDotGit = normalizeRemoteUrl("https://github.com/garrytan/gstack");
|
||||
expect(ssh).toBe(https);
|
||||
expect(https).toBe(httpsNoDotGit);
|
||||
});
|
||||
|
||||
test("handles local: URLs consistently", () => {
|
||||
const result = normalizeRemoteUrl("local:/tmp/my-repo");
|
||||
// local: gets parsed as a URL scheme — the important thing is consistency
|
||||
expect(result).toContain("/tmp/my-repo");
|
||||
});
|
||||
|
||||
test("handles GitLab SSH URLs", () => {
|
||||
expect(normalizeRemoteUrl("git@gitlab.com:org/project.git")).toBe(
|
||||
"https://gitlab.com/org/project"
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("CLI", () => {
|
||||
test("--help exits 0 and prints usage", () => {
|
||||
const result = spawnSync("bun", ["run", scriptPath, "--help"], {
|
||||
encoding: "utf-8",
|
||||
timeout: 10000,
|
||||
});
|
||||
expect(result.status).toBe(0);
|
||||
expect(result.stderr).toContain("--since");
|
||||
});
|
||||
|
||||
test("no args exits 1 with error", () => {
|
||||
const result = spawnSync("bun", ["run", scriptPath], {
|
||||
encoding: "utf-8",
|
||||
timeout: 10000,
|
||||
});
|
||||
expect(result.status).toBe(1);
|
||||
expect(result.stderr).toContain("--since is required");
|
||||
});
|
||||
|
||||
test("invalid window format exits 1", () => {
|
||||
const result = spawnSync("bun", ["run", scriptPath, "--since", "abc"], {
|
||||
encoding: "utf-8",
|
||||
timeout: 10000,
|
||||
});
|
||||
expect(result.status).toBe(1);
|
||||
expect(result.stderr).toContain("Invalid window format");
|
||||
});
|
||||
|
||||
test("--since 7d produces valid JSON", () => {
|
||||
const result = spawnSync(
|
||||
"bun",
|
||||
["run", scriptPath, "--since", "7d", "--format", "json"],
|
||||
{ encoding: "utf-8", timeout: 30000 }
|
||||
);
|
||||
expect(result.status).toBe(0);
|
||||
const json = JSON.parse(result.stdout);
|
||||
expect(json).toHaveProperty("window", "7d");
|
||||
expect(json).toHaveProperty("repos");
|
||||
expect(json).toHaveProperty("total_sessions");
|
||||
expect(json).toHaveProperty("total_repos");
|
||||
expect(json).toHaveProperty("tools");
|
||||
expect(Array.isArray(json.repos)).toBe(true);
|
||||
});
|
||||
|
||||
test("--since 7d --format summary produces readable output", () => {
|
||||
const result = spawnSync(
|
||||
"bun",
|
||||
["run", scriptPath, "--since", "7d", "--format", "summary"],
|
||||
{ encoding: "utf-8", timeout: 30000 }
|
||||
);
|
||||
expect(result.status).toBe(0);
|
||||
expect(result.stdout).toContain("Window: 7d");
|
||||
expect(result.stdout).toContain("Sessions:");
|
||||
expect(result.stdout).toContain("Repos:");
|
||||
});
|
||||
|
||||
test("--since 1h returns results (may be empty)", () => {
|
||||
const result = spawnSync(
|
||||
"bun",
|
||||
["run", scriptPath, "--since", "1h", "--format", "json"],
|
||||
{ encoding: "utf-8", timeout: 30000 }
|
||||
);
|
||||
expect(result.status).toBe(0);
|
||||
const json = JSON.parse(result.stdout);
|
||||
expect(json.total_sessions).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("discovery output structure", () => {
|
||||
test("repos have required fields", () => {
|
||||
const result = spawnSync(
|
||||
"bun",
|
||||
["run", scriptPath, "--since", "30d", "--format", "json"],
|
||||
{ encoding: "utf-8", timeout: 30000 }
|
||||
);
|
||||
expect(result.status).toBe(0);
|
||||
const json = JSON.parse(result.stdout);
|
||||
|
||||
for (const repo of json.repos) {
|
||||
expect(repo).toHaveProperty("name");
|
||||
expect(repo).toHaveProperty("remote");
|
||||
expect(repo).toHaveProperty("paths");
|
||||
expect(repo).toHaveProperty("sessions");
|
||||
expect(Array.isArray(repo.paths)).toBe(true);
|
||||
expect(repo.paths.length).toBeGreaterThan(0);
|
||||
expect(repo.sessions).toHaveProperty("claude_code");
|
||||
expect(repo.sessions).toHaveProperty("codex");
|
||||
expect(repo.sessions).toHaveProperty("gemini");
|
||||
}
|
||||
});
|
||||
|
||||
test("tools summary matches repo data", () => {
|
||||
const result = spawnSync(
|
||||
"bun",
|
||||
["run", scriptPath, "--since", "30d", "--format", "json"],
|
||||
{ encoding: "utf-8", timeout: 30000 }
|
||||
);
|
||||
const json = JSON.parse(result.stdout);
|
||||
|
||||
// Total sessions should equal sum across tools
|
||||
const toolTotal =
|
||||
json.tools.claude_code.total_sessions +
|
||||
json.tools.codex.total_sessions +
|
||||
json.tools.gemini.total_sessions;
|
||||
expect(json.total_sessions).toBe(toolTotal);
|
||||
});
|
||||
|
||||
test("deduplicates Conductor workspaces by remote", () => {
|
||||
const result = spawnSync(
|
||||
"bun",
|
||||
["run", scriptPath, "--since", "30d", "--format", "json"],
|
||||
{ encoding: "utf-8", timeout: 30000 }
|
||||
);
|
||||
const json = JSON.parse(result.stdout);
|
||||
|
||||
// Check that no two repos share the same normalized remote
|
||||
const remotes = json.repos.map((r: any) => r.remote);
|
||||
const uniqueRemotes = new Set(remotes);
|
||||
expect(remotes.length).toBe(uniqueRemotes.size);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -27,6 +27,7 @@ export interface CodexResult {
|
||||
durationMs: number; // Wall clock time
|
||||
sessionId: string | null; // Thread ID for session continuity
|
||||
rawLines: string[]; // Raw JSONL lines for debugging
|
||||
stderr: string; // Stderr output (skill loading errors, auth failures)
|
||||
}
|
||||
|
||||
// --- JSONL parser (ported from Python in codex/SKILL.md.tmpl) ---
|
||||
@@ -98,7 +99,8 @@ export function parseCodexJSONL(lines: string[]): ParsedCodexJSONL {
|
||||
|
||||
/**
|
||||
* Install a SKILL.md into a temp HOME directory for Codex to discover.
|
||||
* Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME.
|
||||
* Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME and copies
|
||||
* agents/openai.yaml when present so Codex sees the same metadata as a real install.
|
||||
*
|
||||
* Returns the temp HOME path. Caller is responsible for cleanup.
|
||||
*/
|
||||
@@ -116,6 +118,13 @@ export function installSkillToTempHome(
|
||||
fs.copyFileSync(srcSkill, path.join(destDir, 'SKILL.md'));
|
||||
}
|
||||
|
||||
const srcOpenAIYaml = path.join(skillDir, 'agents', 'openai.yaml');
|
||||
if (fs.existsSync(srcOpenAIYaml)) {
|
||||
const destAgentsDir = path.join(destDir, 'agents');
|
||||
fs.mkdirSync(destAgentsDir, { recursive: true });
|
||||
fs.copyFileSync(srcOpenAIYaml, path.join(destAgentsDir, 'openai.yaml'));
|
||||
}
|
||||
|
||||
return home;
|
||||
}
|
||||
|
||||
@@ -159,6 +168,7 @@ export async function runCodexSkill(opts: {
|
||||
durationMs: Date.now() - startTime,
|
||||
sessionId: null,
|
||||
rawLines: [],
|
||||
stderr: '',
|
||||
};
|
||||
}
|
||||
|
||||
@@ -274,6 +284,7 @@ export async function runCodexSkill(opts: {
|
||||
durationMs,
|
||||
sessionId: parsed.sessionId,
|
||||
rawLines: collectedLines,
|
||||
stderr,
|
||||
};
|
||||
} finally {
|
||||
// Clean up temp HOME
|
||||
|
||||
+71
-16
@@ -5,11 +5,13 @@
|
||||
* tests across multiple files by category.
|
||||
*/
|
||||
|
||||
import { describe, test, afterAll } from 'bun:test';
|
||||
import { describe, test, beforeAll, afterAll } from 'bun:test';
|
||||
import type { SkillTestResult } from './session-runner';
|
||||
import { EvalCollector, judgePassed } from './eval-store';
|
||||
import type { EvalTestEntry } from './eval-store';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './touchfiles';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './touchfiles';
|
||||
import { WorktreeManager } from '../../lib/worktree';
|
||||
import type { HarvestResult } from '../../lib/worktree';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
@@ -30,13 +32,6 @@ export const evalsEnabled = !!process.env.EVALS;
|
||||
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
|
||||
export let selectedTests: string[] | null = null; // null = run all
|
||||
|
||||
// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback
|
||||
const FAST_EXCLUDED_TESTS = [
|
||||
'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch',
|
||||
'design-consultation-core', 'design-consultation-existing',
|
||||
'qa-fix-loop', 'design-review-fix',
|
||||
];
|
||||
|
||||
if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
const baseBranch = process.env.EVALS_BASE
|
||||
|| detectBaseBranch(ROOT)
|
||||
@@ -55,15 +50,22 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
|
||||
}
|
||||
|
||||
// Apply EVALS_FAST filter after diff-based selection
|
||||
if (evalsEnabled && process.env.EVALS_FAST) {
|
||||
// EVALS_TIER: filter tests by tier after diff-based selection.
|
||||
// 'gate' = gate tests only (CI default — blocks merge)
|
||||
// 'periodic' = periodic tests only (weekly cron / manual)
|
||||
// not set = run all selected tests (local dev default, backward compat)
|
||||
if (evalsEnabled && process.env.EVALS_TIER) {
|
||||
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
|
||||
const tierTests = Object.entries(E2E_TIERS)
|
||||
.filter(([, t]) => t === tier)
|
||||
.map(([name]) => name);
|
||||
|
||||
if (selectedTests === null) {
|
||||
// Run all minus excluded
|
||||
selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t));
|
||||
selectedTests = tierTests;
|
||||
} else {
|
||||
selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t));
|
||||
selectedTests = selectedTests.filter(t => tierTests.includes(t));
|
||||
}
|
||||
process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`);
|
||||
process.stderr.write(`EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
|
||||
}
|
||||
|
||||
export const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||
@@ -205,7 +207,7 @@ export async function finalizeEvalCollector(evalCollector: EvalCollector | null)
|
||||
if (evalsEnabled) {
|
||||
const gstackDir = path.join(os.homedir(), '.gstack');
|
||||
fs.mkdirSync(gstackDir, { recursive: true });
|
||||
for (const f of ['.completeness-intro-seen', '.telemetry-prompted']) {
|
||||
for (const f of ['.completeness-intro-seen', '.telemetry-prompted', '.proactive-prompted']) {
|
||||
const p = path.join(gstackDir, f);
|
||||
if (!fs.existsSync(p)) fs.writeFileSync(p, '');
|
||||
}
|
||||
@@ -234,6 +236,59 @@ export function testConcurrentIfSelected(testName: string, fn: () => Promise<voi
|
||||
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
// --- Worktree isolation ---
|
||||
|
||||
let worktreeManager: WorktreeManager | null = null;
|
||||
|
||||
export function getWorktreeManager(): WorktreeManager {
|
||||
if (!worktreeManager) {
|
||||
worktreeManager = new WorktreeManager();
|
||||
worktreeManager.pruneStale();
|
||||
}
|
||||
return worktreeManager;
|
||||
}
|
||||
|
||||
/** Create an isolated worktree for a test. Returns the worktree path. */
|
||||
export function createTestWorktree(testName: string): string {
|
||||
return getWorktreeManager().create(testName);
|
||||
}
|
||||
|
||||
/** Harvest changes and clean up. Call in afterAll(). Returns HarvestResult for eval integration. */
|
||||
export function harvestAndCleanup(testName: string): HarvestResult | null {
|
||||
const mgr = getWorktreeManager();
|
||||
const result = mgr.harvest(testName);
|
||||
if (result) {
|
||||
if (result.isDuplicate) {
|
||||
process.stderr.write(`\n HARVEST [${testName}]: duplicate patch (skipped)\n`);
|
||||
} else {
|
||||
process.stderr.write(`\n HARVEST [${testName}]: ${result.changedFiles.length} files changed\n`);
|
||||
process.stderr.write(` Patch: ${result.patchPath}\n`);
|
||||
process.stderr.write(` ${result.diffStat}\n\n`);
|
||||
}
|
||||
}
|
||||
mgr.cleanup(testName);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: describe block with automatic worktree isolation + harvest.
|
||||
* Any test file can use this to get real repo context instead of a tmpdir.
|
||||
* Note: tests with planted-bug fixtures should NOT use this — they need their fixture repos.
|
||||
*/
|
||||
export function describeWithWorktree(
|
||||
name: string,
|
||||
testNames: string[],
|
||||
fn: (getWorktreePath: () => string) => void,
|
||||
) {
|
||||
describeIfSelected(name, testNames, () => {
|
||||
let worktreePath: string;
|
||||
beforeAll(() => { worktreePath = createTestWorktree(name); });
|
||||
afterAll(() => { harvestAndCleanup(name); });
|
||||
fn(() => worktreePath);
|
||||
});
|
||||
}
|
||||
|
||||
export { judgePassed } from './eval-store';
|
||||
export { EvalCollector } from './eval-store';
|
||||
export type { EvalTestEntry } from './eval-store';
|
||||
export type { HarvestResult } from '../../lib/worktree';
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
* Eval result persistence and comparison.
|
||||
*
|
||||
* EvalCollector accumulates test results, writes them to
|
||||
* ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
|
||||
* ~/.gstack/projects/$SLUG/evals/{version}-{branch}-{tier}-{timestamp}.json,
|
||||
* prints a summary table, and auto-compares with the previous run.
|
||||
*
|
||||
* Comparison functions are exported for reuse by the eval:compare CLI.
|
||||
@@ -16,7 +16,32 @@ import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '
|
||||
import type { CostEntry } from '../../lib/eval-format';
|
||||
|
||||
const SCHEMA_VERSION = 1;
|
||||
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
||||
const LEGACY_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
||||
|
||||
/**
|
||||
* Detect project-scoped eval dir via gstack-slug.
|
||||
* Falls back to legacy ~/.gstack-dev/evals/ if slug detection fails.
|
||||
*/
|
||||
export function getProjectEvalDir(): string {
|
||||
try {
|
||||
// Try repo-local gstack-slug first, then global install
|
||||
const localSlug = spawnSync('bash', ['-c', '.claude/skills/gstack/bin/gstack-slug 2>/dev/null || ~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null'], {
|
||||
stdio: 'pipe', timeout: 3000,
|
||||
});
|
||||
const output = localSlug.stdout?.toString().trim();
|
||||
if (output) {
|
||||
const slugMatch = output.match(/^SLUG=(.+)$/m);
|
||||
if (slugMatch && slugMatch[1]) {
|
||||
const dir = path.join(os.homedir(), '.gstack', 'projects', slugMatch[1], 'evals');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
return dir;
|
||||
}
|
||||
}
|
||||
} catch { /* fall through */ }
|
||||
return LEGACY_EVAL_DIR;
|
||||
}
|
||||
|
||||
const DEFAULT_EVAL_DIR = getProjectEvalDir();
|
||||
|
||||
// --- Interfaces ---
|
||||
|
||||
@@ -60,6 +85,13 @@ export interface EvalTestEntry {
|
||||
costs?: CostEntry[];
|
||||
|
||||
error?: string;
|
||||
|
||||
// Worktree harvest data
|
||||
harvest?: {
|
||||
filesChanged: number;
|
||||
patchPath: string;
|
||||
isDuplicate: boolean;
|
||||
};
|
||||
}
|
||||
|
||||
export interface EvalResult {
|
||||
|
||||
@@ -9,15 +9,23 @@
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
|
||||
import { getProjectEvalDir } from './eval-store';
|
||||
import type { CostEntry } from '../../lib/eval-format';
|
||||
import { resolveTier, tierToModel } from '../../lib/eval-tier';
|
||||
|
||||
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
|
||||
const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
|
||||
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
|
||||
const PROJECT_DIR = path.dirname(getProjectEvalDir()); // ~/.gstack/projects/$SLUG/
|
||||
|
||||
/** Sanitize test name for use as filename: strip leading slashes, replace / with - */
|
||||
export function sanitizeTestName(name: string): string {
|
||||
return sanitizeForFilename(name);
|
||||
return name.replace(/^\/+/, '').replace(/\//g, '-');
|
||||
}
|
||||
|
||||
/** Atomic write: write to .tmp then rename. Non-fatal on error. */
|
||||
function atomicWriteSync(filePath: string, data: string): void {
|
||||
const tmp = filePath + '.tmp';
|
||||
fs.writeFileSync(tmp, data);
|
||||
fs.renameSync(tmp, filePath);
|
||||
}
|
||||
|
||||
export interface CostEstimate {
|
||||
@@ -140,15 +148,13 @@ export async function runSkillTest(options: {
|
||||
const safeName = testName ? sanitizeTestName(testName) : null;
|
||||
if (runId) {
|
||||
try {
|
||||
runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
|
||||
runDir = path.join(PROJECT_DIR, 'e2e-runs', runId);
|
||||
fs.mkdirSync(runDir, { recursive: true });
|
||||
} catch { /* non-fatal */ }
|
||||
}
|
||||
|
||||
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
|
||||
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
||||
// Model pinned via EVAL_TIER env var (default: sonnet).
|
||||
const evalModel = tierToModel(resolveTier());
|
||||
const args = [
|
||||
'-p',
|
||||
'--model', model,
|
||||
|
||||
+211
-46
@@ -32,25 +32,25 @@ export function matchGlob(file: string, pattern: string): boolean {
|
||||
* Each test lists the file patterns that, if changed, require the test to run.
|
||||
*/
|
||||
export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Browse core
|
||||
'browse-basic': ['browse/src/**'],
|
||||
'browse-snapshot': ['browse/src/**'],
|
||||
// Browse core (+ test-server dependency)
|
||||
'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
'contributor-mode': ['SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// QA
|
||||
'qa-quick': ['qa/**', 'browse/src/**'],
|
||||
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
|
||||
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
|
||||
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
|
||||
// QA (+ test-server dependency)
|
||||
'qa-quick': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
|
||||
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
|
||||
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
|
||||
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
|
||||
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
|
||||
'qa-fix-loop': ['qa/**', 'browse/src/**'],
|
||||
'qa-fix-loop': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
|
||||
'qa-bootstrap': ['qa/**', 'ship/**'],
|
||||
|
||||
// Review
|
||||
@@ -68,58 +68,94 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'plan-eng-review': ['plan-eng-review/**'],
|
||||
'plan-eng-review-artifact': ['plan-eng-review/**'],
|
||||
'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Codex offering verification
|
||||
'codex-offered-office-hours': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
|
||||
'codex-offered-ceo-review': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'codex-offered-eng-review': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Ship
|
||||
'ship-base-branch': ['ship/**'],
|
||||
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
|
||||
'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Setup browser cookies
|
||||
'setup-cookies-detect': ['setup-browser-cookies/**'],
|
||||
'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'],
|
||||
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||
'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Retro
|
||||
'retro': ['retro/**'],
|
||||
'retro-base-branch': ['retro/**'],
|
||||
|
||||
// Global discover
|
||||
'global-discover': ['bin/gstack-global-discover.ts', 'test/global-discover.test.ts'],
|
||||
|
||||
// CSO
|
||||
'cso-full-audit': ['cso/**'],
|
||||
'cso-diff-mode': ['cso/**'],
|
||||
'cso-infra-scope': ['cso/**'],
|
||||
|
||||
// Document-release
|
||||
'document-release': ['document-release/**'],
|
||||
|
||||
// Codex (Claude E2E — tests /codex skill via Claude)
|
||||
'codex-review': ['codex/**'],
|
||||
|
||||
// Codex E2E (tests skills via Codex CLI)
|
||||
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
|
||||
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
|
||||
// Codex E2E (tests skills via Codex CLI + worktree)
|
||||
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
|
||||
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
|
||||
|
||||
// Gemini E2E (tests skills via Gemini CLI)
|
||||
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
|
||||
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
|
||||
// Gemini E2E (tests skills via Gemini CLI + worktree)
|
||||
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
|
||||
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
|
||||
|
||||
|
||||
// Ship coverage audit
|
||||
'ship-coverage-audit': ['ship/**'],
|
||||
// Coverage audit (shared fixture) + triage + gates
|
||||
'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'],
|
||||
'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'],
|
||||
'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'],
|
||||
'ship-triage': ['ship/**', 'bin/gstack-repo-mode'],
|
||||
|
||||
// Plan completion audit + verification
|
||||
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||
'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
|
||||
'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Design
|
||||
'design-consultation-core': ['design-consultation/**'],
|
||||
'design-consultation-existing': ['design-consultation/**'],
|
||||
'design-consultation-research': ['design-consultation/**'],
|
||||
'design-consultation-preview': ['design-consultation/**'],
|
||||
'plan-design-review-plan-mode': ['plan-design-review/**'],
|
||||
'plan-design-review-no-ui-scope': ['plan-design-review/**'],
|
||||
'design-review-fix': ['design-review/**', 'browse/src/**'],
|
||||
'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'],
|
||||
'design-consultation-existing': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
|
||||
'design-consultation-research': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
|
||||
'design-consultation-preview': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
|
||||
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'plan-design-review-no-ui-scope': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'design-review-fix': ['design-review/**', 'browse/src/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Design Shotgun
|
||||
'design-shotgun-path': ['design-shotgun/**', 'design/src/**', 'scripts/resolvers/design.ts'],
|
||||
'design-shotgun-session': ['design-shotgun/**', 'scripts/resolvers/design.ts'],
|
||||
'design-shotgun-full': ['design-shotgun/**', 'design/src/**', 'browse/src/**'],
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
|
||||
// Deploy skills
|
||||
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
'canary-workflow': ['canary/**', 'browse/src/**'],
|
||||
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
|
||||
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
'land-and-deploy-first-run': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
|
||||
'land-and-deploy-review-gate': ['land-and-deploy/**', 'bin/gstack-review-read'],
|
||||
'canary-workflow': ['canary/**', 'browse/src/**'],
|
||||
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
|
||||
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Sidebar agent
|
||||
'sidebar-navigate': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
|
||||
'sidebar-url-accuracy': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
|
||||
|
||||
// Autoplan
|
||||
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
|
||||
|
||||
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
|
||||
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-think-bigger': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-debug': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-code-review': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
@@ -130,6 +166,133 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
};
|
||||
|
||||
/**
|
||||
* E2E test tiers — 'gate' blocks PRs, 'periodic' runs weekly/on-demand.
|
||||
* Must have exactly the same keys as E2E_TOUCHFILES.
|
||||
*/
|
||||
export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// Browse core — gate (if browse breaks, everything breaks)
|
||||
'browse-basic': 'gate',
|
||||
'browse-snapshot': 'gate',
|
||||
|
||||
// SKILL.md setup — gate (if setup breaks, no skill works)
|
||||
'skillmd-setup-discovery': 'gate',
|
||||
'skillmd-no-local-binary': 'gate',
|
||||
'skillmd-outside-git': 'gate',
|
||||
'contributor-mode': 'gate',
|
||||
'session-awareness': 'gate',
|
||||
|
||||
// QA — gate for functional, periodic for quality/benchmarks
|
||||
'qa-quick': 'gate',
|
||||
'qa-b6-static': 'periodic',
|
||||
'qa-b7-spa': 'periodic',
|
||||
'qa-b8-checkout': 'periodic',
|
||||
'qa-only-no-fix': 'gate', // CRITICAL guardrail: Edit tool forbidden
|
||||
'qa-fix-loop': 'periodic',
|
||||
'qa-bootstrap': 'gate',
|
||||
|
||||
// Review — gate for functional/guardrails, periodic for quality
|
||||
'review-sql-injection': 'gate', // Security guardrail
|
||||
'review-enum-completeness': 'gate',
|
||||
'review-base-branch': 'gate',
|
||||
'review-design-lite': 'periodic', // 4/7 threshold is subjective
|
||||
'review-coverage-audit': 'gate',
|
||||
'review-plan-completion': 'gate',
|
||||
'review-dashboard-via': 'gate',
|
||||
|
||||
// Office Hours
|
||||
'office-hours-spec-review': 'gate',
|
||||
|
||||
// Plan reviews — gate for cheap functional, periodic for Opus quality
|
||||
'plan-ceo-review': 'periodic',
|
||||
'plan-ceo-review-selective': 'periodic',
|
||||
'plan-ceo-review-benefits': 'gate',
|
||||
'plan-eng-review': 'periodic',
|
||||
'plan-eng-review-artifact': 'periodic',
|
||||
'plan-eng-coverage-audit': 'gate',
|
||||
'plan-review-report': 'gate',
|
||||
|
||||
// Codex offering verification
|
||||
'codex-offered-office-hours': 'gate',
|
||||
'codex-offered-ceo-review': 'gate',
|
||||
'codex-offered-design-review': 'gate',
|
||||
'codex-offered-eng-review': 'gate',
|
||||
|
||||
// Ship — gate (end-to-end ship path)
|
||||
'ship-base-branch': 'gate',
|
||||
'ship-local-workflow': 'gate',
|
||||
'ship-coverage-audit': 'gate',
|
||||
'ship-triage': 'gate',
|
||||
'ship-plan-completion': 'gate',
|
||||
'ship-plan-verification': 'gate',
|
||||
|
||||
// Retro — gate for cheap branch detection, periodic for full Opus retro
|
||||
'retro': 'periodic',
|
||||
'retro-base-branch': 'gate',
|
||||
|
||||
// Global discover
|
||||
'global-discover': 'gate',
|
||||
|
||||
// CSO — gate for security guardrails, periodic for quality
|
||||
'cso-full-audit': 'gate', // Hardcoded secrets detection
|
||||
'cso-diff-mode': 'gate',
|
||||
'cso-infra-scope': 'periodic',
|
||||
|
||||
// Document-release — gate (CHANGELOG guardrail)
|
||||
'document-release': 'gate',
|
||||
|
||||
// Codex — periodic (Opus, requires codex CLI)
|
||||
'codex-review': 'periodic',
|
||||
|
||||
// Multi-AI — periodic (require external CLIs)
|
||||
'codex-discover-skill': 'periodic',
|
||||
'codex-review-findings': 'periodic',
|
||||
'gemini-discover-skill': 'periodic',
|
||||
'gemini-review-findings': 'periodic',
|
||||
|
||||
// Design — gate for cheap functional, periodic for Opus/quality
|
||||
'design-consultation-core': 'periodic',
|
||||
'design-consultation-existing': 'periodic',
|
||||
'design-consultation-research': 'gate',
|
||||
'design-consultation-preview': 'gate',
|
||||
'plan-design-review-plan-mode': 'periodic',
|
||||
'plan-design-review-no-ui-scope': 'gate',
|
||||
'design-review-fix': 'periodic',
|
||||
'design-shotgun-path': 'gate',
|
||||
'design-shotgun-session': 'gate',
|
||||
'design-shotgun-full': 'periodic',
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': 'gate',
|
||||
|
||||
// Deploy skills
|
||||
'land-and-deploy-workflow': 'gate',
|
||||
'land-and-deploy-first-run': 'gate',
|
||||
'land-and-deploy-review-gate': 'gate',
|
||||
'canary-workflow': 'gate',
|
||||
'benchmark-workflow': 'gate',
|
||||
'setup-deploy-workflow': 'gate',
|
||||
|
||||
// Sidebar agent
|
||||
'sidebar-navigate': 'periodic',
|
||||
'sidebar-url-accuracy': 'periodic',
|
||||
|
||||
// Autoplan — periodic (not yet implemented)
|
||||
'autoplan-core': 'periodic',
|
||||
|
||||
// Skill routing — periodic (LLM routing is non-deterministic)
|
||||
'journey-ideation': 'periodic',
|
||||
'journey-plan-eng': 'periodic',
|
||||
'journey-debug': 'periodic',
|
||||
'journey-qa': 'periodic',
|
||||
'journey-code-review': 'periodic',
|
||||
'journey-ship': 'periodic',
|
||||
'journey-docs': 'periodic',
|
||||
'journey-retro': 'periodic',
|
||||
'journey-design-system': 'periodic',
|
||||
'journey-visual-qa': 'periodic',
|
||||
};
|
||||
|
||||
/**
|
||||
* LLM-judge test touchfiles — keyed by test description string.
|
||||
*/
|
||||
@@ -172,20 +335,22 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
|
||||
|
||||
// Voice directive
|
||||
'voice directive tone': ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
};
|
||||
|
||||
/**
|
||||
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
|
||||
*
|
||||
* Keep this list minimal — only files that genuinely affect every test.
|
||||
* Scoped dependencies (gen-skill-docs, llm-judge, test-server, worktree,
|
||||
* codex/gemini session runners) belong in individual test entries instead.
|
||||
*/
|
||||
export const GLOBAL_TOUCHFILES = [
|
||||
'test/helpers/session-runner.ts',
|
||||
'test/helpers/codex-session-runner.ts',
|
||||
'test/helpers/gemini-session-runner.ts',
|
||||
'test/helpers/eval-store.ts',
|
||||
'test/helpers/llm-judge.ts',
|
||||
'scripts/gen-skill-docs.ts',
|
||||
'test/helpers/touchfiles.ts',
|
||||
'browse/test/test-server.ts',
|
||||
'test/helpers/session-runner.ts', // All E2E tests use this runner
|
||||
'test/helpers/eval-store.ts', // All E2E tests store results here
|
||||
'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous
|
||||
];
|
||||
|
||||
// --- Base branch detection ---
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { execSync, ExecSyncOptionsWithStringEncoding } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const BIN = path.join(ROOT, 'bin');
|
||||
|
||||
let tmpDir: string;
|
||||
let slugDir: string;
|
||||
|
||||
function run(input: string, opts: { expectFail?: boolean } = {}): { stdout: string; exitCode: number } {
|
||||
const execOpts: ExecSyncOptionsWithStringEncoding = {
|
||||
cwd: ROOT,
|
||||
env: { ...process.env, GSTACK_HOME: tmpDir },
|
||||
encoding: 'utf-8',
|
||||
timeout: 10000,
|
||||
};
|
||||
try {
|
||||
const stdout = execSync(`${BIN}/gstack-review-log '${input.replace(/'/g, "'\\''")}'`, execOpts).trim();
|
||||
return { stdout, exitCode: 0 };
|
||||
} catch (e: any) {
|
||||
if (opts.expectFail) {
|
||||
return { stdout: e.stderr?.toString() || '', exitCode: e.status || 1 };
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-revlog-'));
|
||||
// gstack-review-log uses gstack-slug which needs a git repo — create the projects dir
|
||||
// with a predictable slug by pre-creating the directory structure
|
||||
slugDir = path.join(tmpDir, 'projects');
|
||||
fs.mkdirSync(slugDir, { recursive: true });
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe('gstack-review-log', () => {
|
||||
test('appends valid JSON to review JSONL file', () => {
|
||||
const input = '{"skill":"plan-eng-review","status":"clean"}';
|
||||
const result = run(input);
|
||||
expect(result.exitCode).toBe(0);
|
||||
|
||||
// Find the JSONL file that was written
|
||||
const projectDirs = fs.readdirSync(slugDir);
|
||||
expect(projectDirs.length).toBeGreaterThan(0);
|
||||
const projectDir = path.join(slugDir, projectDirs[0]);
|
||||
const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
|
||||
expect(jsonlFiles.length).toBeGreaterThan(0);
|
||||
|
||||
const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
|
||||
const parsed = JSON.parse(content);
|
||||
expect(parsed.skill).toBe('plan-eng-review');
|
||||
expect(parsed.status).toBe('clean');
|
||||
});
|
||||
|
||||
test('rejects non-JSON input with non-zero exit code', () => {
|
||||
const result = run('not json at all', { expectFail: true });
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
|
||||
// Verify nothing was written
|
||||
const projectDirs = fs.readdirSync(slugDir);
|
||||
if (projectDirs.length > 0) {
|
||||
const projectDir = path.join(slugDir, projectDirs[0]);
|
||||
const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
|
||||
if (jsonlFiles.length > 0) {
|
||||
const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
|
||||
expect(content).toBe('');
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -25,7 +25,11 @@ describeIfSelected('Skill E2E tests', [
|
||||
testServer = startTestServer();
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
|
||||
setupBrowseShims(tmpDir);
|
||||
});
|
||||
|
||||
// Pre-warm the browse server so Chromium is already launched for tests.
|
||||
// In CI, Chromium can take 10-20s to launch (Docker + --no-sandbox).
|
||||
spawnSync(browseBin, ['goto', testServer.url], { cwd: tmpDir, timeout: 30000, stdio: 'pipe' });
|
||||
}, 45_000);
|
||||
|
||||
afterAll(() => {
|
||||
testServer?.server?.stop();
|
||||
@@ -41,7 +45,7 @@ describeIfSelected('Skill E2E tests', [
|
||||
4. $B screenshot /tmp/skill-e2e-test.png
|
||||
Report the results of each command.`,
|
||||
workingDirectory: tmpDir,
|
||||
maxTurns: 10,
|
||||
maxTurns: 7,
|
||||
timeout: 60_000,
|
||||
testName: 'browse-basic',
|
||||
runId,
|
||||
@@ -63,7 +67,7 @@ Report the results of each command.`,
|
||||
5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
|
||||
Report what each command returned.`,
|
||||
workingDirectory: tmpDir,
|
||||
maxTurns: 10,
|
||||
maxTurns: 7,
|
||||
timeout: 60_000,
|
||||
testName: 'browse-snapshot',
|
||||
runId,
|
||||
@@ -274,12 +278,25 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi
|
||||
expect(lower.includes('payment') || lower.includes('feature')).toBe(true);
|
||||
// Must mention what we're working on
|
||||
expect(lower.includes('stripe') || lower.includes('checkout') || lower.includes('payment')).toBe(true);
|
||||
// Must have a RECOMMENDATION
|
||||
expect(output).toContain('RECOMMENDATION');
|
||||
// Must have a recommendation or structured options
|
||||
expect(
|
||||
output.includes('RECOMMENDATION') ||
|
||||
lower.includes('recommend') ||
|
||||
lower.includes('option a') ||
|
||||
lower.includes('which do you want') ||
|
||||
lower.includes('which approach')
|
||||
).toBe(true);
|
||||
} else {
|
||||
// Check agent output as fallback
|
||||
const output = result.output || '';
|
||||
expect(output).toContain('RECOMMENDATION');
|
||||
const lowerOut = output.toLowerCase();
|
||||
expect(
|
||||
output.includes('RECOMMENDATION') ||
|
||||
lowerOut.includes('recommend') ||
|
||||
lowerOut.includes('option a') ||
|
||||
lowerOut.includes('which do you want') ||
|
||||
lowerOut.includes('which approach')
|
||||
).toBe(true);
|
||||
}
|
||||
|
||||
// Clean up
|
||||
@@ -0,0 +1,258 @@
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import {
|
||||
ROOT, runId, evalsEnabled,
|
||||
describeIfSelected, logCost, recordE2E,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const evalCollector = createEvalCollector('e2e-cso');
|
||||
|
||||
afterAll(() => {
|
||||
finalizeEvalCollector(evalCollector);
|
||||
});
|
||||
|
||||
// --- CSO v2 E2E Tests ---
|
||||
|
||||
describeIfSelected('CSO v2 — full audit', ['cso-full-audit'], () => {
|
||||
let csoDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
csoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: csoDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Create a minimal app with a planted vulnerability
|
||||
fs.writeFileSync(path.join(csoDir, 'package.json'), JSON.stringify({
|
||||
name: 'cso-test-app',
|
||||
version: '1.0.0',
|
||||
dependencies: { express: '4.18.0' },
|
||||
}, null, 2));
|
||||
|
||||
// Planted vuln: hardcoded API key
|
||||
fs.writeFileSync(path.join(csoDir, 'server.ts'), `
|
||||
import express from 'express';
|
||||
const app = express();
|
||||
const API_KEY = "sk-1234567890abcdef1234567890abcdef";
|
||||
app.get('/api/data', (req, res) => {
|
||||
const id = req.query.id;
|
||||
res.json({ data: \`result for \${id}\` });
|
||||
});
|
||||
app.listen(3000);
|
||||
`);
|
||||
|
||||
// Planted vuln: .env tracked by git
|
||||
fs.writeFileSync(path.join(csoDir, '.env'), 'DATABASE_URL=postgres://admin:secretpass@prod.db.example.com:5432/myapp\n');
|
||||
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(csoDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/cso finds planted vulnerabilities', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
|
||||
|
||||
Run /cso on this repo (full daily audit, no flags).
|
||||
|
||||
IMPORTANT:
|
||||
- Do NOT use AskUserQuestion — skip any interactive prompts.
|
||||
- Focus on finding the planted vulnerabilities in this small repo.
|
||||
- Produce the SECURITY FINDINGS table.
|
||||
- Save the report to .gstack/security-reports/.`,
|
||||
workingDirectory: csoDir,
|
||||
maxTurns: 30,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent'],
|
||||
timeout: 300_000,
|
||||
});
|
||||
|
||||
logCost('cso', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Should detect hardcoded API key
|
||||
const output = result.output.toLowerCase();
|
||||
expect(
|
||||
output.includes('sk-') || output.includes('hardcoded') || output.includes('api key') || output.includes('api_key')
|
||||
).toBe(true);
|
||||
|
||||
// Should detect .env tracked by git
|
||||
expect(
|
||||
output.includes('.env') && (output.includes('tracked') || output.includes('gitignore'))
|
||||
).toBe(true);
|
||||
|
||||
// Should produce a findings table
|
||||
expect(
|
||||
output.includes('security findings') || output.includes('SECURITY FINDINGS')
|
||||
).toBe(true);
|
||||
|
||||
// Should save a report
|
||||
const reportDir = path.join(csoDir, '.gstack', 'security-reports');
|
||||
const reportExists = fs.existsSync(reportDir);
|
||||
if (reportExists) {
|
||||
const reports = fs.readdirSync(reportDir).filter(f => f.endsWith('.json'));
|
||||
expect(reports.length).toBeGreaterThanOrEqual(1);
|
||||
}
|
||||
|
||||
recordE2E(evalCollector, 'cso-full-audit', 'e2e-cso', result);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
describeIfSelected('CSO v2 — diff mode', ['cso-diff-mode'], () => {
|
||||
let csoDiffDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
csoDiffDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-diff-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: csoDiffDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Clean initial commit
|
||||
fs.writeFileSync(path.join(csoDiffDir, 'package.json'), JSON.stringify({
|
||||
name: 'cso-diff-test', version: '1.0.0',
|
||||
}, null, 2));
|
||||
fs.writeFileSync(path.join(csoDiffDir, 'app.ts'), 'console.log("hello");\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Feature branch with a vuln
|
||||
run('git', ['checkout', '-b', 'feat/add-webhook']);
|
||||
fs.writeFileSync(path.join(csoDiffDir, 'webhook.ts'), `
|
||||
import express from 'express';
|
||||
const app = express();
|
||||
// No signature verification!
|
||||
app.post('/webhook/stripe', (req, res) => {
|
||||
const event = req.body;
|
||||
processPayment(event);
|
||||
res.sendStatus(200);
|
||||
});
|
||||
`);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'feat: add webhook']);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(csoDiffDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/cso --diff scopes to branch changes', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
|
||||
|
||||
Run /cso --diff on this repo. The base branch is "main".
|
||||
|
||||
IMPORTANT:
|
||||
- Do NOT use AskUserQuestion — skip any interactive prompts.
|
||||
- Focus on changes in the current branch vs main.
|
||||
- The webhook.ts file was added on this branch — it should be analyzed.`,
|
||||
workingDirectory: csoDiffDir,
|
||||
maxTurns: 25,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent'],
|
||||
timeout: 240_000,
|
||||
});
|
||||
|
||||
logCost('cso', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const output = result.output.toLowerCase();
|
||||
// Should mention webhook and missing signature verification
|
||||
expect(
|
||||
output.includes('webhook') && (output.includes('signature') || output.includes('verify'))
|
||||
).toBe(true);
|
||||
|
||||
recordE2E(evalCollector, 'cso-diff-mode', 'e2e-cso', result);
|
||||
}, 240_000);
|
||||
});
|
||||
|
||||
describeIfSelected('CSO v2 — infra scope', ['cso-infra-scope'], () => {
|
||||
let csoInfraDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
csoInfraDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-infra-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: csoInfraDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// CI workflow with unpinned action
|
||||
fs.mkdirSync(path.join(csoInfraDir, '.github', 'workflows'), { recursive: true });
|
||||
fs.writeFileSync(path.join(csoInfraDir, '.github', 'workflows', 'ci.yml'), `
|
||||
name: CI
|
||||
on: [push]
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: some-third-party/action@main
|
||||
- run: echo "Building..."
|
||||
`);
|
||||
|
||||
// Dockerfile running as root
|
||||
fs.writeFileSync(path.join(csoInfraDir, 'Dockerfile'), `
|
||||
FROM node:20
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN npm install
|
||||
EXPOSE 3000
|
||||
CMD ["node", "server.js"]
|
||||
`);
|
||||
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(csoInfraDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/cso --infra runs infrastructure phases only', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
|
||||
|
||||
Run /cso --infra on this repo. This should run infrastructure-only phases (0-6, 12-14).
|
||||
|
||||
IMPORTANT:
|
||||
- Do NOT use AskUserQuestion — skip any interactive prompts.
|
||||
- This is a TINY repo with only 3 files: .github/workflows/ci.yml, Dockerfile, and package.json. Do NOT waste turns exploring — just read those files directly and audit them.
|
||||
- The Dockerfile has no USER directive (runs as root). The CI workflow uses an unpinned third-party GitHub Action (some-third-party/action@main).
|
||||
- Focus on infrastructure findings, NOT code-level OWASP scanning.
|
||||
- Skip the preamble (gstack-update-check, telemetry, etc.) — go straight to the audit.
|
||||
- Do NOT use the Agent tool for exploration or verification — read the files yourself. This repo is too small to need subagents.`,
|
||||
workingDirectory: csoInfraDir,
|
||||
maxTurns: 30,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 360_000,
|
||||
});
|
||||
|
||||
logCost('cso', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const output = result.output.toLowerCase();
|
||||
// Should mention unpinned action or Dockerfile issues
|
||||
expect(
|
||||
output.includes('unpinned') || output.includes('third-party') ||
|
||||
output.includes('user directive') || output.includes('root')
|
||||
).toBe(true);
|
||||
|
||||
recordE2E(evalCollector, 'cso-infra-scope', 'e2e-cso', result);
|
||||
}, 360_000);
|
||||
});
|
||||
@@ -44,7 +44,7 @@ describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], ()
|
||||
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
|
||||
testConcurrentIfSelected('land-and-deploy-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
||||
|
||||
@@ -85,6 +85,161 @@ Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Land-and-Deploy First-Run E2E ---
|
||||
|
||||
describeIfSelected('Land-and-Deploy first-run E2E', ['land-and-deploy-first-run'], () => {
|
||||
let firstRunDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
firstRunDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-first-run-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: firstRunDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "world"; }\n');
|
||||
fs.writeFileSync(path.join(firstRunDir, 'fly.toml'), 'app = "first-run-app"\n\n[http_service]\n internal_port = 3000\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
run('git', ['checkout', '-b', 'feat/first-deploy']);
|
||||
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "first deploy"; }\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'feat: first deploy']);
|
||||
|
||||
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(firstRunDir, 'land-and-deploy'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(firstRunDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('land-and-deploy-first-run', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
||||
|
||||
You are on branch feat/first-deploy. This is the FIRST TIME running /land-and-deploy
|
||||
for this project — there is NO land-deploy-confirmed file.
|
||||
|
||||
This repo has a fly.toml with app = "first-run-app", indicating a Fly.io deployment.
|
||||
|
||||
IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
|
||||
Instead, simulate the Step 1.5 first-run dry-run validation:
|
||||
1. Detect that this is a FIRST_RUN (no land-deploy-confirmed file)
|
||||
2. Detect the deploy platform from fly.toml (Fly.io, app = first-run-app)
|
||||
3. Infer the production URL (https://first-run-app.fly.dev)
|
||||
4. Build the DEPLOY INFRASTRUCTURE VALIDATION table showing:
|
||||
- Platform detected
|
||||
- Command validation results (simulated as all passing)
|
||||
- Staging detection results (none expected)
|
||||
- What will happen steps
|
||||
5. Write the dry-run report to .gstack/deploy-reports/dry-run-validation.md
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.
|
||||
Just demonstrate the first-run dry-run output.`,
|
||||
workingDirectory: firstRunDir,
|
||||
maxTurns: 20,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'land-and-deploy-first-run',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/land-and-deploy first-run', result);
|
||||
recordE2E(evalCollector, '/land-and-deploy first-run', 'Land-and-Deploy first-run E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify dry-run report was created
|
||||
const reportDir = path.join(firstRunDir, '.gstack', 'deploy-reports');
|
||||
expect(fs.existsSync(reportDir)).toBe(true);
|
||||
|
||||
// Check report content mentions platform detection
|
||||
const reportFiles = fs.readdirSync(reportDir);
|
||||
expect(reportFiles.length).toBeGreaterThan(0);
|
||||
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
|
||||
const hasPlatform = reportContent.toLowerCase().includes('fly') || reportContent.toLowerCase().includes('first-run-app');
|
||||
expect(hasPlatform).toBe(true);
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Land-and-Deploy Review Gate E2E ---
|
||||
|
||||
describeIfSelected('Land-and-Deploy review gate E2E', ['land-and-deploy-review-gate'], () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-review-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(reviewDir, 'app.ts'), 'export function hello() { return "world"; }\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Create 6 more commits to make any review stale
|
||||
for (let i = 1; i <= 6; i++) {
|
||||
fs.writeFileSync(path.join(reviewDir, `file${i}.ts`), `export const x${i} = ${i};\n`);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', `feat: add file${i}`]);
|
||||
}
|
||||
|
||||
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(reviewDir, 'land-and-deploy'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('land-and-deploy-review-gate', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
||||
|
||||
Focus on Step 3.5a and Step 3.5a-bis (the review staleness check and inline review offer).
|
||||
|
||||
This repo has 6 commits since the initial commit. There are NO review logs
|
||||
(gstack-review-read would return NO_REVIEWS).
|
||||
|
||||
Simulate what the readiness gate would show:
|
||||
1. Run gstack-review-read equivalent (simulate NO_REVIEWS output)
|
||||
2. Determine review staleness: Eng Review should be "NOT RUN"
|
||||
3. Note that Step 3.5a-bis would offer an inline review
|
||||
4. Write a simulated readiness report to .gstack/deploy-reports/readiness-report.md
|
||||
showing the review status as NOT RUN with the inline review offer text
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run gh commands.
|
||||
Show what the readiness gate output would look like.`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 15,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'land-and-deploy-review-gate',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/land-and-deploy review-gate', result);
|
||||
recordE2E(evalCollector, '/land-and-deploy review-gate', 'Land-and-Deploy review gate E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify readiness report was created
|
||||
const reportDir = path.join(reviewDir, '.gstack', 'deploy-reports');
|
||||
expect(fs.existsSync(reportDir)).toBe(true);
|
||||
|
||||
const reportFiles = fs.readdirSync(reportDir);
|
||||
expect(reportFiles.length).toBeGreaterThan(0);
|
||||
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
|
||||
// Should mention review status
|
||||
const hasReviewMention = reportContent.toLowerCase().includes('review') ||
|
||||
reportContent.toLowerCase().includes('not run');
|
||||
expect(hasReviewMention).toBe(true);
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Canary skill E2E ---
|
||||
|
||||
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
|
||||
@@ -110,7 +265,7 @@ describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
|
||||
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/canary skill produces monitoring report structure', async () => {
|
||||
testConcurrentIfSelected('canary-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read canary/SKILL.md for the /canary skill instructions.
|
||||
|
||||
@@ -171,7 +326,7 @@ describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
|
||||
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/benchmark skill produces performance report structure', async () => {
|
||||
testConcurrentIfSelected('benchmark-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
|
||||
|
||||
@@ -237,7 +392,7 @@ describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
|
||||
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
|
||||
testConcurrentIfSelected('setup-deploy-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
|
||||
|
||||
|
||||
@@ -560,7 +560,7 @@ describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
|
||||
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 7: /design-review audits and fixes design issues', async () => {
|
||||
testConcurrentIfSelected('design-review-fix', async () => {
|
||||
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
|
||||
+204
-8
@@ -66,7 +66,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-ceo-review produces structured review output', async () => {
|
||||
testConcurrentIfSelected('plan-ceo-review', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
|
||||
|
||||
@@ -150,7 +150,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-ceo-review SELECTIVE EXPANSION produces structured review output', async () => {
|
||||
testConcurrentIfSelected('plan-ceo-review-selective', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
|
||||
|
||||
@@ -244,7 +244,7 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-eng-review produces structured review output', async () => {
|
||||
testConcurrentIfSelected('plan-eng-review', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
|
||||
|
||||
@@ -364,7 +364,7 @@ export function main() { return Dashboard(); }
|
||||
} catch {}
|
||||
});
|
||||
|
||||
test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
|
||||
testConcurrentIfSelected('plan-eng-review-artifact', async () => {
|
||||
// Count existing test-plan files before
|
||||
const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
|
||||
|
||||
@@ -408,8 +408,11 @@ Write your review to ${planDir}/review-output.md`,
|
||||
console.warn('No test-plan artifact found — agent may not have followed artifact instructions');
|
||||
}
|
||||
|
||||
// Soft assertion: we expect an artifact but agent compliance is not guaranteed
|
||||
expect(newFiles.length).toBeGreaterThanOrEqual(1);
|
||||
// Soft assertion: we expect an artifact but agent compliance is not guaranteed.
|
||||
// Log rather than fail — the test-plan artifact is a bonus output, not the core test.
|
||||
if (newFiles.length === 0) {
|
||||
console.warn('SOFT FAIL: No test-plan artifact written — agent did not follow artifact instructions');
|
||||
}
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
@@ -442,7 +445,7 @@ describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'],
|
||||
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/office-hours SKILL.md contains spec review loop', async () => {
|
||||
testConcurrentIfSelected('office-hours-spec-review', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
|
||||
|
||||
@@ -502,7 +505,7 @@ describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefi
|
||||
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
|
||||
testConcurrentIfSelected('plan-ceo-review-benefits', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
|
||||
|
||||
@@ -532,6 +535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Plan Review Report E2E ---
|
||||
// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section
|
||||
// to the bottom of the plan file (the living review status footer).
|
||||
|
||||
describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System
|
||||
|
||||
## Context
|
||||
We're building a real-time notification system for our SaaS app.
|
||||
|
||||
## Changes
|
||||
1. WebSocket server for push notifications
|
||||
2. Notification preferences API
|
||||
3. Email digest fallback for offline users
|
||||
4. PostgreSQL table for notification storage
|
||||
|
||||
## Architecture
|
||||
- WebSocket: Socket.io on Express
|
||||
- Queue: Bull + Redis for email digests
|
||||
- Storage: PostgreSQL notifications table
|
||||
- Frontend: React toast component
|
||||
|
||||
## Open questions
|
||||
- Retry policy for failed WebSocket delivery?
|
||||
- Max notifications stored per user?
|
||||
`);
|
||||
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'add plan']);
|
||||
|
||||
// Copy plan-eng-review skill
|
||||
fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
|
||||
path.join(planDir, 'plan-eng-review', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
|
||||
|
||||
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.
|
||||
|
||||
Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive.
|
||||
Skip the preamble bash block, lake intro, telemetry, and contributor mode sections.
|
||||
|
||||
CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content.
|
||||
|
||||
This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 20,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-review-report',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
});
|
||||
|
||||
logCost('/plan-eng-review report', result);
|
||||
recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
// Verify the review report was written to the plan file
|
||||
const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8');
|
||||
|
||||
// Original plan content should still be present
|
||||
expect(planContent).toContain('# Plan: Add Notifications System');
|
||||
expect(planContent).toContain('WebSocket');
|
||||
|
||||
// Review report section must exist
|
||||
expect(planContent).toContain('## GSTACK REVIEW REPORT');
|
||||
|
||||
// Report should be at the bottom of the file
|
||||
const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT');
|
||||
const afterReport = planContent.slice(reportIndex);
|
||||
|
||||
// Should contain the review table with standard rows
|
||||
expect(afterReport).toMatch(/\|\s*Review\s*\|/);
|
||||
expect(afterReport).toContain('CEO Review');
|
||||
expect(afterReport).toContain('Eng Review');
|
||||
expect(afterReport).toContain('Design Review');
|
||||
|
||||
console.log('Plan review report found at bottom of plan.md');
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Codex Offering E2E ---
|
||||
// Verifies that Codex is properly offered (with availability check, user prompt,
|
||||
// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review.
|
||||
|
||||
describeIfSelected('Codex Offering E2E', [
|
||||
'codex-offered-office-hours', 'codex-offered-ceo-review',
|
||||
'codex-offered-design-review', 'codex-offered-eng-review',
|
||||
], () => {
|
||||
let testDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'init']);
|
||||
|
||||
// Copy all 4 SKILL.md files
|
||||
for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) {
|
||||
fs.mkdirSync(path.join(testDir, skill), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, skill, 'SKILL.md'),
|
||||
path.join(testDir, skill, 'SKILL.md'),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
async function checkCodexOffering(skill: string, testName: string, featureName: string) {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion".
|
||||
|
||||
Summarize the Codex/${featureName} integration — answer these specific questions:
|
||||
1. How is Codex availability checked? (what exact bash command?)
|
||||
2. How is the user prompted? (via AskUserQuestion? what are the options?)
|
||||
3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?)
|
||||
4. Is this step blocking (gates the workflow) or optional (can be skipped)?
|
||||
5. What prompt/context is sent to Codex?
|
||||
|
||||
Write your summary to ${testDir}/${testName}-summary.md`,
|
||||
workingDirectory: testDir,
|
||||
maxTurns: 8,
|
||||
timeout: 120_000,
|
||||
testName,
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost(`/${skill} codex offering`, result);
|
||||
recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const summaryPath = path.join(testDir, `${testName}-summary.md`);
|
||||
expect(fs.existsSync(summaryPath)).toBe(true);
|
||||
|
||||
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
|
||||
// All skills should have codex availability check
|
||||
expect(summary).toMatch(/which codex/);
|
||||
// All skills should have fallback behavior
|
||||
expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/);
|
||||
// All skills should show it's optional/non-blocking
|
||||
expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/);
|
||||
|
||||
console.log(`${skill}: Codex offering verified`);
|
||||
}
|
||||
|
||||
testConcurrentIfSelected('codex-offered-office-hours', async () => {
|
||||
await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion');
|
||||
}, 180_000);
|
||||
|
||||
testConcurrentIfSelected('codex-offered-ceo-review', async () => {
|
||||
await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice');
|
||||
}, 180_000);
|
||||
|
||||
testConcurrentIfSelected('codex-offered-design-review', async () => {
|
||||
await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices');
|
||||
}, 180_000);
|
||||
|
||||
testConcurrentIfSelected('codex-offered-eng-review', async () => {
|
||||
await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice');
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
await finalizeEvalCollector(evalCollector);
|
||||
|
||||
@@ -4,7 +4,7 @@ import { outcomeJudge } from './helpers/llm-judge';
|
||||
import { judgePassed } from './helpers/eval-store';
|
||||
import {
|
||||
ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
|
||||
describeIfSelected, describeE2E,
|
||||
describeIfSelected, describeE2E, testConcurrentIfSelected,
|
||||
copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
@@ -172,17 +172,17 @@ CRITICAL RULES:
|
||||
}
|
||||
|
||||
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
|
||||
test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
|
||||
testConcurrentIfSelected('qa-b6-static', async () => {
|
||||
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
|
||||
}, 360_000);
|
||||
|
||||
// B7: SPA — broken route, stale state, async race, missing aria, console warning
|
||||
test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
|
||||
testConcurrentIfSelected('qa-b7-spa', async () => {
|
||||
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
|
||||
}, 360_000);
|
||||
|
||||
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
|
||||
test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
|
||||
testConcurrentIfSelected('qa-b8-checkout', async () => {
|
||||
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
|
||||
}, 360_000);
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ describeIfSelected('QA skill E2E', ['qa-quick'], () => {
|
||||
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa quick completes without browse errors', async () => {
|
||||
testConcurrentIfSelected('qa-quick', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `B="${browseBin}"
|
||||
|
||||
@@ -108,7 +108,7 @@ describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
|
||||
try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa-only produces report without using Edit tool', async () => {
|
||||
testConcurrentIfSelected('qa-only-no-fix', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
|
||||
@@ -227,7 +227,7 @@ describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
|
||||
try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa fix loop finds bugs and commits fixes', async () => {
|
||||
testConcurrentIfSelected('qa-fix-loop', async () => {
|
||||
const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
|
||||
+133
-14
@@ -51,7 +51,7 @@ describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review produces findings on SQL injection branch', async () => {
|
||||
testConcurrentIfSelected('review-sql-injection', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on a feature branch with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
@@ -125,7 +125,7 @@ describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'],
|
||||
try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review catches missing enum handlers for new status value', async () => {
|
||||
testConcurrentIfSelected('review-enum-completeness', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
@@ -200,7 +200,7 @@ describeIfSelected('Review design lite E2E', ['review-design-lite'], () => {
|
||||
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review catches design anti-patterns in CSS/HTML diff', async () => {
|
||||
testConcurrentIfSelected('review-design-lite', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
@@ -340,21 +340,22 @@ Write your findings to ${dir}/review-output.md`,
|
||||
run('git', ['add', 'app.ts'], dir);
|
||||
run('git', ['commit', '-m', 'feat: update to v2'], dir);
|
||||
|
||||
// Copy ship skill
|
||||
fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dir, 'ship-SKILL.md'));
|
||||
// Extract only Step 0 (base branch detection) from ship/SKILL.md
|
||||
// (copying the full 1900-line file causes agent context bloat and flaky timeouts)
|
||||
const fullShipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
const step0Start = fullShipSkill.indexOf('## Step 0: Detect platform and base branch');
|
||||
const step0End = fullShipSkill.indexOf('## Step 1: Pre-flight');
|
||||
const shipSection = fullShipSkill.slice(step0Start, step0End > step0Start ? step0End : undefined);
|
||||
fs.writeFileSync(path.join(dir, 'ship-SKILL.md'), shipSection);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read ship-SKILL.md for the ship workflow.
|
||||
prompt: `Read ship-SKILL.md. It contains Step 0 (Detect base branch) from the ship workflow.
|
||||
|
||||
Skip the preamble bash block, lake intro, telemetry, and contributor mode sections — go straight to Step 0.
|
||||
Run the base branch detection. Since there is no remote, gh commands will fail — fall back to main.
|
||||
|
||||
Run ONLY Step 0 (Detect base branch) and Step 1 (Pre-flight) from the ship workflow.
|
||||
Since there is no remote, gh commands will fail — fall back to main.
|
||||
Then run git diff and git log against the detected base branch.
|
||||
|
||||
After completing Step 0 and Step 1, STOP. Do NOT proceed to Step 2 or beyond.
|
||||
Do NOT push, create PRs, or modify VERSION/CHANGELOG.
|
||||
|
||||
Write a summary of what you detected to ${dir}/ship-preflight.md including:
|
||||
Write a summary to ${dir}/ship-preflight.md including:
|
||||
- The detected base branch name
|
||||
- The current branch name
|
||||
- The diff stat against the base branch`,
|
||||
@@ -497,7 +498,7 @@ describeIfSelected('Retro E2E', ['retro'], () => {
|
||||
try { fs.rmSync(retroDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/retro produces analysis from git history', async () => {
|
||||
testConcurrentIfSelected('retro', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.
|
||||
|
||||
@@ -529,6 +530,124 @@ Analyze the git history and produce the narrative report as described in the SKI
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Review Dashboard Via Attribution E2E ---
|
||||
|
||||
describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'], () => {
|
||||
let dashDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
dashDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-dashboard-via-'));
|
||||
const run = (cmd: string, args: string[], cwd = dashDir) =>
|
||||
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
// Create git repo with feature branch
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v1");\n');
|
||||
run('git', ['add', 'app.ts']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
run('git', ['checkout', '-b', 'feature/dashboard-test']);
|
||||
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v2");\n');
|
||||
run('git', ['add', 'app.ts']);
|
||||
run('git', ['commit', '-m', 'feat: update']);
|
||||
|
||||
// Get HEAD commit for review entries
|
||||
const headResult = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { cwd: dashDir, stdio: 'pipe' });
|
||||
const commit = headResult.stdout.toString().trim();
|
||||
|
||||
// Pre-populate review log with autoplan-sourced entries
|
||||
// gstack-review-read reads from ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl
|
||||
// For the test, we'll write a mock gstack-review-read script that returns our test data
|
||||
const timestamp = new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
|
||||
const reviewData = [
|
||||
`{"skill":"plan-eng-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"${commit}"}`,
|
||||
`{"skill":"plan-ceo-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"${commit}"}`,
|
||||
`{"skill":"codex-plan-review","timestamp":"${timestamp}","status":"clean","source":"codex","commit":"${commit}"}`,
|
||||
].join('\n');
|
||||
|
||||
// Write a mock gstack-review-read that returns our test data
|
||||
const mockBinDir = path.join(dashDir, '.mock-bin');
|
||||
fs.mkdirSync(mockBinDir, { recursive: true });
|
||||
fs.writeFileSync(path.join(mockBinDir, 'gstack-review-read'), [
|
||||
'#!/usr/bin/env bash',
|
||||
`echo '${reviewData.split('\n').join("'\necho '")}'`,
|
||||
'echo "---CONFIG---"',
|
||||
'echo "false"',
|
||||
'echo "---HEAD---"',
|
||||
`echo "${commit}"`,
|
||||
].join('\n'));
|
||||
fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755);
|
||||
|
||||
// Extract only the Review Readiness Dashboard section from ship/SKILL.md
|
||||
// (copying the full 1900-line file causes agent context bloat and timeouts)
|
||||
const fullSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
const dashStart = fullSkill.indexOf('## Review Readiness Dashboard');
|
||||
const dashEnd = fullSkill.indexOf('\n---\n', dashStart);
|
||||
const dashSection = fullSkill.slice(dashStart, dashEnd > dashStart ? dashEnd : undefined);
|
||||
fs.writeFileSync(path.join(dashDir, 'ship-SKILL.md'), dashSection);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(dashDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('review-dashboard-via', async () => {
|
||||
const mockBinDir = path.join(dashDir, '.mock-bin');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read ship-SKILL.md. You only need to run the Review Readiness Dashboard section.
|
||||
|
||||
Instead of running ~/.claude/skills/gstack/bin/gstack-review-read, run this mock: ${mockBinDir}/gstack-review-read
|
||||
|
||||
Parse the output and display the dashboard table. Pay attention to:
|
||||
1. The "via" field in entries — show source attribution (e.g., "via /autoplan")
|
||||
2. The codex-plan-review entry — it should populate the Outside Voice row
|
||||
3. Since Eng Review IS clear, there should be NO gate blocking — just display the dashboard
|
||||
|
||||
Skip the preamble, lake intro, telemetry, and all other ship steps.
|
||||
Write the dashboard output to ${dashDir}/dashboard-output.md`,
|
||||
workingDirectory: dashDir,
|
||||
maxTurns: 12,
|
||||
timeout: 180_000,
|
||||
testName: 'review-dashboard-via',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/ship dashboard-via', result);
|
||||
recordE2E(evalCollector, '/ship review dashboard via attribution', 'Dashboard via field', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Check dashboard output for via attribution
|
||||
const dashPath = path.join(dashDir, 'dashboard-output.md');
|
||||
const allOutput = [
|
||||
result.output || '',
|
||||
...result.toolCalls.map(tc => tc.output || ''),
|
||||
].join('\n').toLowerCase();
|
||||
|
||||
// Verify via attribution appears somewhere (conversation or file)
|
||||
let dashContent = '';
|
||||
if (fs.existsSync(dashPath)) {
|
||||
dashContent = fs.readFileSync(dashPath, 'utf-8').toLowerCase();
|
||||
}
|
||||
const combined = allOutput + dashContent;
|
||||
|
||||
// Should mention autoplan attribution
|
||||
expect(combined).toMatch(/autoplan/);
|
||||
// Should show eng review as CLEAR (it has a clean entry)
|
||||
expect(combined).toMatch(/clear/i);
|
||||
// Should NOT contain AskUserQuestion gate (no blocking)
|
||||
const gateQuestions = result.toolCalls.filter(tc =>
|
||||
tc.tool === 'mcp__conductor__AskUserQuestion' ||
|
||||
(tc.tool === 'AskUserQuestion')
|
||||
);
|
||||
// Ship dashboard should not gate when eng review is clear
|
||||
expect(gateQuestions).toHaveLength(0);
|
||||
}, 240_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
await finalizeEvalCollector(evalCollector);
|
||||
|
||||
@@ -0,0 +1,279 @@
|
||||
/**
|
||||
* Layer 4: E2E tests for the sidebar agent.
|
||||
*
|
||||
* sidebar-url-accuracy: Deterministic test that verifies the activeTabUrl fix.
|
||||
* Starts server (no browser), POSTs to /sidebar-command with different activeTabUrl
|
||||
* values, reads the queue file, and verifies the prompt uses the extension URL.
|
||||
* No real Claude needed — this is a fast, cheap, deterministic test.
|
||||
*
|
||||
* sidebar-navigate: Full E2E with real Claude (requires ANTHROPIC_API_KEY).
|
||||
* Starts server + sidebar-agent, sends a message, waits for Claude to respond.
|
||||
* Tests the complete message flow through the queue.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { spawn, type Subprocess } from 'bun';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import {
|
||||
ROOT,
|
||||
describeIfSelected, testIfSelected,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
|
||||
const evalCollector = createEvalCollector('e2e-sidebar');
|
||||
|
||||
// --- Sidebar URL Accuracy (deterministic, no Claude) ---
|
||||
|
||||
describeIfSelected('Sidebar URL accuracy E2E', ['sidebar-url-accuracy'], () => {
|
||||
let serverProc: Subprocess | null = null;
|
||||
let serverPort: number = 0;
|
||||
let authToken: string = '';
|
||||
let tmpDir: string = '';
|
||||
let stateFile: string = '';
|
||||
let queueFile: string = '';
|
||||
|
||||
async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
...(opts.headers as Record<string, string> || {}),
|
||||
};
|
||||
if (!headers['Authorization'] && authToken) {
|
||||
headers['Authorization'] = `Bearer ${authToken}`;
|
||||
}
|
||||
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
|
||||
}
|
||||
|
||||
beforeAll(async () => {
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-url-'));
|
||||
stateFile = path.join(tmpDir, 'browse.json');
|
||||
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
|
||||
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
|
||||
|
||||
const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
|
||||
serverProc = spawn(['bun', 'run', serverScript], {
|
||||
env: {
|
||||
...process.env,
|
||||
BROWSE_STATE_FILE: stateFile,
|
||||
BROWSE_HEADLESS_SKIP: '1',
|
||||
BROWSE_PORT: '0',
|
||||
SIDEBAR_QUEUE_PATH: queueFile,
|
||||
BROWSE_IDLE_TIMEOUT: '300',
|
||||
},
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
});
|
||||
|
||||
const deadline = Date.now() + 15000;
|
||||
while (Date.now() < deadline) {
|
||||
if (fs.existsSync(stateFile)) {
|
||||
try {
|
||||
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
|
||||
if (state.port && state.token) {
|
||||
serverPort = state.port;
|
||||
authToken = state.token;
|
||||
break;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
}
|
||||
if (!serverPort) throw new Error('Server did not start in time');
|
||||
}, 20000);
|
||||
|
||||
afterAll(() => {
|
||||
if (serverProc) { try { serverProc.kill(); } catch {} }
|
||||
finalizeEvalCollector(evalCollector);
|
||||
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('sidebar-url-accuracy', async () => {
|
||||
// Fresh session
|
||||
await api('/sidebar-session/new', { method: 'POST' });
|
||||
fs.writeFileSync(queueFile, '');
|
||||
|
||||
const extensionUrl = 'https://example.com/user-navigated-here';
|
||||
const resp = await api('/sidebar-command', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({
|
||||
message: 'What page am I on?',
|
||||
activeTabUrl: extensionUrl,
|
||||
}),
|
||||
});
|
||||
expect(resp.status).toBe(200);
|
||||
|
||||
// Wait for queue entry
|
||||
let lastEntry: any = null;
|
||||
const deadline = Date.now() + 5000;
|
||||
while (Date.now() < deadline) {
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
if (!fs.existsSync(queueFile)) continue;
|
||||
const lines = fs.readFileSync(queueFile, 'utf-8').trim().split('\n').filter(Boolean);
|
||||
if (lines.length > 0) {
|
||||
lastEntry = JSON.parse(lines[lines.length - 1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
expect(lastEntry).not.toBeNull();
|
||||
// Extension URL should be used, not the Playwright fallback
|
||||
expect(lastEntry.pageUrl).toBe(extensionUrl);
|
||||
expect(lastEntry.prompt).toContain(extensionUrl);
|
||||
expect(lastEntry.pageUrl).not.toBe('about:blank');
|
||||
|
||||
// Also test: chrome:// URL should be rejected, falling back to about:blank
|
||||
await api('/sidebar-agent/kill', { method: 'POST' });
|
||||
fs.writeFileSync(queueFile, '');
|
||||
|
||||
await api('/sidebar-command', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({
|
||||
message: 'test',
|
||||
activeTabUrl: 'chrome://settings',
|
||||
}),
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 200));
|
||||
const lines2 = fs.readFileSync(queueFile, 'utf-8').trim().split('\n').filter(Boolean);
|
||||
if (lines2.length > 0) {
|
||||
const entry2 = JSON.parse(lines2[lines2.length - 1]);
|
||||
expect(entry2.pageUrl).toBe('about:blank');
|
||||
}
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: 'sidebar-url-accuracy', suite: 'Sidebar URL accuracy E2E', tier: 'e2e',
|
||||
passed: true,
|
||||
duration_ms: 0,
|
||||
cost_usd: 0,
|
||||
exit_reason: 'success',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// --- Sidebar Navigate (real Claude, requires ANTHROPIC_API_KEY) ---
|
||||
|
||||
describeIfSelected('Sidebar navigate E2E', ['sidebar-navigate'], () => {
|
||||
let serverProc: Subprocess | null = null;
|
||||
let agentProc: Subprocess | null = null;
|
||||
let serverPort: number = 0;
|
||||
let authToken: string = '';
|
||||
let tmpDir: string = '';
|
||||
let stateFile: string = '';
|
||||
let queueFile: string = '';
|
||||
|
||||
async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
...(opts.headers as Record<string, string> || {}),
|
||||
};
|
||||
if (!headers['Authorization'] && authToken) {
|
||||
headers['Authorization'] = `Bearer ${authToken}`;
|
||||
}
|
||||
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
|
||||
}
|
||||
|
||||
beforeAll(async () => {
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-nav-'));
|
||||
stateFile = path.join(tmpDir, 'browse.json');
|
||||
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
|
||||
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
|
||||
|
||||
// Start server WITHOUT headless skip — we need a real browser for Claude to use
|
||||
const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
|
||||
serverProc = spawn(['bun', 'run', serverScript], {
|
||||
env: {
|
||||
...process.env,
|
||||
BROWSE_STATE_FILE: stateFile,
|
||||
BROWSE_HEADLESS_SKIP: '1', // Still skip browser — Claude uses curl/fetch instead
|
||||
BROWSE_PORT: '0',
|
||||
SIDEBAR_QUEUE_PATH: queueFile,
|
||||
BROWSE_IDLE_TIMEOUT: '300',
|
||||
},
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
});
|
||||
|
||||
const deadline = Date.now() + 15000;
|
||||
while (Date.now() < deadline) {
|
||||
if (fs.existsSync(stateFile)) {
|
||||
try {
|
||||
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
|
||||
if (state.port && state.token) {
|
||||
serverPort = state.port;
|
||||
authToken = state.token;
|
||||
break;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
}
|
||||
if (!serverPort) throw new Error('Server did not start in time');
|
||||
|
||||
// Start sidebar-agent
|
||||
const agentScript = path.resolve(ROOT, 'browse', 'src', 'sidebar-agent.ts');
|
||||
agentProc = spawn(['bun', 'run', agentScript], {
|
||||
env: {
|
||||
...process.env,
|
||||
BROWSE_SERVER_PORT: String(serverPort),
|
||||
BROWSE_STATE_FILE: stateFile,
|
||||
SIDEBAR_QUEUE_PATH: queueFile,
|
||||
SIDEBAR_AGENT_TIMEOUT: '90000',
|
||||
BROWSE_BIN: 'echo', // browse commands won't work, but Claude can use curl
|
||||
},
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
});
|
||||
|
||||
await new Promise(r => setTimeout(r, 1500));
|
||||
}, 25000);
|
||||
|
||||
afterAll(() => {
|
||||
if (agentProc) { try { agentProc.kill(); } catch {} }
|
||||
if (serverProc) { try { serverProc.kill(); } catch {} }
|
||||
finalizeEvalCollector(evalCollector);
|
||||
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('sidebar-navigate', async () => {
|
||||
await api('/sidebar-session/new', { method: 'POST' });
|
||||
fs.writeFileSync(queueFile, '');
|
||||
const startTime = Date.now();
|
||||
|
||||
// Ask Claude a simple question — it doesn't need browse commands for this
|
||||
const resp = await api('/sidebar-command', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({
|
||||
message: 'Say exactly "SIDEBAR_TEST_OK" and nothing else.',
|
||||
activeTabUrl: 'https://example.com',
|
||||
}),
|
||||
});
|
||||
expect(resp.status).toBe(200);
|
||||
|
||||
// Poll for agent_done
|
||||
const deadline = Date.now() + 90000;
|
||||
let entries: any[] = [];
|
||||
while (Date.now() < deadline) {
|
||||
const chatResp = await api('/sidebar-chat?after=0');
|
||||
const data = await chatResp.json();
|
||||
entries = data.entries;
|
||||
if (entries.some((e: any) => e.type === 'agent_done')) break;
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
}
|
||||
|
||||
const duration = Date.now() - startTime;
|
||||
const doneEntry = entries.find((e: any) => e.type === 'agent_done');
|
||||
expect(doneEntry).toBeDefined();
|
||||
|
||||
// Claude should have responded with something
|
||||
const agentText = entries
|
||||
.filter((e: any) => e.role === 'agent' && (e.type === 'text' || e.type === 'result'))
|
||||
.map((e: any) => e.text || '')
|
||||
.join(' ');
|
||||
expect(agentText.length).toBeGreaterThan(0);
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: 'sidebar-navigate', suite: 'Sidebar navigate E2E', tier: 'e2e',
|
||||
passed: !!doneEntry && agentText.length > 0,
|
||||
duration_ms: duration,
|
||||
cost_usd: 0,
|
||||
exit_reason: doneEntry ? 'success' : 'timeout',
|
||||
});
|
||||
}, 120_000);
|
||||
});
|
||||
@@ -60,7 +60,7 @@ describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
|
||||
try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/document-release updates docs without clobbering CHANGELOG', async () => {
|
||||
testConcurrentIfSelected('document-release', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.
|
||||
|
||||
@@ -161,36 +161,13 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {
|
||||
|
||||
testConcurrentIfSelected('ship-local-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are running a ship workflow. This is fully automated — do NOT ask for confirmation at any step. Run straight through.
|
||||
|
||||
Step 0 — Detect base branch:
|
||||
Try: gh pr view --json baseRefName -q .baseRefName
|
||||
If that fails, try: gh repo view --json defaultBranchRef -q .defaultBranchRef.name
|
||||
If both fail, fall back to "main". Use the detected branch as <base> in all subsequent steps.
|
||||
|
||||
Step 2 — Merge base branch:
|
||||
git fetch origin <base> && git merge origin/<base> --no-edit
|
||||
If already up to date, continue silently.
|
||||
|
||||
Step 4 — Version bump:
|
||||
Read the VERSION file (4-digit format: MAJOR.MINOR.PATCH.MICRO).
|
||||
Auto-pick MICRO bump (increment the 4th digit). Write the new version to VERSION.
|
||||
|
||||
Step 5 — CHANGELOG:
|
||||
Read CHANGELOG.md. Auto-generate an entry from the branch commits:
|
||||
- git log <base>..HEAD --oneline
|
||||
- git diff <base>...HEAD
|
||||
Format: ## [X.Y.Z.W] - YYYY-MM-DD with bullet points. Prepend after the header.
|
||||
|
||||
Step 6 — Commit:
|
||||
Stage all changes. Commit with message: "chore: bump version and changelog (vX.Y.Z.W)"
|
||||
|
||||
Step 7 — Push:
|
||||
git push -u origin <branch-name>
|
||||
|
||||
Finally, write ship-summary.md with the version and branch.`,
|
||||
prompt: `You are in a git repo on branch feature/ship-test. Do these steps in order:
|
||||
1. Read VERSION file and bump the last digit by 1 (e.g. 0.1.0.0 → 0.1.0.1). Write the new version back.
|
||||
2. Add a CHANGELOG.md entry: "## [NEW_VERSION] - TODAY" with a bullet "- Ship test feature".
|
||||
3. Stage all changes, commit with message "ship: vNEW_VERSION".
|
||||
4. Push to origin: git push origin feature/ship-test`,
|
||||
workingDirectory: shipWorkDir,
|
||||
maxTurns: 15,
|
||||
maxTurns: 8,
|
||||
timeout: 120_000,
|
||||
testName: 'ship-local-workflow',
|
||||
runId,
|
||||
@@ -198,76 +175,30 @@ Finally, write ship-summary.md with the version and branch.`,
|
||||
|
||||
logCost('/ship local workflow', result);
|
||||
|
||||
// Check push succeeded
|
||||
const remoteLog = spawnSync('git', ['log', '--oneline'], { cwd: shipRemoteDir, stdio: 'pipe' });
|
||||
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').length;
|
||||
// Check push succeeded — verify the feature branch exists on the bare remote
|
||||
const branchCheck = spawnSync('git', ['branch', '--list', 'feature/ship-test'], { cwd: shipRemoteDir, stdio: 'pipe' });
|
||||
const branchExists = branchCheck.stdout.toString().trim().length > 0;
|
||||
|
||||
// Check VERSION was bumped
|
||||
// Check VERSION was bumped locally (even if push failed, this shows the LLM did the work)
|
||||
const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
|
||||
? fs.readFileSync(path.join(shipWorkDir, 'VERSION'), 'utf-8').trim() : '';
|
||||
const versionBumped = versionContent !== '0.1.0.0';
|
||||
|
||||
recordE2E(evalCollector, '/ship local workflow', 'Ship workflow E2E', result, {
|
||||
passed: remoteCommits > 1 && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
passed: branchExists && versionBumped && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
expect(remoteCommits).toBeGreaterThan(1);
|
||||
console.log(`Remote commits: ${remoteCommits}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
|
||||
expect(branchExists).toBe(true);
|
||||
expect(versionBumped).toBe(true);
|
||||
console.log(`Branch pushed: ${branchExists}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
|
||||
}, 150_000);
|
||||
});
|
||||
|
||||
// --- Browser cookie detection smoke test ---
|
||||
|
||||
describeIfSelected('Setup Browser Cookies E2E', ['setup-cookies-detect'], () => {
|
||||
let cookieDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
cookieDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cookies-'));
|
||||
// Copy skill files
|
||||
fs.mkdirSync(path.join(cookieDir, 'setup-browser-cookies'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'setup-browser-cookies', 'SKILL.md'),
|
||||
path.join(cookieDir, 'setup-browser-cookies', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(cookieDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('setup-cookies-detect', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow.
|
||||
|
||||
This is a test environment. List which browsers you can detect on this system by checking for their cookie database files.
|
||||
Write the detected browsers to ${cookieDir}/detected-browsers.md.
|
||||
Do NOT launch the cookie picker UI — just detect and report.`,
|
||||
workingDirectory: cookieDir,
|
||||
maxTurns: 5,
|
||||
timeout: 45_000,
|
||||
testName: 'setup-cookies-detect',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/setup-browser-cookies detect', result);
|
||||
|
||||
const detectPath = path.join(cookieDir, 'detected-browsers.md');
|
||||
const detectExists = fs.existsSync(detectPath);
|
||||
const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : '';
|
||||
const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent);
|
||||
|
||||
recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, {
|
||||
passed: detectExists && hasBrowserName && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
expect(detectExists).toBe(true);
|
||||
if (detectExists) {
|
||||
expect(hasBrowserName).toBe(true);
|
||||
}
|
||||
}, 60_000);
|
||||
});
|
||||
// setup-cookies-detect REMOVED: The cookie-import-browser module has 30+ thorough
|
||||
// unit tests in browse/test/cookie-import-browser.test.ts (decryption, profile
|
||||
// detection, error handling, path traversal). The E2E just tested LLM instruction-
|
||||
// following ("write a file saying no browsers") on a CI box with no browsers.
|
||||
|
||||
// --- gstack-upgrade E2E ---
|
||||
|
||||
@@ -461,7 +392,7 @@ describe('processPayment', () => {
|
||||
try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/ship Step 3.4 produces coverage diagram', async () => {
|
||||
testConcurrentIfSelected('ship-coverage-audit', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read the file ship/SKILL.md for the ship workflow instructions.
|
||||
|
||||
@@ -544,7 +475,7 @@ describeIfSelected('Codex skill E2E', ['codex-review'], () => {
|
||||
try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/codex review produces findings and GATE verdict', async () => {
|
||||
testConcurrentIfSelected('codex-review', async () => {
|
||||
// Check codex is available — skip if not installed
|
||||
const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
|
||||
if (codexCheck.status !== 0) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -74,7 +74,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
|
||||
/** Skip an individual test if not selected (for multi-test describe blocks). */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
describeIfSelected('LLM-as-judge quality evals', [
|
||||
@@ -91,11 +91,14 @@ describeIfSelected('LLM-as-judge quality evals', [
|
||||
const { result: scores, meta } = await judge('command reference table', section);
|
||||
console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||
|
||||
// Completeness threshold is 3 (not 4) — the command reference table is
|
||||
// intentionally terse (quick-reference format). The judge consistently scores
|
||||
// completeness=3 because detailed argument docs live in per-command sections.
|
||||
evalCollector?.addTest({
|
||||
name: 'command reference table',
|
||||
suite: 'LLM-as-judge quality evals',
|
||||
tier: 'llm-judge',
|
||||
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
||||
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: judgeCost(meta),
|
||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||
@@ -104,7 +107,7 @@ describeIfSelected('LLM-as-judge quality evals', [
|
||||
});
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(3);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
@@ -790,6 +793,69 @@ describeIfSelected('Other skill evals', [
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Voice directive eval — tests that the voice section produces the right tone
|
||||
describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
|
||||
testIfSelected('voice directive tone', async () => {
|
||||
const t0 = Date.now();
|
||||
// Read a tier 2+ skill to get the full voice directive in context
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const voiceStart = content.indexOf('## Voice');
|
||||
if (voiceStart === -1) {
|
||||
throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
|
||||
}
|
||||
const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
|
||||
const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
|
||||
|
||||
const result = await callJudge<{
|
||||
directness: number;
|
||||
concreteness: number;
|
||||
avoids_corporate: number;
|
||||
avoids_ai_vocabulary: number;
|
||||
connects_user_outcomes: number;
|
||||
reasoning: string;
|
||||
}>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
|
||||
Score each dimension 1-5 where 5 is excellent:
|
||||
|
||||
1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
|
||||
2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
|
||||
3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
|
||||
4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
|
||||
5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
|
||||
|
||||
Return JSON only:
|
||||
{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
|
||||
|
||||
THE VOICE DIRECTIVE:
|
||||
${voiceSection}`);
|
||||
|
||||
console.log('Voice directive scores:', JSON.stringify(result, null, 2));
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: 'voice directive tone',
|
||||
suite: 'Voice directive eval',
|
||||
tier: 'llm-judge',
|
||||
passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
|
||||
&& result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: 0.02,
|
||||
judge_scores: {
|
||||
directness: result.directness,
|
||||
concreteness: result.concreteness,
|
||||
avoids_corporate: result.avoids_corporate,
|
||||
avoids_ai_vocabulary: result.avoids_ai_vocabulary,
|
||||
connects_user_outcomes: result.connects_user_outcomes,
|
||||
},
|
||||
judge_reasoning: result.reasoning,
|
||||
});
|
||||
|
||||
expect(result.directness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.concreteness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
|
||||
expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
+106
-120
@@ -3,7 +3,7 @@ import { runSkillTest } from './helpers/session-runner';
|
||||
import type { SkillTestResult } from './helpers/session-runner';
|
||||
import { EvalCollector } from './helpers/eval-store';
|
||||
import type { EvalTestEntry } from './helpers/eval-store';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
@@ -42,9 +42,28 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
}
|
||||
}
|
||||
|
||||
// Apply EVALS_TIER filter (same logic as e2e-helpers.ts)
|
||||
if (evalsEnabled && process.env.EVALS_TIER) {
|
||||
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
|
||||
const tierTests = Object.entries(E2E_TIERS)
|
||||
.filter(([, t]) => t === tier)
|
||||
.map(([name]) => name);
|
||||
|
||||
if (selectedTests === null) {
|
||||
selectedTests = tierTests;
|
||||
} else {
|
||||
selectedTests = selectedTests.filter(t => tierTests.includes(t));
|
||||
}
|
||||
process.stderr.write(`Routing EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
|
||||
}
|
||||
|
||||
// --- Helper functions ---
|
||||
|
||||
/** Copy all SKILL.md files into tmpDir/.claude/skills/gstack/ for auto-discovery */
|
||||
/** Copy all SKILL.md files for auto-discovery.
|
||||
* Install to BOTH project-level (.claude/skills/) AND user-level (~/.claude/skills/)
|
||||
* because Claude Code discovers skills from both locations. In CI containers,
|
||||
* $HOME may differ from the working directory, so we need both paths to ensure
|
||||
* the Skill tool appears in Claude's available tools list. */
|
||||
function installSkills(tmpDir: string) {
|
||||
const skillDirs = [
|
||||
'', // root gstack SKILL.md
|
||||
@@ -54,15 +73,30 @@ function installSkills(tmpDir: string) {
|
||||
'gstack-upgrade', 'humanizer',
|
||||
];
|
||||
|
||||
// Install to both project-level and user-level skill directories
|
||||
const homeDir = process.env.HOME || os.homedir();
|
||||
const installTargets = [
|
||||
path.join(tmpDir, '.claude', 'skills'), // project-level
|
||||
path.join(homeDir, '.claude', 'skills'), // user-level (~/.claude/skills/)
|
||||
];
|
||||
|
||||
for (const skill of skillDirs) {
|
||||
const srcPath = path.join(ROOT, skill, 'SKILL.md');
|
||||
if (!fs.existsSync(srcPath)) continue;
|
||||
|
||||
const destDir = skill
|
||||
? path.join(tmpDir, '.claude', 'skills', 'gstack', skill)
|
||||
: path.join(tmpDir, '.claude', 'skills', 'gstack');
|
||||
fs.mkdirSync(destDir, { recursive: true });
|
||||
fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
|
||||
const skillName = skill || 'gstack';
|
||||
|
||||
for (const targetBase of installTargets) {
|
||||
const destDir = path.join(targetBase, skillName);
|
||||
fs.mkdirSync(destDir, { recursive: true });
|
||||
fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
|
||||
}
|
||||
}
|
||||
|
||||
// Copy CLAUDE.md so Claude has project context for skill routing.
|
||||
const claudeMdSrc = path.join(ROOT, 'CLAUDE.md');
|
||||
if (fs.existsSync(claudeMdSrc)) {
|
||||
fs.copyFileSync(claudeMdSrc, path.join(tmpDir, 'CLAUDE.md'));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,6 +109,31 @@ function initGitRepo(dir: string) {
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a routing test working directory.
|
||||
* Uses the actual repo checkout (ROOT) which has CLAUDE.md, .claude/skills/,
|
||||
* and full project context. This matches the local environment where routing
|
||||
* tests pass reliably. In containerized CI, bare tmpDirs lack the context
|
||||
* Claude needs to make correct routing decisions.
|
||||
*/
|
||||
function createRoutingWorkDir(suffix: string): string {
|
||||
// Clone the repo checkout into a tmpDir so concurrent tests don't interfere
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), `routing-${suffix}-`));
|
||||
// Copy essential context files
|
||||
const filesToCopy = ['CLAUDE.md', 'README.md', 'package.json', 'ETHOS.md'];
|
||||
for (const f of filesToCopy) {
|
||||
const src = path.join(ROOT, f);
|
||||
if (fs.existsSync(src)) fs.copyFileSync(src, path.join(tmpDir, f));
|
||||
}
|
||||
// Copy skill files
|
||||
installSkills(tmpDir);
|
||||
// Init git
|
||||
initGitRepo(tmpDir);
|
||||
spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
return tmpDir;
|
||||
}
|
||||
|
||||
function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
|
||||
const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
|
||||
const durationSec = Math.round(result.duration / 1000);
|
||||
@@ -96,6 +155,15 @@ function recordRouting(name: string, result: SkillTestResult, expectedSkill: str
|
||||
});
|
||||
}
|
||||
|
||||
// Skip individual tests based on selectedTests (diff + tier filtering)
|
||||
const testIfSelected = (name: string, fn: () => Promise<void>, timeout?: number) => {
|
||||
if (selectedTests !== null && !selectedTests.includes(name)) {
|
||||
test.skip(name, () => {});
|
||||
} else {
|
||||
test.concurrent(name, fn, timeout);
|
||||
}
|
||||
};
|
||||
|
||||
// --- Tests ---
|
||||
|
||||
describeE2E('Skill Routing E2E — Developer Journey', () => {
|
||||
@@ -103,14 +171,9 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
|
||||
evalCollector?.finalize();
|
||||
});
|
||||
|
||||
test.concurrent('journey-ideation', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ideation-'));
|
||||
testIfSelected('journey-ideation', async () => {
|
||||
const tmpDir = createRoutingWorkDir('ideation');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
fs.writeFileSync(path.join(tmpDir, 'README.md'), '# New Project\n');
|
||||
spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
const testName = 'journey-ideation';
|
||||
const expectedSkill = 'office-hours';
|
||||
@@ -137,11 +200,9 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-plan-eng', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-plan-eng-'));
|
||||
testIfSelected('journey-plan-eng', async () => {
|
||||
const tmpDir = createRoutingWorkDir('plan-eng');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
|
||||
|
||||
## Components
|
||||
@@ -189,64 +250,14 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-think-bigger', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-think-bigger-'));
|
||||
// Removed: journey-think-bigger
|
||||
// Tested ambiguous routing ("think bigger" → plan-ceo-review) but Claude
|
||||
// legitimately answers directly instead of routing. Never passed reliably.
|
||||
// The other 10 journey tests cover routing with clear signals.
|
||||
|
||||
testIfSelected('journey-debug', async () => {
|
||||
const tmpDir = createRoutingWorkDir('debug');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
|
||||
|
||||
## Components
|
||||
- REST API (Express.js)
|
||||
- PostgreSQL database
|
||||
- React frontend
|
||||
- SMS integration (Twilio)
|
||||
|
||||
## Data Model
|
||||
- restaurants (id, name, settings)
|
||||
- parties (id, restaurant_id, name, size, phone, status, created_at)
|
||||
- wait_estimates (id, restaurant_id, avg_wait_minutes)
|
||||
|
||||
## API Endpoints
|
||||
- POST /api/parties - add party to waitlist
|
||||
- GET /api/parties - list current waitlist
|
||||
- PATCH /api/parties/:id/status - update party status
|
||||
- GET /api/estimate - get current wait estimate
|
||||
`);
|
||||
spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
const testName = 'journey-think-bigger';
|
||||
const expectedSkill = 'plan-ceo-review';
|
||||
const result = await runSkillTest({
|
||||
prompt: "Actually, looking at this plan again, I feel like we're thinking too small. We're just doing waitlists but what about the whole restaurant guest experience? Is there a bigger opportunity here we should go after?",
|
||||
workingDirectory: tmpDir,
|
||||
maxTurns: 5,
|
||||
allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
|
||||
timeout: 120_000,
|
||||
testName,
|
||||
runId,
|
||||
});
|
||||
|
||||
const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill');
|
||||
const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined;
|
||||
|
||||
logCost(`journey: ${testName}`, result);
|
||||
recordRouting(testName, result, expectedSkill, actualSkill);
|
||||
|
||||
expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
|
||||
expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
|
||||
} finally {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
}, 180_000);
|
||||
|
||||
test.concurrent('journey-debug', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-debug-'));
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
@@ -295,18 +306,16 @@ export default app;
|
||||
recordRouting(testName, result, expectedSkill, actualSkill);
|
||||
|
||||
expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
|
||||
expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
|
||||
const validSkills = ['investigate', 'qa'];
|
||||
expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
|
||||
} finally {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-qa', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-qa-'));
|
||||
testIfSelected('journey-qa', async () => {
|
||||
const tmpDir = createRoutingWorkDir('qa');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2));
|
||||
fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(tmpDir, 'src/index.html'), '<html><body><h1>Waitlist App</h1></body></html>');
|
||||
@@ -340,18 +349,15 @@ export default app;
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-code-review', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-code-review-'));
|
||||
testIfSelected('journey-code-review', async () => {
|
||||
const tmpDir = createRoutingWorkDir('code-review');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
run('git', ['commit', '-m', 'add base app']);
|
||||
run('git', ['checkout', '-b', 'feature/add-waitlist']);
|
||||
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// updated with waitlist feature\nimport { WaitlistService } from "./waitlist";\n');
|
||||
fs.writeFileSync(path.join(tmpDir, 'waitlist.ts'), 'export class WaitlistService {\n async addParty(name: string, size: number) {\n // TODO: implement\n }\n}\n');
|
||||
@@ -383,18 +389,15 @@ export default app;
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-ship', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ship-'));
|
||||
testIfSelected('journey-ship', async () => {
|
||||
const tmpDir = createRoutingWorkDir('ship');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
run('git', ['commit', '-m', 'add base app']);
|
||||
run('git', ['checkout', '-b', 'feature/waitlist']);
|
||||
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// waitlist feature\n');
|
||||
run('git', ['add', '.']);
|
||||
@@ -425,12 +428,9 @@ export default app;
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-docs', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-docs-'));
|
||||
testIfSelected('journey-docs', async () => {
|
||||
const tmpDir = createRoutingWorkDir('docs');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
@@ -465,12 +465,9 @@ export default app;
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-retro', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-retro-'));
|
||||
testIfSelected('journey-retro', async () => {
|
||||
const tmpDir = createRoutingWorkDir('retro');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
@@ -511,18 +508,9 @@ export default app;
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-design-system', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-design-system-'));
|
||||
testIfSelected('journey-design-system', async () => {
|
||||
const tmpDir = createRoutingWorkDir('design-system');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app' }, null, 2));
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
const testName = 'journey-design-system';
|
||||
const expectedSkill = 'design-consultation';
|
||||
@@ -549,12 +537,9 @@ export default app;
|
||||
}
|
||||
}, 150_000);
|
||||
|
||||
test.concurrent('journey-visual-qa', async () => {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-visual-qa-'));
|
||||
testIfSelected('journey-visual-qa', async () => {
|
||||
const tmpDir = createRoutingWorkDir('visual-qa');
|
||||
try {
|
||||
initGitRepo(tmpDir);
|
||||
installSkills(tmpDir);
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
@@ -597,7 +582,8 @@ body { font-family: sans-serif; }
|
||||
recordRouting(testName, result, expectedSkill, actualSkill);
|
||||
|
||||
expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
|
||||
expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
|
||||
const validSkills = ['design-review', 'qa', 'qa-only', 'browse'];
|
||||
expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
|
||||
} finally {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
+139
-29
@@ -99,6 +99,20 @@ describe('SKILL.md command validation', () => {
|
||||
const result = validateSkill(skill);
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all $B commands in autoplan/SKILL.md are valid browse commands', () => {
|
||||
const skill = path.join(ROOT, 'autoplan', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.invalid).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all snapshot flags in autoplan/SKILL.md are valid', () => {
|
||||
const skill = path.join(ROOT, 'autoplan', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Command registry consistency', () => {
|
||||
@@ -227,6 +241,7 @@ describe('Update check preamble', () => {
|
||||
'benchmark/SKILL.md',
|
||||
'land-and-deploy/SKILL.md',
|
||||
'setup-deploy/SKILL.md',
|
||||
'cso/SKILL.md',
|
||||
];
|
||||
|
||||
for (const skill of skillsWithUpdateCheck) {
|
||||
@@ -513,10 +528,12 @@ describe('TODOS-format.md reference consistency', () => {
|
||||
// --- v0.4.1 feature coverage: RECOMMENDATION format, session awareness, enum completeness ---
|
||||
|
||||
describe('v0.4.1 preamble features', () => {
|
||||
const skillsWithPreamble = [
|
||||
'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md',
|
||||
'qa-only/SKILL.md',
|
||||
'setup-browser-cookies/SKILL.md',
|
||||
// Tier 1 skills have core preamble only (no AskUserQuestion format)
|
||||
const tier1Skills = ['SKILL.md', 'browse/SKILL.md', 'setup-browser-cookies/SKILL.md', 'benchmark/SKILL.md'];
|
||||
|
||||
// Tier 2+ skills have AskUserQuestion format with RECOMMENDATION
|
||||
const tier2PlusSkills = [
|
||||
'qa/SKILL.md', 'qa-only/SKILL.md',
|
||||
'ship/SKILL.md', 'review/SKILL.md',
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
@@ -526,22 +543,25 @@ describe('v0.4.1 preamble features', () => {
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
'canary/SKILL.md',
|
||||
'benchmark/SKILL.md',
|
||||
'land-and-deploy/SKILL.md',
|
||||
'setup-deploy/SKILL.md',
|
||||
'cso/SKILL.md',
|
||||
];
|
||||
|
||||
for (const skill of skillsWithPreamble) {
|
||||
const skillsWithPreamble = [...tier1Skills, ...tier2PlusSkills];
|
||||
|
||||
for (const skill of tier2PlusSkills) {
|
||||
test(`${skill} contains RECOMMENDATION format`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('RECOMMENDATION: Choose');
|
||||
expect(content).toContain('AskUserQuestion');
|
||||
});
|
||||
}
|
||||
|
||||
for (const skill of skillsWithPreamble) {
|
||||
test(`${skill} contains session awareness`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('_SESSIONS');
|
||||
expect(content).toContain('RECOMMENDATION');
|
||||
});
|
||||
}
|
||||
|
||||
@@ -724,14 +744,8 @@ describe('Contributor mode preamble structure', () => {
|
||||
for (const skill of skillsWithPreamble) {
|
||||
test(`${skill} has 0-10 rating in contributor mode`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('0 to 10');
|
||||
expect(content).toContain('My rating');
|
||||
});
|
||||
|
||||
test(`${skill} has calibration example`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('Calibration');
|
||||
expect(content).toContain('the bar');
|
||||
expect(content).toContain('0-10');
|
||||
expect(content).toContain('Rating');
|
||||
});
|
||||
|
||||
test(`${skill} has "what would make this a 10" field`, () => {
|
||||
@@ -807,7 +821,7 @@ describe('Completeness Principle in generated SKILL.md files', () => {
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
'cso/SKILL.md', ];
|
||||
|
||||
for (const skill of skillsWithPreamble) {
|
||||
test(`${skill} contains Completeness Principle section`, () => {
|
||||
@@ -817,17 +831,12 @@ describe('Completeness Principle in generated SKILL.md files', () => {
|
||||
});
|
||||
}
|
||||
|
||||
test('Completeness Principle includes compression table', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
test('Completeness Principle includes compression table in tier 2+ skills', () => {
|
||||
// Root is tier 1 (no completeness). Check tier 2+ skill.
|
||||
const content = fs.readFileSync(path.join(ROOT, 'cso', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('CC+gstack');
|
||||
expect(content).toContain('Compression');
|
||||
});
|
||||
|
||||
test('Completeness Principle includes anti-patterns', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('BAD:');
|
||||
expect(content).toContain('Anti-patterns');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: Planted-bug fixture validation (A4) ---
|
||||
@@ -961,10 +970,37 @@ describe('gstack-slug', () => {
|
||||
test('output is eval-compatible (KEY=VALUE format)', () => {
|
||||
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
|
||||
const lines = result.stdout.toString().trim().split('\n');
|
||||
expect(lines.length).toBe(3);
|
||||
expect(lines.length).toBe(2);
|
||||
expect(lines[0]).toMatch(/^SLUG=.+/);
|
||||
expect(lines[1]).toMatch(/^BRANCH=.+/);
|
||||
expect(lines[2]).toMatch(/^PROJECTS_DIR=.+/);
|
||||
});
|
||||
|
||||
test('output values contain only safe characters (no shell metacharacters)', () => {
|
||||
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
|
||||
const slug = result.stdout.toString().match(/SLUG=(.*)/)?.[1] ?? '';
|
||||
const branch = result.stdout.toString().match(/BRANCH=(.*)/)?.[1] ?? '';
|
||||
// Only alphanumeric, dot, dash, underscore are allowed (#133)
|
||||
expect(slug).toMatch(/^[a-zA-Z0-9._-]+$/);
|
||||
expect(branch).toMatch(/^[a-zA-Z0-9._-]+$/);
|
||||
});
|
||||
test('eval sets variables under bash with set -euo pipefail', () => {
|
||||
const result = Bun.spawnSync(
|
||||
['bash', '-c', 'set -euo pipefail; eval "$(./bin/gstack-slug 2>/dev/null)"; echo "SLUG=$SLUG"; echo "BRANCH=$BRANCH"'],
|
||||
{ cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }
|
||||
);
|
||||
expect(result.exitCode).toBe(0);
|
||||
const output = result.stdout.toString();
|
||||
expect(output).toMatch(/^SLUG=.+/m);
|
||||
expect(output).toMatch(/^BRANCH=.+/m);
|
||||
});
|
||||
|
||||
test('no templates or bin scripts use source process substitution for gstack-slug', () => {
|
||||
const result = Bun.spawnSync(
|
||||
['grep', '-r', 'source <(.*gstack-slug', '--include=*.tmpl', '--include=gstack-review-*', '.'],
|
||||
{ cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }
|
||||
);
|
||||
// grep returns exit code 1 when no matches found — that's what we want
|
||||
expect(result.stdout.toString().trim()).toBe('');
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1275,7 +1311,7 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('fall back to the Claude adversarial subagent');
|
||||
// Review log uses new skill name
|
||||
expect(content).toContain('adversarial-review');
|
||||
expect(content).toContain('xhigh');
|
||||
expect(content).toContain('reasoning_effort="high"');
|
||||
expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
|
||||
});
|
||||
|
||||
@@ -1285,17 +1321,23 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('< 50');
|
||||
expect(content).toContain('200+');
|
||||
expect(content).toContain('adversarial-review');
|
||||
expect(content).toContain('xhigh');
|
||||
expect(content).toContain('reasoning_effort="high"');
|
||||
expect(content).toContain('Investigate and fix');
|
||||
});
|
||||
|
||||
test('codex-host ship/review do NOT contain adversarial review step', () => {
|
||||
// .agents/ is gitignored — generate on demand
|
||||
Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
|
||||
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
|
||||
});
|
||||
const shipContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-ship', 'SKILL.md'), 'utf-8');
|
||||
expect(shipContent).not.toContain('codex review --base');
|
||||
expect(shipContent).not.toContain('Investigate and fix');
|
||||
expect(shipContent).not.toContain('CODEX_REVIEWS');
|
||||
|
||||
const reviewContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-review', 'SKILL.md'), 'utf-8');
|
||||
expect(reviewContent).not.toContain('codex review --base');
|
||||
expect(reviewContent).not.toContain('codex_reviews');
|
||||
expect(reviewContent).not.toContain('CODEX_REVIEWS');
|
||||
expect(reviewContent).not.toContain('adversarial-review');
|
||||
expect(reviewContent).not.toContain('Investigate and fix');
|
||||
});
|
||||
@@ -1306,6 +1348,13 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('codex exec');
|
||||
});
|
||||
|
||||
test('/review persists a review-log entry for ship readiness', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('"skill":"review"');
|
||||
expect(content).toContain('"issues_found":N');
|
||||
expect(content).toContain('Persist Eng Review result');
|
||||
});
|
||||
|
||||
test('Review Readiness Dashboard includes Adversarial Review row', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Adversarial');
|
||||
@@ -1362,6 +1411,11 @@ describe('Skill trigger phrases', () => {
|
||||
describe('Codex skill validation', () => {
|
||||
const AGENTS_DIR = path.join(ROOT, '.agents', 'skills');
|
||||
|
||||
// .agents/ is gitignored (v0.11.2.0) — generate on demand for tests
|
||||
Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
|
||||
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
|
||||
});
|
||||
|
||||
// Discover all Claude skills with templates (except /codex which is Claude-only)
|
||||
const CLAUDE_SKILLS_WITH_TEMPLATES = (() => {
|
||||
const skills: string[] = [];
|
||||
@@ -1423,3 +1477,59 @@ describe('Codex skill validation', () => {
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// --- Repo mode and test failure triage validation ---
|
||||
|
||||
describe('Repo mode preamble validation', () => {
|
||||
test('generated SKILL.md preamble contains REPO_MODE output', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('REPO_MODE:');
|
||||
expect(content).toContain('gstack-repo-mode');
|
||||
});
|
||||
|
||||
test('tier 3+ skills contain See Something Say Something section', () => {
|
||||
// Root SKILL.md is tier 1 (no Repo Mode). Check a tier 3 skill instead.
|
||||
const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('See Something, Say Something');
|
||||
expect(content).toContain('REPO_MODE');
|
||||
expect(content).toContain('solo');
|
||||
expect(content).toContain('collaborative');
|
||||
});
|
||||
});
|
||||
|
||||
describe('Test failure triage in ship skill', () => {
|
||||
test('ship/SKILL.md contains Test Failure Ownership Triage', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Test Failure Ownership Triage');
|
||||
});
|
||||
|
||||
test('ship/SKILL.md triage uses git diff for classification', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('git diff origin/<base>...HEAD --name-only');
|
||||
});
|
||||
|
||||
test('ship/SKILL.md triage has solo and collaborative paths', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('REPO_MODE');
|
||||
expect(content).toContain('solo');
|
||||
expect(content).toContain('collaborative');
|
||||
expect(content).toContain('Investigate and fix now');
|
||||
expect(content).toContain('Add as P0 TODO');
|
||||
});
|
||||
|
||||
test('ship/SKILL.md triage has GitHub issue assignment for collaborative mode', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('gh issue create');
|
||||
expect(content).toContain('--assignee');
|
||||
});
|
||||
|
||||
test('{{TEST_FAILURE_TRIAGE}} placeholder is fully resolved in ship/SKILL.md', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).not.toContain('{{TEST_FAILURE_TRIAGE}}');
|
||||
});
|
||||
|
||||
test('ship/SKILL.md uses in-branch language for stop condition', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('In-branch test failures');
|
||||
});
|
||||
});
|
||||
|
||||
+125
-5
@@ -78,8 +78,8 @@ describe('gstack-telemetry-log', () => {
|
||||
|
||||
const events = parseJsonl();
|
||||
expect(events).toHaveLength(1);
|
||||
// installation_id should be a SHA-256 hash (64 hex chars)
|
||||
expect(events[0].installation_id).toMatch(/^[a-f0-9]{64}$/);
|
||||
// installation_id should be a UUID v4 (or hex fallback)
|
||||
expect(events[0].installation_id).toMatch(/^[a-f0-9-]{32,36}$/);
|
||||
});
|
||||
|
||||
test('installation_id is null for anonymous tier', () => {
|
||||
@@ -125,6 +125,82 @@ describe('gstack-telemetry-log', () => {
|
||||
expect(events[0]).toHaveProperty('_branch');
|
||||
});
|
||||
|
||||
// ─── json_safe() injection prevention tests ────────────────
|
||||
test('sanitizes skill name with quote injection attempt', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
run(`${BIN}/gstack-telemetry-log --skill 'review","injected":"true' --duration 10 --outcome success --session-id inj-1`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
// Must be valid JSON (no injection — quotes stripped, so no field injection possible)
|
||||
const event = JSON.parse(lines[0]);
|
||||
// The key check: no injected top-level property was created
|
||||
expect(event).not.toHaveProperty('injected');
|
||||
// Skill field should have quotes stripped but content preserved
|
||||
expect(event.skill).not.toContain('"');
|
||||
});
|
||||
|
||||
test('truncates skill name exceeding 200 chars', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
const longSkill = 'a'.repeat(250);
|
||||
run(`${BIN}/gstack-telemetry-log --skill '${longSkill}' --duration 10 --outcome success --session-id trunc-1`);
|
||||
|
||||
const events = parseJsonl();
|
||||
expect(events[0].skill.length).toBeLessThanOrEqual(200);
|
||||
});
|
||||
|
||||
test('sanitizes outcome with newline injection attempt', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
// Use printf to pass actual newline in the argument
|
||||
run(`bash -c 'OUTCOME=$(printf "success\\nfake\\":\\"true"); ${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome "$OUTCOME" --session-id inj-2'`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
const event = JSON.parse(lines[0]);
|
||||
expect(event).not.toHaveProperty('fake');
|
||||
});
|
||||
|
||||
test('sanitizes session_id with backslash-quote injection', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id 'id\\\\"","x":"y'`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
const event = JSON.parse(lines[0]);
|
||||
expect(event).not.toHaveProperty('x');
|
||||
});
|
||||
|
||||
test('sanitizes error_class with quote injection', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-class 'timeout","extra":"val' --session-id inj-3`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
const event = JSON.parse(lines[0]);
|
||||
expect(event).not.toHaveProperty('extra');
|
||||
});
|
||||
|
||||
test('sanitizes failed_step with quote injection', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --failed-step 'step1","hacked":"yes' --session-id inj-4`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
const event = JSON.parse(lines[0]);
|
||||
expect(event).not.toHaveProperty('hacked');
|
||||
});
|
||||
|
||||
test('escapes error_message quotes and preserves content', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-message 'Error: file "test.txt" not found' --session-id inj-5`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
const event = JSON.parse(lines[0]);
|
||||
expect(event.error_message).toContain('file');
|
||||
expect(event.error_message).toContain('not found');
|
||||
});
|
||||
|
||||
test('creates analytics directory if missing', () => {
|
||||
// Remove analytics dir
|
||||
const analyticsDir = path.join(tmpDir, 'analytics');
|
||||
@@ -136,6 +212,34 @@ describe('gstack-telemetry-log', () => {
|
||||
expect(fs.existsSync(analyticsDir)).toBe(true);
|
||||
expect(readJsonl()).toHaveLength(1);
|
||||
});
|
||||
|
||||
// ─── Telemetry JSON safety: branch/repo with special chars ────
|
||||
test('branch name with quotes does not corrupt JSON', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
// Simulate a branch name with double quotes by setting it via git env override
|
||||
// The json_safe function strips quotes, so the JSONL should remain valid
|
||||
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id branch-quotes-1`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
// Every line must be valid JSON
|
||||
const event = JSON.parse(lines[0]);
|
||||
expect(event._branch).toBeDefined();
|
||||
// _branch should not contain double quotes (json_safe strips them)
|
||||
expect(event._branch).not.toContain('"');
|
||||
});
|
||||
|
||||
test('repo slug with special chars does not corrupt JSON', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id repo-special-1`);
|
||||
|
||||
const lines = readJsonl();
|
||||
expect(lines).toHaveLength(1);
|
||||
const event = JSON.parse(lines[0]);
|
||||
expect(event._repo_slug).toBeDefined();
|
||||
// _repo_slug should not contain double quotes (json_safe strips them)
|
||||
expect(event._repo_slug).not.toContain('"');
|
||||
});
|
||||
});
|
||||
|
||||
describe('.pending marker', () => {
|
||||
@@ -244,16 +348,32 @@ describe('gstack-analytics', () => {
|
||||
});
|
||||
|
||||
describe('gstack-telemetry-sync', () => {
|
||||
test('exits silently with no endpoint configured', () => {
|
||||
// Default: GSTACK_TELEMETRY_ENDPOINT is not set → exit 0
|
||||
test('exits silently with no Supabase URL configured', () => {
|
||||
// Default: GSTACK_SUPABASE_URL is not set → exit 0
|
||||
const result = run(`${BIN}/gstack-telemetry-sync`);
|
||||
expect(result).toBe('');
|
||||
});
|
||||
|
||||
test('exits silently with no JSONL file', () => {
|
||||
const result = run(`${BIN}/gstack-telemetry-sync`, { GSTACK_TELEMETRY_ENDPOINT: 'http://localhost:9999' });
|
||||
const result = run(`${BIN}/gstack-telemetry-sync`, { GSTACK_SUPABASE_URL: 'http://localhost:9999' });
|
||||
expect(result).toBe('');
|
||||
});
|
||||
|
||||
test('does not rename JSONL field names (edge function expects raw names)', () => {
|
||||
setConfig('telemetry', 'anonymous');
|
||||
run(`${BIN}/gstack-telemetry-log --skill qa --duration 60 --outcome success --session-id raw-fields-1`);
|
||||
|
||||
const events = parseJsonl();
|
||||
expect(events).toHaveLength(1);
|
||||
// Edge function expects these raw field names, NOT Postgres column names
|
||||
expect(events[0]).toHaveProperty('v');
|
||||
expect(events[0]).toHaveProperty('ts');
|
||||
expect(events[0]).toHaveProperty('sessions');
|
||||
// Should NOT have Postgres column names
|
||||
expect(events[0]).not.toHaveProperty('schema_version');
|
||||
expect(events[0]).not.toHaveProperty('event_timestamp');
|
||||
expect(events[0]).not.toHaveProperty('concurrent_sessions');
|
||||
});
|
||||
});
|
||||
|
||||
describe('gstack-community-dashboard', () => {
|
||||
|
||||
+48
-6
@@ -13,6 +13,7 @@ import {
|
||||
selectTests,
|
||||
detectBaseBranch,
|
||||
E2E_TOUCHFILES,
|
||||
E2E_TIERS,
|
||||
LLM_JUDGE_TOUCHFILES,
|
||||
GLOBAL_TOUCHFILES,
|
||||
} from './helpers/touchfiles';
|
||||
@@ -79,8 +80,10 @@ describe('selectTests', () => {
|
||||
expect(result.selected).toContain('plan-ceo-review');
|
||||
expect(result.selected).toContain('plan-ceo-review-selective');
|
||||
expect(result.selected).toContain('plan-ceo-review-benefits');
|
||||
expect(result.selected.length).toBe(3);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 3);
|
||||
expect(result.selected).toContain('autoplan-core');
|
||||
expect(result.selected).toContain('codex-offered-ceo-review');
|
||||
expect(result.selected.length).toBe(5);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5);
|
||||
});
|
||||
|
||||
test('global touchfile triggers ALL tests', () => {
|
||||
@@ -90,10 +93,19 @@ describe('selectTests', () => {
|
||||
expect(result.reason).toContain('global');
|
||||
});
|
||||
|
||||
test('gen-skill-docs.ts is a global touchfile', () => {
|
||||
test('gen-skill-docs.ts is a scoped touchfile, not global', () => {
|
||||
const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
|
||||
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
|
||||
expect(result.reason).toContain('global');
|
||||
// Should select tests that list gen-skill-docs.ts in their touchfiles, not ALL tests
|
||||
expect(result.selected.length).toBeGreaterThan(0);
|
||||
expect(result.selected.length).toBeLessThan(Object.keys(E2E_TOUCHFILES).length);
|
||||
expect(result.reason).toBe('diff');
|
||||
// Should include tests that depend on gen-skill-docs.ts
|
||||
expect(result.selected).toContain('skillmd-setup-discovery');
|
||||
expect(result.selected).toContain('contributor-mode');
|
||||
expect(result.selected).toContain('journey-ideation');
|
||||
// Should NOT include tests that don't depend on it
|
||||
expect(result.selected).not.toContain('retro');
|
||||
expect(result.selected).not.toContain('cso-full-audit');
|
||||
});
|
||||
|
||||
test('unrelated file selects nothing', () => {
|
||||
@@ -142,7 +154,7 @@ describe('selectTests', () => {
|
||||
});
|
||||
|
||||
test('global touchfiles work for LLM-judge tests too', () => {
|
||||
const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
|
||||
const result = selectTests(['test/helpers/session-runner.ts'], LLM_JUDGE_TOUCHFILES);
|
||||
expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
|
||||
});
|
||||
});
|
||||
@@ -232,6 +244,36 @@ describe('TOUCHFILES completeness', () => {
|
||||
}
|
||||
});
|
||||
|
||||
test('E2E_TIERS covers exactly the same tests as E2E_TOUCHFILES', () => {
|
||||
const touchfileKeys = new Set(Object.keys(E2E_TOUCHFILES));
|
||||
const tierKeys = new Set(Object.keys(E2E_TIERS));
|
||||
|
||||
const missingFromTiers = [...touchfileKeys].filter(k => !tierKeys.has(k));
|
||||
const extraInTiers = [...tierKeys].filter(k => !touchfileKeys.has(k));
|
||||
|
||||
if (missingFromTiers.length > 0) {
|
||||
throw new Error(
|
||||
`E2E tests missing TIER entries: ${missingFromTiers.join(', ')}\n` +
|
||||
`Add these to E2E_TIERS in test/helpers/touchfiles.ts`,
|
||||
);
|
||||
}
|
||||
if (extraInTiers.length > 0) {
|
||||
throw new Error(
|
||||
`E2E_TIERS has extra entries not in E2E_TOUCHFILES: ${extraInTiers.join(', ')}\n` +
|
||||
`Remove these from E2E_TIERS or add to E2E_TOUCHFILES`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('E2E_TIERS only contains valid tier values', () => {
|
||||
const validTiers = ['gate', 'periodic'];
|
||||
for (const [name, tier] of Object.entries(E2E_TIERS)) {
|
||||
if (!validTiers.includes(tier)) {
|
||||
throw new Error(`E2E_TIERS['${name}'] has invalid tier '${tier}'. Valid: ${validTiers.join(', ')}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test('every LLM-judge test has a TOUCHFILES entry', () => {
|
||||
const llmContent = fs.readFileSync(
|
||||
path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
|
||||
|
||||
@@ -0,0 +1,165 @@
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const UNINSTALL = path.join(ROOT, 'bin', 'gstack-uninstall');
|
||||
|
||||
describe('gstack-uninstall', () => {
|
||||
test('syntax check passes', () => {
|
||||
const result = spawnSync('bash', ['-n', UNINSTALL], { stdio: 'pipe' });
|
||||
expect(result.status).toBe(0);
|
||||
});
|
||||
|
||||
test('--help prints usage and exits 0', () => {
|
||||
const result = spawnSync('bash', [UNINSTALL, '--help'], { stdio: 'pipe' });
|
||||
expect(result.status).toBe(0);
|
||||
const output = result.stdout.toString();
|
||||
expect(output).toContain('gstack-uninstall');
|
||||
expect(output).toContain('--force');
|
||||
expect(output).toContain('--keep-state');
|
||||
});
|
||||
|
||||
test('unknown flag exits with error', () => {
|
||||
const result = spawnSync('bash', [UNINSTALL, '--bogus'], {
|
||||
stdio: 'pipe',
|
||||
env: { ...process.env, HOME: '/nonexistent' },
|
||||
});
|
||||
expect(result.status).toBe(1);
|
||||
expect(result.stderr.toString()).toContain('Unknown option');
|
||||
});
|
||||
|
||||
describe('integration tests with mock layout', () => {
|
||||
let tmpDir: string;
|
||||
let mockHome: string;
|
||||
let mockGitRoot: string;
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-uninstall-test-'));
|
||||
mockHome = path.join(tmpDir, 'home');
|
||||
mockGitRoot = path.join(tmpDir, 'repo');
|
||||
|
||||
// Create mock gstack install layout
|
||||
fs.mkdirSync(path.join(mockHome, '.claude', 'skills', 'gstack'), { recursive: true });
|
||||
fs.writeFileSync(path.join(mockHome, '.claude', 'skills', 'gstack', 'SKILL.md'), 'test');
|
||||
|
||||
// Create per-skill symlinks (both old unprefixed and new prefixed)
|
||||
fs.symlinkSync('gstack/review', path.join(mockHome, '.claude', 'skills', 'review'));
|
||||
fs.symlinkSync('gstack/ship', path.join(mockHome, '.claude', 'skills', 'gstack-ship'));
|
||||
|
||||
// Create a non-gstack symlink (should NOT be removed)
|
||||
fs.mkdirSync(path.join(mockHome, '.claude', 'skills', 'other-tool'), { recursive: true });
|
||||
|
||||
// Create state directory
|
||||
fs.mkdirSync(path.join(mockHome, '.gstack', 'projects'), { recursive: true });
|
||||
fs.writeFileSync(path.join(mockHome, '.gstack', 'config.json'), '{}');
|
||||
|
||||
// Create mock git repo
|
||||
fs.mkdirSync(mockGitRoot, { recursive: true });
|
||||
spawnSync('git', ['init', '-b', 'main'], { cwd: mockGitRoot, stdio: 'pipe' });
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test('--force removes global Claude skills and state', () => {
|
||||
const result = spawnSync('bash', [UNINSTALL, '--force'], {
|
||||
stdio: 'pipe',
|
||||
env: {
|
||||
...process.env,
|
||||
HOME: mockHome,
|
||||
GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
|
||||
GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
|
||||
},
|
||||
cwd: mockGitRoot,
|
||||
});
|
||||
|
||||
expect(result.status).toBe(0);
|
||||
const output = result.stdout.toString();
|
||||
expect(output).toContain('gstack uninstalled');
|
||||
|
||||
// Global skill dir should be removed
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack'))).toBe(false);
|
||||
|
||||
// Per-skill symlinks pointing into gstack/ should be removed
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'review'))).toBe(false);
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack-ship'))).toBe(false);
|
||||
|
||||
// Non-gstack tool should still exist
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'other-tool'))).toBe(true);
|
||||
|
||||
// State should be removed
|
||||
expect(fs.existsSync(path.join(mockHome, '.gstack'))).toBe(false);
|
||||
});
|
||||
|
||||
test('--keep-state preserves state directory', () => {
|
||||
const result = spawnSync('bash', [UNINSTALL, '--force', '--keep-state'], {
|
||||
stdio: 'pipe',
|
||||
env: {
|
||||
...process.env,
|
||||
HOME: mockHome,
|
||||
GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
|
||||
GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
|
||||
},
|
||||
cwd: mockGitRoot,
|
||||
});
|
||||
|
||||
expect(result.status).toBe(0);
|
||||
|
||||
// Skills should be removed
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack'))).toBe(false);
|
||||
|
||||
// State should still exist
|
||||
expect(fs.existsSync(path.join(mockHome, '.gstack'))).toBe(true);
|
||||
expect(fs.existsSync(path.join(mockHome, '.gstack', 'config.json'))).toBe(true);
|
||||
});
|
||||
|
||||
test('clean system outputs nothing to remove', () => {
|
||||
const cleanHome = path.join(tmpDir, 'clean-home');
|
||||
fs.mkdirSync(cleanHome, { recursive: true });
|
||||
|
||||
const result = spawnSync('bash', [UNINSTALL, '--force'], {
|
||||
stdio: 'pipe',
|
||||
env: {
|
||||
...process.env,
|
||||
HOME: cleanHome,
|
||||
GSTACK_DIR: path.join(cleanHome, 'nonexistent'),
|
||||
GSTACK_STATE_DIR: path.join(cleanHome, '.gstack'),
|
||||
},
|
||||
cwd: mockGitRoot,
|
||||
});
|
||||
|
||||
expect(result.status).toBe(0);
|
||||
expect(result.stdout.toString()).toContain('Nothing to remove');
|
||||
});
|
||||
|
||||
test('upgrade path: prefixed install + uninstall cleans both old and new symlinks', () => {
|
||||
// Simulate the state after setup --no-prefix followed by setup (with prefix):
|
||||
// Both old unprefixed and new prefixed symlinks exist
|
||||
// (mockHome already has both 'review' and 'gstack-ship' symlinks)
|
||||
|
||||
const result = spawnSync('bash', [UNINSTALL, '--force'], {
|
||||
stdio: 'pipe',
|
||||
env: {
|
||||
...process.env,
|
||||
HOME: mockHome,
|
||||
GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
|
||||
GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
|
||||
},
|
||||
cwd: mockGitRoot,
|
||||
});
|
||||
|
||||
expect(result.status).toBe(0);
|
||||
|
||||
// Both old (review) and new (gstack-ship) symlinks should be gone
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'review'))).toBe(false);
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack-ship'))).toBe(false);
|
||||
|
||||
// Non-gstack should survive
|
||||
expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'other-tool'))).toBe(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,271 @@
|
||||
/**
|
||||
* Unit tests for WorktreeManager.
|
||||
*
|
||||
* Tests worktree lifecycle: create, harvest, dedup, cleanup, prune.
|
||||
* Each test creates real git worktrees in a temporary repo.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, afterEach } from 'bun:test';
|
||||
import { WorktreeManager } from '../lib/worktree';
|
||||
import type { HarvestResult } from '../lib/worktree';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
/** Create a minimal git repo in a tmpdir for testing. */
|
||||
function createTestRepo(): string {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'worktree-test-'));
|
||||
spawnSync('git', ['init'], { cwd: dir, stdio: 'pipe' });
|
||||
spawnSync('git', ['config', 'user.email', 'test@test.com'], { cwd: dir, stdio: 'pipe' });
|
||||
spawnSync('git', ['config', 'user.name', 'Test'], { cwd: dir, stdio: 'pipe' });
|
||||
|
||||
// Create initial commit so HEAD exists
|
||||
fs.writeFileSync(path.join(dir, 'README.md'), '# Test repo\n');
|
||||
// Add .gitignore matching real repo (so copied build artifacts don't appear as changes)
|
||||
fs.writeFileSync(path.join(dir, '.gitignore'), '.agents/\nbrowse/dist/\n.gstack-worktrees/\n');
|
||||
// Create a .agents directory (simulating gitignored build artifacts)
|
||||
fs.mkdirSync(path.join(dir, '.agents', 'skills'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, '.agents', 'skills', 'test-skill.md'), '# Test skill\n');
|
||||
// Create browse/dist (simulating build artifacts)
|
||||
fs.mkdirSync(path.join(dir, 'browse', 'dist'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'browse', 'dist', 'browse'), '#!/bin/sh\necho browse\n');
|
||||
|
||||
spawnSync('git', ['add', 'README.md', '.gitignore'], { cwd: dir, stdio: 'pipe' });
|
||||
spawnSync('git', ['commit', '-m', 'Initial commit'], { cwd: dir, stdio: 'pipe' });
|
||||
|
||||
return dir;
|
||||
}
|
||||
|
||||
/** Clean up a test repo. */
|
||||
function cleanupRepo(dir: string): void {
|
||||
// Prune worktrees first to avoid git lock issues
|
||||
spawnSync('git', ['worktree', 'prune'], { cwd: dir, stdio: 'pipe' });
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
// Track repos to clean up
|
||||
const repos: string[] = [];
|
||||
|
||||
// Dedup index path — clear before each test to avoid cross-run contamination
|
||||
const DEDUP_PATH = path.join(os.homedir(), '.gstack-dev', 'harvests', 'dedup.json');
|
||||
|
||||
afterEach(() => {
|
||||
for (const repo of repos) {
|
||||
try { cleanupRepo(repo); } catch { /* best effort */ }
|
||||
}
|
||||
repos.length = 0;
|
||||
// Clear dedup index so tests are independent
|
||||
try { fs.unlinkSync(DEDUP_PATH); } catch { /* may not exist */ }
|
||||
});
|
||||
|
||||
describe('WorktreeManager', () => {
|
||||
|
||||
test('create() produces a valid worktree at the expected path', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const worktreePath = mgr.create('test-1');
|
||||
|
||||
expect(fs.existsSync(worktreePath)).toBe(true);
|
||||
expect(fs.existsSync(path.join(worktreePath, 'README.md'))).toBe(true);
|
||||
expect(worktreePath).toContain('.gstack-worktrees');
|
||||
expect(worktreePath).toContain('test-1');
|
||||
|
||||
mgr.cleanup('test-1');
|
||||
});
|
||||
|
||||
test('create() worktree has .agents/skills/ (gitignored artifacts copied)', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const worktreePath = mgr.create('test-agents');
|
||||
|
||||
expect(fs.existsSync(path.join(worktreePath, '.agents', 'skills', 'test-skill.md'))).toBe(true);
|
||||
expect(fs.existsSync(path.join(worktreePath, 'browse', 'dist', 'browse'))).toBe(true);
|
||||
|
||||
mgr.cleanup('test-agents');
|
||||
});
|
||||
|
||||
test('create() stores correct originalSha', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const expectedSha = spawnSync('git', ['rev-parse', 'HEAD'], { cwd: repo, stdio: 'pipe' })
|
||||
.stdout.toString().trim();
|
||||
|
||||
mgr.create('test-sha');
|
||||
|
||||
const info = mgr.getInfo('test-sha');
|
||||
expect(info).toBeDefined();
|
||||
expect(info!.originalSha).toBe(expectedSha);
|
||||
|
||||
mgr.cleanup('test-sha');
|
||||
});
|
||||
|
||||
test('harvest() captures modifications to tracked files', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const worktreePath = mgr.create('test-harvest-mod');
|
||||
|
||||
// Modify a tracked file in the worktree
|
||||
fs.writeFileSync(path.join(worktreePath, 'README.md'), '# Modified!\n');
|
||||
|
||||
const result = mgr.harvest('test-harvest-mod');
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.changedFiles).toContain('README.md');
|
||||
expect(result!.isDuplicate).toBe(false);
|
||||
expect(result!.patchPath).toBeTruthy();
|
||||
expect(fs.existsSync(result!.patchPath)).toBe(true);
|
||||
|
||||
mgr.cleanup('test-harvest-mod');
|
||||
});
|
||||
|
||||
test('harvest() captures new untracked files (git add -A path)', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const worktreePath = mgr.create('test-harvest-new');
|
||||
|
||||
// Create a new file in the worktree
|
||||
fs.writeFileSync(path.join(worktreePath, 'new-file.txt'), 'Hello from agent\n');
|
||||
|
||||
const result = mgr.harvest('test-harvest-new');
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.changedFiles).toContain('new-file.txt');
|
||||
|
||||
mgr.cleanup('test-harvest-new');
|
||||
});
|
||||
|
||||
test('harvest() captures committed changes (git diff originalSha)', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const worktreePath = mgr.create('test-harvest-commit');
|
||||
|
||||
// Make a commit in the worktree (simulating agent running git commit)
|
||||
fs.writeFileSync(path.join(worktreePath, 'committed.txt'), 'Agent committed this\n');
|
||||
spawnSync('git', ['add', 'committed.txt'], { cwd: worktreePath, stdio: 'pipe' });
|
||||
spawnSync('git', ['commit', '-m', 'Agent commit'], { cwd: worktreePath, stdio: 'pipe' });
|
||||
|
||||
const result = mgr.harvest('test-harvest-commit');
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.changedFiles).toContain('committed.txt');
|
||||
|
||||
mgr.cleanup('test-harvest-commit');
|
||||
});
|
||||
|
||||
test('harvest() returns null when worktree is clean', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
mgr.create('test-harvest-clean');
|
||||
|
||||
// Don't modify anything
|
||||
const result = mgr.harvest('test-harvest-clean');
|
||||
|
||||
expect(result).toBeNull();
|
||||
|
||||
mgr.cleanup('test-harvest-clean');
|
||||
});
|
||||
|
||||
test('harvest() dedup skips identical patches', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
|
||||
// First run
|
||||
const mgr1 = new WorktreeManager(repo);
|
||||
const wt1 = mgr1.create('test-dedup-1');
|
||||
fs.writeFileSync(path.join(wt1, 'dedup-test.txt'), 'same content\n');
|
||||
const result1 = mgr1.harvest('test-dedup-1');
|
||||
mgr1.cleanup('test-dedup-1');
|
||||
|
||||
expect(result1).not.toBeNull();
|
||||
expect(result1!.isDuplicate).toBe(false);
|
||||
|
||||
// Second run with same change
|
||||
const mgr2 = new WorktreeManager(repo);
|
||||
const wt2 = mgr2.create('test-dedup-2');
|
||||
fs.writeFileSync(path.join(wt2, 'dedup-test.txt'), 'same content\n');
|
||||
const result2 = mgr2.harvest('test-dedup-2');
|
||||
mgr2.cleanup('test-dedup-2');
|
||||
|
||||
expect(result2).not.toBeNull();
|
||||
expect(result2!.isDuplicate).toBe(true);
|
||||
});
|
||||
|
||||
test('cleanup() removes worktree directory', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const worktreePath = mgr.create('test-cleanup');
|
||||
expect(fs.existsSync(worktreePath)).toBe(true);
|
||||
|
||||
mgr.cleanup('test-cleanup');
|
||||
expect(fs.existsSync(worktreePath)).toBe(false);
|
||||
});
|
||||
|
||||
test('pruneStale() removes orphaned worktrees from previous runs', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
|
||||
// Create a worktree with a different manager (simulating a previous run)
|
||||
const oldMgr = new WorktreeManager(repo);
|
||||
const oldPath = oldMgr.create('stale-test');
|
||||
const oldRunDir = path.dirname(oldPath);
|
||||
expect(fs.existsSync(oldPath)).toBe(true);
|
||||
|
||||
// Remove via git but leave directory (simulating a crash)
|
||||
spawnSync('git', ['worktree', 'remove', '--force', oldPath], { cwd: repo, stdio: 'pipe' });
|
||||
// Recreate the directory to simulate orphaned state
|
||||
fs.mkdirSync(oldPath, { recursive: true });
|
||||
|
||||
// New manager should prune the old run's directory
|
||||
const newMgr = new WorktreeManager(repo);
|
||||
newMgr.pruneStale();
|
||||
|
||||
expect(fs.existsSync(oldRunDir)).toBe(false);
|
||||
});
|
||||
|
||||
test('create() throws on failure (no silent fallback to ROOT)', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
// Create the same worktree twice — second should fail because path exists
|
||||
mgr.create('test-fail');
|
||||
expect(() => mgr.create('test-fail')).toThrow();
|
||||
|
||||
mgr.cleanup('test-fail');
|
||||
});
|
||||
|
||||
test('harvest() returns null gracefully when worktree dir was deleted by agent', () => {
|
||||
const repo = createTestRepo();
|
||||
repos.push(repo);
|
||||
const mgr = new WorktreeManager(repo);
|
||||
|
||||
const worktreePath = mgr.create('test-deleted');
|
||||
|
||||
// Simulate agent deleting its own worktree directory
|
||||
fs.rmSync(worktreePath, { recursive: true, force: true });
|
||||
|
||||
// harvest should return null gracefully, not throw
|
||||
const result = mgr.harvest('test-deleted');
|
||||
expect(result).toBeNull();
|
||||
|
||||
// cleanup should also be non-fatal
|
||||
mgr.cleanup('test-deleted');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user