Card Title
+Some content here with tight line height.
+Another Card
+Different spacing and colors for no reason.
+
+
+
+
+`);
+
+ // Init git repo with clean working tree
+ const { spawnSync } = require('child_process');
+ const run = (cmd: string, args: string[]) =>
+ spawnSync(cmd, args, { cwd: qaFixDir, stdio: 'pipe', timeout: 5000 });
+
+ run('git', ['init', '-b', 'main']);
+ run('git', ['config', 'user.email', 'test@test.com']);
+ run('git', ['config', 'user.name', 'Test']);
+ run('git', ['add', '.']);
+ run('git', ['commit', '-m', 'initial commit']);
+
+ // Start a local server serving from the working directory so fixes are reflected on refresh
+ qaFixServer = Bun.serve({
+ port: 0,
+ hostname: '127.0.0.1',
+ fetch(req) {
+ const url = new URL(req.url);
+ let filePath = url.pathname === '/' ? '/index.html' : url.pathname;
+ filePath = filePath.replace(/^\//, '');
+ const fullPath = path.join(qaFixDir, filePath);
+ if (!fs.existsSync(fullPath)) {
+ return new Response('Not Found', { status: 404 });
+ }
+ const content = fs.readFileSync(fullPath, 'utf-8');
+ return new Response(content, {
+ headers: { 'Content-Type': 'text/html' },
+ });
+ },
+ });
+ });
+
+ afterAll(() => {
+ qaFixServer?.stop();
+ try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
+ });
+
+ test('/qa fix loop finds bugs and commits fixes', async () => {
+ const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;
+
+ const result = await runSkillTest({
+ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
+
+Read the file qa/SKILL.md for the QA workflow instructions.
+
+Run a Quick-tier QA test on ${qaFixUrl}
+The source code for this page is at ${qaFixDir}/index.html — you can fix bugs there.
+Do NOT use AskUserQuestion — run Quick tier directly.
+Write your report to ${qaFixDir}/qa-reports/qa-report.md
+
+This is a test+fix loop: find bugs, fix them in the source code, commit each fix, and re-verify.`,
+ workingDirectory: qaFixDir,
+ maxTurns: 40,
+ allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob', 'Grep'],
+ timeout: 300_000,
+ testName: 'qa-fix-loop',
+ runId,
+ });
+
+ logCost('/qa fix loop', result);
+ recordE2E('/qa fix loop', 'QA Fix Loop E2E', result, {
+ passed: ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ // Accept error_max_turns — fix loop may use many turns
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+
+ // Verify at least one fix commit was made beyond the initial commit
+ const gitLog = spawnSync('git', ['log', '--oneline'], {
+ cwd: qaFixDir, stdio: 'pipe',
+ });
+ const commits = gitLog.stdout.toString().trim().split('\n');
+ console.log(`/qa fix loop: ${commits.length} commits total (1 initial + ${commits.length - 1} fixes)`);
+ expect(commits.length).toBeGreaterThan(1);
+
+ // Verify Edit tool was used (agent actually modified source code)
+ const editCalls = result.toolCalls.filter(tc => tc.tool === 'Edit');
+ expect(editCalls.length).toBeGreaterThan(0);
+ }, 360_000);
+});
+
+// --- Plan-Eng-Review Test-Plan Artifact E2E ---
+
+describeIfSelected('Plan-Eng-Review Test-Plan Artifact E2E', ['plan-eng-review-artifact'], () => {
+ let planDir: string;
+ let projectDir: string;
+
+ beforeAll(() => {
+ planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-artifact-'));
+ const { spawnSync } = require('child_process');
+ const run = (cmd: string, args: string[]) =>
+ spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
+
+ run('git', ['init', '-b', 'main']);
+ run('git', ['config', 'user.email', 'test@test.com']);
+ run('git', ['config', 'user.name', 'Test']);
+
+ // Create base commit on main
+ fs.writeFileSync(path.join(planDir, 'app.ts'), 'export function greet() { return "hello"; }\n');
+ run('git', ['add', '.']);
+ run('git', ['commit', '-m', 'initial']);
+
+ // Create feature branch with changes
+ run('git', ['checkout', '-b', 'feature/add-dashboard']);
+ fs.writeFileSync(path.join(planDir, 'dashboard.ts'), `export function Dashboard() {
+ const data = fetchStats();
+ return { users: data.users, revenue: data.revenue };
+}
+function fetchStats() {
+ return fetch('/api/stats').then(r => r.json());
+}
+`);
+ fs.writeFileSync(path.join(planDir, 'app.ts'), `import { Dashboard } from "./dashboard";
+export function greet() { return "hello"; }
+export function main() { return Dashboard(); }
+`);
+ run('git', ['add', '.']);
+ run('git', ['commit', '-m', 'feat: add dashboard']);
+
+ // Plan document
+ fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Dashboard
+
+## Changes
+1. New \`dashboard.ts\` with Dashboard component and fetchStats API call
+2. Updated \`app.ts\` to import and use Dashboard
+
+## Architecture
+- Dashboard fetches from \`/api/stats\` endpoint
+- Returns user count and revenue metrics
+`);
+ run('git', ['add', 'plan.md']);
+ run('git', ['commit', '-m', 'add plan']);
+
+ // Copy plan-eng-review skill
+ fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
+ fs.copyFileSync(
+ path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
+ path.join(planDir, 'plan-eng-review', 'SKILL.md'),
+ );
+
+ // Set up remote-slug shim and browse shims (plan-eng-review uses remote-slug for artifact path)
+ setupBrowseShims(planDir);
+
+ // Create project directory for artifacts
+ projectDir = path.join(os.homedir(), '.gstack', 'projects', 'test-project');
+ fs.mkdirSync(projectDir, { recursive: true });
+ });
+
+ afterAll(() => {
+ try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
+ // Clean up test-plan artifacts (but not the project dir itself)
+ try {
+ const files = fs.readdirSync(projectDir);
+ for (const f of files) {
+ if (f.includes('test-plan')) {
+ fs.unlinkSync(path.join(projectDir, f));
+ }
+ }
+ } catch {}
+ });
+
+ test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
+ // Count existing test-plan files before
+ const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
+
+ const result = await runSkillTest({
+ prompt: `Read plan-eng-review/SKILL.md for the review workflow.
+
+Read plan.md — that's the plan to review. This is a standalone plan with source code in app.ts and dashboard.ts.
+
+Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive.
+
+IMPORTANT: After your review, you MUST write the test-plan artifact as described in the "Test Plan Artifact" section of SKILL.md. The remote-slug shim is at ${planDir}/browse/bin/remote-slug.
+
+Write your review to ${planDir}/review-output.md`,
+ workingDirectory: planDir,
+ maxTurns: 20,
+ allowedTools: ['Bash', 'Read', 'Write', 'Glob', 'Grep'],
+ timeout: 360_000,
+ testName: 'plan-eng-review-artifact',
+ runId,
+ });
+
+ logCost('/plan-eng-review artifact', result);
+ recordE2E('/plan-eng-review test-plan artifact', 'Plan-Eng-Review Test-Plan Artifact E2E', result, {
+ passed: ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+
+ // Verify test-plan artifact was written
+ const afterFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
+ const newFiles = afterFiles.filter(f => !beforeFiles.includes(f));
+ console.log(`Test-plan artifacts: ${beforeFiles.length} before, ${afterFiles.length} after, ${newFiles.length} new`);
+
+ if (newFiles.length > 0) {
+ const content = fs.readFileSync(path.join(projectDir, newFiles[0]), 'utf-8');
+ console.log(`Test-plan artifact (${newFiles[0]}): ${content.length} chars`);
+ expect(content.length).toBeGreaterThan(50);
+ } else {
+ console.warn('No test-plan artifact found — agent may not have followed artifact instructions');
+ }
+
+ // Soft assertion: we expect an artifact but agent compliance is not guaranteed
+ expect(newFiles.length).toBeGreaterThanOrEqual(1);
+ }, 420_000);
+});
+
+// --- Base branch detection smoke tests ---
+
+describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => {
+ let baseBranchDir: string;
+ const run = (cmd: string, args: string[], cwd: string) =>
+ spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
+
+ beforeAll(() => {
+ baseBranchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-basebranch-'));
+ });
+
+ afterAll(() => {
+ try { fs.rmSync(baseBranchDir, { recursive: true, force: true }); } catch {}
+ });
+
+ testIfSelected('review-base-branch', async () => {
+ const dir = path.join(baseBranchDir, 'review-base');
+ fs.mkdirSync(dir, { recursive: true });
+
+ // Create git repo with a feature branch off main
+ run('git', ['init'], dir);
+ run('git', ['config', 'user.email', 'test@test.com'], dir);
+ run('git', ['config', 'user.name', 'Test'], dir);
+
+ fs.writeFileSync(path.join(dir, 'app.rb'), '# clean base\nclass App\nend\n');
+ run('git', ['add', 'app.rb'], dir);
+ run('git', ['commit', '-m', 'initial commit'], dir);
+
+ // Create feature branch with a change
+ run('git', ['checkout', '-b', 'feature/test-review'], dir);
+ fs.writeFileSync(path.join(dir, 'app.rb'), '# clean base\nclass App\n def hello; "world"; end\nend\n');
+ run('git', ['add', 'app.rb'], dir);
+ run('git', ['commit', '-m', 'feat: add hello method'], dir);
+
+ // Copy review skill files
+ fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(dir, 'review-SKILL.md'));
+ fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(dir, 'review-checklist.md'));
+ fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(dir, 'review-greptile-triage.md'));
+
+ const result = await runSkillTest({
+ prompt: `You are in a git repo on a feature branch with changes.
+Read review-SKILL.md for the review workflow instructions.
+Also read review-checklist.md and apply it.
+
+IMPORTANT: Follow Step 0 to detect the base branch. Since there is no remote, gh commands will fail — fall back to main.
+Then run the review against the detected base branch.
+Write your findings to ${dir}/review-output.md`,
+ workingDirectory: dir,
+ maxTurns: 15,
+ timeout: 90_000,
+ testName: 'review-base-branch',
+ runId,
+ });
+
+ logCost('/review base-branch', result);
+ recordE2E('/review base branch detection', 'Base branch detection', result);
+ expect(result.exitReason).toBe('success');
+
+ // Verify the review used "base branch" language (from Step 0)
+ const toolOutputs = result.toolCalls.map(tc => tc.output || '').join('\n');
+ const allOutput = (result.output || '') + toolOutputs;
+ // The agent should have run git diff against main (the fallback)
+ const usedGitDiff = result.toolCalls.some(tc =>
+ tc.tool === 'Bash' && typeof tc.input === 'string' && tc.input.includes('git diff')
+ );
+ expect(usedGitDiff).toBe(true);
+ }, 120_000);
+
+ testIfSelected('ship-base-branch', async () => {
+ const dir = path.join(baseBranchDir, 'ship-base');
+ fs.mkdirSync(dir, { recursive: true });
+
+ // Create git repo with feature branch
+ run('git', ['init'], dir);
+ run('git', ['config', 'user.email', 'test@test.com'], dir);
+ run('git', ['config', 'user.name', 'Test'], dir);
+
+ fs.writeFileSync(path.join(dir, 'app.ts'), 'console.log("v1");\n');
+ run('git', ['add', 'app.ts'], dir);
+ run('git', ['commit', '-m', 'initial'], dir);
+
+ run('git', ['checkout', '-b', 'feature/ship-test'], dir);
+ fs.writeFileSync(path.join(dir, 'app.ts'), 'console.log("v2");\n');
+ run('git', ['add', 'app.ts'], dir);
+ run('git', ['commit', '-m', 'feat: update to v2'], dir);
+
+ // Copy ship skill
+ fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dir, 'ship-SKILL.md'));
+
+ const result = await runSkillTest({
+ prompt: `Read ship-SKILL.md for the ship workflow.
+
+Run ONLY Step 0 (Detect base branch) and Step 1 (Pre-flight) from the ship workflow.
+Since there is no remote, gh commands will fail — fall back to main.
+
+After completing Step 0 and Step 1, STOP. Do NOT proceed to Step 2 or beyond.
+Do NOT push, create PRs, or modify VERSION/CHANGELOG.
+
+Write a summary of what you detected to ${dir}/ship-preflight.md including:
+- The detected base branch name
+- The current branch name
+- The diff stat against the base branch`,
+ workingDirectory: dir,
+ maxTurns: 10,
+ timeout: 60_000,
+ testName: 'ship-base-branch',
+ runId,
+ });
+
+ logCost('/ship base-branch', result);
+ recordE2E('/ship base branch detection', 'Base branch detection', result);
+ expect(result.exitReason).toBe('success');
+
+ // Verify preflight output was written
+ const preflightPath = path.join(dir, 'ship-preflight.md');
+ if (fs.existsSync(preflightPath)) {
+ const content = fs.readFileSync(preflightPath, 'utf-8');
+ expect(content.length).toBeGreaterThan(20);
+ // Should mention the branch name
+ expect(content.toLowerCase()).toMatch(/main|base/);
+ }
+
+ // Verify no destructive actions — no push, no PR creation
+ const destructiveTools = result.toolCalls.filter(tc =>
+ tc.tool === 'Bash' && typeof tc.input === 'string' &&
+ (tc.input.includes('git push') || tc.input.includes('gh pr create'))
+ );
+ expect(destructiveTools).toHaveLength(0);
+ }, 90_000);
+
+ testIfSelected('retro-base-branch', async () => {
+ const dir = path.join(baseBranchDir, 'retro-base');
+ fs.mkdirSync(dir, { recursive: true });
+
+ // Create git repo with commit history
+ run('git', ['init'], dir);
+ run('git', ['config', 'user.email', 'dev@example.com'], dir);
+ run('git', ['config', 'user.name', 'Dev'], dir);
+
+ fs.writeFileSync(path.join(dir, 'app.ts'), 'console.log("hello");\n');
+ run('git', ['add', 'app.ts'], dir);
+ run('git', ['commit', '-m', 'feat: initial app', '--date', '2026-03-14T09:00:00'], dir);
+
+ fs.writeFileSync(path.join(dir, 'auth.ts'), 'export function login() {}\n');
+ run('git', ['add', 'auth.ts'], dir);
+ run('git', ['commit', '-m', 'feat: add auth', '--date', '2026-03-15T10:00:00'], dir);
+
+ fs.writeFileSync(path.join(dir, 'test.ts'), 'test("it works", () => {});\n');
+ run('git', ['add', 'test.ts'], dir);
+ run('git', ['commit', '-m', 'test: add tests', '--date', '2026-03-16T11:00:00'], dir);
+
+ // Copy retro skill
+ fs.mkdirSync(path.join(dir, 'retro'), { recursive: true });
+ fs.copyFileSync(path.join(ROOT, 'retro', 'SKILL.md'), path.join(dir, 'retro', 'SKILL.md'));
+
+ const result = await runSkillTest({
+ prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.
+
+IMPORTANT: Follow the "Detect default branch" step first. Since there is no remote, gh will fail — fall back to main.
+Then use the detected branch name for all git queries.
+
+Run /retro for the last 7 days of this git repo. Skip any AskUserQuestion calls — this is non-interactive.
+This is a local-only repo so use the local branch (main) instead of origin/main for all git log commands.
+
+Write your retrospective to ${dir}/retro-output.md`,
+ workingDirectory: dir,
+ maxTurns: 25,
+ timeout: 240_000,
+ testName: 'retro-base-branch',
+ runId,
+ });
+
+ logCost('/retro base-branch', result);
+ recordE2E('/retro default branch detection', 'Base branch detection', result, {
+ passed: ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+
+ // Verify retro output was produced
+ const retroPath = path.join(dir, 'retro-output.md');
+ if (fs.existsSync(retroPath)) {
+ const content = fs.readFileSync(retroPath, 'utf-8');
+ expect(content.length).toBeGreaterThan(100);
+ }
+ }, 300_000);
+});
+
+// --- Document-Release skill E2E ---
+
+describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
+ let docReleaseDir: string;
+
+ beforeAll(() => {
+ docReleaseDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-doc-release-'));
+
+ // Copy document-release skill files
+ copyDirSync(path.join(ROOT, 'document-release'), path.join(docReleaseDir, 'document-release'));
+
+ // Init git repo with initial docs
+ const run = (cmd: string, args: string[]) =>
+ spawnSync(cmd, args, { cwd: docReleaseDir, stdio: 'pipe', timeout: 5000 });
+
+ run('git', ['init', '-b', 'main']);
+ run('git', ['config', 'user.email', 'test@test.com']);
+ run('git', ['config', 'user.name', 'Test']);
+
+ // Create initial README with a features list
+ fs.writeFileSync(path.join(docReleaseDir, 'README.md'),
+ '# Test Project\n\n## Features\n\n- Feature A\n- Feature B\n\n## Install\n\n```bash\nnpm install\n```\n');
+
+ // Create initial CHANGELOG that must NOT be clobbered
+ fs.writeFileSync(path.join(docReleaseDir, 'CHANGELOG.md'),
+ '# Changelog\n\n## 1.0.0 — 2026-03-01\n\n- Initial release with Feature A and Feature B\n- Setup CI pipeline\n');
+
+ // Create VERSION file (already bumped)
+ fs.writeFileSync(path.join(docReleaseDir, 'VERSION'), '1.1.0\n');
+
+ run('git', ['add', '.']);
+ run('git', ['commit', '-m', 'initial']);
+
+ // Create feature branch with a code change
+ run('git', ['checkout', '-b', 'feat/add-feature-c']);
+ fs.writeFileSync(path.join(docReleaseDir, 'feature-c.ts'), 'export function featureC() { return "C"; }\n');
+ fs.writeFileSync(path.join(docReleaseDir, 'VERSION'), '1.1.1\n');
+ fs.writeFileSync(path.join(docReleaseDir, 'CHANGELOG.md'),
+ '# Changelog\n\n## 1.1.1 — 2026-03-16\n\n- Added Feature C\n\n## 1.0.0 — 2026-03-01\n\n- Initial release with Feature A and Feature B\n- Setup CI pipeline\n');
+ run('git', ['add', '.']);
+ run('git', ['commit', '-m', 'feat: add feature C']);
+ });
+
+ afterAll(() => {
+ try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
+ });
+
+ test('/document-release updates docs without clobbering CHANGELOG', async () => {
+ const result = await runSkillTest({
+ prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.
+
+Run the /document-release workflow on this repo. The base branch is "main".
+
+IMPORTANT:
+- Do NOT use AskUserQuestion — auto-approve everything or skip if unsure.
+- Do NOT push or create PRs (there is no remote).
+- Do NOT run gh commands (no remote).
+- Focus on updating README.md to reflect the new Feature C.
+- Do NOT overwrite or regenerate CHANGELOG entries.
+- Skip VERSION bump (it's already bumped).
+- After editing, just commit the changes locally.`,
+ workingDirectory: docReleaseDir,
+ maxTurns: 30,
+ allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+ timeout: 180_000,
+ testName: 'document-release',
+ runId,
+ });
+
+ logCost('/document-release', result);
+
+ // Read CHANGELOG to verify it was NOT clobbered
+ const changelog = fs.readFileSync(path.join(docReleaseDir, 'CHANGELOG.md'), 'utf-8');
+ const hasOriginalEntries = changelog.includes('Initial release with Feature A and Feature B')
+ && changelog.includes('Setup CI pipeline')
+ && changelog.includes('1.0.0');
+ if (!hasOriginalEntries) {
+ console.warn('CHANGELOG CLOBBERED — original entries missing!');
+ }
+
+ // Check if README was updated
+ const readme = fs.readFileSync(path.join(docReleaseDir, 'README.md'), 'utf-8');
+ const readmeUpdated = readme.includes('Feature C') || readme.includes('feature-c') || readme.includes('feature C');
+
+ const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
+ recordE2E('/document-release', 'Document-Release skill E2E', result, {
+ passed: exitOk && hasOriginalEntries,
+ });
+
+ // Critical guardrail: CHANGELOG must not be clobbered
+ expect(hasOriginalEntries).toBe(true);
+
+ // Accept error_max_turns — thorough doc review is not a failure
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+
+ // Informational: did it update README?
+ if (readmeUpdated) {
+ console.log('README updated to include Feature C');
+ } else {
+ console.warn('README was NOT updated — agent may not have found the feature');
+ }
+ }, 240_000);
+});
+
+// --- Deferred skill E2E tests (destructive or require interactive UI) ---
+
+// Deferred tests — only test.todo entries, no selection needed
+describeE2E('Deferred skill E2E', () => {
+ // Ship is destructive: pushes to remote, creates PRs, modifies VERSION/CHANGELOG
+ test.todo('/ship completes full workflow');
+
+ // Setup-browser-cookies requires interactive browser picker UI
+ test.todo('/setup-browser-cookies imports cookies');
+
+});
+
+// --- gstack-upgrade E2E ---
+
+describeIfSelected('gstack-upgrade E2E', ['gstack-upgrade-happy-path'], () => {
+ let upgradeDir: string;
+ let remoteDir: string;
+
+ beforeAll(() => {
+ upgradeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-upgrade-'));
+ remoteDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-remote-'));
+
+ const run = (cmd: string, args: string[], cwd: string) =>
+ spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
+
+ // Init the "project" repo
+ run('git', ['init'], upgradeDir);
+ run('git', ['config', 'user.email', 'test@test.com'], upgradeDir);
+ run('git', ['config', 'user.name', 'Test'], upgradeDir);
+
+ // Create mock gstack install directory (local-git type)
+ const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
+ fs.mkdirSync(mockGstack, { recursive: true });
+
+ // Init as a git repo
+ run('git', ['init'], mockGstack);
+ run('git', ['config', 'user.email', 'test@test.com'], mockGstack);
+ run('git', ['config', 'user.name', 'Test'], mockGstack);
+
+ // Create bare remote
+ run('git', ['init', '--bare'], remoteDir);
+ run('git', ['remote', 'add', 'origin', remoteDir], mockGstack);
+
+ // Write old version files
+ fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.5.0\n');
+ fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
+ '# Changelog\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
+ fs.writeFileSync(path.join(mockGstack, 'setup'),
+ '#!/bin/bash\necho "Setup completed"\n', { mode: 0o755 });
+
+ // Initial commit + push
+ run('git', ['add', '.'], mockGstack);
+ run('git', ['commit', '-m', 'initial'], mockGstack);
+ run('git', ['push', '-u', 'origin', 'HEAD:main'], mockGstack);
+
+ // Create new version (simulate upstream release)
+ fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.6.0\n');
+ fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
+ '# Changelog\n\n## 0.6.0 — 2026-03-15\n\n- New feature: interactive design review\n- Fix: snapshot flag validation\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
+ run('git', ['add', '.'], mockGstack);
+ run('git', ['commit', '-m', 'release 0.6.0'], mockGstack);
+ run('git', ['push', 'origin', 'HEAD:main'], mockGstack);
+
+ // Reset working copy back to old version
+ run('git', ['reset', '--hard', 'HEAD~1'], mockGstack);
+
+ // Copy gstack-upgrade skill
+ fs.mkdirSync(path.join(upgradeDir, 'gstack-upgrade'), { recursive: true });
+ fs.copyFileSync(
+ path.join(ROOT, 'gstack-upgrade', 'SKILL.md'),
+ path.join(upgradeDir, 'gstack-upgrade', 'SKILL.md'),
+ );
+
+ // Commit so git repo is clean
+ run('git', ['add', '.'], upgradeDir);
+ run('git', ['commit', '-m', 'initial project'], upgradeDir);
+ });
+
+ afterAll(() => {
+ try { fs.rmSync(upgradeDir, { recursive: true, force: true }); } catch {}
+ try { fs.rmSync(remoteDir, { recursive: true, force: true }); } catch {}
+ });
+
+ testIfSelected('gstack-upgrade-happy-path', async () => {
+ const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
+ const result = await runSkillTest({
+ prompt: `Read gstack-upgrade/SKILL.md for the upgrade workflow.
+
+You are running /gstack-upgrade standalone. The gstack installation is at ./.claude/skills/gstack (local-git type — it has a .git directory with an origin remote).
+
+Current version: 0.5.0. A new version 0.6.0 is available on origin/main.
+
+Follow the standalone upgrade flow:
+1. Detect install type (local-git)
+2. Run git fetch origin && git reset --hard origin/main in the install directory
+3. Run the setup script
+4. Show what's new from CHANGELOG
+
+Skip any AskUserQuestion calls — auto-approve the upgrade. Write a summary of what you did to stdout.
+
+IMPORTANT: The install directory is at ./.claude/skills/gstack — use that exact path.`,
+ workingDirectory: upgradeDir,
+ maxTurns: 20,
+ timeout: 180_000,
+ testName: 'gstack-upgrade-happy-path',
+ runId,
+ });
+
+ logCost('/gstack-upgrade happy path', result);
+
+ // Check that the version was updated
+ const versionAfter = fs.readFileSync(path.join(mockGstack, 'VERSION'), 'utf-8').trim();
+ const output = result.output || '';
+ const mentionsUpgrade = output.toLowerCase().includes('0.6.0') ||
+ output.toLowerCase().includes('upgrade') ||
+ output.toLowerCase().includes('updated');
+
+ recordE2E('/gstack-upgrade happy path', 'gstack-upgrade E2E', result, {
+ passed: versionAfter === '0.6.0' && ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+ expect(versionAfter).toBe('0.6.0');
+ }, 240_000);
+});
+
+// --- Design Consultation E2E ---
+
+/**
+ * LLM judge for DESIGN.md quality — checks font blacklist compliance,
+ * coherence, specificity, and AI slop avoidance.
+ */
+async function designQualityJudge(designMd: string): Promise<{ passed: boolean; reasoning: string }> {
+ return callJudge<{ passed: boolean; reasoning: string }>(`You are evaluating a generated DESIGN.md file for quality.
+
+Evaluate against these criteria — ALL must pass for an overall "passed: true":
+1. Does NOT recommend Inter, Roboto, Arial, Helvetica, Open Sans, Lato, Montserrat, or Poppins as primary fonts
+2. Aesthetic direction is coherent with color approach (e.g., brutalist aesthetic doesn't pair with expressive color without explanation)
+3. Font recommendations include specific font names (not generic like "a sans-serif font")
+4. Color palette includes actual hex values, not placeholders like "[hex]"
+5. Rationale is provided for major decisions (not just "because it looks good")
+6. No AI slop patterns: purple gradients mentioned positively, "3-column feature grid" language, generic marketing speak
+7. Product context is reflected in design choices (civic tech → should have appropriate, professional aesthetic)
+
+DESIGN.md content:
+\`\`\`
+${designMd}
+\`\`\`
+
+Return JSON: { "passed": true/false, "reasoning": "one paragraph explaining your evaluation" }`);
+}
+
+describeIfSelected('Design Consultation E2E', [
+ 'design-consultation-core', 'design-consultation-research',
+ 'design-consultation-existing', 'design-consultation-preview',
+], () => {
+ let designDir: string;
+
+ beforeAll(() => {
+ designDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-design-consultation-'));
+ const { spawnSync } = require('child_process');
+ const run = (cmd: string, args: string[]) =>
+ spawnSync(cmd, args, { cwd: designDir, stdio: 'pipe', timeout: 5000 });
+
+ run('git', ['init', '-b', 'main']);
+ run('git', ['config', 'user.email', 'test@test.com']);
+ run('git', ['config', 'user.name', 'Test']);
+
+ // Create a realistic project context
+ fs.writeFileSync(path.join(designDir, 'README.md'), `# CivicPulse
+
+A civic tech data platform for government employees to access, visualize, and share public data. Built with Next.js and PostgreSQL.
+
+## Features
+- Real-time data dashboards for municipal budgets
+- Public records search with faceted filtering
+- Data export and sharing tools for inter-department collaboration
+`);
+ fs.writeFileSync(path.join(designDir, 'package.json'), JSON.stringify({
+ name: 'civicpulse',
+ version: '0.1.0',
+ dependencies: { next: '^14.0.0', react: '^18.2.0', 'tailwindcss': '^3.4.0' },
+ }, null, 2));
+
+ run('git', ['add', '.']);
+ run('git', ['commit', '-m', 'initial project setup']);
+
+ // Copy design-consultation skill
+ fs.mkdirSync(path.join(designDir, 'design-consultation'), { recursive: true });
+ fs.copyFileSync(
+ path.join(ROOT, 'design-consultation', 'SKILL.md'),
+ path.join(designDir, 'design-consultation', 'SKILL.md'),
+ );
+ });
+
+ afterAll(() => {
+ try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
+ });
+
+ testIfSelected('design-consultation-core', async () => {
+ const result = await runSkillTest({
+ prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
+
+This is a civic tech data platform called CivicPulse for government employees who need to access public data. Read the README.md for details.
+
+Skip research — work from your design knowledge. Skip the font preview page. Skip any AskUserQuestion calls — this is non-interactive. Accept your first design system proposal.
+
+Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
+ workingDirectory: designDir,
+ maxTurns: 20,
+ timeout: 360_000,
+ testName: 'design-consultation-core',
+ runId,
+ });
+
+ logCost('/design-consultation core', result);
+
+ const designPath = path.join(designDir, 'DESIGN.md');
+ const claudePath = path.join(designDir, 'CLAUDE.md');
+ const designExists = fs.existsSync(designPath);
+ const claudeExists = fs.existsSync(claudePath);
+ let designContent = '';
+
+ if (designExists) {
+ designContent = fs.readFileSync(designPath, 'utf-8');
+ }
+
+ // Structural checks
+ const requiredSections = ['Product Context', 'Aesthetic', 'Typography', 'Color', 'Spacing', 'Layout', 'Motion'];
+ const missingSections = requiredSections.filter(s => !designContent.toLowerCase().includes(s.toLowerCase()));
+
+ // LLM judge for quality
+ let judgeResult = { passed: false, reasoning: 'judge not run' };
+ if (designExists && designContent.length > 100) {
+ try {
+ judgeResult = await designQualityJudge(designContent);
+ console.log('Design quality judge:', JSON.stringify(judgeResult, null, 2));
+ } catch (err) {
+ console.warn('Judge failed:', err);
+ judgeResult = { passed: true, reasoning: 'judge error — defaulting to pass' };
+ }
+ }
+
+ const structuralPass = designExists && claudeExists && missingSections.length === 0;
+ recordE2E('/design-consultation core', 'Design Consultation E2E', result, {
+ passed: structuralPass && judgeResult.passed && ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+ expect(designExists).toBe(true);
+ if (designExists) {
+ expect(missingSections).toHaveLength(0);
+ }
+ if (claudeExists) {
+ const claude = fs.readFileSync(claudePath, 'utf-8');
+ expect(claude.toLowerCase()).toContain('design.md');
+ }
+ }, 420_000);
+
+ testIfSelected('design-consultation-research', async () => {
+ // Clean up from previous test
+ try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
+ try { fs.unlinkSync(path.join(designDir, 'CLAUDE.md')); } catch {}
+
+ const result = await runSkillTest({
+ prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
+
+This is a civic tech data platform called CivicPulse. Read the README.md.
+
+DO research what's out there before proposing — search for civic tech and government data platform designs. Skip the font preview page. Skip any AskUserQuestion calls — this is non-interactive.
+
+Write DESIGN.md to the working directory.`,
+ workingDirectory: designDir,
+ maxTurns: 30,
+ timeout: 360_000,
+ testName: 'design-consultation-research',
+ runId,
+ });
+
+ logCost('/design-consultation research', result);
+
+ const designPath = path.join(designDir, 'DESIGN.md');
+ const designExists = fs.existsSync(designPath);
+ let designContent = '';
+ if (designExists) {
+ designContent = fs.readFileSync(designPath, 'utf-8');
+ }
+
+ // Check if WebSearch was used (may not be available in all envs)
+ const webSearchCalls = result.toolCalls.filter(tc => tc.tool === 'WebSearch');
+ if (webSearchCalls.length > 0) {
+ console.log(`WebSearch used ${webSearchCalls.length} times`);
+ } else {
+ console.warn('WebSearch not used — may be unavailable in test env');
+ }
+
+ // LLM judge
+ let judgeResult = { passed: false, reasoning: 'judge not run' };
+ if (designExists && designContent.length > 100) {
+ try {
+ judgeResult = await designQualityJudge(designContent);
+ console.log('Design quality judge (research):', JSON.stringify(judgeResult, null, 2));
+ } catch (err) {
+ console.warn('Judge failed:', err);
+ judgeResult = { passed: true, reasoning: 'judge error — defaulting to pass' };
+ }
+ }
+
+ recordE2E('/design-consultation research', 'Design Consultation E2E', result, {
+ passed: designExists && ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+ expect(designExists).toBe(true);
+ }, 420_000);
+
+ testIfSelected('design-consultation-existing', async () => {
+ // Pre-create a minimal DESIGN.md
+ fs.writeFileSync(path.join(designDir, 'DESIGN.md'), `# Design System — CivicPulse
+
+## Typography
+Body: system-ui
+`);
+
+ const result = await runSkillTest({
+ prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
+
+There is already a DESIGN.md in this repo. Update it with a complete design system for CivicPulse, a civic tech data platform for government employees.
+
+Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non-interactive.`,
+ workingDirectory: designDir,
+ maxTurns: 20,
+ timeout: 360_000,
+ testName: 'design-consultation-existing',
+ runId,
+ });
+
+ logCost('/design-consultation existing', result);
+
+ const designPath = path.join(designDir, 'DESIGN.md');
+ const designExists = fs.existsSync(designPath);
+ let designContent = '';
+ if (designExists) {
+ designContent = fs.readFileSync(designPath, 'utf-8');
+ }
+
+ // Should have more content than the minimal version
+ const hasColor = designContent.toLowerCase().includes('color');
+ const hasSpacing = designContent.toLowerCase().includes('spacing');
+
+ recordE2E('/design-consultation existing', 'Design Consultation E2E', result, {
+ passed: designExists && hasColor && hasSpacing && ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+ expect(designExists).toBe(true);
+ if (designExists) {
+ expect(hasColor).toBe(true);
+ expect(hasSpacing).toBe(true);
+ }
+ }, 420_000);
+
+ testIfSelected('design-consultation-preview', async () => {
+ // Clean up
+ try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
+
+ const result = await runSkillTest({
+ prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
+
+This is CivicPulse, a civic tech data platform. Read the README.md.
+
+Skip research. Skip any AskUserQuestion calls — this is non-interactive. Generate the font and color preview page but write it to ./design-preview.html instead of /tmp/ (do NOT run the open command). Then write DESIGN.md.`,
+ workingDirectory: designDir,
+ maxTurns: 20,
+ timeout: 360_000,
+ testName: 'design-consultation-preview',
+ runId,
+ });
+
+ logCost('/design-consultation preview', result);
+
+ const previewPath = path.join(designDir, 'design-preview.html');
+ const designPath = path.join(designDir, 'DESIGN.md');
+ const previewExists = fs.existsSync(previewPath);
+ const designExists = fs.existsSync(designPath);
+
+ let previewContent = '';
+ if (previewExists) {
+ previewContent = fs.readFileSync(previewPath, 'utf-8');
+ }
+
+ const hasHtml = previewContent.includes(' 100) {
+ try {
+ judgeResult = await designQualityJudge(designContent);
+ console.log('Design quality judge (preview):', JSON.stringify(judgeResult, null, 2));
+ } catch (err) {
+ console.warn('Judge failed:', err);
+ judgeResult = { passed: true, reasoning: 'judge error — defaulting to pass' };
+ }
+ }
+ }
+
+ recordE2E('/design-consultation preview', 'Design Consultation E2E', result, {
+ passed: previewExists && designExists && hasHtml && ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+ expect(previewExists).toBe(true);
+ if (previewExists) {
+ expect(hasHtml).toBe(true);
+ expect(hasFontRef).toBe(true);
+ }
+ expect(designExists).toBe(true);
+ }, 420_000);
+});
+
+// --- Plan Design Review E2E (plan-mode) ---
+
+describeIfSelected('Plan Design Review E2E', ['plan-design-review-plan-mode', 'plan-design-review-no-ui-scope'], () => {
+ let reviewDir: string;
+
+ beforeAll(() => {
+ reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-design-'));
+
+ const { spawnSync } = require('child_process');
+ const run = (cmd: string, args: string[]) =>
+ spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
+
+ run('git', ['init', '-b', 'main']);
+ run('git', ['config', 'user.email', 'test@test.com']);
+ run('git', ['config', 'user.name', 'Test']);
+
+ // Copy plan-design-review skill
+ fs.mkdirSync(path.join(reviewDir, 'plan-design-review'), { recursive: true });
+ fs.copyFileSync(
+ path.join(ROOT, 'plan-design-review', 'SKILL.md'),
+ path.join(reviewDir, 'plan-design-review', 'SKILL.md'),
+ );
+
+ // Create a plan file with intentional design gaps
+ fs.writeFileSync(path.join(reviewDir, 'plan.md'), `# Plan: User Dashboard
+
+## Context
+Build a user dashboard that shows account stats, recent activity, and settings.
+
+## Implementation
+1. Create a dashboard page at /dashboard
+2. Show user stats (posts, followers, engagement rate)
+3. Add a recent activity feed
+4. Add a settings panel
+5. Use a clean, modern UI with cards and icons
+6. Add a hero section at the top with a gradient background
+
+## Technical Details
+- React components with Tailwind CSS
+- API endpoint: GET /api/dashboard
+- WebSocket for real-time activity updates
+`);
+
+ run('git', ['add', '.']);
+ run('git', ['commit', '-m', 'initial plan']);
+ });
+
+ afterAll(() => {
+ try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
+ });
+
+ testIfSelected('plan-design-review-plan-mode', async () => {
+ const result = await runSkillTest({
+ prompt: `Read plan-design-review/SKILL.md for the design review workflow.
+
+Review the plan in ./plan.md. This plan has several design gaps — it uses vague language like "clean, modern UI" and "cards and icons", mentions a "hero section with gradient" (AI slop), and doesn't specify empty states, error states, loading states, responsive behavior, or accessibility.
+
+Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Rate each design dimension 0-10 and explain what would make it a 10. Then EDIT plan.md to add the missing design decisions (interaction state table, empty states, responsive behavior, etc.).
+
+IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit. Just read the plan file, review it, and edit it to fix the gaps.`,
+ workingDirectory: reviewDir,
+ maxTurns: 15,
+ timeout: 300_000,
+ testName: 'plan-design-review-plan-mode',
+ runId,
+ });
+
+ logCost('/plan-design-review plan-mode', result);
+
+ // Check that the agent produced design ratings (0-10 scale)
+ const output = result.output || '';
+ const hasRatings = /\d+\/10/.test(output);
+ const hasDesignContent = output.toLowerCase().includes('information architecture') ||
+ output.toLowerCase().includes('interaction state') ||
+ output.toLowerCase().includes('ai slop') ||
+ output.toLowerCase().includes('hierarchy');
+
+ // Check that the plan file was edited (the core new behavior)
+ const planAfter = fs.readFileSync(path.join(reviewDir, 'plan.md'), 'utf-8');
+ const planOriginal = `# Plan: User Dashboard`;
+ const planWasEdited = planAfter.length > 300; // Original is ~450 chars, edited should be much longer
+ const planHasDesignAdditions = planAfter.toLowerCase().includes('empty') ||
+ planAfter.toLowerCase().includes('loading') ||
+ planAfter.toLowerCase().includes('error') ||
+ planAfter.toLowerCase().includes('state') ||
+ planAfter.toLowerCase().includes('responsive') ||
+ planAfter.toLowerCase().includes('accessibility');
+
+ recordE2E('/plan-design-review plan-mode', 'Plan Design Review E2E', result, {
+ passed: hasDesignContent && planWasEdited && ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+ // Agent should produce design-relevant output about the plan
+ expect(hasDesignContent).toBe(true);
+ // Agent should have edited the plan file to add missing design decisions
+ expect(planWasEdited).toBe(true);
+ expect(planHasDesignAdditions).toBe(true);
+ }, 360_000);
+
+ testIfSelected('plan-design-review-no-ui-scope', async () => {
+ // Write a backend-only plan
+ fs.writeFileSync(path.join(reviewDir, 'backend-plan.md'), `# Plan: Database Migration
+
+## Context
+Migrate user records from PostgreSQL to a new schema with better indexing.
+
+## Implementation
+1. Create migration to add new columns to users table
+2. Backfill data from legacy columns
+3. Add database indexes for common query patterns
+4. Update ActiveRecord models
+5. Run migration in staging first, then production
+`);
+
+ const result = await runSkillTest({
+ prompt: `Read plan-design-review/SKILL.md for the design review workflow.
+
+Review the plan in ./backend-plan.md. This is a pure backend database migration plan with no UI changes.
+
+Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Write your findings directly to stdout.
+
+IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit.`,
+ workingDirectory: reviewDir,
+ maxTurns: 10,
+ timeout: 180_000,
+ testName: 'plan-design-review-no-ui-scope',
+ runId,
+ });
+
+ logCost('/plan-design-review no-ui-scope', result);
+
+ // Agent should detect no UI scope and exit early
+ const output = result.output || '';
+ const detectsNoUI = output.toLowerCase().includes('no ui') ||
+ output.toLowerCase().includes('no frontend') ||
+ output.toLowerCase().includes('no design') ||
+ output.toLowerCase().includes('not applicable') ||
+ output.toLowerCase().includes('backend');
+
+ recordE2E('/plan-design-review no-ui-scope', 'Plan Design Review E2E', result, {
+ passed: detectsNoUI && ['success', 'error_max_turns'].includes(result.exitReason),
+ });
+
+ expect(['success', 'error_max_turns']).toContain(result.exitReason);
+ expect(detectsNoUI).toBe(true);
+ }, 240_000);
+});
+
+// --- Design Review E2E (live-site audit + fix) ---
+
+describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
+ let qaDesignDir: string;
+ let qaDesignServer: ReturnTypeSome content here with tight line height.
+Different spacing and colors for no reason.
+