fix: resolve merge conflicts with origin/main (v0.6.1 qa-design-review → design-review rename)

Conflicts resolved:
- README.md: kept install section + office-hours/debug skills, adopted
  main's design-review rename and restructured footer
- design-review/SKILL.md: took main's version (renamed from qa-design-review)
- plan-design-review/SKILL.md: took main's version with base branch detect
- Updated install instructions to use /design-review (not /qa-design-review)
- Updated skill count to 15 in footer

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-17 22:59:16 -07:00
36 changed files with 2552 additions and 864 deletions
+86
View File
@@ -0,0 +1,86 @@
/* Planted design anti-patterns for E2E eval — 7 issues */
/* Issue 1: [HIGH] Blacklisted font (Papyrus) */
/* Issue 2: [HIGH] Body text < 16px (14px) */
body {
font-family: 'Papyrus', sans-serif;
font-size: 14px;
margin: 0;
padding: 0;
}
/* Issue 5: [MEDIUM] Purple/violet gradient background */
.hero {
background: linear-gradient(135deg, #6366f1, #8b5cf6);
text-align: center;
padding: 80px 20px;
color: white;
}
.hero h1 {
text-align: center;
font-size: 48px;
}
.hero p {
text-align: center;
font-size: 20px;
}
/* Issue 7: [LOW] 3-column feature grid with icon circles */
.features {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 24px;
padding: 60px 40px;
text-align: center;
}
.feature-card {
border-radius: 24px;
padding: 32px;
text-align: center;
background: #f9fafb;
}
/* Icon in colored circle — AI slop pattern */
.icon-circle {
width: 60px;
height: 60px;
border-radius: 50%;
background: #ede9fe;
display: flex;
align-items: center;
justify-content: center;
margin: 0 auto 16px;
font-size: 24px;
}
/* Issue 3: [HIGH] outline: none without replacement */
button {
outline: none;
background: #6366f1;
color: white;
border: none;
padding: 12px 24px;
border-radius: 24px;
cursor: pointer;
}
.small-link {
font-size: 11px;
padding: 4px 8px;
}
/* Issue 4: [HIGH] !important usage */
.override {
color: red !important;
margin-left: 10px !important;
}
.footer {
text-align: center;
padding: 40px;
background: #1e1b4b;
color: white;
}
+41
View File
@@ -0,0 +1,41 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="styles.css">
<title>Our Platform</title>
</head>
<body>
<!-- Issue 6: [MEDIUM] Generic hero copy ("Welcome to...", "all-in-one solution") -->
<div class="hero">
<h1>Welcome to Our Platform</h1>
<p>Your all-in-one solution for everything you need</p>
<button>Get Started</button>
</div>
<!-- Issue 7: [LOW] 3-column feature grid with icon-in-circle + title + description -->
<div class="features">
<div class="feature-card">
<div class="icon-circle">&#9733;</div>
<h3>Feature One</h3>
<p>A short description of this amazing feature that will change your life.</p>
</div>
<div class="feature-card">
<div class="icon-circle">&#9889;</div>
<h3>Feature Two</h3>
<p>Another incredible capability that sets us apart from the competition.</p>
</div>
<div class="feature-card">
<div class="icon-circle">&#9881;</div>
<h3>Feature Three</h3>
<p>Yet another powerful tool to streamline your workflow effortlessly.</p>
</div>
</div>
<div class="footer">
<p class="override">Unlock the power of our platform today</p>
<a href="#" class="small-link">Terms of Service</a>
</div>
</body>
</html>
+1 -1
View File
@@ -70,7 +70,7 @@ describe('gen-skill-docs', () => {
{ dir: 'setup-browser-cookies', name: 'setup-browser-cookies' },
{ dir: 'gstack-upgrade', name: 'gstack-upgrade' },
{ dir: 'plan-design-review', name: 'plan-design-review' },
{ dir: 'qa-design-review', name: 'qa-design-review' },
{ dir: 'design-review', name: 'design-review' },
{ dir: 'design-consultation', name: 'design-consultation' },
];
+200
View File
@@ -0,0 +1,200 @@
/**
* Diff-based test selection for E2E and LLM-judge evals.
*
* Each test declares which source files it depends on ("touchfiles").
* The test runner checks `git diff` and only runs tests whose
* dependencies were modified. Override with EVALS_ALL=1 to run everything.
*/
import { spawnSync } from 'child_process';
// --- Glob matching ---
/**
* Match a file path against a glob pattern.
* Supports:
* ** — match any number of path segments
* * — match within a single segment (no /)
*/
export function matchGlob(file: string, pattern: string): boolean {
const regexStr = pattern
.replace(/\./g, '\\.')
.replace(/\*\*/g, '{{GLOBSTAR}}')
.replace(/\*/g, '[^/]*')
.replace(/\{\{GLOBSTAR\}\}/g, '.*');
return new RegExp(`^${regexStr}$`).test(file);
}
// --- Touchfile maps ---
/**
* E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
* Each test lists the file patterns that, if changed, require the test to run.
*/
export const E2E_TOUCHFILES: Record<string, string[]> = {
// Browse core
'browse-basic': ['browse/src/**'],
'browse-snapshot': ['browse/src/**'],
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'],
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
// QA
'qa-quick': ['qa/**', 'browse/src/**'],
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
'qa-fix-loop': ['qa/**', 'browse/src/**'],
// Review
'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'],
'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Plan reviews
'plan-ceo-review': ['plan-ceo-review/**'],
'plan-ceo-review-selective': ['plan-ceo-review/**'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],
// Ship
'ship-base-branch': ['ship/**'],
// Retro
'retro': ['retro/**'],
'retro-base-branch': ['retro/**'],
// Document-release
'document-release': ['document-release/**'],
// QA bootstrap
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
// Ship coverage audit
'ship-coverage-audit': ['ship/**'],
// Design
'design-consultation-core': ['design-consultation/**'],
'design-consultation-research': ['design-consultation/**'],
'design-consultation-existing': ['design-consultation/**'],
'design-consultation-preview': ['design-consultation/**'],
'plan-design-review-plan-mode': ['plan-design-review/**'],
'plan-design-review-no-ui-scope': ['plan-design-review/**'],
'design-review-fix': ['design-review/**', 'browse/src/**'],
// gstack-upgrade
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
};
/**
* LLM-judge test touchfiles — keyed by test description string.
*/
export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
'setup block': ['SKILL.md', 'SKILL.md.tmpl'],
'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
// Ship & Release
'ship/SKILL.md workflow': ['ship/SKILL.md', 'ship/SKILL.md.tmpl'],
'document-release/SKILL.md workflow': ['document-release/SKILL.md', 'document-release/SKILL.md.tmpl'],
// Plan Reviews
'plan-ceo-review/SKILL.md modes': ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'],
'plan-eng-review/SKILL.md sections': ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'],
'plan-design-review/SKILL.md passes': ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'],
// Design skills
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
// Other skills
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
};
/**
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
*/
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts',
'test/helpers/eval-store.ts',
'test/helpers/llm-judge.ts',
'scripts/gen-skill-docs.ts',
'test/helpers/touchfiles.ts',
'browse/test/test-server.ts',
];
// --- Base branch detection ---
/**
* Detect the base branch by trying refs in order.
* Returns the first valid ref, or null if none found.
*/
export function detectBaseBranch(cwd: string): string | null {
for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
const result = spawnSync('git', ['rev-parse', '--verify', ref], {
cwd, stdio: 'pipe', timeout: 3000,
});
if (result.status === 0) return ref;
}
return null;
}
/**
* Get list of files changed between base branch and HEAD.
*/
export function getChangedFiles(baseBranch: string, cwd: string): string[] {
const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
cwd, stdio: 'pipe', timeout: 5000,
});
if (result.status !== 0) return [];
return result.stdout.toString().trim().split('\n').filter(Boolean);
}
// --- Test selection ---
/**
* Select tests to run based on changed files.
*
* Algorithm:
* 1. If any changed file matches a global touchfile → run ALL tests
* 2. Otherwise, for each test, check if any changed file matches its patterns
* 3. Return selected + skipped lists with reason
*/
export function selectTests(
changedFiles: string[],
touchfiles: Record<string, string[]>,
globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
): { selected: string[]; skipped: string[]; reason: string } {
const allTestNames = Object.keys(touchfiles);
// Global touchfile hit → run all
for (const file of changedFiles) {
if (globalTouchfiles.some(g => matchGlob(file, g))) {
return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
}
}
// Per-test matching
const selected: string[] = [];
const skipped: string[] = [];
for (const [testName, patterns] of Object.entries(touchfiles)) {
const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
(hit ? selected : skipped).push(testName);
}
return { selected, skipped, reason: 'diff' };
}
+386 -106
View File
@@ -1,10 +1,11 @@
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner';
import { outcomeJudge } from './helpers/llm-judge';
import { outcomeJudge, callJudge } from './helpers/llm-judge';
import { EvalCollector, judgePassed } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { startTestServer } from '../browse/test/test-server';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
@@ -21,6 +22,41 @@ const ROOT = path.resolve(import.meta.dir, '..');
const evalsEnabled = !!process.env.EVALS;
const describeE2E = evalsEnabled ? describe : describe.skip;
// --- Diff-based test selection ---
// When EVALS_ALL is not set, only run tests whose touchfiles were modified.
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
let selectedTests: string[] | null = null; // null = run all
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE
|| detectBaseBranch(ROOT)
|| 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length > 0) {
const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
selectedTests = selection.selected;
process.stderr.write(`\nE2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`);
if (selection.skipped.length > 0) {
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
}
process.stderr.write('\n');
}
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
}
/** Wrap a describe block to skip entirely if none of its tests are selected. */
function describeIfSelected(name: string, testNames: string[], fn: () => void) {
const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
(anySelected ? describeE2E : describe.skip)(name, fn);
}
/** Skip an individual test if not selected (for multi-test describe blocks). */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
}
// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
@@ -133,7 +169,10 @@ if (evalsEnabled) {
}
}
describeE2E('Skill E2E tests', () => {
describeIfSelected('Skill E2E tests', [
'browse-basic', 'browse-snapshot', 'skillmd-setup-discovery',
'skillmd-no-local-binary', 'skillmd-outside-git', 'contributor-mode', 'session-awareness',
], () => {
beforeAll(() => {
testServer = startTestServer();
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
@@ -145,7 +184,7 @@ describeE2E('Skill E2E tests', () => {
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
test('browse basic commands work without errors', async () => {
testIfSelected('browse-basic', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
1. $B goto ${testServer.url}
@@ -166,7 +205,7 @@ Report the results of each command.`,
expect(result.exitReason).toBe('success');
}, 90_000);
test('browse snapshot flags all work', async () => {
testIfSelected('browse-snapshot', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
1. $B goto ${testServer.url}
@@ -191,7 +230,7 @@ Report what each command returned.`,
expect(result.exitReason).toBe('success');
}, 90_000);
test('agent discovers browse binary via SKILL.md setup block', async () => {
testIfSelected('skillmd-setup-discovery', async () => {
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
@@ -220,7 +259,7 @@ Report whether it worked.`,
expect(result.exitReason).toBe('success');
}, 90_000);
test('SKILL.md setup block handles missing local binary gracefully', async () => {
testIfSelected('skillmd-no-local-binary', async () => {
// Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
@@ -255,7 +294,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
}, 60_000);
test('SKILL.md setup block works outside git repo', async () => {
testIfSelected('skillmd-outside-git', async () => {
// Create a tmpdir outside any git repo
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
@@ -286,7 +325,7 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
}, 60_000);
test('contributor mode files a report on gstack error', async () => {
testIfSelected('contributor-mode', async () => {
const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-'));
const logsDir = path.join(contribDir, 'contributor-logs');
fs.mkdirSync(logsDir, { recursive: true });
@@ -342,7 +381,7 @@ File a contributor report about this issue. Then tell me what you filed.`,
try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
}, 90_000);
test('session awareness adds ELI16 context when _SESSIONS >= 3', async () => {
testIfSelected('session-awareness', async () => {
const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-'));
// Set up a git repo so there's project/branch context to reference
@@ -413,7 +452,7 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi
// --- B4: QA skill E2E ---
describeE2E('QA skill E2E', () => {
describeIfSelected('QA skill E2E', ['qa-quick'], () => {
let qaDir: string;
beforeAll(() => {
@@ -468,7 +507,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
// --- B5: Review skill E2E ---
describeE2E('Review skill E2E', () => {
describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
let reviewDir: string;
beforeAll(() => {
@@ -527,7 +566,7 @@ Write your review findings to ${reviewDir}/review-output.md`,
// --- Review: Enum completeness E2E ---
describeE2E('Review enum completeness E2E', () => {
describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'], () => {
let enumDir: string;
beforeAll(() => {
@@ -597,13 +636,107 @@ The diff adds a new "returned" status to the Order model. Your job is to check i
}, 120_000);
});
// --- Review: Design review lite E2E ---
describeE2E('Review design lite E2E', () => {
let designDir: string;
beforeAll(() => {
designDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-design-lite-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: designDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Commit clean base on main
fs.writeFileSync(path.join(designDir, 'index.html'), '<h1>Clean</h1>\n');
fs.writeFileSync(path.join(designDir, 'styles.css'), 'body { font-size: 16px; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Feature branch adds AI slop CSS + HTML
run('git', ['checkout', '-b', 'feature/add-landing-page']);
const slopCss = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.css'), 'utf-8');
const slopHtml = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.html'), 'utf-8');
fs.writeFileSync(path.join(designDir, 'styles.css'), slopCss);
fs.writeFileSync(path.join(designDir, 'landing.html'), slopHtml);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add landing page']);
// Copy review skill files
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(designDir, 'review-SKILL.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(designDir, 'review-checklist.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'design-checklist.md'), path.join(designDir, 'review-design-checklist.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(designDir, 'review-greptile-triage.md'));
});
afterAll(() => {
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
});
test('/review catches design anti-patterns in CSS/HTML diff', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
Read review-SKILL.md for the review workflow instructions.
Read review-checklist.md for the code review checklist.
Read review-design-checklist.md for the design review checklist.
Run /review on the current diff (git diff main...HEAD).
The diff adds a landing page with CSS and HTML. Check for both code issues AND design anti-patterns.
Write your review findings to ${designDir}/review-output.md
Important: The design checklist should catch issues like blacklisted fonts, small font sizes, outline:none, !important, AI slop patterns (purple gradients, generic hero copy, 3-column feature grid), etc.`,
workingDirectory: designDir,
maxTurns: 15,
timeout: 120_000,
testName: 'review-design-lite',
runId,
});
logCost('/review design lite', result);
recordE2E('/review design lite', 'Review design lite E2E', result);
expect(result.exitReason).toBe('success');
// Verify the review caught at least 4 of 7 planted design issues
const reviewPath = path.join(designDir, 'review-output.md');
if (fs.existsSync(reviewPath)) {
const review = fs.readFileSync(reviewPath, 'utf-8').toLowerCase();
let detected = 0;
// Issue 1: Blacklisted font (Papyrus) — HIGH
if (review.includes('papyrus') || review.includes('blacklisted font') || review.includes('font family')) detected++;
// Issue 2: Body text < 16px — HIGH
if (review.includes('14px') || review.includes('font-size') || review.includes('font size') || review.includes('body text')) detected++;
// Issue 3: outline: none — HIGH
if (review.includes('outline') || review.includes('focus')) detected++;
// Issue 4: !important — HIGH
if (review.includes('!important') || review.includes('important')) detected++;
// Issue 5: Purple gradient — MEDIUM
if (review.includes('gradient') || review.includes('purple') || review.includes('violet') || review.includes('#6366f1') || review.includes('#8b5cf6')) detected++;
// Issue 6: Generic hero copy — MEDIUM
if (review.includes('welcome to') || review.includes('all-in-one') || review.includes('generic') || review.includes('hero copy') || review.includes('ai slop')) detected++;
// Issue 7: 3-column feature grid — LOW
if (review.includes('3-column') || review.includes('three-column') || review.includes('feature grid') || review.includes('icon') || review.includes('circle')) detected++;
console.log(`Design review detected ${detected}/7 planted issues`);
expect(detected).toBeGreaterThanOrEqual(4);
}
}, 150_000);
});
// --- B6/B7/B8: Planted-bug outcome evals ---
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
describeOutcome('Planted-bug outcome evals', () => {
// Wrap describeOutcome with selection — skip if no planted-bug tests are selected
const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout'];
const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t));
(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => {
let outcomeDir: string;
beforeAll(() => {
@@ -767,7 +900,7 @@ CRITICAL RULES:
// --- Plan CEO Review E2E ---
describeE2E('Plan CEO Review E2E', () => {
describeIfSelected('Plan CEO Review E2E', ['plan-ceo-review'], () => {
let planDir: string;
beforeAll(() => {
@@ -854,7 +987,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
// --- Plan CEO Review (SELECTIVE EXPANSION) E2E ---
describeE2E('Plan CEO Review SELECTIVE EXPANSION E2E', () => {
describeIfSelected('Plan CEO Review SELECTIVE EXPANSION E2E', ['plan-ceo-review-selective'], () => {
let planDir: string;
beforeAll(() => {
@@ -937,7 +1070,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
// --- Plan Eng Review E2E ---
describeE2E('Plan Eng Review E2E', () => {
describeIfSelected('Plan Eng Review E2E', ['plan-eng-review'], () => {
let planDir: string;
beforeAll(() => {
@@ -1031,7 +1164,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
// --- Retro E2E ---
describeE2E('Retro E2E', () => {
describeIfSelected('Retro E2E', ['retro'], () => {
let retroDir: string;
beforeAll(() => {
@@ -1117,7 +1250,7 @@ Analyze the git history and produce the narrative report as described in the SKI
// --- QA-Only E2E (report-only, no fixes) ---
describeE2E('QA-Only skill E2E', () => {
describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
let qaOnlyDir: string;
beforeAll(() => {
@@ -1203,7 +1336,7 @@ Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`,
// --- QA Fix Loop E2E ---
describeE2E('QA Fix Loop E2E', () => {
describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
let qaFixDir: string;
let qaFixServer: ReturnType<typeof Bun.serve> | null = null;
@@ -1317,7 +1450,7 @@ This is a test+fix loop: find bugs, fix them in the source code, commit each fix
// --- Plan-Eng-Review Test-Plan Artifact E2E ---
describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => {
describeIfSelected('Plan-Eng-Review Test-Plan Artifact E2E', ['plan-eng-review-artifact'], () => {
let planDir: string;
let projectDir: string;
@@ -1444,7 +1577,7 @@ Write your review to ${planDir}/review-output.md`,
// --- Base branch detection smoke tests ---
describeE2E('Base branch detection', () => {
describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => {
let baseBranchDir: string;
const run = (cmd: string, args: string[], cwd: string) =>
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
@@ -1457,7 +1590,7 @@ describeE2E('Base branch detection', () => {
try { fs.rmSync(baseBranchDir, { recursive: true, force: true }); } catch {}
});
test('/review detects base branch and diffs against it', async () => {
testIfSelected('review-base-branch', async () => {
const dir = path.join(baseBranchDir, 'review-base');
fs.mkdirSync(dir, { recursive: true });
@@ -1510,7 +1643,7 @@ Write your findings to ${dir}/review-output.md`,
expect(usedGitDiff).toBe(true);
}, 120_000);
test('/ship Step 0-1 detects base branch without destructive actions', async () => {
testIfSelected('ship-base-branch', async () => {
const dir = path.join(baseBranchDir, 'ship-base');
fs.mkdirSync(dir, { recursive: true });
@@ -1572,7 +1705,7 @@ Write a summary of what you detected to ${dir}/ship-preflight.md including:
expect(destructiveTools).toHaveLength(0);
}, 90_000);
test('/retro detects default branch for git queries', async () => {
testIfSelected('retro-base-branch', async () => {
const dir = path.join(baseBranchDir, 'retro-base');
fs.mkdirSync(dir, { recursive: true });
@@ -1631,7 +1764,7 @@ Write your retrospective to ${dir}/retro-output.md`,
// --- Document-Release skill E2E ---
describeE2E('Document-Release skill E2E', () => {
describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
let docReleaseDir: string;
beforeAll(() => {
@@ -1735,6 +1868,7 @@ IMPORTANT:
// --- Deferred skill E2E tests (destructive or require interactive UI) ---
// Deferred tests — only test.todo entries, no selection needed
describeE2E('Deferred skill E2E', () => {
// Ship is destructive: pushes to remote, creates PRs, modifies VERSION/CHANGELOG
test.todo('/ship completes full workflow');
@@ -1742,8 +1876,120 @@ describeE2E('Deferred skill E2E', () => {
// Setup-browser-cookies requires interactive browser picker UI
test.todo('/setup-browser-cookies imports cookies');
// Gstack-upgrade is destructive: modifies skill installation directory
test.todo('/gstack-upgrade completes upgrade flow');
});
// --- gstack-upgrade E2E ---
describeIfSelected('gstack-upgrade E2E', ['gstack-upgrade-happy-path'], () => {
let upgradeDir: string;
let remoteDir: string;
beforeAll(() => {
upgradeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-upgrade-'));
remoteDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-remote-'));
const run = (cmd: string, args: string[], cwd: string) =>
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
// Init the "project" repo
run('git', ['init'], upgradeDir);
run('git', ['config', 'user.email', 'test@test.com'], upgradeDir);
run('git', ['config', 'user.name', 'Test'], upgradeDir);
// Create mock gstack install directory (local-git type)
const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
fs.mkdirSync(mockGstack, { recursive: true });
// Init as a git repo
run('git', ['init'], mockGstack);
run('git', ['config', 'user.email', 'test@test.com'], mockGstack);
run('git', ['config', 'user.name', 'Test'], mockGstack);
// Create bare remote
run('git', ['init', '--bare'], remoteDir);
run('git', ['remote', 'add', 'origin', remoteDir], mockGstack);
// Write old version files
fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.5.0\n');
fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
'# Changelog\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
fs.writeFileSync(path.join(mockGstack, 'setup'),
'#!/bin/bash\necho "Setup completed"\n', { mode: 0o755 });
// Initial commit + push
run('git', ['add', '.'], mockGstack);
run('git', ['commit', '-m', 'initial'], mockGstack);
run('git', ['push', '-u', 'origin', 'HEAD:main'], mockGstack);
// Create new version (simulate upstream release)
fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.6.0\n');
fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
'# Changelog\n\n## 0.6.0 — 2026-03-15\n\n- New feature: interactive design review\n- Fix: snapshot flag validation\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
run('git', ['add', '.'], mockGstack);
run('git', ['commit', '-m', 'release 0.6.0'], mockGstack);
run('git', ['push', 'origin', 'HEAD:main'], mockGstack);
// Reset working copy back to old version
run('git', ['reset', '--hard', 'HEAD~1'], mockGstack);
// Copy gstack-upgrade skill
fs.mkdirSync(path.join(upgradeDir, 'gstack-upgrade'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'gstack-upgrade', 'SKILL.md'),
path.join(upgradeDir, 'gstack-upgrade', 'SKILL.md'),
);
// Commit so git repo is clean
run('git', ['add', '.'], upgradeDir);
run('git', ['commit', '-m', 'initial project'], upgradeDir);
});
afterAll(() => {
try { fs.rmSync(upgradeDir, { recursive: true, force: true }); } catch {}
try { fs.rmSync(remoteDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('gstack-upgrade-happy-path', async () => {
const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
const result = await runSkillTest({
prompt: `Read gstack-upgrade/SKILL.md for the upgrade workflow.
You are running /gstack-upgrade standalone. The gstack installation is at ./.claude/skills/gstack (local-git type — it has a .git directory with an origin remote).
Current version: 0.5.0. A new version 0.6.0 is available on origin/main.
Follow the standalone upgrade flow:
1. Detect install type (local-git)
2. Run git fetch origin && git reset --hard origin/main in the install directory
3. Run the setup script
4. Show what's new from CHANGELOG
Skip any AskUserQuestion calls — auto-approve the upgrade. Write a summary of what you did to stdout.
IMPORTANT: The install directory is at ./.claude/skills/gstack — use that exact path.`,
workingDirectory: upgradeDir,
maxTurns: 20,
timeout: 180_000,
testName: 'gstack-upgrade-happy-path',
runId,
});
logCost('/gstack-upgrade happy path', result);
// Check that the version was updated
const versionAfter = fs.readFileSync(path.join(mockGstack, 'VERSION'), 'utf-8').trim();
const output = result.output || '';
const mentionsUpgrade = output.toLowerCase().includes('0.6.0') ||
output.toLowerCase().includes('upgrade') ||
output.toLowerCase().includes('updated');
recordE2E('/gstack-upgrade happy path', 'gstack-upgrade E2E', result, {
passed: versionAfter === '0.6.0' && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(versionAfter).toBe('0.6.0');
}, 240_000);
});
// --- Design Consultation E2E ---
@@ -1772,7 +2018,10 @@ ${designMd}
Return JSON: { "passed": true/false, "reasoning": "one paragraph explaining your evaluation" }`);
}
describeE2E('Design Consultation E2E', () => {
describeIfSelected('Design Consultation E2E', [
'design-consultation-core', 'design-consultation-research',
'design-consultation-existing', 'design-consultation-preview',
], () => {
let designDir: string;
beforeAll(() => {
@@ -1816,7 +2065,7 @@ A civic tech data platform for government employees to access, visualize, and sh
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
});
test('Test 1: core flow produces valid DESIGN.md + CLAUDE.md', async () => {
testIfSelected('design-consultation-core', async () => {
const result = await runSkillTest({
prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
@@ -1876,7 +2125,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
}
}, 420_000);
test('Test 2: research integration uses WebSearch', async () => {
testIfSelected('design-consultation-research', async () => {
// Clean up from previous test
try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
try { fs.unlinkSync(path.join(designDir, 'CLAUDE.md')); } catch {}
@@ -1933,7 +2182,7 @@ Write DESIGN.md to the working directory.`,
expect(designExists).toBe(true);
}, 420_000);
test('Test 3: handles existing DESIGN.md', async () => {
testIfSelected('design-consultation-existing', async () => {
// Pre-create a minimal DESIGN.md
fs.writeFileSync(path.join(designDir, 'DESIGN.md'), `# Design System — CivicPulse
@@ -1979,7 +2228,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
}
}, 420_000);
test('Test 4: generates font + color preview HTML', async () => {
testIfSelected('design-consultation-preview', async () => {
// Clean up
try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
@@ -2041,15 +2290,13 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener
}, 420_000);
});
// --- Plan Design Review E2E ---
// --- Plan Design Review E2E (plan-mode) ---
describeE2E('Plan Design Review E2E', () => {
describeIfSelected('Plan Design Review E2E', ['plan-design-review-plan-mode', 'plan-design-review-no-ui-scope'], () => {
let reviewDir: string;
beforeAll(() => {
testServer = testServer || startTestServer();
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-design-'));
setupBrowseShims(reviewDir);
const { spawnSync } = require('child_process');
const run = (cmd: string, args: string[]) =>
@@ -2058,9 +2305,6 @@ describeE2E('Plan Design Review E2E', () => {
run('git', ['init']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(reviewDir, 'index.html'), '<h1>Test</h1>\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Copy plan-design-review skill
fs.mkdirSync(path.join(reviewDir, 'plan-design-review'), { recursive: true });
@@ -2068,100 +2312,136 @@ describeE2E('Plan Design Review E2E', () => {
path.join(ROOT, 'plan-design-review', 'SKILL.md'),
path.join(reviewDir, 'plan-design-review', 'SKILL.md'),
);
// Create a plan file with intentional design gaps
fs.writeFileSync(path.join(reviewDir, 'plan.md'), `# Plan: User Dashboard
## Context
Build a user dashboard that shows account stats, recent activity, and settings.
## Implementation
1. Create a dashboard page at /dashboard
2. Show user stats (posts, followers, engagement rate)
3. Add a recent activity feed
4. Add a settings panel
5. Use a clean, modern UI with cards and icons
6. Add a hero section at the top with a gradient background
## Technical Details
- React components with Tailwind CSS
- API endpoint: GET /api/dashboard
- WebSocket for real-time activity updates
`);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial plan']);
});
afterAll(() => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
test('Test 5: /plan-design-review produces audit report', async () => {
testIfSelected('plan-design-review-plan-mode', async () => {
const result = await runSkillTest({
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
prompt: `Read plan-design-review/SKILL.md for the design review workflow.
B="${browseBin}"
Review the plan in ./plan.md. This plan has several design gaps — it uses vague language like "clean, modern UI" and "cards and icons", mentions a "hero section with gradient" (AI slop), and doesn't specify empty states, error states, loading states, responsive behavior, or accessibility.
Read plan-design-review/SKILL.md for the design review workflow.
Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Rate each design dimension 0-10 and explain what would make it a 10. Then EDIT plan.md to add the missing design decisions (interaction state table, empty states, responsive behavior, etc.).
Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.`,
IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit. Just read the plan file, review it, and edit it to fix the gaps.`,
workingDirectory: reviewDir,
maxTurns: 20,
timeout: 360_000,
testName: 'plan-design-review-audit',
maxTurns: 15,
timeout: 300_000,
testName: 'plan-design-review-plan-mode',
runId,
});
logCost('/plan-design-review audit', result);
logCost('/plan-design-review plan-mode', result);
const reportPath = path.join(reviewDir, 'design-audit.md');
const reportExists = fs.existsSync(reportPath);
let reportContent = '';
if (reportExists) {
reportContent = fs.readFileSync(reportPath, 'utf-8');
}
// Check that the agent produced design ratings (0-10 scale)
const output = result.output || '';
const hasRatings = /\d+\/10/.test(output);
const hasDesignContent = output.toLowerCase().includes('information architecture') ||
output.toLowerCase().includes('interaction state') ||
output.toLowerCase().includes('ai slop') ||
output.toLowerCase().includes('hierarchy');
const hasFirstImpression = reportContent.toLowerCase().includes('first impression') ||
reportContent.toLowerCase().includes('impression');
// Check that the plan file was edited (the core new behavior)
const planAfter = fs.readFileSync(path.join(reviewDir, 'plan.md'), 'utf-8');
const planOriginal = `# Plan: User Dashboard`;
const planWasEdited = planAfter.length > 300; // Original is ~450 chars, edited should be much longer
const planHasDesignAdditions = planAfter.toLowerCase().includes('empty') ||
planAfter.toLowerCase().includes('loading') ||
planAfter.toLowerCase().includes('error') ||
planAfter.toLowerCase().includes('state') ||
planAfter.toLowerCase().includes('responsive') ||
planAfter.toLowerCase().includes('accessibility');
recordE2E('/plan-design-review audit', 'Plan Design Review E2E', result, {
passed: reportExists && ['success', 'error_max_turns'].includes(result.exitReason),
recordE2E('/plan-design-review plan-mode', 'Plan Design Review E2E', result, {
passed: hasDesignContent && planWasEdited && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(reportExists).toBe(true);
if (reportExists) {
expect(reportContent.length).toBeGreaterThan(200);
}
}, 420_000);
// Agent should produce design-relevant output about the plan
expect(hasDesignContent).toBe(true);
// Agent should have edited the plan file to add missing design decisions
expect(planWasEdited).toBe(true);
expect(planHasDesignAdditions).toBe(true);
}, 360_000);
test('Test 6: /plan-design-review exports DESIGN.md', async () => {
// Clean up previous test artifacts
try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {}
testIfSelected('plan-design-review-no-ui-scope', async () => {
// Write a backend-only plan
fs.writeFileSync(path.join(reviewDir, 'backend-plan.md'), `# Plan: Database Migration
## Context
Migrate user records from PostgreSQL to a new schema with better indexing.
## Implementation
1. Create migration to add new columns to users table
2. Backfill data from legacy columns
3. Add database indexes for common query patterns
4. Update ActiveRecord models
5. Run migration in staging first, then production
`);
const result = await runSkillTest({
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
prompt: `Read plan-design-review/SKILL.md for the design review workflow.
B="${browseBin}"
Review the plan in ./backend-plan.md. This is a pure backend database migration plan with no UI changes.
Read plan-design-review/SKILL.md for the design review workflow.
Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Write your findings directly to stdout.
Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — this is non-interactive. After Phase 2 (Design System Extraction), write a DESIGN.md to the working directory. Also write the audit report to ./design-audit.md.`,
IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit.`,
workingDirectory: reviewDir,
maxTurns: 25,
timeout: 360_000,
testName: 'plan-design-review-export',
maxTurns: 10,
timeout: 180_000,
testName: 'plan-design-review-no-ui-scope',
runId,
});
logCost('/plan-design-review export', result);
logCost('/plan-design-review no-ui-scope', result);
const designPath = path.join(reviewDir, 'DESIGN.md');
const reportPath = path.join(reviewDir, 'design-audit.md');
const designExists = fs.existsSync(designPath);
const reportExists = fs.existsSync(reportPath);
// Agent should detect no UI scope and exit early
const output = result.output || '';
const detectsNoUI = output.toLowerCase().includes('no ui') ||
output.toLowerCase().includes('no frontend') ||
output.toLowerCase().includes('no design') ||
output.toLowerCase().includes('not applicable') ||
output.toLowerCase().includes('backend');
let designContent = '';
if (designExists) {
designContent = fs.readFileSync(designPath, 'utf-8');
}
const hasTypography = designContent.toLowerCase().includes('typography') || designContent.toLowerCase().includes('font');
const hasColor = designContent.toLowerCase().includes('color');
recordE2E('/plan-design-review export', 'Plan Design Review E2E', result, {
passed: designExists && ['success', 'error_max_turns'].includes(result.exitReason),
recordE2E('/plan-design-review no-ui-scope', 'Plan Design Review E2E', result, {
passed: detectsNoUI && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
// DESIGN.md export is best-effort — agent may not always produce it
if (designExists) {
expect(hasTypography || hasColor).toBe(true);
}
}, 420_000);
expect(detectsNoUI).toBe(true);
}, 240_000);
});
// --- QA Design Review E2E ---
// --- Design Review E2E (live-site audit + fix) ---
describeE2E('QA Design Review E2E', () => {
describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
let qaDesignDir: string;
let qaDesignServer: ReturnType<typeof Bun.serve> | null = null;
@@ -2237,11 +2517,11 @@ describeE2E('QA Design Review E2E', () => {
},
});
// Copy qa-design-review skill
fs.mkdirSync(path.join(qaDesignDir, 'qa-design-review'), { recursive: true });
// Copy design-review skill
fs.mkdirSync(path.join(qaDesignDir, 'design-review'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'qa-design-review', 'SKILL.md'),
path.join(qaDesignDir, 'qa-design-review', 'SKILL.md'),
path.join(ROOT, 'design-review', 'SKILL.md'),
path.join(qaDesignDir, 'design-review', 'SKILL.md'),
);
});
@@ -2250,7 +2530,7 @@ describeE2E('QA Design Review E2E', () => {
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
});
test('Test 7: /qa-design-review audits and fixes design issues', async () => {
test('Test 7: /design-review audits and fixes design issues', async () => {
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
const result = await runSkillTest({
@@ -2258,17 +2538,17 @@ describeE2E('QA Design Review E2E', () => {
B="${browseBin}"
Read qa-design-review/SKILL.md for the design review + fix workflow.
Read design-review/SKILL.md for the design review + fix workflow.
Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion calls — this is non-interactive. Fix up to 3 issues max. Write your report to ./design-audit.md.`,
workingDirectory: qaDesignDir,
maxTurns: 30,
timeout: 360_000,
testName: 'qa-design-review-fix',
testName: 'design-review-fix',
runId,
});
logCost('/qa-design-review fix', result);
logCost('/design-review fix', result);
const reportPath = path.join(qaDesignDir, 'design-audit.md');
const reportExists = fs.existsSync(reportPath);
@@ -2280,7 +2560,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
const commits = gitLog.stdout.toString().trim().split('\n');
const designFixCommits = commits.filter((c: string) => c.includes('style(design)'));
recordE2E('/qa-design-review fix', 'QA Design Review E2E', result, {
recordE2E('/design-review fix', 'Design Review E2E', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
@@ -2300,7 +2580,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
// --- Test Bootstrap E2E ---
describeE2E('Test Bootstrap E2E', () => {
describeIfSelected('Test Bootstrap E2E', ['qa-bootstrap'], () => {
let bootstrapDir: string;
let bootstrapServer: ReturnType<typeof Bun.serve>;
@@ -2437,7 +2717,7 @@ This is a test+fix loop: find bugs, fix them, write regression tests, commit eac
// --- Test Coverage Audit E2E ---
describeE2E('Test Coverage Audit E2E', () => {
describeIfSelected('Test Coverage Audit E2E', ['ship-coverage-audit'], () => {
let coverageDir: string;
beforeAll(() => {
+253 -13
View File
@@ -17,6 +17,7 @@ import * as path from 'path';
import { callJudge, judge } from './helpers/llm-judge';
import type { JudgeScore } from './helpers/llm-judge';
import { EvalCollector } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
const ROOT = path.resolve(import.meta.dir, '..');
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
@@ -26,8 +27,43 @@ const describeEval = evalsEnabled ? describe : describe.skip;
// Eval result collector
const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
describeEval('LLM-as-judge quality evals', () => {
test('command reference table scores >= 4 on all dimensions', async () => {
// --- Diff-based test selection ---
let selectedTests: string[] | null = null;
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE
|| detectBaseBranch(ROOT)
|| 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length > 0) {
const selection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
selectedTests = selection.selected;
process.stderr.write(`\nLLM-judge selection (${selection.reason}): ${selection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests\n`);
if (selection.skipped.length > 0) {
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
}
process.stderr.write('\n');
}
}
/** Wrap a describe block to skip if none of its tests are selected. */
function describeIfSelected(name: string, testNames: string[], fn: () => void) {
const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
(anySelected ? describeEval : describe.skip)(name, fn);
}
/** Skip an individual test if not selected (for multi-test describe blocks). */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
}
describeIfSelected('LLM-as-judge quality evals', [
'command reference table', 'snapshot flags reference',
'browse/SKILL.md reference', 'setup block', 'regression vs baseline',
], () => {
testIfSelected('command reference table', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const start = content.indexOf('## Command Reference');
@@ -53,7 +89,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('snapshot flags section scores >= 4 on all dimensions', async () => {
testIfSelected('snapshot flags reference', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const start = content.indexOf('## Snapshot System');
@@ -79,7 +115,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('browse/SKILL.md overall scores >= 4', async () => {
testIfSelected('browse/SKILL.md reference', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
const start = content.indexOf('## Snapshot Flags');
@@ -104,7 +140,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('setup block scores >= 3 on actionability and clarity', async () => {
testIfSelected('setup block', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = content.indexOf('## SETUP');
@@ -131,7 +167,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.clarity).toBeGreaterThanOrEqual(3);
}, 30_000);
test('regression check: compare branch vs baseline quality', async () => {
testIfSelected('regression vs baseline', async () => {
const t0 = Date.now();
const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const genStart = generated.indexOf('## Command Reference');
@@ -220,10 +256,10 @@ Scores are 1-5 overall quality.`,
// --- Part 7: QA skill quality evals (C6) ---
describeEval('QA skill quality evals', () => {
describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => {
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
test('qa/SKILL.md workflow quality scores >= 4', async () => {
testIfSelected('qa/SKILL.md workflow', async () => {
const t0 = Date.now();
const start = qaContent.indexOf('## Workflow');
const end = qaContent.indexOf('## Health Score Rubric');
@@ -266,7 +302,7 @@ ${section}`);
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('qa/SKILL.md health score rubric is unambiguous', async () => {
testIfSelected('qa/SKILL.md health rubric', async () => {
const t0 = Date.now();
const start = qaContent.indexOf('## Health Score Rubric');
const section = qaContent.slice(start);
@@ -310,8 +346,8 @@ ${section}`);
// --- Part 7: Cross-skill consistency judge (C7) ---
describeEval('Cross-skill consistency evals', () => {
test('greptile-history patterns are consistent across all skills', async () => {
describeIfSelected('Cross-skill consistency evals', ['cross-skill greptile consistency'], () => {
testIfSelected('cross-skill greptile consistency', async () => {
const t0 = Date.now();
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
@@ -375,10 +411,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
// --- Part 7: Baseline score pinning (C9) ---
describeEval('Baseline score pinning', () => {
describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => {
const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');
test('LLM eval scores do not regress below baselines', async () => {
testIfSelected('baseline score pinning', async () => {
const t0 = Date.now();
if (!fs.existsSync(baselinesPath)) {
console.log('No baseline file found — skipping pinning check');
@@ -428,6 +464,210 @@ describeEval('Baseline score pinning', () => {
}, 60_000);
});
// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) ---
/**
* DRY helper for workflow SKILL.md judge tests.
* Extracts a section from a SKILL.md file and judges its quality as an agent workflow.
*/
async function runWorkflowJudge(opts: {
testName: string;
suite: string;
skillPath: string;
startMarker: string;
endMarker: string | null;
judgeContext: string;
judgeGoal: string;
thresholds?: { clarity: number; completeness: number; actionability: number };
}) {
const t0 = Date.now();
const defaults = { clarity: 4, completeness: 3, actionability: 4 };
const thresholds = { ...defaults, ...opts.thresholds };
const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
const startIdx = content.indexOf(opts.startMarker);
if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`);
let section: string;
if (opts.endMarker) {
const endIdx = content.indexOf(opts.endMarker, startIdx);
if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`);
section = content.slice(startIdx, endIdx);
} else {
section = content.slice(startIdx);
}
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
that are documented separately do NOT penalize for missing external definitions.
Rate on three dimensions (1-5 scale):
- **clarity** (1-5): Can an agent follow the instructions without ambiguity?
- **completeness** (1-5): Are all steps, decision points, and outputs well-defined?
- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables?
Respond with ONLY valid JSON:
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
Here is the document to evaluate:
${section}`);
console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2));
evalCollector?.addTest({
name: opts.testName,
suite: opts.suite,
tier: 'llm-judge',
passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
});
expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity);
expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness);
expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability);
}
// Block 1: Ship & Release skills
describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => {
testIfSelected('ship/SKILL.md workflow', async () => {
await runWorkflowJudge({
testName: 'ship/SKILL.md workflow',
suite: 'Ship & Release skill evals',
skillPath: 'ship/SKILL.md',
startMarker: '# Ship:',
endMarker: '## Important Rules',
judgeContext: 'a ship/release workflow document',
judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR',
});
}, 30_000);
testIfSelected('document-release/SKILL.md workflow', async () => {
await runWorkflowJudge({
testName: 'document-release/SKILL.md workflow',
suite: 'Ship & Release skill evals',
skillPath: 'document-release/SKILL.md',
startMarker: '# Document Release:',
endMarker: '## Important Rules',
judgeContext: 'a post-ship documentation update workflow',
judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS',
});
}, 30_000);
});
// Block 2: Plan Review skills
describeIfSelected('Plan Review skill evals', [
'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes',
], () => {
testIfSelected('plan-ceo-review/SKILL.md modes', async () => {
await runWorkflowJudge({
testName: 'plan-ceo-review/SKILL.md modes',
suite: 'Plan Review skill evals',
skillPath: 'plan-ceo-review/SKILL.md',
startMarker: '## Step 0: Nuclear Scope Challenge',
endMarker: '## Review Sections',
judgeContext: 'a CEO/founder plan review framework with 4 scope modes',
judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively',
});
}, 30_000);
testIfSelected('plan-eng-review/SKILL.md sections', async () => {
await runWorkflowJudge({
testName: 'plan-eng-review/SKILL.md sections',
suite: 'Plan Review skill evals',
skillPath: 'plan-eng-review/SKILL.md',
startMarker: '## BEFORE YOU START:',
endMarker: '## CRITICAL RULE',
judgeContext: 'an engineering plan review framework with 4 review sections',
judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion',
});
}, 30_000);
testIfSelected('plan-design-review/SKILL.md passes', async () => {
await runWorkflowJudge({
testName: 'plan-design-review/SKILL.md passes',
suite: 'Plan Review skill evals',
skillPath: 'plan-design-review/SKILL.md',
startMarker: '## Review Sections',
endMarker: '## CRITICAL RULE',
judgeContext: 'a design plan review framework with 7 review passes',
judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate',
});
}, 30_000);
});
// Block 3: Design skills
describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => {
testIfSelected('design-review/SKILL.md fix loop', async () => {
await runWorkflowJudge({
testName: 'design-review/SKILL.md fix loop',
suite: 'Design skill evals',
skillPath: 'design-review/SKILL.md',
startMarker: '## Phase 7:',
endMarker: '## Additional Rules',
judgeContext: 'a design audit triage and fix loop workflow',
judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots',
});
}, 30_000);
testIfSelected('design-consultation/SKILL.md research', async () => {
await runWorkflowJudge({
testName: 'design-consultation/SKILL.md research',
suite: 'Design skill evals',
skillPath: 'design-consultation/SKILL.md',
startMarker: '## Phase 1:',
endMarker: '## Phase 4:',
judgeContext: 'a design consultation research and proposal workflow',
judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications',
});
}, 30_000);
});
// Block 4: Other skills
describeIfSelected('Other skill evals', [
'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
], () => {
testIfSelected('retro/SKILL.md instructions', async () => {
await runWorkflowJudge({
testName: 'retro/SKILL.md instructions',
suite: 'Other skill evals',
skillPath: 'retro/SKILL.md',
startMarker: '## Instructions',
endMarker: '## Compare Mode',
judgeContext: 'an engineering retrospective data gathering and analysis workflow',
judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking',
});
}, 30_000);
testIfSelected('qa-only/SKILL.md workflow', async () => {
await runWorkflowJudge({
testName: 'qa-only/SKILL.md workflow',
suite: 'Other skill evals',
skillPath: 'qa-only/SKILL.md',
startMarker: '## Workflow',
endMarker: '## Important Rules',
judgeContext: 'a report-only QA testing workflow',
judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything',
});
}, 30_000);
testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => {
await runWorkflowJudge({
testName: 'gstack-upgrade/SKILL.md upgrade flow',
suite: 'Other skill evals',
skillPath: 'gstack-upgrade/SKILL.md',
startMarker: '## Inline upgrade flow',
endMarker: '## Standalone usage',
judgeContext: 'a version upgrade detection and execution workflow',
judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed',
});
}, 30_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+33 -13
View File
@@ -72,15 +72,29 @@ describe('SKILL.md command validation', () => {
expect(result.snapshotFlagErrors).toHaveLength(0);
});
test('all $B commands in qa-design-review/SKILL.md are valid browse commands', () => {
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
test('all $B commands in design-review/SKILL.md are valid browse commands', () => {
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.invalid).toHaveLength(0);
});
test('all snapshot flags in qa-design-review/SKILL.md are valid', () => {
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
test('all snapshot flags in design-review/SKILL.md are valid', () => {
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.snapshotFlagErrors).toHaveLength(0);
});
test('all $B commands in design-consultation/SKILL.md are valid browse commands', () => {
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.invalid).toHaveLength(0);
});
test('all snapshot flags in design-consultation/SKILL.md are valid', () => {
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.snapshotFlagErrors).toHaveLength(0);
@@ -206,7 +220,7 @@ describe('Update check preamble', () => {
'retro/SKILL.md',
'office-hours/SKILL.md', 'debug/SKILL.md',
'plan-design-review/SKILL.md',
'qa-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
@@ -431,6 +445,8 @@ describe('No hardcoded branch names in SKILL templates', () => {
'plan-ceo-review/SKILL.md.tmpl',
'retro/SKILL.md.tmpl',
'document-release/SKILL.md.tmpl',
'plan-eng-review/SKILL.md.tmpl',
'plan-design-review/SKILL.md.tmpl',
];
// Patterns that indicate hardcoded 'main' in git commands
@@ -515,7 +531,7 @@ describe('v0.4.1 preamble features', () => {
'retro/SKILL.md',
'office-hours/SKILL.md', 'debug/SKILL.md',
'plan-design-review/SKILL.md',
'qa-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
@@ -616,6 +632,10 @@ describe('Contributor mode preamble structure', () => {
'ship/SKILL.md', 'review/SKILL.md',
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
'retro/SKILL.md',
'plan-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
for (const skill of skillsWithPreamble) {
@@ -701,7 +721,7 @@ describe('Completeness Principle in generated SKILL.md files', () => {
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
'retro/SKILL.md',
'plan-design-review/SKILL.md',
'qa-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
@@ -874,8 +894,8 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
expect(content).toContain('Step 2.5');
});
test('TEST_BOOTSTRAP appears in qa-design-review/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
test('TEST_BOOTSTRAP appears in design-review/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Framework Bootstrap');
});
@@ -916,10 +936,10 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
expect(content).toContain('100% test coverage');
});
test('WebSearch is in allowed-tools for qa, ship, qa-design-review', () => {
test('WebSearch is in allowed-tools for qa, ship, design-review', () => {
const qa = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
const ship = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const qaDesign = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
const qaDesign = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
expect(qa).toContain('WebSearch');
expect(ship).toContain('WebSearch');
expect(qaDesign).toContain('WebSearch');
@@ -942,8 +962,8 @@ describe('Phase 8e.5 regression test generation', () => {
expect(content).not.toContain('Never modify tests or CI configuration');
});
test('qa-design-review has CSS-aware Phase 8e.5 variant', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
test('design-review has CSS-aware Phase 8e.5 variant', () => {
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('8e.5. Regression Test (design-review variant)');
expect(content).toContain('CSS-only');
expect(content).toContain('test(design): regression test');
+253
View File
@@ -0,0 +1,253 @@
/**
* Unit tests for diff-based test selection.
* Free (no API calls), runs with `bun test`.
*/
import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import {
matchGlob,
selectTests,
detectBaseBranch,
E2E_TOUCHFILES,
LLM_JUDGE_TOUCHFILES,
GLOBAL_TOUCHFILES,
} from './helpers/touchfiles';
const ROOT = path.resolve(import.meta.dir, '..');
// --- matchGlob ---
describe('matchGlob', () => {
test('** matches any depth of path segments', () => {
expect(matchGlob('browse/src/commands.ts', 'browse/src/**')).toBe(true);
expect(matchGlob('browse/src/deep/nested/file.ts', 'browse/src/**')).toBe(true);
expect(matchGlob('browse/src/cli.ts', 'browse/src/**')).toBe(true);
});
test('** does not match unrelated paths', () => {
expect(matchGlob('browse/src/commands.ts', 'qa/**')).toBe(false);
expect(matchGlob('review/SKILL.md', 'qa/**')).toBe(false);
});
test('exact match works', () => {
expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
expect(matchGlob('SKILL.md.tmpl', 'SKILL.md')).toBe(false);
expect(matchGlob('qa/SKILL.md', 'SKILL.md')).toBe(false);
});
test('* matches within a single segment', () => {
expect(matchGlob('test/fixtures/review-eval-enum.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
expect(matchGlob('test/fixtures/review-eval-enum-diff.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
expect(matchGlob('test/fixtures/review-eval-vuln.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(false);
});
test('dots in patterns are escaped correctly', () => {
expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
expect(matchGlob('SKILLxmd', 'SKILL.md')).toBe(false);
});
test('** at end matches files in the directory', () => {
expect(matchGlob('qa/SKILL.md', 'qa/**')).toBe(true);
expect(matchGlob('qa/SKILL.md.tmpl', 'qa/**')).toBe(true);
expect(matchGlob('qa/templates/report.md', 'qa/**')).toBe(true);
});
});
// --- selectTests ---
describe('selectTests', () => {
test('browse/src change selects browse and qa tests', () => {
const result = selectTests(['browse/src/commands.ts'], E2E_TOUCHFILES);
expect(result.selected).toContain('browse-basic');
expect(result.selected).toContain('browse-snapshot');
expect(result.selected).toContain('qa-quick');
expect(result.selected).toContain('qa-fix-loop');
expect(result.selected).toContain('design-review-fix');
expect(result.reason).toBe('diff');
// Should NOT include unrelated tests
expect(result.selected).not.toContain('plan-ceo-review');
expect(result.selected).not.toContain('retro');
expect(result.selected).not.toContain('document-release');
});
test('skill-specific change selects only that skill and related tests', () => {
const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES);
expect(result.selected).toContain('plan-ceo-review');
expect(result.selected).toContain('plan-ceo-review-selective');
expect(result.selected.length).toBe(2);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2);
});
test('global touchfile triggers ALL tests', () => {
const result = selectTests(['test/helpers/session-runner.ts'], E2E_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
expect(result.skipped.length).toBe(0);
expect(result.reason).toContain('global');
});
test('gen-skill-docs.ts is a global touchfile', () => {
const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
expect(result.reason).toContain('global');
});
test('unrelated file selects nothing', () => {
const result = selectTests(['README.md'], E2E_TOUCHFILES);
expect(result.selected).toEqual([]);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length);
});
test('empty changed files selects nothing', () => {
const result = selectTests([], E2E_TOUCHFILES);
expect(result.selected).toEqual([]);
});
test('multiple changed files union their selections', () => {
const result = selectTests(
['plan-ceo-review/SKILL.md', 'retro/SKILL.md.tmpl'],
E2E_TOUCHFILES,
);
expect(result.selected).toContain('plan-ceo-review');
expect(result.selected).toContain('plan-ceo-review-selective');
expect(result.selected).toContain('retro');
expect(result.selected).toContain('retro-base-branch');
expect(result.selected.length).toBe(4);
});
test('works with LLM_JUDGE_TOUCHFILES', () => {
const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES);
expect(result.selected).toContain('qa/SKILL.md workflow');
expect(result.selected).toContain('qa/SKILL.md health rubric');
expect(result.selected.length).toBe(2);
});
test('SKILL.md.tmpl root template only selects root-dependent tests', () => {
const result = selectTests(['SKILL.md.tmpl'], E2E_TOUCHFILES);
// Should select the 7 tests that depend on root SKILL.md
expect(result.selected).toContain('skillmd-setup-discovery');
expect(result.selected).toContain('contributor-mode');
expect(result.selected).toContain('session-awareness');
// Should NOT select unrelated tests
expect(result.selected).not.toContain('plan-ceo-review');
expect(result.selected).not.toContain('retro');
});
test('global touchfiles work for LLM-judge tests too', () => {
const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
});
});
// --- detectBaseBranch ---
describe('detectBaseBranch', () => {
test('detects local main branch', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(dir, 'test.txt'), 'hello\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
const result = detectBaseBranch(dir);
// Should find 'main' (or 'master' depending on git default)
expect(result).toMatch(/^(main|master)$/);
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
});
test('returns null for empty repo with no branches', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
// No commits = no branches
const result = detectBaseBranch(dir);
expect(result).toBeNull();
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
});
test('returns null for non-git directory', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
const result = detectBaseBranch(dir);
expect(result).toBeNull();
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
});
});
// --- Completeness: every testName in skill-e2e.test.ts has a TOUCHFILES entry ---
describe('TOUCHFILES completeness', () => {
test('every E2E testName has a TOUCHFILES entry', () => {
const e2eContent = fs.readFileSync(
path.join(ROOT, 'test', 'skill-e2e.test.ts'),
'utf-8',
);
// Extract all testName: 'value' entries
const testNameRegex = /testName:\s*['"`]([^'"`]+)['"`]/g;
const testNames: string[] = [];
let match;
while ((match = testNameRegex.exec(e2eContent)) !== null) {
let name = match[1];
// Handle template literals like `qa-${label}` — these expand to
// qa-b6-static, qa-b7-spa, qa-b8-checkout
if (name.includes('${')) continue; // skip template literals, check expanded forms below
testNames.push(name);
}
// Add the template-expanded testNames from runPlantedBugEval calls
const plantedBugRegex = /runPlantedBugEval\([^,]+,\s*[^,]+,\s*['"`]([^'"`]+)['"`]\)/g;
while ((match = plantedBugRegex.exec(e2eContent)) !== null) {
testNames.push(`qa-${match[1]}`);
}
expect(testNames.length).toBeGreaterThan(0);
const missing = testNames.filter(name => !(name in E2E_TOUCHFILES));
if (missing.length > 0) {
throw new Error(
`E2E tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
`Add these to E2E_TOUCHFILES in test/helpers/touchfiles.ts`,
);
}
});
test('every LLM-judge test has a TOUCHFILES entry', () => {
const llmContent = fs.readFileSync(
path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
'utf-8',
);
// Extract test names from addTest({ name: '...' }) calls
const nameRegex = /name:\s*['"`]([^'"`]+)['"`]/g;
const testNames: string[] = [];
let match;
while ((match = nameRegex.exec(llmContent)) !== null) {
testNames.push(match[1]);
}
// Deduplicate (some tests call addTest with the same name)
const unique = [...new Set(testNames)];
expect(unique.length).toBeGreaterThan(0);
const missing = unique.filter(name => !(name in LLM_JUDGE_TOUCHFILES));
if (missing.length > 0) {
throw new Error(
`LLM-judge tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
`Add these to LLM_JUDGE_TOUCHFILES in test/helpers/touchfiles.ts`,
);
}
});
});