mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 05:05:08 +02:00
fix: resolve merge conflicts with origin/main (v0.6.1 qa-design-review → design-review rename)
Conflicts resolved: - README.md: kept install section + office-hours/debug skills, adopted main's design-review rename and restructured footer - design-review/SKILL.md: took main's version (renamed from qa-design-review) - plan-design-review/SKILL.md: took main's version with base branch detect - Updated install instructions to use /design-review (not /qa-design-review) - Updated skill count to 15 in footer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+86
@@ -0,0 +1,86 @@
|
||||
/* Planted design anti-patterns for E2E eval — 7 issues */
|
||||
|
||||
/* Issue 1: [HIGH] Blacklisted font (Papyrus) */
|
||||
/* Issue 2: [HIGH] Body text < 16px (14px) */
|
||||
body {
|
||||
font-family: 'Papyrus', sans-serif;
|
||||
font-size: 14px;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/* Issue 5: [MEDIUM] Purple/violet gradient background */
|
||||
.hero {
|
||||
background: linear-gradient(135deg, #6366f1, #8b5cf6);
|
||||
text-align: center;
|
||||
padding: 80px 20px;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.hero h1 {
|
||||
text-align: center;
|
||||
font-size: 48px;
|
||||
}
|
||||
|
||||
.hero p {
|
||||
text-align: center;
|
||||
font-size: 20px;
|
||||
}
|
||||
|
||||
/* Issue 7: [LOW] 3-column feature grid with icon circles */
|
||||
.features {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
gap: 24px;
|
||||
padding: 60px 40px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.feature-card {
|
||||
border-radius: 24px;
|
||||
padding: 32px;
|
||||
text-align: center;
|
||||
background: #f9fafb;
|
||||
}
|
||||
|
||||
/* Icon in colored circle — AI slop pattern */
|
||||
.icon-circle {
|
||||
width: 60px;
|
||||
height: 60px;
|
||||
border-radius: 50%;
|
||||
background: #ede9fe;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
margin: 0 auto 16px;
|
||||
font-size: 24px;
|
||||
}
|
||||
|
||||
/* Issue 3: [HIGH] outline: none without replacement */
|
||||
button {
|
||||
outline: none;
|
||||
background: #6366f1;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 24px;
|
||||
border-radius: 24px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.small-link {
|
||||
font-size: 11px;
|
||||
padding: 4px 8px;
|
||||
}
|
||||
|
||||
/* Issue 4: [HIGH] !important usage */
|
||||
.override {
|
||||
color: red !important;
|
||||
margin-left: 10px !important;
|
||||
}
|
||||
|
||||
.footer {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
background: #1e1b4b;
|
||||
color: white;
|
||||
}
|
||||
+41
@@ -0,0 +1,41 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<link rel="stylesheet" href="styles.css">
|
||||
<title>Our Platform</title>
|
||||
</head>
|
||||
<body>
|
||||
<!-- Issue 6: [MEDIUM] Generic hero copy ("Welcome to...", "all-in-one solution") -->
|
||||
<div class="hero">
|
||||
<h1>Welcome to Our Platform</h1>
|
||||
<p>Your all-in-one solution for everything you need</p>
|
||||
<button>Get Started</button>
|
||||
</div>
|
||||
|
||||
<!-- Issue 7: [LOW] 3-column feature grid with icon-in-circle + title + description -->
|
||||
<div class="features">
|
||||
<div class="feature-card">
|
||||
<div class="icon-circle">★</div>
|
||||
<h3>Feature One</h3>
|
||||
<p>A short description of this amazing feature that will change your life.</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<div class="icon-circle">⚡</div>
|
||||
<h3>Feature Two</h3>
|
||||
<p>Another incredible capability that sets us apart from the competition.</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<div class="icon-circle">⚙</div>
|
||||
<h3>Feature Three</h3>
|
||||
<p>Yet another powerful tool to streamline your workflow effortlessly.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="footer">
|
||||
<p class="override">Unlock the power of our platform today</p>
|
||||
<a href="#" class="small-link">Terms of Service</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -70,7 +70,7 @@ describe('gen-skill-docs', () => {
|
||||
{ dir: 'setup-browser-cookies', name: 'setup-browser-cookies' },
|
||||
{ dir: 'gstack-upgrade', name: 'gstack-upgrade' },
|
||||
{ dir: 'plan-design-review', name: 'plan-design-review' },
|
||||
{ dir: 'qa-design-review', name: 'qa-design-review' },
|
||||
{ dir: 'design-review', name: 'design-review' },
|
||||
{ dir: 'design-consultation', name: 'design-consultation' },
|
||||
];
|
||||
|
||||
|
||||
@@ -0,0 +1,200 @@
|
||||
/**
|
||||
* Diff-based test selection for E2E and LLM-judge evals.
|
||||
*
|
||||
* Each test declares which source files it depends on ("touchfiles").
|
||||
* The test runner checks `git diff` and only runs tests whose
|
||||
* dependencies were modified. Override with EVALS_ALL=1 to run everything.
|
||||
*/
|
||||
|
||||
import { spawnSync } from 'child_process';
|
||||
|
||||
// --- Glob matching ---
|
||||
|
||||
/**
|
||||
* Match a file path against a glob pattern.
|
||||
* Supports:
|
||||
* ** — match any number of path segments
|
||||
* * — match within a single segment (no /)
|
||||
*/
|
||||
export function matchGlob(file: string, pattern: string): boolean {
|
||||
const regexStr = pattern
|
||||
.replace(/\./g, '\\.')
|
||||
.replace(/\*\*/g, '{{GLOBSTAR}}')
|
||||
.replace(/\*/g, '[^/]*')
|
||||
.replace(/\{\{GLOBSTAR\}\}/g, '.*');
|
||||
return new RegExp(`^${regexStr}$`).test(file);
|
||||
}
|
||||
|
||||
// --- Touchfile maps ---
|
||||
|
||||
/**
|
||||
* E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
|
||||
* Each test lists the file patterns that, if changed, require the test to run.
|
||||
*/
|
||||
export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Browse core
|
||||
'browse-basic': ['browse/src/**'],
|
||||
'browse-snapshot': ['browse/src/**'],
|
||||
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
|
||||
// QA
|
||||
'qa-quick': ['qa/**', 'browse/src/**'],
|
||||
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
|
||||
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
|
||||
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
|
||||
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
|
||||
'qa-fix-loop': ['qa/**', 'browse/src/**'],
|
||||
|
||||
// Review
|
||||
'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'],
|
||||
'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
|
||||
'review-base-branch': ['review/**'],
|
||||
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
|
||||
|
||||
// Plan reviews
|
||||
'plan-ceo-review': ['plan-ceo-review/**'],
|
||||
'plan-ceo-review-selective': ['plan-ceo-review/**'],
|
||||
'plan-eng-review': ['plan-eng-review/**'],
|
||||
'plan-eng-review-artifact': ['plan-eng-review/**'],
|
||||
|
||||
// Ship
|
||||
'ship-base-branch': ['ship/**'],
|
||||
|
||||
// Retro
|
||||
'retro': ['retro/**'],
|
||||
'retro-base-branch': ['retro/**'],
|
||||
|
||||
// Document-release
|
||||
'document-release': ['document-release/**'],
|
||||
|
||||
// QA bootstrap
|
||||
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
|
||||
|
||||
// Ship coverage audit
|
||||
'ship-coverage-audit': ['ship/**'],
|
||||
|
||||
// Design
|
||||
'design-consultation-core': ['design-consultation/**'],
|
||||
'design-consultation-research': ['design-consultation/**'],
|
||||
'design-consultation-existing': ['design-consultation/**'],
|
||||
'design-consultation-preview': ['design-consultation/**'],
|
||||
'plan-design-review-plan-mode': ['plan-design-review/**'],
|
||||
'plan-design-review-no-ui-scope': ['plan-design-review/**'],
|
||||
'design-review-fix': ['design-review/**', 'browse/src/**'],
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
};
|
||||
|
||||
/**
|
||||
* LLM-judge test touchfiles — keyed by test description string.
|
||||
*/
|
||||
export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
|
||||
'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
|
||||
'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
|
||||
'setup block': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
|
||||
'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
|
||||
'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
|
||||
'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
|
||||
|
||||
// Ship & Release
|
||||
'ship/SKILL.md workflow': ['ship/SKILL.md', 'ship/SKILL.md.tmpl'],
|
||||
'document-release/SKILL.md workflow': ['document-release/SKILL.md', 'document-release/SKILL.md.tmpl'],
|
||||
|
||||
// Plan Reviews
|
||||
'plan-ceo-review/SKILL.md modes': ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'],
|
||||
'plan-eng-review/SKILL.md sections': ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'],
|
||||
'plan-design-review/SKILL.md passes': ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'],
|
||||
|
||||
// Design skills
|
||||
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
|
||||
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
|
||||
|
||||
// Other skills
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
|
||||
};
|
||||
|
||||
/**
|
||||
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
|
||||
*/
|
||||
export const GLOBAL_TOUCHFILES = [
|
||||
'test/helpers/session-runner.ts',
|
||||
'test/helpers/eval-store.ts',
|
||||
'test/helpers/llm-judge.ts',
|
||||
'scripts/gen-skill-docs.ts',
|
||||
'test/helpers/touchfiles.ts',
|
||||
'browse/test/test-server.ts',
|
||||
];
|
||||
|
||||
// --- Base branch detection ---
|
||||
|
||||
/**
|
||||
* Detect the base branch by trying refs in order.
|
||||
* Returns the first valid ref, or null if none found.
|
||||
*/
|
||||
export function detectBaseBranch(cwd: string): string | null {
|
||||
for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
|
||||
const result = spawnSync('git', ['rev-parse', '--verify', ref], {
|
||||
cwd, stdio: 'pipe', timeout: 3000,
|
||||
});
|
||||
if (result.status === 0) return ref;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of files changed between base branch and HEAD.
|
||||
*/
|
||||
export function getChangedFiles(baseBranch: string, cwd: string): string[] {
|
||||
const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
|
||||
cwd, stdio: 'pipe', timeout: 5000,
|
||||
});
|
||||
if (result.status !== 0) return [];
|
||||
return result.stdout.toString().trim().split('\n').filter(Boolean);
|
||||
}
|
||||
|
||||
// --- Test selection ---
|
||||
|
||||
/**
|
||||
* Select tests to run based on changed files.
|
||||
*
|
||||
* Algorithm:
|
||||
* 1. If any changed file matches a global touchfile → run ALL tests
|
||||
* 2. Otherwise, for each test, check if any changed file matches its patterns
|
||||
* 3. Return selected + skipped lists with reason
|
||||
*/
|
||||
export function selectTests(
|
||||
changedFiles: string[],
|
||||
touchfiles: Record<string, string[]>,
|
||||
globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
|
||||
): { selected: string[]; skipped: string[]; reason: string } {
|
||||
const allTestNames = Object.keys(touchfiles);
|
||||
|
||||
// Global touchfile hit → run all
|
||||
for (const file of changedFiles) {
|
||||
if (globalTouchfiles.some(g => matchGlob(file, g))) {
|
||||
return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
|
||||
}
|
||||
}
|
||||
|
||||
// Per-test matching
|
||||
const selected: string[] = [];
|
||||
const skipped: string[] = [];
|
||||
for (const [testName, patterns] of Object.entries(touchfiles)) {
|
||||
const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
|
||||
(hit ? selected : skipped).push(testName);
|
||||
}
|
||||
|
||||
return { selected, skipped, reason: 'diff' };
|
||||
}
|
||||
+386
-106
@@ -1,10 +1,11 @@
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import type { SkillTestResult } from './helpers/session-runner';
|
||||
import { outcomeJudge } from './helpers/llm-judge';
|
||||
import { outcomeJudge, callJudge } from './helpers/llm-judge';
|
||||
import { EvalCollector, judgePassed } from './helpers/eval-store';
|
||||
import type { EvalTestEntry } from './helpers/eval-store';
|
||||
import { startTestServer } from '../browse/test/test-server';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
@@ -21,6 +22,41 @@ const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||
|
||||
// --- Diff-based test selection ---
|
||||
// When EVALS_ALL is not set, only run tests whose touchfiles were modified.
|
||||
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
|
||||
let selectedTests: string[] | null = null; // null = run all
|
||||
|
||||
if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
const baseBranch = process.env.EVALS_BASE
|
||||
|| detectBaseBranch(ROOT)
|
||||
|| 'main';
|
||||
const changedFiles = getChangedFiles(baseBranch, ROOT);
|
||||
|
||||
if (changedFiles.length > 0) {
|
||||
const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
|
||||
selectedTests = selection.selected;
|
||||
process.stderr.write(`\nE2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`);
|
||||
if (selection.skipped.length > 0) {
|
||||
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
|
||||
}
|
||||
process.stderr.write('\n');
|
||||
}
|
||||
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
|
||||
}
|
||||
|
||||
/** Wrap a describe block to skip entirely if none of its tests are selected. */
|
||||
function describeIfSelected(name: string, testNames: string[], fn: () => void) {
|
||||
const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
|
||||
(anySelected ? describeE2E : describe.skip)(name, fn);
|
||||
}
|
||||
|
||||
/** Skip an individual test if not selected (for multi-test describe blocks). */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
|
||||
const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
|
||||
|
||||
@@ -133,7 +169,10 @@ if (evalsEnabled) {
|
||||
}
|
||||
}
|
||||
|
||||
describeE2E('Skill E2E tests', () => {
|
||||
describeIfSelected('Skill E2E tests', [
|
||||
'browse-basic', 'browse-snapshot', 'skillmd-setup-discovery',
|
||||
'skillmd-no-local-binary', 'skillmd-outside-git', 'contributor-mode', 'session-awareness',
|
||||
], () => {
|
||||
beforeAll(() => {
|
||||
testServer = startTestServer();
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
|
||||
@@ -145,7 +184,7 @@ describeE2E('Skill E2E tests', () => {
|
||||
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('browse basic commands work without errors', async () => {
|
||||
testIfSelected('browse-basic', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
|
||||
1. $B goto ${testServer.url}
|
||||
@@ -166,7 +205,7 @@ Report the results of each command.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('browse snapshot flags all work', async () => {
|
||||
testIfSelected('browse-snapshot', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
|
||||
1. $B goto ${testServer.url}
|
||||
@@ -191,7 +230,7 @@ Report what each command returned.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('agent discovers browse binary via SKILL.md setup block', async () => {
|
||||
testIfSelected('skillmd-setup-discovery', async () => {
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
@@ -220,7 +259,7 @@ Report whether it worked.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('SKILL.md setup block handles missing local binary gracefully', async () => {
|
||||
testIfSelected('skillmd-no-local-binary', async () => {
|
||||
// Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse
|
||||
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
|
||||
|
||||
@@ -255,7 +294,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w
|
||||
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('SKILL.md setup block works outside git repo', async () => {
|
||||
testIfSelected('skillmd-outside-git', async () => {
|
||||
// Create a tmpdir outside any git repo
|
||||
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
|
||||
|
||||
@@ -286,7 +325,7 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('contributor mode files a report on gstack error', async () => {
|
||||
testIfSelected('contributor-mode', async () => {
|
||||
const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-'));
|
||||
const logsDir = path.join(contribDir, 'contributor-logs');
|
||||
fs.mkdirSync(logsDir, { recursive: true });
|
||||
@@ -342,7 +381,7 @@ File a contributor report about this issue. Then tell me what you filed.`,
|
||||
try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
|
||||
}, 90_000);
|
||||
|
||||
test('session awareness adds ELI16 context when _SESSIONS >= 3', async () => {
|
||||
testIfSelected('session-awareness', async () => {
|
||||
const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-'));
|
||||
|
||||
// Set up a git repo so there's project/branch context to reference
|
||||
@@ -413,7 +452,7 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi
|
||||
|
||||
// --- B4: QA skill E2E ---
|
||||
|
||||
describeE2E('QA skill E2E', () => {
|
||||
describeIfSelected('QA skill E2E', ['qa-quick'], () => {
|
||||
let qaDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -468,7 +507,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
|
||||
// --- B5: Review skill E2E ---
|
||||
|
||||
describeE2E('Review skill E2E', () => {
|
||||
describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -527,7 +566,7 @@ Write your review findings to ${reviewDir}/review-output.md`,
|
||||
|
||||
// --- Review: Enum completeness E2E ---
|
||||
|
||||
describeE2E('Review enum completeness E2E', () => {
|
||||
describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'], () => {
|
||||
let enumDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -597,13 +636,107 @@ The diff adds a new "returned" status to the Order model. Your job is to check i
|
||||
}, 120_000);
|
||||
});
|
||||
|
||||
// --- Review: Design review lite E2E ---
|
||||
|
||||
describeE2E('Review design lite E2E', () => {
|
||||
let designDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
designDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-design-lite-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: designDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Commit clean base on main
|
||||
fs.writeFileSync(path.join(designDir, 'index.html'), '<h1>Clean</h1>\n');
|
||||
fs.writeFileSync(path.join(designDir, 'styles.css'), 'body { font-size: 16px; }\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Feature branch adds AI slop CSS + HTML
|
||||
run('git', ['checkout', '-b', 'feature/add-landing-page']);
|
||||
const slopCss = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.css'), 'utf-8');
|
||||
const slopHtml = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.html'), 'utf-8');
|
||||
fs.writeFileSync(path.join(designDir, 'styles.css'), slopCss);
|
||||
fs.writeFileSync(path.join(designDir, 'landing.html'), slopHtml);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'add landing page']);
|
||||
|
||||
// Copy review skill files
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(designDir, 'review-SKILL.md'));
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(designDir, 'review-checklist.md'));
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'design-checklist.md'), path.join(designDir, 'review-design-checklist.md'));
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(designDir, 'review-greptile-triage.md'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review catches design anti-patterns in CSS/HTML diff', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
Read review-checklist.md for the code review checklist.
|
||||
Read review-design-checklist.md for the design review checklist.
|
||||
Run /review on the current diff (git diff main...HEAD).
|
||||
|
||||
The diff adds a landing page with CSS and HTML. Check for both code issues AND design anti-patterns.
|
||||
Write your review findings to ${designDir}/review-output.md
|
||||
|
||||
Important: The design checklist should catch issues like blacklisted fonts, small font sizes, outline:none, !important, AI slop patterns (purple gradients, generic hero copy, 3-column feature grid), etc.`,
|
||||
workingDirectory: designDir,
|
||||
maxTurns: 15,
|
||||
timeout: 120_000,
|
||||
testName: 'review-design-lite',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/review design lite', result);
|
||||
recordE2E('/review design lite', 'Review design lite E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify the review caught at least 4 of 7 planted design issues
|
||||
const reviewPath = path.join(designDir, 'review-output.md');
|
||||
if (fs.existsSync(reviewPath)) {
|
||||
const review = fs.readFileSync(reviewPath, 'utf-8').toLowerCase();
|
||||
let detected = 0;
|
||||
|
||||
// Issue 1: Blacklisted font (Papyrus) — HIGH
|
||||
if (review.includes('papyrus') || review.includes('blacklisted font') || review.includes('font family')) detected++;
|
||||
// Issue 2: Body text < 16px — HIGH
|
||||
if (review.includes('14px') || review.includes('font-size') || review.includes('font size') || review.includes('body text')) detected++;
|
||||
// Issue 3: outline: none — HIGH
|
||||
if (review.includes('outline') || review.includes('focus')) detected++;
|
||||
// Issue 4: !important — HIGH
|
||||
if (review.includes('!important') || review.includes('important')) detected++;
|
||||
// Issue 5: Purple gradient — MEDIUM
|
||||
if (review.includes('gradient') || review.includes('purple') || review.includes('violet') || review.includes('#6366f1') || review.includes('#8b5cf6')) detected++;
|
||||
// Issue 6: Generic hero copy — MEDIUM
|
||||
if (review.includes('welcome to') || review.includes('all-in-one') || review.includes('generic') || review.includes('hero copy') || review.includes('ai slop')) detected++;
|
||||
// Issue 7: 3-column feature grid — LOW
|
||||
if (review.includes('3-column') || review.includes('three-column') || review.includes('feature grid') || review.includes('icon') || review.includes('circle')) detected++;
|
||||
|
||||
console.log(`Design review detected ${detected}/7 planted issues`);
|
||||
expect(detected).toBeGreaterThanOrEqual(4);
|
||||
}
|
||||
}, 150_000);
|
||||
});
|
||||
|
||||
// --- B6/B7/B8: Planted-bug outcome evals ---
|
||||
|
||||
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
|
||||
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
|
||||
const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
|
||||
|
||||
describeOutcome('Planted-bug outcome evals', () => {
|
||||
// Wrap describeOutcome with selection — skip if no planted-bug tests are selected
|
||||
const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout'];
|
||||
const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t));
|
||||
(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => {
|
||||
let outcomeDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -767,7 +900,7 @@ CRITICAL RULES:
|
||||
|
||||
// --- Plan CEO Review E2E ---
|
||||
|
||||
describeE2E('Plan CEO Review E2E', () => {
|
||||
describeIfSelected('Plan CEO Review E2E', ['plan-ceo-review'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -854,7 +987,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
|
||||
// --- Plan CEO Review (SELECTIVE EXPANSION) E2E ---
|
||||
|
||||
describeE2E('Plan CEO Review SELECTIVE EXPANSION E2E', () => {
|
||||
describeIfSelected('Plan CEO Review SELECTIVE EXPANSION E2E', ['plan-ceo-review-selective'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -937,7 +1070,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
|
||||
// --- Plan Eng Review E2E ---
|
||||
|
||||
describeE2E('Plan Eng Review E2E', () => {
|
||||
describeIfSelected('Plan Eng Review E2E', ['plan-eng-review'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1031,7 +1164,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
|
||||
|
||||
// --- Retro E2E ---
|
||||
|
||||
describeE2E('Retro E2E', () => {
|
||||
describeIfSelected('Retro E2E', ['retro'], () => {
|
||||
let retroDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1117,7 +1250,7 @@ Analyze the git history and produce the narrative report as described in the SKI
|
||||
|
||||
// --- QA-Only E2E (report-only, no fixes) ---
|
||||
|
||||
describeE2E('QA-Only skill E2E', () => {
|
||||
describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
|
||||
let qaOnlyDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1203,7 +1336,7 @@ Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`,
|
||||
|
||||
// --- QA Fix Loop E2E ---
|
||||
|
||||
describeE2E('QA Fix Loop E2E', () => {
|
||||
describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
|
||||
let qaFixDir: string;
|
||||
let qaFixServer: ReturnType<typeof Bun.serve> | null = null;
|
||||
|
||||
@@ -1317,7 +1450,7 @@ This is a test+fix loop: find bugs, fix them in the source code, commit each fix
|
||||
|
||||
// --- Plan-Eng-Review Test-Plan Artifact E2E ---
|
||||
|
||||
describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => {
|
||||
describeIfSelected('Plan-Eng-Review Test-Plan Artifact E2E', ['plan-eng-review-artifact'], () => {
|
||||
let planDir: string;
|
||||
let projectDir: string;
|
||||
|
||||
@@ -1444,7 +1577,7 @@ Write your review to ${planDir}/review-output.md`,
|
||||
|
||||
// --- Base branch detection smoke tests ---
|
||||
|
||||
describeE2E('Base branch detection', () => {
|
||||
describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => {
|
||||
let baseBranchDir: string;
|
||||
const run = (cmd: string, args: string[], cwd: string) =>
|
||||
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
|
||||
@@ -1457,7 +1590,7 @@ describeE2E('Base branch detection', () => {
|
||||
try { fs.rmSync(baseBranchDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review detects base branch and diffs against it', async () => {
|
||||
testIfSelected('review-base-branch', async () => {
|
||||
const dir = path.join(baseBranchDir, 'review-base');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
|
||||
@@ -1510,7 +1643,7 @@ Write your findings to ${dir}/review-output.md`,
|
||||
expect(usedGitDiff).toBe(true);
|
||||
}, 120_000);
|
||||
|
||||
test('/ship Step 0-1 detects base branch without destructive actions', async () => {
|
||||
testIfSelected('ship-base-branch', async () => {
|
||||
const dir = path.join(baseBranchDir, 'ship-base');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
|
||||
@@ -1572,7 +1705,7 @@ Write a summary of what you detected to ${dir}/ship-preflight.md including:
|
||||
expect(destructiveTools).toHaveLength(0);
|
||||
}, 90_000);
|
||||
|
||||
test('/retro detects default branch for git queries', async () => {
|
||||
testIfSelected('retro-base-branch', async () => {
|
||||
const dir = path.join(baseBranchDir, 'retro-base');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
|
||||
@@ -1631,7 +1764,7 @@ Write your retrospective to ${dir}/retro-output.md`,
|
||||
|
||||
// --- Document-Release skill E2E ---
|
||||
|
||||
describeE2E('Document-Release skill E2E', () => {
|
||||
describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
|
||||
let docReleaseDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1735,6 +1868,7 @@ IMPORTANT:
|
||||
|
||||
// --- Deferred skill E2E tests (destructive or require interactive UI) ---
|
||||
|
||||
// Deferred tests — only test.todo entries, no selection needed
|
||||
describeE2E('Deferred skill E2E', () => {
|
||||
// Ship is destructive: pushes to remote, creates PRs, modifies VERSION/CHANGELOG
|
||||
test.todo('/ship completes full workflow');
|
||||
@@ -1742,8 +1876,120 @@ describeE2E('Deferred skill E2E', () => {
|
||||
// Setup-browser-cookies requires interactive browser picker UI
|
||||
test.todo('/setup-browser-cookies imports cookies');
|
||||
|
||||
// Gstack-upgrade is destructive: modifies skill installation directory
|
||||
test.todo('/gstack-upgrade completes upgrade flow');
|
||||
});
|
||||
|
||||
// --- gstack-upgrade E2E ---
|
||||
|
||||
describeIfSelected('gstack-upgrade E2E', ['gstack-upgrade-happy-path'], () => {
|
||||
let upgradeDir: string;
|
||||
let remoteDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
upgradeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-upgrade-'));
|
||||
remoteDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-remote-'));
|
||||
|
||||
const run = (cmd: string, args: string[], cwd: string) =>
|
||||
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
// Init the "project" repo
|
||||
run('git', ['init'], upgradeDir);
|
||||
run('git', ['config', 'user.email', 'test@test.com'], upgradeDir);
|
||||
run('git', ['config', 'user.name', 'Test'], upgradeDir);
|
||||
|
||||
// Create mock gstack install directory (local-git type)
|
||||
const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
|
||||
fs.mkdirSync(mockGstack, { recursive: true });
|
||||
|
||||
// Init as a git repo
|
||||
run('git', ['init'], mockGstack);
|
||||
run('git', ['config', 'user.email', 'test@test.com'], mockGstack);
|
||||
run('git', ['config', 'user.name', 'Test'], mockGstack);
|
||||
|
||||
// Create bare remote
|
||||
run('git', ['init', '--bare'], remoteDir);
|
||||
run('git', ['remote', 'add', 'origin', remoteDir], mockGstack);
|
||||
|
||||
// Write old version files
|
||||
fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.5.0\n');
|
||||
fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
|
||||
'# Changelog\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
|
||||
fs.writeFileSync(path.join(mockGstack, 'setup'),
|
||||
'#!/bin/bash\necho "Setup completed"\n', { mode: 0o755 });
|
||||
|
||||
// Initial commit + push
|
||||
run('git', ['add', '.'], mockGstack);
|
||||
run('git', ['commit', '-m', 'initial'], mockGstack);
|
||||
run('git', ['push', '-u', 'origin', 'HEAD:main'], mockGstack);
|
||||
|
||||
// Create new version (simulate upstream release)
|
||||
fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.6.0\n');
|
||||
fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
|
||||
'# Changelog\n\n## 0.6.0 — 2026-03-15\n\n- New feature: interactive design review\n- Fix: snapshot flag validation\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
|
||||
run('git', ['add', '.'], mockGstack);
|
||||
run('git', ['commit', '-m', 'release 0.6.0'], mockGstack);
|
||||
run('git', ['push', 'origin', 'HEAD:main'], mockGstack);
|
||||
|
||||
// Reset working copy back to old version
|
||||
run('git', ['reset', '--hard', 'HEAD~1'], mockGstack);
|
||||
|
||||
// Copy gstack-upgrade skill
|
||||
fs.mkdirSync(path.join(upgradeDir, 'gstack-upgrade'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'gstack-upgrade', 'SKILL.md'),
|
||||
path.join(upgradeDir, 'gstack-upgrade', 'SKILL.md'),
|
||||
);
|
||||
|
||||
// Commit so git repo is clean
|
||||
run('git', ['add', '.'], upgradeDir);
|
||||
run('git', ['commit', '-m', 'initial project'], upgradeDir);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(upgradeDir, { recursive: true, force: true }); } catch {}
|
||||
try { fs.rmSync(remoteDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('gstack-upgrade-happy-path', async () => {
|
||||
const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read gstack-upgrade/SKILL.md for the upgrade workflow.
|
||||
|
||||
You are running /gstack-upgrade standalone. The gstack installation is at ./.claude/skills/gstack (local-git type — it has a .git directory with an origin remote).
|
||||
|
||||
Current version: 0.5.0. A new version 0.6.0 is available on origin/main.
|
||||
|
||||
Follow the standalone upgrade flow:
|
||||
1. Detect install type (local-git)
|
||||
2. Run git fetch origin && git reset --hard origin/main in the install directory
|
||||
3. Run the setup script
|
||||
4. Show what's new from CHANGELOG
|
||||
|
||||
Skip any AskUserQuestion calls — auto-approve the upgrade. Write a summary of what you did to stdout.
|
||||
|
||||
IMPORTANT: The install directory is at ./.claude/skills/gstack — use that exact path.`,
|
||||
workingDirectory: upgradeDir,
|
||||
maxTurns: 20,
|
||||
timeout: 180_000,
|
||||
testName: 'gstack-upgrade-happy-path',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/gstack-upgrade happy path', result);
|
||||
|
||||
// Check that the version was updated
|
||||
const versionAfter = fs.readFileSync(path.join(mockGstack, 'VERSION'), 'utf-8').trim();
|
||||
const output = result.output || '';
|
||||
const mentionsUpgrade = output.toLowerCase().includes('0.6.0') ||
|
||||
output.toLowerCase().includes('upgrade') ||
|
||||
output.toLowerCase().includes('updated');
|
||||
|
||||
recordE2E('/gstack-upgrade happy path', 'gstack-upgrade E2E', result, {
|
||||
passed: versionAfter === '0.6.0' && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
expect(versionAfter).toBe('0.6.0');
|
||||
}, 240_000);
|
||||
});
|
||||
|
||||
// --- Design Consultation E2E ---
|
||||
@@ -1772,7 +2018,10 @@ ${designMd}
|
||||
Return JSON: { "passed": true/false, "reasoning": "one paragraph explaining your evaluation" }`);
|
||||
}
|
||||
|
||||
describeE2E('Design Consultation E2E', () => {
|
||||
describeIfSelected('Design Consultation E2E', [
|
||||
'design-consultation-core', 'design-consultation-research',
|
||||
'design-consultation-existing', 'design-consultation-preview',
|
||||
], () => {
|
||||
let designDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
@@ -1816,7 +2065,7 @@ A civic tech data platform for government employees to access, visualize, and sh
|
||||
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 1: core flow produces valid DESIGN.md + CLAUDE.md', async () => {
|
||||
testIfSelected('design-consultation-core', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
|
||||
|
||||
@@ -1876,7 +2125,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
|
||||
}
|
||||
}, 420_000);
|
||||
|
||||
test('Test 2: research integration uses WebSearch', async () => {
|
||||
testIfSelected('design-consultation-research', async () => {
|
||||
// Clean up from previous test
|
||||
try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
|
||||
try { fs.unlinkSync(path.join(designDir, 'CLAUDE.md')); } catch {}
|
||||
@@ -1933,7 +2182,7 @@ Write DESIGN.md to the working directory.`,
|
||||
expect(designExists).toBe(true);
|
||||
}, 420_000);
|
||||
|
||||
test('Test 3: handles existing DESIGN.md', async () => {
|
||||
testIfSelected('design-consultation-existing', async () => {
|
||||
// Pre-create a minimal DESIGN.md
|
||||
fs.writeFileSync(path.join(designDir, 'DESIGN.md'), `# Design System — CivicPulse
|
||||
|
||||
@@ -1979,7 +2228,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
|
||||
}
|
||||
}, 420_000);
|
||||
|
||||
test('Test 4: generates font + color preview HTML', async () => {
|
||||
testIfSelected('design-consultation-preview', async () => {
|
||||
// Clean up
|
||||
try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
|
||||
|
||||
@@ -2041,15 +2290,13 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Plan Design Review E2E ---
|
||||
// --- Plan Design Review E2E (plan-mode) ---
|
||||
|
||||
describeE2E('Plan Design Review E2E', () => {
|
||||
describeIfSelected('Plan Design Review E2E', ['plan-design-review-plan-mode', 'plan-design-review-no-ui-scope'], () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testServer = testServer || startTestServer();
|
||||
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-design-'));
|
||||
setupBrowseShims(reviewDir);
|
||||
|
||||
const { spawnSync } = require('child_process');
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
@@ -2058,9 +2305,6 @@ describeE2E('Plan Design Review E2E', () => {
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(reviewDir, 'index.html'), '<h1>Test</h1>\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Copy plan-design-review skill
|
||||
fs.mkdirSync(path.join(reviewDir, 'plan-design-review'), { recursive: true });
|
||||
@@ -2068,100 +2312,136 @@ describeE2E('Plan Design Review E2E', () => {
|
||||
path.join(ROOT, 'plan-design-review', 'SKILL.md'),
|
||||
path.join(reviewDir, 'plan-design-review', 'SKILL.md'),
|
||||
);
|
||||
|
||||
// Create a plan file with intentional design gaps
|
||||
fs.writeFileSync(path.join(reviewDir, 'plan.md'), `# Plan: User Dashboard
|
||||
|
||||
## Context
|
||||
Build a user dashboard that shows account stats, recent activity, and settings.
|
||||
|
||||
## Implementation
|
||||
1. Create a dashboard page at /dashboard
|
||||
2. Show user stats (posts, followers, engagement rate)
|
||||
3. Add a recent activity feed
|
||||
4. Add a settings panel
|
||||
5. Use a clean, modern UI with cards and icons
|
||||
6. Add a hero section at the top with a gradient background
|
||||
|
||||
## Technical Details
|
||||
- React components with Tailwind CSS
|
||||
- API endpoint: GET /api/dashboard
|
||||
- WebSocket for real-time activity updates
|
||||
`);
|
||||
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial plan']);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 5: /plan-design-review produces audit report', async () => {
|
||||
testIfSelected('plan-design-review-plan-mode', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
prompt: `Read plan-design-review/SKILL.md for the design review workflow.
|
||||
|
||||
B="${browseBin}"
|
||||
Review the plan in ./plan.md. This plan has several design gaps — it uses vague language like "clean, modern UI" and "cards and icons", mentions a "hero section with gradient" (AI slop), and doesn't specify empty states, error states, loading states, responsive behavior, or accessibility.
|
||||
|
||||
Read plan-design-review/SKILL.md for the design review workflow.
|
||||
Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Rate each design dimension 0-10 and explain what would make it a 10. Then EDIT plan.md to add the missing design decisions (interaction state table, empty states, responsive behavior, etc.).
|
||||
|
||||
Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.`,
|
||||
IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit. Just read the plan file, review it, and edit it to fix the gaps.`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 20,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-design-review-audit',
|
||||
maxTurns: 15,
|
||||
timeout: 300_000,
|
||||
testName: 'plan-design-review-plan-mode',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/plan-design-review audit', result);
|
||||
logCost('/plan-design-review plan-mode', result);
|
||||
|
||||
const reportPath = path.join(reviewDir, 'design-audit.md');
|
||||
const reportExists = fs.existsSync(reportPath);
|
||||
let reportContent = '';
|
||||
if (reportExists) {
|
||||
reportContent = fs.readFileSync(reportPath, 'utf-8');
|
||||
}
|
||||
// Check that the agent produced design ratings (0-10 scale)
|
||||
const output = result.output || '';
|
||||
const hasRatings = /\d+\/10/.test(output);
|
||||
const hasDesignContent = output.toLowerCase().includes('information architecture') ||
|
||||
output.toLowerCase().includes('interaction state') ||
|
||||
output.toLowerCase().includes('ai slop') ||
|
||||
output.toLowerCase().includes('hierarchy');
|
||||
|
||||
const hasFirstImpression = reportContent.toLowerCase().includes('first impression') ||
|
||||
reportContent.toLowerCase().includes('impression');
|
||||
// Check that the plan file was edited (the core new behavior)
|
||||
const planAfter = fs.readFileSync(path.join(reviewDir, 'plan.md'), 'utf-8');
|
||||
const planOriginal = `# Plan: User Dashboard`;
|
||||
const planWasEdited = planAfter.length > 300; // Original is ~450 chars, edited should be much longer
|
||||
const planHasDesignAdditions = planAfter.toLowerCase().includes('empty') ||
|
||||
planAfter.toLowerCase().includes('loading') ||
|
||||
planAfter.toLowerCase().includes('error') ||
|
||||
planAfter.toLowerCase().includes('state') ||
|
||||
planAfter.toLowerCase().includes('responsive') ||
|
||||
planAfter.toLowerCase().includes('accessibility');
|
||||
|
||||
recordE2E('/plan-design-review audit', 'Plan Design Review E2E', result, {
|
||||
passed: reportExists && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
recordE2E('/plan-design-review plan-mode', 'Plan Design Review E2E', result, {
|
||||
passed: hasDesignContent && planWasEdited && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
expect(reportExists).toBe(true);
|
||||
if (reportExists) {
|
||||
expect(reportContent.length).toBeGreaterThan(200);
|
||||
}
|
||||
}, 420_000);
|
||||
// Agent should produce design-relevant output about the plan
|
||||
expect(hasDesignContent).toBe(true);
|
||||
// Agent should have edited the plan file to add missing design decisions
|
||||
expect(planWasEdited).toBe(true);
|
||||
expect(planHasDesignAdditions).toBe(true);
|
||||
}, 360_000);
|
||||
|
||||
test('Test 6: /plan-design-review exports DESIGN.md', async () => {
|
||||
// Clean up previous test artifacts
|
||||
try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {}
|
||||
testIfSelected('plan-design-review-no-ui-scope', async () => {
|
||||
// Write a backend-only plan
|
||||
fs.writeFileSync(path.join(reviewDir, 'backend-plan.md'), `# Plan: Database Migration
|
||||
|
||||
## Context
|
||||
Migrate user records from PostgreSQL to a new schema with better indexing.
|
||||
|
||||
## Implementation
|
||||
1. Create migration to add new columns to users table
|
||||
2. Backfill data from legacy columns
|
||||
3. Add database indexes for common query patterns
|
||||
4. Update ActiveRecord models
|
||||
5. Run migration in staging first, then production
|
||||
`);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
prompt: `Read plan-design-review/SKILL.md for the design review workflow.
|
||||
|
||||
B="${browseBin}"
|
||||
Review the plan in ./backend-plan.md. This is a pure backend database migration plan with no UI changes.
|
||||
|
||||
Read plan-design-review/SKILL.md for the design review workflow.
|
||||
Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Write your findings directly to stdout.
|
||||
|
||||
Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — this is non-interactive. After Phase 2 (Design System Extraction), write a DESIGN.md to the working directory. Also write the audit report to ./design-audit.md.`,
|
||||
IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit.`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 25,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-design-review-export',
|
||||
maxTurns: 10,
|
||||
timeout: 180_000,
|
||||
testName: 'plan-design-review-no-ui-scope',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/plan-design-review export', result);
|
||||
logCost('/plan-design-review no-ui-scope', result);
|
||||
|
||||
const designPath = path.join(reviewDir, 'DESIGN.md');
|
||||
const reportPath = path.join(reviewDir, 'design-audit.md');
|
||||
const designExists = fs.existsSync(designPath);
|
||||
const reportExists = fs.existsSync(reportPath);
|
||||
// Agent should detect no UI scope and exit early
|
||||
const output = result.output || '';
|
||||
const detectsNoUI = output.toLowerCase().includes('no ui') ||
|
||||
output.toLowerCase().includes('no frontend') ||
|
||||
output.toLowerCase().includes('no design') ||
|
||||
output.toLowerCase().includes('not applicable') ||
|
||||
output.toLowerCase().includes('backend');
|
||||
|
||||
let designContent = '';
|
||||
if (designExists) {
|
||||
designContent = fs.readFileSync(designPath, 'utf-8');
|
||||
}
|
||||
|
||||
const hasTypography = designContent.toLowerCase().includes('typography') || designContent.toLowerCase().includes('font');
|
||||
const hasColor = designContent.toLowerCase().includes('color');
|
||||
|
||||
recordE2E('/plan-design-review export', 'Plan Design Review E2E', result, {
|
||||
passed: designExists && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
recordE2E('/plan-design-review no-ui-scope', 'Plan Design Review E2E', result, {
|
||||
passed: detectsNoUI && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
// DESIGN.md export is best-effort — agent may not always produce it
|
||||
if (designExists) {
|
||||
expect(hasTypography || hasColor).toBe(true);
|
||||
}
|
||||
}, 420_000);
|
||||
expect(detectsNoUI).toBe(true);
|
||||
}, 240_000);
|
||||
});
|
||||
|
||||
// --- QA Design Review E2E ---
|
||||
// --- Design Review E2E (live-site audit + fix) ---
|
||||
|
||||
describeE2E('QA Design Review E2E', () => {
|
||||
describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
|
||||
let qaDesignDir: string;
|
||||
let qaDesignServer: ReturnType<typeof Bun.serve> | null = null;
|
||||
|
||||
@@ -2237,11 +2517,11 @@ describeE2E('QA Design Review E2E', () => {
|
||||
},
|
||||
});
|
||||
|
||||
// Copy qa-design-review skill
|
||||
fs.mkdirSync(path.join(qaDesignDir, 'qa-design-review'), { recursive: true });
|
||||
// Copy design-review skill
|
||||
fs.mkdirSync(path.join(qaDesignDir, 'design-review'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'qa-design-review', 'SKILL.md'),
|
||||
path.join(qaDesignDir, 'qa-design-review', 'SKILL.md'),
|
||||
path.join(ROOT, 'design-review', 'SKILL.md'),
|
||||
path.join(qaDesignDir, 'design-review', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
@@ -2250,7 +2530,7 @@ describeE2E('QA Design Review E2E', () => {
|
||||
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 7: /qa-design-review audits and fixes design issues', async () => {
|
||||
test('Test 7: /design-review audits and fixes design issues', async () => {
|
||||
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
@@ -2258,17 +2538,17 @@ describeE2E('QA Design Review E2E', () => {
|
||||
|
||||
B="${browseBin}"
|
||||
|
||||
Read qa-design-review/SKILL.md for the design review + fix workflow.
|
||||
Read design-review/SKILL.md for the design review + fix workflow.
|
||||
|
||||
Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion calls — this is non-interactive. Fix up to 3 issues max. Write your report to ./design-audit.md.`,
|
||||
workingDirectory: qaDesignDir,
|
||||
maxTurns: 30,
|
||||
timeout: 360_000,
|
||||
testName: 'qa-design-review-fix',
|
||||
testName: 'design-review-fix',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/qa-design-review fix', result);
|
||||
logCost('/design-review fix', result);
|
||||
|
||||
const reportPath = path.join(qaDesignDir, 'design-audit.md');
|
||||
const reportExists = fs.existsSync(reportPath);
|
||||
@@ -2280,7 +2560,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
|
||||
const commits = gitLog.stdout.toString().trim().split('\n');
|
||||
const designFixCommits = commits.filter((c: string) => c.includes('style(design)'));
|
||||
|
||||
recordE2E('/qa-design-review fix', 'QA Design Review E2E', result, {
|
||||
recordE2E('/design-review fix', 'Design Review E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
@@ -2300,7 +2580,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
|
||||
|
||||
// --- Test Bootstrap E2E ---
|
||||
|
||||
describeE2E('Test Bootstrap E2E', () => {
|
||||
describeIfSelected('Test Bootstrap E2E', ['qa-bootstrap'], () => {
|
||||
let bootstrapDir: string;
|
||||
let bootstrapServer: ReturnType<typeof Bun.serve>;
|
||||
|
||||
@@ -2437,7 +2717,7 @@ This is a test+fix loop: find bugs, fix them, write regression tests, commit eac
|
||||
|
||||
// --- Test Coverage Audit E2E ---
|
||||
|
||||
describeE2E('Test Coverage Audit E2E', () => {
|
||||
describeIfSelected('Test Coverage Audit E2E', ['ship-coverage-audit'], () => {
|
||||
let coverageDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
|
||||
+253
-13
@@ -17,6 +17,7 @@ import * as path from 'path';
|
||||
import { callJudge, judge } from './helpers/llm-judge';
|
||||
import type { JudgeScore } from './helpers/llm-judge';
|
||||
import { EvalCollector } from './helpers/eval-store';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
|
||||
@@ -26,8 +27,43 @@ const describeEval = evalsEnabled ? describe : describe.skip;
|
||||
// Eval result collector
|
||||
const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
|
||||
|
||||
describeEval('LLM-as-judge quality evals', () => {
|
||||
test('command reference table scores >= 4 on all dimensions', async () => {
|
||||
// --- Diff-based test selection ---
|
||||
let selectedTests: string[] | null = null;
|
||||
|
||||
if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
const baseBranch = process.env.EVALS_BASE
|
||||
|| detectBaseBranch(ROOT)
|
||||
|| 'main';
|
||||
const changedFiles = getChangedFiles(baseBranch, ROOT);
|
||||
|
||||
if (changedFiles.length > 0) {
|
||||
const selection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
|
||||
selectedTests = selection.selected;
|
||||
process.stderr.write(`\nLLM-judge selection (${selection.reason}): ${selection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests\n`);
|
||||
if (selection.skipped.length > 0) {
|
||||
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
|
||||
}
|
||||
process.stderr.write('\n');
|
||||
}
|
||||
}
|
||||
|
||||
/** Wrap a describe block to skip if none of its tests are selected. */
|
||||
function describeIfSelected(name: string, testNames: string[], fn: () => void) {
|
||||
const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
|
||||
(anySelected ? describeEval : describe.skip)(name, fn);
|
||||
}
|
||||
|
||||
/** Skip an individual test if not selected (for multi-test describe blocks). */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
describeIfSelected('LLM-as-judge quality evals', [
|
||||
'command reference table', 'snapshot flags reference',
|
||||
'browse/SKILL.md reference', 'setup block', 'regression vs baseline',
|
||||
], () => {
|
||||
testIfSelected('command reference table', async () => {
|
||||
const t0 = Date.now();
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const start = content.indexOf('## Command Reference');
|
||||
@@ -53,7 +89,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('snapshot flags section scores >= 4 on all dimensions', async () => {
|
||||
testIfSelected('snapshot flags reference', async () => {
|
||||
const t0 = Date.now();
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const start = content.indexOf('## Snapshot System');
|
||||
@@ -79,7 +115,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('browse/SKILL.md overall scores >= 4', async () => {
|
||||
testIfSelected('browse/SKILL.md reference', async () => {
|
||||
const t0 = Date.now();
|
||||
const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
|
||||
const start = content.indexOf('## Snapshot Flags');
|
||||
@@ -104,7 +140,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('setup block scores >= 3 on actionability and clarity', async () => {
|
||||
testIfSelected('setup block', async () => {
|
||||
const t0 = Date.now();
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = content.indexOf('## SETUP');
|
||||
@@ -131,7 +167,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(3);
|
||||
}, 30_000);
|
||||
|
||||
test('regression check: compare branch vs baseline quality', async () => {
|
||||
testIfSelected('regression vs baseline', async () => {
|
||||
const t0 = Date.now();
|
||||
const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const genStart = generated.indexOf('## Command Reference');
|
||||
@@ -220,10 +256,10 @@ Scores are 1-5 overall quality.`,
|
||||
|
||||
// --- Part 7: QA skill quality evals (C6) ---
|
||||
|
||||
describeEval('QA skill quality evals', () => {
|
||||
describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => {
|
||||
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('qa/SKILL.md workflow quality scores >= 4', async () => {
|
||||
testIfSelected('qa/SKILL.md workflow', async () => {
|
||||
const t0 = Date.now();
|
||||
const start = qaContent.indexOf('## Workflow');
|
||||
const end = qaContent.indexOf('## Health Score Rubric');
|
||||
@@ -266,7 +302,7 @@ ${section}`);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('qa/SKILL.md health score rubric is unambiguous', async () => {
|
||||
testIfSelected('qa/SKILL.md health rubric', async () => {
|
||||
const t0 = Date.now();
|
||||
const start = qaContent.indexOf('## Health Score Rubric');
|
||||
const section = qaContent.slice(start);
|
||||
@@ -310,8 +346,8 @@ ${section}`);
|
||||
|
||||
// --- Part 7: Cross-skill consistency judge (C7) ---
|
||||
|
||||
describeEval('Cross-skill consistency evals', () => {
|
||||
test('greptile-history patterns are consistent across all skills', async () => {
|
||||
describeIfSelected('Cross-skill consistency evals', ['cross-skill greptile consistency'], () => {
|
||||
testIfSelected('cross-skill greptile consistency', async () => {
|
||||
const t0 = Date.now();
|
||||
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
@@ -375,10 +411,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
|
||||
|
||||
// --- Part 7: Baseline score pinning (C9) ---
|
||||
|
||||
describeEval('Baseline score pinning', () => {
|
||||
describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => {
|
||||
const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');
|
||||
|
||||
test('LLM eval scores do not regress below baselines', async () => {
|
||||
testIfSelected('baseline score pinning', async () => {
|
||||
const t0 = Date.now();
|
||||
if (!fs.existsSync(baselinesPath)) {
|
||||
console.log('No baseline file found — skipping pinning check');
|
||||
@@ -428,6 +464,210 @@ describeEval('Baseline score pinning', () => {
|
||||
}, 60_000);
|
||||
});
|
||||
|
||||
// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) ---
|
||||
|
||||
/**
|
||||
* DRY helper for workflow SKILL.md judge tests.
|
||||
* Extracts a section from a SKILL.md file and judges its quality as an agent workflow.
|
||||
*/
|
||||
async function runWorkflowJudge(opts: {
|
||||
testName: string;
|
||||
suite: string;
|
||||
skillPath: string;
|
||||
startMarker: string;
|
||||
endMarker: string | null;
|
||||
judgeContext: string;
|
||||
judgeGoal: string;
|
||||
thresholds?: { clarity: number; completeness: number; actionability: number };
|
||||
}) {
|
||||
const t0 = Date.now();
|
||||
const defaults = { clarity: 4, completeness: 3, actionability: 4 };
|
||||
const thresholds = { ...defaults, ...opts.thresholds };
|
||||
|
||||
const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
|
||||
const startIdx = content.indexOf(opts.startMarker);
|
||||
if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`);
|
||||
|
||||
let section: string;
|
||||
if (opts.endMarker) {
|
||||
const endIdx = content.indexOf(opts.endMarker, startIdx);
|
||||
if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`);
|
||||
section = content.slice(startIdx, endIdx);
|
||||
} else {
|
||||
section = content.slice(startIdx);
|
||||
}
|
||||
|
||||
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
|
||||
|
||||
The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
|
||||
that are documented separately — do NOT penalize for missing external definitions.
|
||||
|
||||
Rate on three dimensions (1-5 scale):
|
||||
- **clarity** (1-5): Can an agent follow the instructions without ambiguity?
|
||||
- **completeness** (1-5): Are all steps, decision points, and outputs well-defined?
|
||||
- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables?
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the document to evaluate:
|
||||
|
||||
${section}`);
|
||||
|
||||
console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2));
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: opts.testName,
|
||||
suite: opts.suite,
|
||||
tier: 'llm-judge',
|
||||
passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: 0.02,
|
||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||
judge_reasoning: scores.reasoning,
|
||||
});
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability);
|
||||
}
|
||||
|
||||
// Block 1: Ship & Release skills
|
||||
describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => {
|
||||
testIfSelected('ship/SKILL.md workflow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'ship/SKILL.md workflow',
|
||||
suite: 'Ship & Release skill evals',
|
||||
skillPath: 'ship/SKILL.md',
|
||||
startMarker: '# Ship:',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a ship/release workflow document',
|
||||
judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('document-release/SKILL.md workflow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'document-release/SKILL.md workflow',
|
||||
suite: 'Ship & Release skill evals',
|
||||
skillPath: 'document-release/SKILL.md',
|
||||
startMarker: '# Document Release:',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a post-ship documentation update workflow',
|
||||
judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 2: Plan Review skills
|
||||
describeIfSelected('Plan Review skill evals', [
|
||||
'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes',
|
||||
], () => {
|
||||
testIfSelected('plan-ceo-review/SKILL.md modes', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'plan-ceo-review/SKILL.md modes',
|
||||
suite: 'Plan Review skill evals',
|
||||
skillPath: 'plan-ceo-review/SKILL.md',
|
||||
startMarker: '## Step 0: Nuclear Scope Challenge',
|
||||
endMarker: '## Review Sections',
|
||||
judgeContext: 'a CEO/founder plan review framework with 4 scope modes',
|
||||
judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('plan-eng-review/SKILL.md sections', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'plan-eng-review/SKILL.md sections',
|
||||
suite: 'Plan Review skill evals',
|
||||
skillPath: 'plan-eng-review/SKILL.md',
|
||||
startMarker: '## BEFORE YOU START:',
|
||||
endMarker: '## CRITICAL RULE',
|
||||
judgeContext: 'an engineering plan review framework with 4 review sections',
|
||||
judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('plan-design-review/SKILL.md passes', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'plan-design-review/SKILL.md passes',
|
||||
suite: 'Plan Review skill evals',
|
||||
skillPath: 'plan-design-review/SKILL.md',
|
||||
startMarker: '## Review Sections',
|
||||
endMarker: '## CRITICAL RULE',
|
||||
judgeContext: 'a design plan review framework with 7 review passes',
|
||||
judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 3: Design skills
|
||||
describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => {
|
||||
testIfSelected('design-review/SKILL.md fix loop', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'design-review/SKILL.md fix loop',
|
||||
suite: 'Design skill evals',
|
||||
skillPath: 'design-review/SKILL.md',
|
||||
startMarker: '## Phase 7:',
|
||||
endMarker: '## Additional Rules',
|
||||
judgeContext: 'a design audit triage and fix loop workflow',
|
||||
judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('design-consultation/SKILL.md research', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'design-consultation/SKILL.md research',
|
||||
suite: 'Design skill evals',
|
||||
skillPath: 'design-consultation/SKILL.md',
|
||||
startMarker: '## Phase 1:',
|
||||
endMarker: '## Phase 4:',
|
||||
judgeContext: 'a design consultation research and proposal workflow',
|
||||
judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 4: Other skills
|
||||
describeIfSelected('Other skill evals', [
|
||||
'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
|
||||
], () => {
|
||||
testIfSelected('retro/SKILL.md instructions', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'retro/SKILL.md instructions',
|
||||
suite: 'Other skill evals',
|
||||
skillPath: 'retro/SKILL.md',
|
||||
startMarker: '## Instructions',
|
||||
endMarker: '## Compare Mode',
|
||||
judgeContext: 'an engineering retrospective data gathering and analysis workflow',
|
||||
judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('qa-only/SKILL.md workflow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'qa-only/SKILL.md workflow',
|
||||
suite: 'Other skill evals',
|
||||
skillPath: 'qa-only/SKILL.md',
|
||||
startMarker: '## Workflow',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a report-only QA testing workflow',
|
||||
judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'gstack-upgrade/SKILL.md upgrade flow',
|
||||
suite: 'Other skill evals',
|
||||
skillPath: 'gstack-upgrade/SKILL.md',
|
||||
startMarker: '## Inline upgrade flow',
|
||||
endMarker: '## Standalone usage',
|
||||
judgeContext: 'a version upgrade detection and execution workflow',
|
||||
judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -72,15 +72,29 @@ describe('SKILL.md command validation', () => {
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all $B commands in qa-design-review/SKILL.md are valid browse commands', () => {
|
||||
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
|
||||
test('all $B commands in design-review/SKILL.md are valid browse commands', () => {
|
||||
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.invalid).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all snapshot flags in qa-design-review/SKILL.md are valid', () => {
|
||||
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
|
||||
test('all snapshot flags in design-review/SKILL.md are valid', () => {
|
||||
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all $B commands in design-consultation/SKILL.md are valid browse commands', () => {
|
||||
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.invalid).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all snapshot flags in design-consultation/SKILL.md are valid', () => {
|
||||
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
@@ -206,7 +220,7 @@ describe('Update check preamble', () => {
|
||||
'retro/SKILL.md',
|
||||
'office-hours/SKILL.md', 'debug/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'qa-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
@@ -431,6 +445,8 @@ describe('No hardcoded branch names in SKILL templates', () => {
|
||||
'plan-ceo-review/SKILL.md.tmpl',
|
||||
'retro/SKILL.md.tmpl',
|
||||
'document-release/SKILL.md.tmpl',
|
||||
'plan-eng-review/SKILL.md.tmpl',
|
||||
'plan-design-review/SKILL.md.tmpl',
|
||||
];
|
||||
|
||||
// Patterns that indicate hardcoded 'main' in git commands
|
||||
@@ -515,7 +531,7 @@ describe('v0.4.1 preamble features', () => {
|
||||
'retro/SKILL.md',
|
||||
'office-hours/SKILL.md', 'debug/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'qa-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
@@ -616,6 +632,10 @@ describe('Contributor mode preamble structure', () => {
|
||||
'ship/SKILL.md', 'review/SKILL.md',
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
|
||||
for (const skill of skillsWithPreamble) {
|
||||
@@ -701,7 +721,7 @@ describe('Completeness Principle in generated SKILL.md files', () => {
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'qa-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
@@ -874,8 +894,8 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
|
||||
expect(content).toContain('Step 2.5');
|
||||
});
|
||||
|
||||
test('TEST_BOOTSTRAP appears in qa-design-review/SKILL.md', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
|
||||
test('TEST_BOOTSTRAP appears in design-review/SKILL.md', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Test Framework Bootstrap');
|
||||
});
|
||||
|
||||
@@ -916,10 +936,10 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
|
||||
expect(content).toContain('100% test coverage');
|
||||
});
|
||||
|
||||
test('WebSearch is in allowed-tools for qa, ship, qa-design-review', () => {
|
||||
test('WebSearch is in allowed-tools for qa, ship, design-review', () => {
|
||||
const qa = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
const ship = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
const qaDesign = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
|
||||
const qaDesign = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
|
||||
expect(qa).toContain('WebSearch');
|
||||
expect(ship).toContain('WebSearch');
|
||||
expect(qaDesign).toContain('WebSearch');
|
||||
@@ -942,8 +962,8 @@ describe('Phase 8e.5 regression test generation', () => {
|
||||
expect(content).not.toContain('Never modify tests or CI configuration');
|
||||
});
|
||||
|
||||
test('qa-design-review has CSS-aware Phase 8e.5 variant', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
|
||||
test('design-review has CSS-aware Phase 8e.5 variant', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('8e.5. Regression Test (design-review variant)');
|
||||
expect(content).toContain('CSS-only');
|
||||
expect(content).toContain('test(design): regression test');
|
||||
|
||||
@@ -0,0 +1,253 @@
|
||||
/**
|
||||
* Unit tests for diff-based test selection.
|
||||
* Free (no API calls), runs with `bun test`.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import {
|
||||
matchGlob,
|
||||
selectTests,
|
||||
detectBaseBranch,
|
||||
E2E_TOUCHFILES,
|
||||
LLM_JUDGE_TOUCHFILES,
|
||||
GLOBAL_TOUCHFILES,
|
||||
} from './helpers/touchfiles';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// --- matchGlob ---
|
||||
|
||||
describe('matchGlob', () => {
|
||||
test('** matches any depth of path segments', () => {
|
||||
expect(matchGlob('browse/src/commands.ts', 'browse/src/**')).toBe(true);
|
||||
expect(matchGlob('browse/src/deep/nested/file.ts', 'browse/src/**')).toBe(true);
|
||||
expect(matchGlob('browse/src/cli.ts', 'browse/src/**')).toBe(true);
|
||||
});
|
||||
|
||||
test('** does not match unrelated paths', () => {
|
||||
expect(matchGlob('browse/src/commands.ts', 'qa/**')).toBe(false);
|
||||
expect(matchGlob('review/SKILL.md', 'qa/**')).toBe(false);
|
||||
});
|
||||
|
||||
test('exact match works', () => {
|
||||
expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
|
||||
expect(matchGlob('SKILL.md.tmpl', 'SKILL.md')).toBe(false);
|
||||
expect(matchGlob('qa/SKILL.md', 'SKILL.md')).toBe(false);
|
||||
});
|
||||
|
||||
test('* matches within a single segment', () => {
|
||||
expect(matchGlob('test/fixtures/review-eval-enum.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
|
||||
expect(matchGlob('test/fixtures/review-eval-enum-diff.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
|
||||
expect(matchGlob('test/fixtures/review-eval-vuln.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(false);
|
||||
});
|
||||
|
||||
test('dots in patterns are escaped correctly', () => {
|
||||
expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
|
||||
expect(matchGlob('SKILLxmd', 'SKILL.md')).toBe(false);
|
||||
});
|
||||
|
||||
test('** at end matches files in the directory', () => {
|
||||
expect(matchGlob('qa/SKILL.md', 'qa/**')).toBe(true);
|
||||
expect(matchGlob('qa/SKILL.md.tmpl', 'qa/**')).toBe(true);
|
||||
expect(matchGlob('qa/templates/report.md', 'qa/**')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// --- selectTests ---
|
||||
|
||||
describe('selectTests', () => {
|
||||
test('browse/src change selects browse and qa tests', () => {
|
||||
const result = selectTests(['browse/src/commands.ts'], E2E_TOUCHFILES);
|
||||
expect(result.selected).toContain('browse-basic');
|
||||
expect(result.selected).toContain('browse-snapshot');
|
||||
expect(result.selected).toContain('qa-quick');
|
||||
expect(result.selected).toContain('qa-fix-loop');
|
||||
expect(result.selected).toContain('design-review-fix');
|
||||
expect(result.reason).toBe('diff');
|
||||
// Should NOT include unrelated tests
|
||||
expect(result.selected).not.toContain('plan-ceo-review');
|
||||
expect(result.selected).not.toContain('retro');
|
||||
expect(result.selected).not.toContain('document-release');
|
||||
});
|
||||
|
||||
test('skill-specific change selects only that skill and related tests', () => {
|
||||
const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES);
|
||||
expect(result.selected).toContain('plan-ceo-review');
|
||||
expect(result.selected).toContain('plan-ceo-review-selective');
|
||||
expect(result.selected.length).toBe(2);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2);
|
||||
});
|
||||
|
||||
test('global touchfile triggers ALL tests', () => {
|
||||
const result = selectTests(['test/helpers/session-runner.ts'], E2E_TOUCHFILES);
|
||||
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
|
||||
expect(result.skipped.length).toBe(0);
|
||||
expect(result.reason).toContain('global');
|
||||
});
|
||||
|
||||
test('gen-skill-docs.ts is a global touchfile', () => {
|
||||
const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
|
||||
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
|
||||
expect(result.reason).toContain('global');
|
||||
});
|
||||
|
||||
test('unrelated file selects nothing', () => {
|
||||
const result = selectTests(['README.md'], E2E_TOUCHFILES);
|
||||
expect(result.selected).toEqual([]);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length);
|
||||
});
|
||||
|
||||
test('empty changed files selects nothing', () => {
|
||||
const result = selectTests([], E2E_TOUCHFILES);
|
||||
expect(result.selected).toEqual([]);
|
||||
});
|
||||
|
||||
test('multiple changed files union their selections', () => {
|
||||
const result = selectTests(
|
||||
['plan-ceo-review/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
E2E_TOUCHFILES,
|
||||
);
|
||||
expect(result.selected).toContain('plan-ceo-review');
|
||||
expect(result.selected).toContain('plan-ceo-review-selective');
|
||||
expect(result.selected).toContain('retro');
|
||||
expect(result.selected).toContain('retro-base-branch');
|
||||
expect(result.selected.length).toBe(4);
|
||||
});
|
||||
|
||||
test('works with LLM_JUDGE_TOUCHFILES', () => {
|
||||
const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES);
|
||||
expect(result.selected).toContain('qa/SKILL.md workflow');
|
||||
expect(result.selected).toContain('qa/SKILL.md health rubric');
|
||||
expect(result.selected.length).toBe(2);
|
||||
});
|
||||
|
||||
test('SKILL.md.tmpl root template only selects root-dependent tests', () => {
|
||||
const result = selectTests(['SKILL.md.tmpl'], E2E_TOUCHFILES);
|
||||
// Should select the 7 tests that depend on root SKILL.md
|
||||
expect(result.selected).toContain('skillmd-setup-discovery');
|
||||
expect(result.selected).toContain('contributor-mode');
|
||||
expect(result.selected).toContain('session-awareness');
|
||||
// Should NOT select unrelated tests
|
||||
expect(result.selected).not.toContain('plan-ceo-review');
|
||||
expect(result.selected).not.toContain('retro');
|
||||
});
|
||||
|
||||
test('global touchfiles work for LLM-judge tests too', () => {
|
||||
const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
|
||||
expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
|
||||
});
|
||||
});
|
||||
|
||||
// --- detectBaseBranch ---
|
||||
|
||||
describe('detectBaseBranch', () => {
|
||||
test('detects local main branch', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(dir, 'test.txt'), 'hello\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'init']);
|
||||
|
||||
const result = detectBaseBranch(dir);
|
||||
// Should find 'main' (or 'master' depending on git default)
|
||||
expect(result).toMatch(/^(main|master)$/);
|
||||
|
||||
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('returns null for empty repo with no branches', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
// No commits = no branches
|
||||
const result = detectBaseBranch(dir);
|
||||
expect(result).toBeNull();
|
||||
|
||||
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('returns null for non-git directory', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
|
||||
const result = detectBaseBranch(dir);
|
||||
expect(result).toBeNull();
|
||||
|
||||
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
});
|
||||
|
||||
// --- Completeness: every testName in skill-e2e.test.ts has a TOUCHFILES entry ---
|
||||
|
||||
describe('TOUCHFILES completeness', () => {
|
||||
test('every E2E testName has a TOUCHFILES entry', () => {
|
||||
const e2eContent = fs.readFileSync(
|
||||
path.join(ROOT, 'test', 'skill-e2e.test.ts'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
// Extract all testName: 'value' entries
|
||||
const testNameRegex = /testName:\s*['"`]([^'"`]+)['"`]/g;
|
||||
const testNames: string[] = [];
|
||||
let match;
|
||||
while ((match = testNameRegex.exec(e2eContent)) !== null) {
|
||||
let name = match[1];
|
||||
// Handle template literals like `qa-${label}` — these expand to
|
||||
// qa-b6-static, qa-b7-spa, qa-b8-checkout
|
||||
if (name.includes('${')) continue; // skip template literals, check expanded forms below
|
||||
testNames.push(name);
|
||||
}
|
||||
|
||||
// Add the template-expanded testNames from runPlantedBugEval calls
|
||||
const plantedBugRegex = /runPlantedBugEval\([^,]+,\s*[^,]+,\s*['"`]([^'"`]+)['"`]\)/g;
|
||||
while ((match = plantedBugRegex.exec(e2eContent)) !== null) {
|
||||
testNames.push(`qa-${match[1]}`);
|
||||
}
|
||||
|
||||
expect(testNames.length).toBeGreaterThan(0);
|
||||
|
||||
const missing = testNames.filter(name => !(name in E2E_TOUCHFILES));
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`E2E tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
|
||||
`Add these to E2E_TOUCHFILES in test/helpers/touchfiles.ts`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('every LLM-judge test has a TOUCHFILES entry', () => {
|
||||
const llmContent = fs.readFileSync(
|
||||
path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
// Extract test names from addTest({ name: '...' }) calls
|
||||
const nameRegex = /name:\s*['"`]([^'"`]+)['"`]/g;
|
||||
const testNames: string[] = [];
|
||||
let match;
|
||||
while ((match = nameRegex.exec(llmContent)) !== null) {
|
||||
testNames.push(match[1]);
|
||||
}
|
||||
|
||||
// Deduplicate (some tests call addTest with the same name)
|
||||
const unique = [...new Set(testNames)];
|
||||
expect(unique.length).toBeGreaterThan(0);
|
||||
|
||||
const missing = unique.filter(name => !(name in LLM_JUDGE_TOUCHFILES));
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`LLM-judge tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
|
||||
`Add these to LLM_JUDGE_TOUCHFILES in test/helpers/touchfiles.ts`,
|
||||
);
|
||||
}
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user