merge: integrate origin/main (v0.5.1-v0.6.4) into team-supabase-store

Resolves conflicts in package.json (keep unified cli-eval.ts + add
eval:select) and test/skill-llm-eval.test.ts (keep judgeCost/judgeCosts
helpers + add diff-based test selection).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-17 23:01:35 -07:00
58 changed files with 6736 additions and 1719 deletions
+86
View File
@@ -0,0 +1,86 @@
/* Planted design anti-patterns for E2E eval — 7 issues */
/* Issue 1: [HIGH] Blacklisted font (Papyrus) */
/* Issue 2: [HIGH] Body text < 16px (14px) */
body {
font-family: 'Papyrus', sans-serif;
font-size: 14px;
margin: 0;
padding: 0;
}
/* Issue 5: [MEDIUM] Purple/violet gradient background */
.hero {
background: linear-gradient(135deg, #6366f1, #8b5cf6);
text-align: center;
padding: 80px 20px;
color: white;
}
.hero h1 {
text-align: center;
font-size: 48px;
}
.hero p {
text-align: center;
font-size: 20px;
}
/* Issue 7: [LOW] 3-column feature grid with icon circles */
.features {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 24px;
padding: 60px 40px;
text-align: center;
}
.feature-card {
border-radius: 24px;
padding: 32px;
text-align: center;
background: #f9fafb;
}
/* Icon in colored circle — AI slop pattern */
.icon-circle {
width: 60px;
height: 60px;
border-radius: 50%;
background: #ede9fe;
display: flex;
align-items: center;
justify-content: center;
margin: 0 auto 16px;
font-size: 24px;
}
/* Issue 3: [HIGH] outline: none without replacement */
button {
outline: none;
background: #6366f1;
color: white;
border: none;
padding: 12px 24px;
border-radius: 24px;
cursor: pointer;
}
.small-link {
font-size: 11px;
padding: 4px 8px;
}
/* Issue 4: [HIGH] !important usage */
.override {
color: red !important;
margin-left: 10px !important;
}
.footer {
text-align: center;
padding: 40px;
background: #1e1b4b;
color: white;
}
+41
View File
@@ -0,0 +1,41 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="styles.css">
<title>Our Platform</title>
</head>
<body>
<!-- Issue 6: [MEDIUM] Generic hero copy ("Welcome to...", "all-in-one solution") -->
<div class="hero">
<h1>Welcome to Our Platform</h1>
<p>Your all-in-one solution for everything you need</p>
<button>Get Started</button>
</div>
<!-- Issue 7: [LOW] 3-column feature grid with icon-in-circle + title + description -->
<div class="features">
<div class="feature-card">
<div class="icon-circle">&#9733;</div>
<h3>Feature One</h3>
<p>A short description of this amazing feature that will change your life.</p>
</div>
<div class="feature-card">
<div class="icon-circle">&#9889;</div>
<h3>Feature Two</h3>
<p>Another incredible capability that sets us apart from the competition.</p>
</div>
<div class="feature-card">
<div class="icon-circle">&#9881;</div>
<h3>Feature Three</h3>
<p>Yet another powerful tool to streamline your workflow effortlessly.</p>
</div>
</div>
<div class="footer">
<p class="override">Unlock the power of our platform today</p>
<a href="#" class="small-link">Terms of Service</a>
</div>
</body>
</html>
+29 -1
View File
@@ -70,7 +70,7 @@ describe('gen-skill-docs', () => {
{ dir: 'setup-browser-cookies', name: 'setup-browser-cookies' },
{ dir: 'gstack-upgrade', name: 'gstack-upgrade' },
{ dir: 'plan-design-review', name: 'plan-design-review' },
{ dir: 'qa-design-review', name: 'qa-design-review' },
{ dir: 'design-review', name: 'design-review' },
{ dir: 'design-consultation', name: 'design-consultation' },
];
@@ -322,3 +322,31 @@ describe('description quality evals', () => {
expect(tipsSection).not.toContain('->');
});
});
describe('REVIEW_DASHBOARD resolver', () => {
const REVIEW_SKILLS = ['plan-ceo-review', 'plan-eng-review', 'plan-design-review'];
for (const skill of REVIEW_SKILLS) {
test(`review dashboard appears in ${skill} generated file`, () => {
const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8');
expect(content).toContain('reviews.jsonl');
expect(content).toContain('REVIEW READINESS DASHBOARD');
});
}
test('review dashboard appears in ship generated file', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('reviews.jsonl');
expect(content).toContain('REVIEW READINESS DASHBOARD');
});
test('resolver output contains key dashboard elements', () => {
const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('VERDICT');
expect(content).toContain('CLEARED');
expect(content).toContain('Eng Review');
expect(content).toContain('7 days');
expect(content).toContain('Design Review');
expect(content).toContain('skip_eng_review');
});
});
+200
View File
@@ -0,0 +1,200 @@
/**
* Diff-based test selection for E2E and LLM-judge evals.
*
* Each test declares which source files it depends on ("touchfiles").
* The test runner checks `git diff` and only runs tests whose
* dependencies were modified. Override with EVALS_ALL=1 to run everything.
*/
import { spawnSync } from 'child_process';
// --- Glob matching ---
/**
* Match a file path against a glob pattern.
* Supports:
* ** — match any number of path segments
* * — match within a single segment (no /)
*/
export function matchGlob(file: string, pattern: string): boolean {
const regexStr = pattern
.replace(/\./g, '\\.')
.replace(/\*\*/g, '{{GLOBSTAR}}')
.replace(/\*/g, '[^/]*')
.replace(/\{\{GLOBSTAR\}\}/g, '.*');
return new RegExp(`^${regexStr}$`).test(file);
}
// --- Touchfile maps ---
/**
* E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
* Each test lists the file patterns that, if changed, require the test to run.
*/
export const E2E_TOUCHFILES: Record<string, string[]> = {
// Browse core
'browse-basic': ['browse/src/**'],
'browse-snapshot': ['browse/src/**'],
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'],
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
// QA
'qa-quick': ['qa/**', 'browse/src/**'],
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
'qa-fix-loop': ['qa/**', 'browse/src/**'],
// Review
'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'],
'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Plan reviews
'plan-ceo-review': ['plan-ceo-review/**'],
'plan-ceo-review-selective': ['plan-ceo-review/**'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],
// Ship
'ship-base-branch': ['ship/**'],
// Retro
'retro': ['retro/**'],
'retro-base-branch': ['retro/**'],
// Document-release
'document-release': ['document-release/**'],
// QA bootstrap
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
// Ship coverage audit
'ship-coverage-audit': ['ship/**'],
// Design
'design-consultation-core': ['design-consultation/**'],
'design-consultation-research': ['design-consultation/**'],
'design-consultation-existing': ['design-consultation/**'],
'design-consultation-preview': ['design-consultation/**'],
'plan-design-review-plan-mode': ['plan-design-review/**'],
'plan-design-review-no-ui-scope': ['plan-design-review/**'],
'design-review-fix': ['design-review/**', 'browse/src/**'],
// gstack-upgrade
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
};
/**
* LLM-judge test touchfiles — keyed by test description string.
*/
export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
'setup block': ['SKILL.md', 'SKILL.md.tmpl'],
'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
// Ship & Release
'ship/SKILL.md workflow': ['ship/SKILL.md', 'ship/SKILL.md.tmpl'],
'document-release/SKILL.md workflow': ['document-release/SKILL.md', 'document-release/SKILL.md.tmpl'],
// Plan Reviews
'plan-ceo-review/SKILL.md modes': ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'],
'plan-eng-review/SKILL.md sections': ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'],
'plan-design-review/SKILL.md passes': ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'],
// Design skills
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
// Other skills
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
};
/**
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
*/
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts',
'test/helpers/eval-store.ts',
'test/helpers/llm-judge.ts',
'scripts/gen-skill-docs.ts',
'test/helpers/touchfiles.ts',
'browse/test/test-server.ts',
];
// --- Base branch detection ---
/**
* Detect the base branch by trying refs in order.
* Returns the first valid ref, or null if none found.
*/
export function detectBaseBranch(cwd: string): string | null {
for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
const result = spawnSync('git', ['rev-parse', '--verify', ref], {
cwd, stdio: 'pipe', timeout: 3000,
});
if (result.status === 0) return ref;
}
return null;
}
/**
* Get list of files changed between base branch and HEAD.
*/
export function getChangedFiles(baseBranch: string, cwd: string): string[] {
const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
cwd, stdio: 'pipe', timeout: 5000,
});
if (result.status !== 0) return [];
return result.stdout.toString().trim().split('\n').filter(Boolean);
}
// --- Test selection ---
/**
* Select tests to run based on changed files.
*
* Algorithm:
* 1. If any changed file matches a global touchfile → run ALL tests
* 2. Otherwise, for each test, check if any changed file matches its patterns
* 3. Return selected + skipped lists with reason
*/
export function selectTests(
changedFiles: string[],
touchfiles: Record<string, string[]>,
globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
): { selected: string[]; skipped: string[]; reason: string } {
const allTestNames = Object.keys(touchfiles);
// Global touchfile hit → run all
for (const file of changedFiles) {
if (globalTouchfiles.some(g => matchGlob(file, g))) {
return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
}
}
// Per-test matching
const selected: string[] = [];
const skipped: string[] = [];
for (const [testName, patterns] of Object.entries(touchfiles)) {
const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
(hit ? selected : skipped).push(testName);
}
return { selected, skipped, reason: 'diff' };
}
+732 -106
View File
File diff suppressed because it is too large Load Diff
+253 -13
View File
@@ -19,6 +19,7 @@ import { callJudge, judge } from './helpers/llm-judge';
import type { JudgeMeta } from './helpers/llm-judge';
import { EvalCollector } from './helpers/eval-store';
import { MODEL_PRICING } from '../lib/eval-cost';
import { selectTests, detectBaseBranch, getChangedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
const ROOT = path.resolve(import.meta.dir, '..');
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
@@ -44,8 +45,43 @@ function judgeCosts(meta: JudgeMeta) {
}];
}
describeEval('LLM-as-judge quality evals', () => {
test('command reference table scores >= 4 on all dimensions', async () => {
// --- Diff-based test selection ---
let selectedTests: string[] | null = null;
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE
|| detectBaseBranch(ROOT)
|| 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length > 0) {
const selection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
selectedTests = selection.selected;
process.stderr.write(`\nLLM-judge selection (${selection.reason}): ${selection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests\n`);
if (selection.skipped.length > 0) {
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
}
process.stderr.write('\n');
}
}
/** Wrap a describe block to skip if none of its tests are selected. */
function describeIfSelected(name: string, testNames: string[], fn: () => void) {
const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
(anySelected ? describeEval : describe.skip)(name, fn);
}
/** Skip an individual test if not selected (for multi-test describe blocks). */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
}
describeIfSelected('LLM-as-judge quality evals', [
'command reference table', 'snapshot flags reference',
'browse/SKILL.md reference', 'setup block', 'regression vs baseline',
], () => {
testIfSelected('command reference table', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const start = content.indexOf('## Command Reference');
@@ -72,7 +108,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('snapshot flags section scores >= 4 on all dimensions', async () => {
testIfSelected('snapshot flags reference', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const start = content.indexOf('## Snapshot System');
@@ -99,7 +135,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('browse/SKILL.md overall scores >= 4', async () => {
testIfSelected('browse/SKILL.md reference', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
const start = content.indexOf('## Snapshot Flags');
@@ -125,7 +161,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('setup block scores >= 3 on actionability and clarity', async () => {
testIfSelected('setup block', async () => {
const t0 = Date.now();
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = content.indexOf('## SETUP');
@@ -153,7 +189,7 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.clarity).toBeGreaterThanOrEqual(3);
}, 30_000);
test('regression check: compare branch vs baseline quality', async () => {
testIfSelected('regression vs baseline', async () => {
const t0 = Date.now();
const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const genStart = generated.indexOf('## Command Reference');
@@ -231,10 +267,10 @@ Scores are 1-5 overall quality.`);
// --- Part 7: QA skill quality evals (C6) ---
describeEval('QA skill quality evals', () => {
describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => {
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
test('qa/SKILL.md workflow quality scores >= 4', async () => {
testIfSelected('qa/SKILL.md workflow', async () => {
const t0 = Date.now();
const start = qaContent.indexOf('## Workflow');
const end = qaContent.indexOf('## Health Score Rubric');
@@ -278,7 +314,7 @@ ${section}`);
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('qa/SKILL.md health score rubric is unambiguous', async () => {
testIfSelected('qa/SKILL.md health rubric', async () => {
const t0 = Date.now();
const start = qaContent.indexOf('## Health Score Rubric');
const section = qaContent.slice(start);
@@ -323,8 +359,8 @@ ${section}`);
// --- Part 7: Cross-skill consistency judge (C7) ---
describeEval('Cross-skill consistency evals', () => {
test('greptile-history patterns are consistent across all skills', async () => {
describeIfSelected('Cross-skill consistency evals', ['cross-skill greptile consistency'], () => {
testIfSelected('cross-skill greptile consistency', async () => {
const t0 = Date.now();
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
@@ -389,10 +425,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
// --- Part 7: Baseline score pinning (C9) ---
describeEval('Baseline score pinning', () => {
describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => {
const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');
test('LLM eval scores do not regress below baselines', async () => {
testIfSelected('baseline score pinning', async () => {
const t0 = Date.now();
if (!fs.existsSync(baselinesPath)) {
console.log('No baseline file found — skipping pinning check');
@@ -443,6 +479,210 @@ describeEval('Baseline score pinning', () => {
}, 60_000);
});
// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) ---
/**
* DRY helper for workflow SKILL.md judge tests.
* Extracts a section from a SKILL.md file and judges its quality as an agent workflow.
*/
async function runWorkflowJudge(opts: {
testName: string;
suite: string;
skillPath: string;
startMarker: string;
endMarker: string | null;
judgeContext: string;
judgeGoal: string;
thresholds?: { clarity: number; completeness: number; actionability: number };
}) {
const t0 = Date.now();
const defaults = { clarity: 4, completeness: 3, actionability: 4 };
const thresholds = { ...defaults, ...opts.thresholds };
const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
const startIdx = content.indexOf(opts.startMarker);
if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`);
let section: string;
if (opts.endMarker) {
const endIdx = content.indexOf(opts.endMarker, startIdx);
if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`);
section = content.slice(startIdx, endIdx);
} else {
section = content.slice(startIdx);
}
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
that are documented separately — do NOT penalize for missing external definitions.
Rate on three dimensions (1-5 scale):
- **clarity** (1-5): Can an agent follow the instructions without ambiguity?
- **completeness** (1-5): Are all steps, decision points, and outputs well-defined?
- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables?
Respond with ONLY valid JSON:
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
Here is the document to evaluate:
${section}`);
console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2));
evalCollector?.addTest({
name: opts.testName,
suite: opts.suite,
tier: 'llm-judge',
passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
});
expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity);
expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness);
expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability);
}
// Block 1: Ship & Release skills
describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => {
testIfSelected('ship/SKILL.md workflow', async () => {
await runWorkflowJudge({
testName: 'ship/SKILL.md workflow',
suite: 'Ship & Release skill evals',
skillPath: 'ship/SKILL.md',
startMarker: '# Ship:',
endMarker: '## Important Rules',
judgeContext: 'a ship/release workflow document',
judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR',
});
}, 30_000);
testIfSelected('document-release/SKILL.md workflow', async () => {
await runWorkflowJudge({
testName: 'document-release/SKILL.md workflow',
suite: 'Ship & Release skill evals',
skillPath: 'document-release/SKILL.md',
startMarker: '# Document Release:',
endMarker: '## Important Rules',
judgeContext: 'a post-ship documentation update workflow',
judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS',
});
}, 30_000);
});
// Block 2: Plan Review skills
describeIfSelected('Plan Review skill evals', [
'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes',
], () => {
testIfSelected('plan-ceo-review/SKILL.md modes', async () => {
await runWorkflowJudge({
testName: 'plan-ceo-review/SKILL.md modes',
suite: 'Plan Review skill evals',
skillPath: 'plan-ceo-review/SKILL.md',
startMarker: '## Step 0: Nuclear Scope Challenge',
endMarker: '## Review Sections',
judgeContext: 'a CEO/founder plan review framework with 4 scope modes',
judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively',
});
}, 30_000);
testIfSelected('plan-eng-review/SKILL.md sections', async () => {
await runWorkflowJudge({
testName: 'plan-eng-review/SKILL.md sections',
suite: 'Plan Review skill evals',
skillPath: 'plan-eng-review/SKILL.md',
startMarker: '## BEFORE YOU START:',
endMarker: '## CRITICAL RULE',
judgeContext: 'an engineering plan review framework with 4 review sections',
judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion',
});
}, 30_000);
testIfSelected('plan-design-review/SKILL.md passes', async () => {
await runWorkflowJudge({
testName: 'plan-design-review/SKILL.md passes',
suite: 'Plan Review skill evals',
skillPath: 'plan-design-review/SKILL.md',
startMarker: '## Review Sections',
endMarker: '## CRITICAL RULE',
judgeContext: 'a design plan review framework with 7 review passes',
judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate',
});
}, 30_000);
});
// Block 3: Design skills
describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => {
testIfSelected('design-review/SKILL.md fix loop', async () => {
await runWorkflowJudge({
testName: 'design-review/SKILL.md fix loop',
suite: 'Design skill evals',
skillPath: 'design-review/SKILL.md',
startMarker: '## Phase 7:',
endMarker: '## Additional Rules',
judgeContext: 'a design audit triage and fix loop workflow',
judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots',
});
}, 30_000);
testIfSelected('design-consultation/SKILL.md research', async () => {
await runWorkflowJudge({
testName: 'design-consultation/SKILL.md research',
suite: 'Design skill evals',
skillPath: 'design-consultation/SKILL.md',
startMarker: '## Phase 1:',
endMarker: '## Phase 4:',
judgeContext: 'a design consultation research and proposal workflow',
judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications',
});
}, 30_000);
});
// Block 4: Other skills
describeIfSelected('Other skill evals', [
'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
], () => {
testIfSelected('retro/SKILL.md instructions', async () => {
await runWorkflowJudge({
testName: 'retro/SKILL.md instructions',
suite: 'Other skill evals',
skillPath: 'retro/SKILL.md',
startMarker: '## Instructions',
endMarker: '## Compare Mode',
judgeContext: 'an engineering retrospective data gathering and analysis workflow',
judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking',
});
}, 30_000);
testIfSelected('qa-only/SKILL.md workflow', async () => {
await runWorkflowJudge({
testName: 'qa-only/SKILL.md workflow',
suite: 'Other skill evals',
skillPath: 'qa-only/SKILL.md',
startMarker: '## Workflow',
endMarker: '## Important Rules',
judgeContext: 'a report-only QA testing workflow',
judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything',
});
}, 30_000);
testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => {
await runWorkflowJudge({
testName: 'gstack-upgrade/SKILL.md upgrade flow',
suite: 'Other skill evals',
skillPath: 'gstack-upgrade/SKILL.md',
startMarker: '## Inline upgrade flow',
endMarker: '## Standalone usage',
judgeContext: 'a version upgrade detection and execution workflow',
judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed',
});
}, 30_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+357 -6
View File
@@ -72,15 +72,29 @@ describe('SKILL.md command validation', () => {
expect(result.snapshotFlagErrors).toHaveLength(0);
});
test('all $B commands in qa-design-review/SKILL.md are valid browse commands', () => {
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
test('all $B commands in design-review/SKILL.md are valid browse commands', () => {
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.invalid).toHaveLength(0);
});
test('all snapshot flags in qa-design-review/SKILL.md are valid', () => {
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
test('all snapshot flags in design-review/SKILL.md are valid', () => {
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.snapshotFlagErrors).toHaveLength(0);
});
test('all $B commands in design-consultation/SKILL.md are valid browse commands', () => {
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.invalid).toHaveLength(0);
});
test('all snapshot flags in design-consultation/SKILL.md are valid', () => {
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
if (!fs.existsSync(skill)) return;
const result = validateSkill(skill);
expect(result.snapshotFlagErrors).toHaveLength(0);
@@ -205,7 +219,7 @@ describe('Update check preamble', () => {
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
'retro/SKILL.md',
'plan-design-review/SKILL.md',
'qa-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
@@ -430,6 +444,8 @@ describe('No hardcoded branch names in SKILL templates', () => {
'plan-ceo-review/SKILL.md.tmpl',
'retro/SKILL.md.tmpl',
'document-release/SKILL.md.tmpl',
'plan-eng-review/SKILL.md.tmpl',
'plan-design-review/SKILL.md.tmpl',
];
// Patterns that indicate hardcoded 'main' in git commands
@@ -513,7 +529,7 @@ describe('v0.4.1 preamble features', () => {
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
'retro/SKILL.md',
'plan-design-review/SKILL.md',
'qa-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
@@ -543,6 +559,10 @@ describe('Contributor mode preamble structure', () => {
'ship/SKILL.md', 'review/SKILL.md',
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
'retro/SKILL.md',
'plan-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
for (const skill of skillsWithPreamble) {
@@ -617,6 +637,43 @@ describe('Enum & Value Completeness in review checklist', () => {
});
});
// --- Completeness Principle spot-check ---
describe('Completeness Principle in generated SKILL.md files', () => {
const skillsWithPreamble = [
'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md',
'qa-only/SKILL.md',
'setup-browser-cookies/SKILL.md',
'ship/SKILL.md', 'review/SKILL.md',
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
'retro/SKILL.md',
'plan-design-review/SKILL.md',
'design-review/SKILL.md',
'design-consultation/SKILL.md',
'document-release/SKILL.md',
];
for (const skill of skillsWithPreamble) {
test(`${skill} contains Completeness Principle section`, () => {
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
expect(content).toContain('Completeness Principle');
expect(content).toContain('Boil the Lake');
});
}
test('Completeness Principle includes compression table', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('CC+gstack');
expect(content).toContain('Compression');
});
test('Completeness Principle includes anti-patterns', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('BAD:');
expect(content).toContain('Anti-patterns');
});
});
// --- Part 7: Planted-bug fixture validation (A4) ---
describe('Planted-bug fixture validation', () => {
@@ -665,3 +722,297 @@ describe('Planted-bug fixture validation', () => {
expect(content).toContain('update_column');
});
});
// --- CEO review mode validation ---
describe('CEO review mode validation', () => {
const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
test('has all four CEO review modes defined', () => {
const modes = ['SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'HOLD SCOPE', 'SCOPE REDUCTION'];
for (const mode of modes) {
expect(content).toContain(mode);
}
});
test('has CEO plan persistence step', () => {
expect(content).toContain('ceo-plans');
expect(content).toContain('status: ACTIVE');
});
test('has docs/designs promotion section', () => {
expect(content).toContain('docs/designs');
expect(content).toContain('PROMOTED');
});
test('mode quick reference has four columns', () => {
expect(content).toContain('EXPANSION');
expect(content).toContain('SELECTIVE');
expect(content).toContain('HOLD SCOPE');
expect(content).toContain('REDUCTION');
});
});
// --- gstack-slug helper ---
describe('gstack-slug', () => {
const SLUG_BIN = path.join(ROOT, 'bin', 'gstack-slug');
test('binary exists and is executable', () => {
expect(fs.existsSync(SLUG_BIN)).toBe(true);
const stat = fs.statSync(SLUG_BIN);
expect(stat.mode & 0o111).toBeGreaterThan(0);
});
test('outputs SLUG and BRANCH lines in a git repo', () => {
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
expect(result.exitCode).toBe(0);
const output = result.stdout.toString();
expect(output).toContain('SLUG=');
expect(output).toContain('BRANCH=');
});
test('SLUG does not contain forward slashes', () => {
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
const slug = result.stdout.toString().match(/SLUG=(.*)/)?.[1] ?? '';
expect(slug).not.toContain('/');
expect(slug.length).toBeGreaterThan(0);
});
test('BRANCH does not contain forward slashes', () => {
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
const branch = result.stdout.toString().match(/BRANCH=(.*)/)?.[1] ?? '';
expect(branch).not.toContain('/');
expect(branch.length).toBeGreaterThan(0);
});
test('output is eval-compatible (KEY=VALUE format)', () => {
const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
const lines = result.stdout.toString().trim().split('\n');
expect(lines.length).toBe(2);
expect(lines[0]).toMatch(/^SLUG=.+/);
expect(lines[1]).toMatch(/^BRANCH=.+/);
});
});
// --- Test Bootstrap validation ---
describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
test('TEST_BOOTSTRAP resolver produces valid content', () => {
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(qaContent).toContain('Test Framework Bootstrap');
expect(qaContent).toContain('RUNTIME:ruby');
expect(qaContent).toContain('RUNTIME:node');
expect(qaContent).toContain('RUNTIME:python');
expect(qaContent).toContain('no-test-bootstrap');
expect(qaContent).toContain('BOOTSTRAP_DECLINED');
});
test('TEST_BOOTSTRAP appears in qa/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Framework Bootstrap');
expect(content).toContain('TESTING.md');
expect(content).toContain('CLAUDE.md');
});
test('TEST_BOOTSTRAP appears in ship/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Framework Bootstrap');
expect(content).toContain('Step 2.5');
});
test('TEST_BOOTSTRAP appears in design-review/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Framework Bootstrap');
});
test('TEST_BOOTSTRAP does NOT appear in qa-only/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa-only', 'SKILL.md'), 'utf-8');
expect(content).not.toContain('Test Framework Bootstrap');
// But should have the recommendation note
expect(content).toContain('No test framework detected');
expect(content).toContain('Run `/qa` to bootstrap');
});
test('bootstrap includes framework knowledge table', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('vitest');
expect(content).toContain('minitest');
expect(content).toContain('pytest');
expect(content).toContain('cargo test');
expect(content).toContain('phpunit');
expect(content).toContain('ExUnit');
});
test('bootstrap includes CI/CD pipeline generation', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('.github/workflows/test.yml');
expect(content).toContain('GitHub Actions');
});
test('bootstrap includes first real tests step', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('First real tests');
expect(content).toContain('git log --since=30.days');
expect(content).toContain('Prioritize by risk');
});
test('bootstrap includes vibe coding philosophy', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('vibe coding');
expect(content).toContain('100% test coverage');
});
test('WebSearch is in allowed-tools for qa, ship, design-review', () => {
const qa = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
const ship = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const qaDesign = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
expect(qa).toContain('WebSearch');
expect(ship).toContain('WebSearch');
expect(qaDesign).toContain('WebSearch');
});
});
// --- Phase 8e.5 regression test validation ---
describe('Phase 8e.5 regression test generation', () => {
test('qa/SKILL.md contains Phase 8e.5', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('8e.5. Regression Test');
expect(content).toContain('test(qa): regression test');
expect(content).toContain('WTF-likelihood exclusion');
});
test('qa/SKILL.md Rule 13 is amended for regression tests', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('Only modify tests when generating regression tests in Phase 8e.5');
expect(content).not.toContain('Never modify tests or CI configuration');
});
test('design-review has CSS-aware Phase 8e.5 variant', () => {
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('8e.5. Regression Test (design-review variant)');
expect(content).toContain('CSS-only');
expect(content).toContain('test(design): regression test');
});
test('regression test includes full attribution comment format', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('// Regression: ISSUE-NNN');
expect(content).toContain('// Found by /qa on');
expect(content).toContain('// Report: .gstack/qa-reports/');
});
test('regression test uses auto-incrementing names', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('auto-incrementing');
expect(content).toContain('max number + 1');
});
});
// --- Step 3.4 coverage audit validation ---
describe('Step 3.4 test coverage audit', () => {
test('ship/SKILL.md contains Step 3.4', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Step 3.4: Test Coverage Audit');
expect(content).toContain('CODE PATH COVERAGE');
});
test('Step 3.4 includes quality scoring rubric', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('★★★');
expect(content).toContain('★★');
expect(content).toContain('edge cases AND error paths');
expect(content).toContain('happy path only');
});
test('Step 3.4 includes before/after test count', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Count test files before');
expect(content).toContain('Count test files after');
});
test('ship PR body includes Test Coverage section', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('## Test Coverage');
});
test('ship rules include test generation rule', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Step 3.4 generates coverage tests');
expect(content).toContain('Never commit failing tests');
});
test('Step 3.4 includes vibe coding philosophy', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('vibe coding becomes yolo coding');
});
test('Step 3.4 traces actual codepaths, not just syntax', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Trace every codepath');
expect(content).toContain('Trace data flow');
expect(content).toContain('Diagram the execution');
});
test('Step 3.4 maps user flows and interaction edge cases', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Map user flows');
expect(content).toContain('Interaction edge cases');
expect(content).toContain('Double-click');
expect(content).toContain('Navigate away');
expect(content).toContain('Error states the user can see');
expect(content).toContain('Empty/zero/boundary states');
});
test('Step 3.4 diagram includes USER FLOW COVERAGE section', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('USER FLOW COVERAGE');
expect(content).toContain('Code paths:');
expect(content).toContain('User flows:');
});
});
// --- Retro test health validation ---
describe('Retro test health tracking', () => {
test('retro/SKILL.md has test health data gathering commands', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('# 10. Test file count');
expect(content).toContain('# 11. Regression test commits');
expect(content).toContain('# 12. Test files changed');
});
test('retro/SKILL.md has Test Health metrics row', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Health');
expect(content).toContain('regression tests');
});
test('retro/SKILL.md has Test Health narrative section', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('### Test Health');
expect(content).toContain('Total test files');
expect(content).toContain('vibe coding safe');
});
test('retro JSON schema includes test_health field', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('test_health');
expect(content).toContain('total_test_files');
expect(content).toContain('regression_test_commits');
});
});
// --- QA report template regression tests section ---
describe('QA report template', () => {
test('qa-report-template.md has Regression Tests section', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'templates', 'qa-report-template.md'), 'utf-8');
expect(content).toContain('## Regression Tests');
expect(content).toContain('committed / deferred / skipped');
expect(content).toContain('### Deferred Tests');
expect(content).toContain('**Precondition:**');
});
});
+253
View File
@@ -0,0 +1,253 @@
/**
* Unit tests for diff-based test selection.
* Free (no API calls), runs with `bun test`.
*/
import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import {
matchGlob,
selectTests,
detectBaseBranch,
E2E_TOUCHFILES,
LLM_JUDGE_TOUCHFILES,
GLOBAL_TOUCHFILES,
} from './helpers/touchfiles';
const ROOT = path.resolve(import.meta.dir, '..');
// --- matchGlob ---
describe('matchGlob', () => {
test('** matches any depth of path segments', () => {
expect(matchGlob('browse/src/commands.ts', 'browse/src/**')).toBe(true);
expect(matchGlob('browse/src/deep/nested/file.ts', 'browse/src/**')).toBe(true);
expect(matchGlob('browse/src/cli.ts', 'browse/src/**')).toBe(true);
});
test('** does not match unrelated paths', () => {
expect(matchGlob('browse/src/commands.ts', 'qa/**')).toBe(false);
expect(matchGlob('review/SKILL.md', 'qa/**')).toBe(false);
});
test('exact match works', () => {
expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
expect(matchGlob('SKILL.md.tmpl', 'SKILL.md')).toBe(false);
expect(matchGlob('qa/SKILL.md', 'SKILL.md')).toBe(false);
});
test('* matches within a single segment', () => {
expect(matchGlob('test/fixtures/review-eval-enum.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
expect(matchGlob('test/fixtures/review-eval-enum-diff.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
expect(matchGlob('test/fixtures/review-eval-vuln.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(false);
});
test('dots in patterns are escaped correctly', () => {
expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
expect(matchGlob('SKILLxmd', 'SKILL.md')).toBe(false);
});
test('** at end matches files in the directory', () => {
expect(matchGlob('qa/SKILL.md', 'qa/**')).toBe(true);
expect(matchGlob('qa/SKILL.md.tmpl', 'qa/**')).toBe(true);
expect(matchGlob('qa/templates/report.md', 'qa/**')).toBe(true);
});
});
// --- selectTests ---
describe('selectTests', () => {
test('browse/src change selects browse and qa tests', () => {
const result = selectTests(['browse/src/commands.ts'], E2E_TOUCHFILES);
expect(result.selected).toContain('browse-basic');
expect(result.selected).toContain('browse-snapshot');
expect(result.selected).toContain('qa-quick');
expect(result.selected).toContain('qa-fix-loop');
expect(result.selected).toContain('design-review-fix');
expect(result.reason).toBe('diff');
// Should NOT include unrelated tests
expect(result.selected).not.toContain('plan-ceo-review');
expect(result.selected).not.toContain('retro');
expect(result.selected).not.toContain('document-release');
});
test('skill-specific change selects only that skill and related tests', () => {
const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES);
expect(result.selected).toContain('plan-ceo-review');
expect(result.selected).toContain('plan-ceo-review-selective');
expect(result.selected.length).toBe(2);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2);
});
test('global touchfile triggers ALL tests', () => {
const result = selectTests(['test/helpers/session-runner.ts'], E2E_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
expect(result.skipped.length).toBe(0);
expect(result.reason).toContain('global');
});
test('gen-skill-docs.ts is a global touchfile', () => {
const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
expect(result.reason).toContain('global');
});
test('unrelated file selects nothing', () => {
const result = selectTests(['README.md'], E2E_TOUCHFILES);
expect(result.selected).toEqual([]);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length);
});
test('empty changed files selects nothing', () => {
const result = selectTests([], E2E_TOUCHFILES);
expect(result.selected).toEqual([]);
});
test('multiple changed files union their selections', () => {
const result = selectTests(
['plan-ceo-review/SKILL.md', 'retro/SKILL.md.tmpl'],
E2E_TOUCHFILES,
);
expect(result.selected).toContain('plan-ceo-review');
expect(result.selected).toContain('plan-ceo-review-selective');
expect(result.selected).toContain('retro');
expect(result.selected).toContain('retro-base-branch');
expect(result.selected.length).toBe(4);
});
test('works with LLM_JUDGE_TOUCHFILES', () => {
const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES);
expect(result.selected).toContain('qa/SKILL.md workflow');
expect(result.selected).toContain('qa/SKILL.md health rubric');
expect(result.selected.length).toBe(2);
});
test('SKILL.md.tmpl root template only selects root-dependent tests', () => {
const result = selectTests(['SKILL.md.tmpl'], E2E_TOUCHFILES);
// Should select the 7 tests that depend on root SKILL.md
expect(result.selected).toContain('skillmd-setup-discovery');
expect(result.selected).toContain('contributor-mode');
expect(result.selected).toContain('session-awareness');
// Should NOT select unrelated tests
expect(result.selected).not.toContain('plan-ceo-review');
expect(result.selected).not.toContain('retro');
});
test('global touchfiles work for LLM-judge tests too', () => {
const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
});
});
// --- detectBaseBranch ---
describe('detectBaseBranch', () => {
test('detects local main branch', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(dir, 'test.txt'), 'hello\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
const result = detectBaseBranch(dir);
// Should find 'main' (or 'master' depending on git default)
expect(result).toMatch(/^(main|master)$/);
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
});
test('returns null for empty repo with no branches', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
// No commits = no branches
const result = detectBaseBranch(dir);
expect(result).toBeNull();
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
});
test('returns null for non-git directory', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
const result = detectBaseBranch(dir);
expect(result).toBeNull();
try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
});
});
// --- Completeness: every testName in skill-e2e.test.ts has a TOUCHFILES entry ---
describe('TOUCHFILES completeness', () => {
test('every E2E testName has a TOUCHFILES entry', () => {
const e2eContent = fs.readFileSync(
path.join(ROOT, 'test', 'skill-e2e.test.ts'),
'utf-8',
);
// Extract all testName: 'value' entries
const testNameRegex = /testName:\s*['"`]([^'"`]+)['"`]/g;
const testNames: string[] = [];
let match;
while ((match = testNameRegex.exec(e2eContent)) !== null) {
let name = match[1];
// Handle template literals like `qa-${label}` — these expand to
// qa-b6-static, qa-b7-spa, qa-b8-checkout
if (name.includes('${')) continue; // skip template literals, check expanded forms below
testNames.push(name);
}
// Add the template-expanded testNames from runPlantedBugEval calls
const plantedBugRegex = /runPlantedBugEval\([^,]+,\s*[^,]+,\s*['"`]([^'"`]+)['"`]\)/g;
while ((match = plantedBugRegex.exec(e2eContent)) !== null) {
testNames.push(`qa-${match[1]}`);
}
expect(testNames.length).toBeGreaterThan(0);
const missing = testNames.filter(name => !(name in E2E_TOUCHFILES));
if (missing.length > 0) {
throw new Error(
`E2E tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
`Add these to E2E_TOUCHFILES in test/helpers/touchfiles.ts`,
);
}
});
test('every LLM-judge test has a TOUCHFILES entry', () => {
const llmContent = fs.readFileSync(
path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
'utf-8',
);
// Extract test names from addTest({ name: '...' }) calls
const nameRegex = /name:\s*['"`]([^'"`]+)['"`]/g;
const testNames: string[] = [];
let match;
while ((match = nameRegex.exec(llmContent)) !== null) {
testNames.push(match[1]);
}
// Deduplicate (some tests call addTest with the same name)
const unique = [...new Set(testNames)];
expect(unique.length).toBeGreaterThan(0);
const missing = unique.filter(name => !(name in LLM_JUDGE_TOUCHFILES));
if (missing.length > 0) {
throw new Error(
`LLM-judge tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
`Add these to LLM_JUDGE_TOUCHFILES in test/helpers/touchfiles.ts`,
);
}
});
});