feat: test bootstrap, regression tests, coverage audit, retro test health

- Add {{TEST_BOOTSTRAP}} resolver to gen-skill-docs.ts
- Add Phase 8e.5 regression test generation to /qa and /qa-design-review
- Add Step 3.4 test coverage audit with quality scoring to /ship
- Add test health tracking to /retro
- Add 2 E2E evals (bootstrap + coverage audit)
- Add 26 validation tests
- Update ARCHITECTURE.md placeholder table
- Add 2 P3 TODOs (CI/CD non-GitHub, auto-upgrade weak tests)
This commit is contained in:
Garry Tan
2026-03-17 10:41:40 -07:00
parent 73b00b4e29
commit 04eb7824bd
16 changed files with 1512 additions and 6 deletions
+263
View File
@@ -2215,6 +2215,269 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
}, 420_000);
});
// --- Test Bootstrap E2E ---
describeE2E('Test Bootstrap E2E', () => {
let bootstrapDir: string;
let bootstrapServer: ReturnType<typeof Bun.serve>;
beforeAll(() => {
bootstrapDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-bootstrap-'));
setupBrowseShims(bootstrapDir);
// Copy qa skill files
copyDirSync(path.join(ROOT, 'qa'), path.join(bootstrapDir, 'qa'));
// Create a minimal Node.js project with NO test framework
fs.writeFileSync(path.join(bootstrapDir, 'package.json'), JSON.stringify({
name: 'test-bootstrap-app',
version: '1.0.0',
type: 'module',
}, null, 2));
// Create a simple app file with a bug
fs.writeFileSync(path.join(bootstrapDir, 'app.js'), `
export function add(a, b) { return a + b; }
export function subtract(a, b) { return a - b; }
export function divide(a, b) { return a / b; } // BUG: no zero check
`);
// Create a simple HTML page with a bug
fs.writeFileSync(path.join(bootstrapDir, 'index.html'), `<!DOCTYPE html>
<html lang="en">
<head><meta charset="utf-8"><title>Bootstrap Test</title></head>
<body>
<h1>Test App</h1>
<a href="/nonexistent-page">Broken Link</a>
<script>console.error("ReferenceError: undefinedVar is not defined");</script>
</body>
</html>
`);
// Init git repo
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: bootstrapDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial commit']);
// Serve from working directory
bootstrapServer = Bun.serve({
port: 0,
hostname: '127.0.0.1',
fetch(req) {
const url = new URL(req.url);
let filePath = url.pathname === '/' ? '/index.html' : url.pathname;
filePath = filePath.replace(/^\//, '');
const fullPath = path.join(bootstrapDir, filePath);
if (!fs.existsSync(fullPath)) {
return new Response('Not Found', { status: 404 });
}
const content = fs.readFileSync(fullPath, 'utf-8');
return new Response(content, {
headers: { 'Content-Type': 'text/html' },
});
},
});
});
afterAll(() => {
bootstrapServer?.stop();
try { fs.rmSync(bootstrapDir, { recursive: true, force: true }); } catch {}
});
test('/qa bootstrap + regression test on zero-test project', async () => {
const serverUrl = `http://127.0.0.1:${bootstrapServer!.port}`;
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
Read the file qa/SKILL.md for the QA workflow instructions.
Run a Quick-tier QA test on ${serverUrl}
The source code for this page is at ${bootstrapDir}/index.html — you can fix bugs there.
Do NOT use AskUserQuestion — for any AskUserQuestion prompts, choose the RECOMMENDED option automatically.
Write your report to ${bootstrapDir}/qa-reports/qa-report.md
This project has NO test framework. When the bootstrap asks, pick vitest (option A).
This is a test+fix loop: find bugs, fix them, write regression tests, commit each fix.`,
workingDirectory: bootstrapDir,
maxTurns: 50,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob', 'Grep'],
timeout: 420_000,
testName: 'qa-bootstrap',
runId,
});
logCost('/qa bootstrap', result);
recordE2E('/qa bootstrap + regression test', 'Test Bootstrap E2E', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
// Verify bootstrap created test infrastructure
const hasTestConfig = fs.existsSync(path.join(bootstrapDir, 'vitest.config.ts'))
|| fs.existsSync(path.join(bootstrapDir, 'vitest.config.js'))
|| fs.existsSync(path.join(bootstrapDir, 'jest.config.js'))
|| fs.existsSync(path.join(bootstrapDir, 'jest.config.ts'));
console.log(`Test config created: ${hasTestConfig}`);
const hasTestingMd = fs.existsSync(path.join(bootstrapDir, 'TESTING.md'));
console.log(`TESTING.md created: ${hasTestingMd}`);
// Check for bootstrap commit
const gitLog = spawnSync('git', ['log', '--oneline', '--grep=bootstrap'], {
cwd: bootstrapDir, stdio: 'pipe',
});
const bootstrapCommits = gitLog.stdout.toString().trim();
console.log(`Bootstrap commits: ${bootstrapCommits || 'none'}`);
// Check for regression test commits
const regressionLog = spawnSync('git', ['log', '--oneline', '--grep=test(qa)'], {
cwd: bootstrapDir, stdio: 'pipe',
});
const regressionCommits = regressionLog.stdout.toString().trim();
console.log(`Regression test commits: ${regressionCommits || 'none'}`);
// Verify at least the bootstrap happened (fix commits are bonus)
const allCommits = spawnSync('git', ['log', '--oneline'], {
cwd: bootstrapDir, stdio: 'pipe',
});
const totalCommits = allCommits.stdout.toString().trim().split('\n').length;
console.log(`Total commits: ${totalCommits}`);
expect(totalCommits).toBeGreaterThan(1); // At least initial + bootstrap
}, 420_000);
});
// --- Test Coverage Audit E2E ---
describeE2E('Test Coverage Audit E2E', () => {
let coverageDir: string;
beforeAll(() => {
coverageDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-coverage-'));
// Copy ship skill files
copyDirSync(path.join(ROOT, 'ship'), path.join(coverageDir, 'ship'));
copyDirSync(path.join(ROOT, 'review'), path.join(coverageDir, 'review'));
// Create a Node.js project WITH test framework but coverage gaps
fs.writeFileSync(path.join(coverageDir, 'package.json'), JSON.stringify({
name: 'test-coverage-app',
version: '1.0.0',
type: 'module',
scripts: { test: 'echo "no tests yet"' },
devDependencies: { vitest: '^1.0.0' },
}, null, 2));
// Create vitest config
fs.writeFileSync(path.join(coverageDir, 'vitest.config.ts'),
`import { defineConfig } from 'vitest/config';\nexport default defineConfig({ test: {} });\n`);
fs.writeFileSync(path.join(coverageDir, 'VERSION'), '0.1.0.0\n');
fs.writeFileSync(path.join(coverageDir, 'CHANGELOG.md'), '# Changelog\n');
// Create source file with multiple code paths
fs.mkdirSync(path.join(coverageDir, 'src'), { recursive: true });
fs.writeFileSync(path.join(coverageDir, 'src', 'billing.ts'), `
export function processPayment(amount: number, currency: string) {
if (amount <= 0) throw new Error('Invalid amount');
if (currency !== 'USD' && currency !== 'EUR') throw new Error('Unsupported currency');
return { status: 'success', amount, currency };
}
export function refundPayment(paymentId: string, reason: string) {
if (!paymentId) throw new Error('Payment ID required');
if (!reason) throw new Error('Reason required');
return { status: 'refunded', paymentId, reason };
}
`);
// Create a test directory with ONE test (partial coverage)
fs.mkdirSync(path.join(coverageDir, 'test'), { recursive: true });
fs.writeFileSync(path.join(coverageDir, 'test', 'billing.test.ts'), `
import { describe, test, expect } from 'vitest';
import { processPayment } from '../src/billing';
describe('processPayment', () => {
test('processes valid payment', () => {
const result = processPayment(100, 'USD');
expect(result.status).toBe('success');
});
// GAP: no test for invalid amount
// GAP: no test for unsupported currency
// GAP: refundPayment not tested at all
});
`);
// Init git repo with main branch
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: coverageDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial commit']);
// Create feature branch
run('git', ['checkout', '-b', 'feature/billing']);
});
afterAll(() => {
try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
});
test('/ship Step 3.4 produces coverage diagram', async () => {
const result = await runSkillTest({
prompt: `Read the file ship/SKILL.md for the ship workflow instructions.
You are on the feature/billing branch. The base branch is main.
This is a test project — there is no remote, no PR to create.
ONLY run Step 3.4 (Test Coverage Audit) from the ship workflow.
Skip all other steps (tests, evals, review, version, changelog, commit, push, PR).
The source code is in ${coverageDir}/src/billing.ts.
Existing tests are in ${coverageDir}/test/billing.test.ts.
The test command is: echo "tests pass" (mocked — just pretend tests pass).
Produce the ASCII coverage diagram showing which code paths are tested and which have gaps.
Do NOT generate new tests — just produce the diagram and coverage summary.
Output the diagram directly.`,
workingDirectory: coverageDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob', 'Grep'],
timeout: 120_000,
testName: 'ship-coverage-audit',
runId,
});
logCost('/ship coverage audit', result);
recordE2E('/ship Step 3.4 coverage audit', 'Test Coverage Audit E2E', result, {
passed: result.exitReason === 'success',
});
expect(result.exitReason).toBe('success');
// Check output contains coverage diagram elements
const output = result.output || '';
const hasGap = output.includes('GAP') || output.includes('gap') || output.includes('NO TEST');
const hasTested = output.includes('TESTED') || output.includes('tested') || output.includes('✓');
const hasCoverage = output.includes('COVERAGE') || output.includes('coverage') || output.includes('paths tested');
console.log(`Output has GAP markers: ${hasGap}`);
console.log(`Output has TESTED markers: ${hasTested}`);
console.log(`Output has coverage summary: ${hasCoverage}`);
// At minimum, the agent should have read the source and test files
const readCalls = result.toolCalls.filter(tc => tc.tool === 'Read');
expect(readCalls.length).toBeGreaterThan(0);
}, 180_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+198
View File
@@ -707,3 +707,201 @@ describe('gstack-slug', () => {
expect(lines[1]).toMatch(/^BRANCH=.+/);
});
});
// --- Test Bootstrap validation ---
describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
test('TEST_BOOTSTRAP resolver produces valid content', () => {
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(qaContent).toContain('Test Framework Bootstrap');
expect(qaContent).toContain('RUNTIME:ruby');
expect(qaContent).toContain('RUNTIME:node');
expect(qaContent).toContain('RUNTIME:python');
expect(qaContent).toContain('no-test-bootstrap');
expect(qaContent).toContain('BOOTSTRAP_DECLINED');
});
test('TEST_BOOTSTRAP appears in qa/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Framework Bootstrap');
expect(content).toContain('TESTING.md');
expect(content).toContain('CLAUDE.md');
});
test('TEST_BOOTSTRAP appears in ship/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Framework Bootstrap');
expect(content).toContain('Step 2.5');
});
test('TEST_BOOTSTRAP appears in qa-design-review/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Framework Bootstrap');
});
test('TEST_BOOTSTRAP does NOT appear in qa-only/SKILL.md', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa-only', 'SKILL.md'), 'utf-8');
expect(content).not.toContain('Test Framework Bootstrap');
// But should have the recommendation note
expect(content).toContain('No test framework detected');
expect(content).toContain('Run `/qa` to bootstrap');
});
test('bootstrap includes framework knowledge table', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('vitest');
expect(content).toContain('minitest');
expect(content).toContain('pytest');
expect(content).toContain('cargo test');
expect(content).toContain('phpunit');
expect(content).toContain('ExUnit');
});
test('bootstrap includes CI/CD pipeline generation', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('.github/workflows/test.yml');
expect(content).toContain('GitHub Actions');
});
test('bootstrap includes first real tests step', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('First real tests');
expect(content).toContain('git log --since=30.days');
expect(content).toContain('Prioritize by risk');
});
test('bootstrap includes vibe coding philosophy', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('vibe coding');
expect(content).toContain('100% test coverage');
});
test('WebSearch is in allowed-tools for qa, ship, qa-design-review', () => {
const qa = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
const ship = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const qaDesign = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
expect(qa).toContain('WebSearch');
expect(ship).toContain('WebSearch');
expect(qaDesign).toContain('WebSearch');
});
});
// --- Phase 8e.5 regression test validation ---
describe('Phase 8e.5 regression test generation', () => {
test('qa/SKILL.md contains Phase 8e.5', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('8e.5. Regression Test');
expect(content).toContain('test(qa): regression test');
expect(content).toContain('WTF-likelihood exclusion');
});
test('qa/SKILL.md Rule 13 is amended for regression tests', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('Only modify tests when generating regression tests in Phase 8e.5');
expect(content).not.toContain('Never modify tests or CI configuration');
});
test('qa-design-review has CSS-aware Phase 8e.5 variant', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('8e.5. Regression Test (design-review variant)');
expect(content).toContain('CSS-only');
expect(content).toContain('test(design): regression test');
});
test('regression test includes full attribution comment format', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('// Regression: ISSUE-NNN');
expect(content).toContain('// Found by /qa on');
expect(content).toContain('// Report: .gstack/qa-reports/');
});
test('regression test uses auto-incrementing names', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(content).toContain('auto-incrementing');
expect(content).toContain('max number + 1');
});
});
// --- Step 3.4 coverage audit validation ---
describe('Step 3.4 test coverage audit', () => {
test('ship/SKILL.md contains Step 3.4', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Step 3.4: Test Coverage Audit');
expect(content).toContain('CODE PATH COVERAGE MAP');
});
test('Step 3.4 includes quality scoring rubric', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('★★★');
expect(content).toContain('★★');
expect(content).toContain('edge cases AND error paths');
expect(content).toContain('happy path only');
});
test('Step 3.4 includes before/after test count', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Count test files before');
expect(content).toContain('Count test files after');
});
test('ship PR body includes Test Coverage section', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('## Test Coverage');
});
test('ship rules include test generation rule', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Step 3.4 generates coverage tests');
expect(content).toContain('Never commit failing tests');
});
test('Step 3.4 includes vibe coding philosophy', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('vibe coding becomes yolo coding');
});
});
// --- Retro test health validation ---
describe('Retro test health tracking', () => {
test('retro/SKILL.md has test health data gathering commands', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('# 10. Test file count');
expect(content).toContain('# 11. Regression test commits');
expect(content).toContain('# 12. Test files changed');
});
test('retro/SKILL.md has Test Health metrics row', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('Test Health');
expect(content).toContain('regression tests');
});
test('retro/SKILL.md has Test Health narrative section', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('### Test Health');
expect(content).toContain('Total test files');
expect(content).toContain('vibe coding safe');
});
test('retro JSON schema includes test_health field', () => {
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
expect(content).toContain('test_health');
expect(content).toContain('total_test_files');
expect(content).toContain('regression_test_commits');
});
});
// --- QA report template regression tests section ---
describe('QA report template', () => {
test('qa-report-template.md has Regression Tests section', () => {
const content = fs.readFileSync(path.join(ROOT, 'qa', 'templates', 'qa-report-template.md'), 'utf-8');
expect(content).toContain('## Regression Tests');
expect(content).toContain('committed / deferred / skipped');
expect(content).toContain('### Deferred Tests');
expect(content).toContain('**Precondition:**');
});
});