mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
feat: interactive /plan-design-review + CEO invokes designer + 100% coverage (v0.6.4) (#149)
* refactor: rename qa-design-review → design-review The "qa-" prefix was confusing — this is the live-site design audit with fix loop, not a QA-only report. Rename directory and update all references across docs, tests, scripts, and skill templates. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: interactive /plan-design-review + CEO invokes designer Rewrite /plan-design-review from report-only grading to an interactive plan-fixer that rates each design dimension 0-10, explains what a 10 looks like, and edits the plan to get there. Parallel structure with /plan-ceo-review and /plan-eng-review — one issue = one AskUserQuestion. CEO review now detects UI scope and invokes the designer perspective when the plan has frontend/UX work, so you get design review automatically when it matters. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: validation + touchfile entries for 100% coverage Add design-consultation to command/snapshot flag validation. Add 4 skills to contributor mode validation (plan-design-review, design-review, design-consultation, document-release). Add 2 templates to hardcoded branch check. Register touchfile entries for 10 new LLM-judge tests and 1 new E2E test. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: LLM-judge for 10 skills + gstack-upgrade E2E Add LLM-judge quality evals for all uncovered skills using a DRY runWorkflowJudge helper with section marker guards. Add real E2E test for gstack-upgrade using mock git remote (replaces test.todo). Add plan-edit assertion to plan-design-review E2E. 14/15 skills now at full coverage. setup-browser-cookies remains deferred (needs real browser). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * docs: add bisect commit style to CLAUDE.md All commits should be single logical changes, split before pushing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: bump version and changelog (v0.6.4.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -70,7 +70,7 @@ describe('gen-skill-docs', () => {
|
||||
{ dir: 'setup-browser-cookies', name: 'setup-browser-cookies' },
|
||||
{ dir: 'gstack-upgrade', name: 'gstack-upgrade' },
|
||||
{ dir: 'plan-design-review', name: 'plan-design-review' },
|
||||
{ dir: 'qa-design-review', name: 'qa-design-review' },
|
||||
{ dir: 'design-review', name: 'design-review' },
|
||||
{ dir: 'design-consultation', name: 'design-consultation' },
|
||||
];
|
||||
|
||||
|
||||
@@ -84,9 +84,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'design-consultation-research': ['design-consultation/**'],
|
||||
'design-consultation-existing': ['design-consultation/**'],
|
||||
'design-consultation-preview': ['design-consultation/**'],
|
||||
'plan-design-review-audit': ['plan-design-review/**'],
|
||||
'plan-design-review-export': ['plan-design-review/**'],
|
||||
'qa-design-review-fix': ['qa-design-review/**', 'browse/src/**'],
|
||||
'plan-design-review-plan-mode': ['plan-design-review/**'],
|
||||
'plan-design-review-no-ui-scope': ['plan-design-review/**'],
|
||||
'design-review-fix': ['design-review/**', 'browse/src/**'],
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -102,6 +105,24 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
|
||||
'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
|
||||
|
||||
// Ship & Release
|
||||
'ship/SKILL.md workflow': ['ship/SKILL.md', 'ship/SKILL.md.tmpl'],
|
||||
'document-release/SKILL.md workflow': ['document-release/SKILL.md', 'document-release/SKILL.md.tmpl'],
|
||||
|
||||
// Plan Reviews
|
||||
'plan-ceo-review/SKILL.md modes': ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'],
|
||||
'plan-eng-review/SKILL.md sections': ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'],
|
||||
'plan-design-review/SKILL.md passes': ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'],
|
||||
|
||||
// Design skills
|
||||
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
|
||||
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
|
||||
|
||||
// Other skills
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
+217
-76
@@ -1876,8 +1876,120 @@ describeE2E('Deferred skill E2E', () => {
|
||||
// Setup-browser-cookies requires interactive browser picker UI
|
||||
test.todo('/setup-browser-cookies imports cookies');
|
||||
|
||||
// Gstack-upgrade is destructive: modifies skill installation directory
|
||||
test.todo('/gstack-upgrade completes upgrade flow');
|
||||
});
|
||||
|
||||
// --- gstack-upgrade E2E ---
|
||||
|
||||
describeIfSelected('gstack-upgrade E2E', ['gstack-upgrade-happy-path'], () => {
|
||||
let upgradeDir: string;
|
||||
let remoteDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
upgradeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-upgrade-'));
|
||||
remoteDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-remote-'));
|
||||
|
||||
const run = (cmd: string, args: string[], cwd: string) =>
|
||||
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
// Init the "project" repo
|
||||
run('git', ['init'], upgradeDir);
|
||||
run('git', ['config', 'user.email', 'test@test.com'], upgradeDir);
|
||||
run('git', ['config', 'user.name', 'Test'], upgradeDir);
|
||||
|
||||
// Create mock gstack install directory (local-git type)
|
||||
const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
|
||||
fs.mkdirSync(mockGstack, { recursive: true });
|
||||
|
||||
// Init as a git repo
|
||||
run('git', ['init'], mockGstack);
|
||||
run('git', ['config', 'user.email', 'test@test.com'], mockGstack);
|
||||
run('git', ['config', 'user.name', 'Test'], mockGstack);
|
||||
|
||||
// Create bare remote
|
||||
run('git', ['init', '--bare'], remoteDir);
|
||||
run('git', ['remote', 'add', 'origin', remoteDir], mockGstack);
|
||||
|
||||
// Write old version files
|
||||
fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.5.0\n');
|
||||
fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
|
||||
'# Changelog\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
|
||||
fs.writeFileSync(path.join(mockGstack, 'setup'),
|
||||
'#!/bin/bash\necho "Setup completed"\n', { mode: 0o755 });
|
||||
|
||||
// Initial commit + push
|
||||
run('git', ['add', '.'], mockGstack);
|
||||
run('git', ['commit', '-m', 'initial'], mockGstack);
|
||||
run('git', ['push', '-u', 'origin', 'HEAD:main'], mockGstack);
|
||||
|
||||
// Create new version (simulate upstream release)
|
||||
fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.6.0\n');
|
||||
fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'),
|
||||
'# Changelog\n\n## 0.6.0 — 2026-03-15\n\n- New feature: interactive design review\n- Fix: snapshot flag validation\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n');
|
||||
run('git', ['add', '.'], mockGstack);
|
||||
run('git', ['commit', '-m', 'release 0.6.0'], mockGstack);
|
||||
run('git', ['push', 'origin', 'HEAD:main'], mockGstack);
|
||||
|
||||
// Reset working copy back to old version
|
||||
run('git', ['reset', '--hard', 'HEAD~1'], mockGstack);
|
||||
|
||||
// Copy gstack-upgrade skill
|
||||
fs.mkdirSync(path.join(upgradeDir, 'gstack-upgrade'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'gstack-upgrade', 'SKILL.md'),
|
||||
path.join(upgradeDir, 'gstack-upgrade', 'SKILL.md'),
|
||||
);
|
||||
|
||||
// Commit so git repo is clean
|
||||
run('git', ['add', '.'], upgradeDir);
|
||||
run('git', ['commit', '-m', 'initial project'], upgradeDir);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(upgradeDir, { recursive: true, force: true }); } catch {}
|
||||
try { fs.rmSync(remoteDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('gstack-upgrade-happy-path', async () => {
|
||||
const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack');
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read gstack-upgrade/SKILL.md for the upgrade workflow.
|
||||
|
||||
You are running /gstack-upgrade standalone. The gstack installation is at ./.claude/skills/gstack (local-git type — it has a .git directory with an origin remote).
|
||||
|
||||
Current version: 0.5.0. A new version 0.6.0 is available on origin/main.
|
||||
|
||||
Follow the standalone upgrade flow:
|
||||
1. Detect install type (local-git)
|
||||
2. Run git fetch origin && git reset --hard origin/main in the install directory
|
||||
3. Run the setup script
|
||||
4. Show what's new from CHANGELOG
|
||||
|
||||
Skip any AskUserQuestion calls — auto-approve the upgrade. Write a summary of what you did to stdout.
|
||||
|
||||
IMPORTANT: The install directory is at ./.claude/skills/gstack — use that exact path.`,
|
||||
workingDirectory: upgradeDir,
|
||||
maxTurns: 20,
|
||||
timeout: 180_000,
|
||||
testName: 'gstack-upgrade-happy-path',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/gstack-upgrade happy path', result);
|
||||
|
||||
// Check that the version was updated
|
||||
const versionAfter = fs.readFileSync(path.join(mockGstack, 'VERSION'), 'utf-8').trim();
|
||||
const output = result.output || '';
|
||||
const mentionsUpgrade = output.toLowerCase().includes('0.6.0') ||
|
||||
output.toLowerCase().includes('upgrade') ||
|
||||
output.toLowerCase().includes('updated');
|
||||
|
||||
recordE2E('/gstack-upgrade happy path', 'gstack-upgrade E2E', result, {
|
||||
passed: versionAfter === '0.6.0' && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
expect(versionAfter).toBe('0.6.0');
|
||||
}, 240_000);
|
||||
});
|
||||
|
||||
// --- Design Consultation E2E ---
|
||||
@@ -2178,15 +2290,13 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Plan Design Review E2E ---
|
||||
// --- Plan Design Review E2E (plan-mode) ---
|
||||
|
||||
describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-design-review-export'], () => {
|
||||
describeIfSelected('Plan Design Review E2E', ['plan-design-review-plan-mode', 'plan-design-review-no-ui-scope'], () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testServer = testServer || startTestServer();
|
||||
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-design-'));
|
||||
setupBrowseShims(reviewDir);
|
||||
|
||||
const { spawnSync } = require('child_process');
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
@@ -2195,9 +2305,6 @@ describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(reviewDir, 'index.html'), '<h1>Test</h1>\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Copy plan-design-review skill
|
||||
fs.mkdirSync(path.join(reviewDir, 'plan-design-review'), { recursive: true });
|
||||
@@ -2205,102 +2312,136 @@ describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-
|
||||
path.join(ROOT, 'plan-design-review', 'SKILL.md'),
|
||||
path.join(reviewDir, 'plan-design-review', 'SKILL.md'),
|
||||
);
|
||||
|
||||
// Create a plan file with intentional design gaps
|
||||
fs.writeFileSync(path.join(reviewDir, 'plan.md'), `# Plan: User Dashboard
|
||||
|
||||
## Context
|
||||
Build a user dashboard that shows account stats, recent activity, and settings.
|
||||
|
||||
## Implementation
|
||||
1. Create a dashboard page at /dashboard
|
||||
2. Show user stats (posts, followers, engagement rate)
|
||||
3. Add a recent activity feed
|
||||
4. Add a settings panel
|
||||
5. Use a clean, modern UI with cards and icons
|
||||
6. Add a hero section at the top with a gradient background
|
||||
|
||||
## Technical Details
|
||||
- React components with Tailwind CSS
|
||||
- API endpoint: GET /api/dashboard
|
||||
- WebSocket for real-time activity updates
|
||||
`);
|
||||
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial plan']);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testIfSelected('plan-design-review-audit', async () => {
|
||||
testIfSelected('plan-design-review-plan-mode', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
prompt: `Read plan-design-review/SKILL.md for the design review workflow.
|
||||
|
||||
B="${browseBin}"
|
||||
Review the plan in ./plan.md. This plan has several design gaps — it uses vague language like "clean, modern UI" and "cards and icons", mentions a "hero section with gradient" (AI slop), and doesn't specify empty states, error states, loading states, responsive behavior, or accessibility.
|
||||
|
||||
Read plan-design-review/SKILL.md for the design review workflow.
|
||||
Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Rate each design dimension 0-10 and explain what would make it a 10. Then EDIT plan.md to add the missing design decisions (interaction state table, empty states, responsive behavior, etc.).
|
||||
|
||||
Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.
|
||||
|
||||
EFFICIENCY: Skip the preamble bash block. Combine multiple browse commands into single bash blocks (e.g. run all Phase 2 JS extractions in one block). Write the report as soon as you have enough data — do not over-explore.`,
|
||||
IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit. Just read the plan file, review it, and edit it to fix the gaps.`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 30,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-design-review-audit',
|
||||
maxTurns: 15,
|
||||
timeout: 300_000,
|
||||
testName: 'plan-design-review-plan-mode',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/plan-design-review audit', result);
|
||||
logCost('/plan-design-review plan-mode', result);
|
||||
|
||||
const reportPath = path.join(reviewDir, 'design-audit.md');
|
||||
const reportExists = fs.existsSync(reportPath);
|
||||
let reportContent = '';
|
||||
if (reportExists) {
|
||||
reportContent = fs.readFileSync(reportPath, 'utf-8');
|
||||
}
|
||||
// Check that the agent produced design ratings (0-10 scale)
|
||||
const output = result.output || '';
|
||||
const hasRatings = /\d+\/10/.test(output);
|
||||
const hasDesignContent = output.toLowerCase().includes('information architecture') ||
|
||||
output.toLowerCase().includes('interaction state') ||
|
||||
output.toLowerCase().includes('ai slop') ||
|
||||
output.toLowerCase().includes('hierarchy');
|
||||
|
||||
const hasFirstImpression = reportContent.toLowerCase().includes('first impression') ||
|
||||
reportContent.toLowerCase().includes('impression');
|
||||
// Check that the plan file was edited (the core new behavior)
|
||||
const planAfter = fs.readFileSync(path.join(reviewDir, 'plan.md'), 'utf-8');
|
||||
const planOriginal = `# Plan: User Dashboard`;
|
||||
const planWasEdited = planAfter.length > 300; // Original is ~450 chars, edited should be much longer
|
||||
const planHasDesignAdditions = planAfter.toLowerCase().includes('empty') ||
|
||||
planAfter.toLowerCase().includes('loading') ||
|
||||
planAfter.toLowerCase().includes('error') ||
|
||||
planAfter.toLowerCase().includes('state') ||
|
||||
planAfter.toLowerCase().includes('responsive') ||
|
||||
planAfter.toLowerCase().includes('accessibility');
|
||||
|
||||
recordE2E('/plan-design-review audit', 'Plan Design Review E2E', result, {
|
||||
passed: reportExists && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
recordE2E('/plan-design-review plan-mode', 'Plan Design Review E2E', result, {
|
||||
passed: hasDesignContent && planWasEdited && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
expect(reportExists).toBe(true);
|
||||
if (reportExists) {
|
||||
expect(reportContent.length).toBeGreaterThan(200);
|
||||
}
|
||||
}, 420_000);
|
||||
// Agent should produce design-relevant output about the plan
|
||||
expect(hasDesignContent).toBe(true);
|
||||
// Agent should have edited the plan file to add missing design decisions
|
||||
expect(planWasEdited).toBe(true);
|
||||
expect(planHasDesignAdditions).toBe(true);
|
||||
}, 360_000);
|
||||
|
||||
testIfSelected('plan-design-review-export', async () => {
|
||||
// Clean up previous test artifacts
|
||||
try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {}
|
||||
testIfSelected('plan-design-review-no-ui-scope', async () => {
|
||||
// Write a backend-only plan
|
||||
fs.writeFileSync(path.join(reviewDir, 'backend-plan.md'), `# Plan: Database Migration
|
||||
|
||||
## Context
|
||||
Migrate user records from PostgreSQL to a new schema with better indexing.
|
||||
|
||||
## Implementation
|
||||
1. Create migration to add new columns to users table
|
||||
2. Backfill data from legacy columns
|
||||
3. Add database indexes for common query patterns
|
||||
4. Update ActiveRecord models
|
||||
5. Run migration in staging first, then production
|
||||
`);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
prompt: `Read plan-design-review/SKILL.md for the design review workflow.
|
||||
|
||||
B="${browseBin}"
|
||||
Review the plan in ./backend-plan.md. This is a pure backend database migration plan with no UI changes.
|
||||
|
||||
Read plan-design-review/SKILL.md for the design review workflow.
|
||||
Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Write your findings directly to stdout.
|
||||
|
||||
Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — this is non-interactive. After Phase 2 (Design System Extraction), write a DESIGN.md to the working directory. Also write the audit report to ./design-audit.md.`,
|
||||
IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit.`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 25,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-design-review-export',
|
||||
maxTurns: 10,
|
||||
timeout: 180_000,
|
||||
testName: 'plan-design-review-no-ui-scope',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/plan-design-review export', result);
|
||||
logCost('/plan-design-review no-ui-scope', result);
|
||||
|
||||
const designPath = path.join(reviewDir, 'DESIGN.md');
|
||||
const reportPath = path.join(reviewDir, 'design-audit.md');
|
||||
const designExists = fs.existsSync(designPath);
|
||||
const reportExists = fs.existsSync(reportPath);
|
||||
// Agent should detect no UI scope and exit early
|
||||
const output = result.output || '';
|
||||
const detectsNoUI = output.toLowerCase().includes('no ui') ||
|
||||
output.toLowerCase().includes('no frontend') ||
|
||||
output.toLowerCase().includes('no design') ||
|
||||
output.toLowerCase().includes('not applicable') ||
|
||||
output.toLowerCase().includes('backend');
|
||||
|
||||
let designContent = '';
|
||||
if (designExists) {
|
||||
designContent = fs.readFileSync(designPath, 'utf-8');
|
||||
}
|
||||
|
||||
const hasTypography = designContent.toLowerCase().includes('typography') || designContent.toLowerCase().includes('font');
|
||||
const hasColor = designContent.toLowerCase().includes('color');
|
||||
|
||||
recordE2E('/plan-design-review export', 'Plan Design Review E2E', result, {
|
||||
passed: designExists && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
recordE2E('/plan-design-review no-ui-scope', 'Plan Design Review E2E', result, {
|
||||
passed: detectsNoUI && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
// DESIGN.md export is best-effort — agent may not always produce it
|
||||
if (designExists) {
|
||||
expect(hasTypography || hasColor).toBe(true);
|
||||
}
|
||||
}, 420_000);
|
||||
expect(detectsNoUI).toBe(true);
|
||||
}, 240_000);
|
||||
});
|
||||
|
||||
// --- QA Design Review E2E ---
|
||||
// --- Design Review E2E (live-site audit + fix) ---
|
||||
|
||||
describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => {
|
||||
describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
|
||||
let qaDesignDir: string;
|
||||
let qaDesignServer: ReturnType<typeof Bun.serve> | null = null;
|
||||
|
||||
@@ -2376,11 +2517,11 @@ describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => {
|
||||
},
|
||||
});
|
||||
|
||||
// Copy qa-design-review skill
|
||||
fs.mkdirSync(path.join(qaDesignDir, 'qa-design-review'), { recursive: true });
|
||||
// Copy design-review skill
|
||||
fs.mkdirSync(path.join(qaDesignDir, 'design-review'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'qa-design-review', 'SKILL.md'),
|
||||
path.join(qaDesignDir, 'qa-design-review', 'SKILL.md'),
|
||||
path.join(ROOT, 'design-review', 'SKILL.md'),
|
||||
path.join(qaDesignDir, 'design-review', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
@@ -2389,7 +2530,7 @@ describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => {
|
||||
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 7: /qa-design-review audits and fixes design issues', async () => {
|
||||
test('Test 7: /design-review audits and fixes design issues', async () => {
|
||||
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
@@ -2397,17 +2538,17 @@ describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => {
|
||||
|
||||
B="${browseBin}"
|
||||
|
||||
Read qa-design-review/SKILL.md for the design review + fix workflow.
|
||||
Read design-review/SKILL.md for the design review + fix workflow.
|
||||
|
||||
Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion calls — this is non-interactive. Fix up to 3 issues max. Write your report to ./design-audit.md.`,
|
||||
workingDirectory: qaDesignDir,
|
||||
maxTurns: 30,
|
||||
timeout: 360_000,
|
||||
testName: 'qa-design-review-fix',
|
||||
testName: 'design-review-fix',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/qa-design-review fix', result);
|
||||
logCost('/design-review fix', result);
|
||||
|
||||
const reportPath = path.join(qaDesignDir, 'design-audit.md');
|
||||
const reportExists = fs.existsSync(reportPath);
|
||||
@@ -2419,7 +2560,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
|
||||
const commits = gitLog.stdout.toString().trim().split('\n');
|
||||
const designFixCommits = commits.filter((c: string) => c.includes('style(design)'));
|
||||
|
||||
recordE2E('/qa-design-review fix', 'QA Design Review E2E', result, {
|
||||
recordE2E('/design-review fix', 'Design Review E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
|
||||
@@ -464,6 +464,210 @@ describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => {
|
||||
}, 60_000);
|
||||
});
|
||||
|
||||
// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) ---
|
||||
|
||||
/**
|
||||
* DRY helper for workflow SKILL.md judge tests.
|
||||
* Extracts a section from a SKILL.md file and judges its quality as an agent workflow.
|
||||
*/
|
||||
async function runWorkflowJudge(opts: {
|
||||
testName: string;
|
||||
suite: string;
|
||||
skillPath: string;
|
||||
startMarker: string;
|
||||
endMarker: string | null;
|
||||
judgeContext: string;
|
||||
judgeGoal: string;
|
||||
thresholds?: { clarity: number; completeness: number; actionability: number };
|
||||
}) {
|
||||
const t0 = Date.now();
|
||||
const defaults = { clarity: 4, completeness: 3, actionability: 4 };
|
||||
const thresholds = { ...defaults, ...opts.thresholds };
|
||||
|
||||
const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
|
||||
const startIdx = content.indexOf(opts.startMarker);
|
||||
if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`);
|
||||
|
||||
let section: string;
|
||||
if (opts.endMarker) {
|
||||
const endIdx = content.indexOf(opts.endMarker, startIdx);
|
||||
if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`);
|
||||
section = content.slice(startIdx, endIdx);
|
||||
} else {
|
||||
section = content.slice(startIdx);
|
||||
}
|
||||
|
||||
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
|
||||
|
||||
The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
|
||||
that are documented separately — do NOT penalize for missing external definitions.
|
||||
|
||||
Rate on three dimensions (1-5 scale):
|
||||
- **clarity** (1-5): Can an agent follow the instructions without ambiguity?
|
||||
- **completeness** (1-5): Are all steps, decision points, and outputs well-defined?
|
||||
- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables?
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the document to evaluate:
|
||||
|
||||
${section}`);
|
||||
|
||||
console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2));
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: opts.testName,
|
||||
suite: opts.suite,
|
||||
tier: 'llm-judge',
|
||||
passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: 0.02,
|
||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||
judge_reasoning: scores.reasoning,
|
||||
});
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability);
|
||||
}
|
||||
|
||||
// Block 1: Ship & Release skills
|
||||
describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => {
|
||||
testIfSelected('ship/SKILL.md workflow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'ship/SKILL.md workflow',
|
||||
suite: 'Ship & Release skill evals',
|
||||
skillPath: 'ship/SKILL.md',
|
||||
startMarker: '# Ship:',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a ship/release workflow document',
|
||||
judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('document-release/SKILL.md workflow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'document-release/SKILL.md workflow',
|
||||
suite: 'Ship & Release skill evals',
|
||||
skillPath: 'document-release/SKILL.md',
|
||||
startMarker: '# Document Release:',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a post-ship documentation update workflow',
|
||||
judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 2: Plan Review skills
|
||||
describeIfSelected('Plan Review skill evals', [
|
||||
'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes',
|
||||
], () => {
|
||||
testIfSelected('plan-ceo-review/SKILL.md modes', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'plan-ceo-review/SKILL.md modes',
|
||||
suite: 'Plan Review skill evals',
|
||||
skillPath: 'plan-ceo-review/SKILL.md',
|
||||
startMarker: '## Step 0: Nuclear Scope Challenge',
|
||||
endMarker: '## Review Sections',
|
||||
judgeContext: 'a CEO/founder plan review framework with 4 scope modes',
|
||||
judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('plan-eng-review/SKILL.md sections', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'plan-eng-review/SKILL.md sections',
|
||||
suite: 'Plan Review skill evals',
|
||||
skillPath: 'plan-eng-review/SKILL.md',
|
||||
startMarker: '## BEFORE YOU START:',
|
||||
endMarker: '## CRITICAL RULE',
|
||||
judgeContext: 'an engineering plan review framework with 4 review sections',
|
||||
judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('plan-design-review/SKILL.md passes', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'plan-design-review/SKILL.md passes',
|
||||
suite: 'Plan Review skill evals',
|
||||
skillPath: 'plan-design-review/SKILL.md',
|
||||
startMarker: '## Review Sections',
|
||||
endMarker: '## CRITICAL RULE',
|
||||
judgeContext: 'a design plan review framework with 7 review passes',
|
||||
judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 3: Design skills
|
||||
describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => {
|
||||
testIfSelected('design-review/SKILL.md fix loop', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'design-review/SKILL.md fix loop',
|
||||
suite: 'Design skill evals',
|
||||
skillPath: 'design-review/SKILL.md',
|
||||
startMarker: '## Phase 7:',
|
||||
endMarker: '## Additional Rules',
|
||||
judgeContext: 'a design audit triage and fix loop workflow',
|
||||
judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('design-consultation/SKILL.md research', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'design-consultation/SKILL.md research',
|
||||
suite: 'Design skill evals',
|
||||
skillPath: 'design-consultation/SKILL.md',
|
||||
startMarker: '## Phase 1:',
|
||||
endMarker: '## Phase 4:',
|
||||
judgeContext: 'a design consultation research and proposal workflow',
|
||||
judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 4: Other skills
|
||||
describeIfSelected('Other skill evals', [
|
||||
'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
|
||||
], () => {
|
||||
testIfSelected('retro/SKILL.md instructions', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'retro/SKILL.md instructions',
|
||||
suite: 'Other skill evals',
|
||||
skillPath: 'retro/SKILL.md',
|
||||
startMarker: '## Instructions',
|
||||
endMarker: '## Compare Mode',
|
||||
judgeContext: 'an engineering retrospective data gathering and analysis workflow',
|
||||
judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('qa-only/SKILL.md workflow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'qa-only/SKILL.md workflow',
|
||||
suite: 'Other skill evals',
|
||||
skillPath: 'qa-only/SKILL.md',
|
||||
startMarker: '## Workflow',
|
||||
endMarker: '## Important Rules',
|
||||
judgeContext: 'a report-only QA testing workflow',
|
||||
judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything',
|
||||
});
|
||||
}, 30_000);
|
||||
|
||||
testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'gstack-upgrade/SKILL.md upgrade flow',
|
||||
suite: 'Other skill evals',
|
||||
skillPath: 'gstack-upgrade/SKILL.md',
|
||||
startMarker: '## Inline upgrade flow',
|
||||
endMarker: '## Standalone usage',
|
||||
judgeContext: 'a version upgrade detection and execution workflow',
|
||||
judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -72,15 +72,29 @@ describe('SKILL.md command validation', () => {
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all $B commands in qa-design-review/SKILL.md are valid browse commands', () => {
|
||||
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
|
||||
test('all $B commands in design-review/SKILL.md are valid browse commands', () => {
|
||||
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.invalid).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all snapshot flags in qa-design-review/SKILL.md are valid', () => {
|
||||
const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
|
||||
test('all snapshot flags in design-review/SKILL.md are valid', () => {
|
||||
const skill = path.join(ROOT, 'design-review', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all $B commands in design-consultation/SKILL.md are valid browse commands', () => {
|
||||
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.invalid).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('all snapshot flags in design-consultation/SKILL.md are valid', () => {
|
||||
const skill = path.join(ROOT, 'design-consultation', 'SKILL.md');
|
||||
if (!fs.existsSync(skill)) return;
|
||||
const result = validateSkill(skill);
|
||||
expect(result.snapshotFlagErrors).toHaveLength(0);
|
||||
@@ -205,7 +219,7 @@ describe('Update check preamble', () => {
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'qa-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
@@ -430,6 +444,8 @@ describe('No hardcoded branch names in SKILL templates', () => {
|
||||
'plan-ceo-review/SKILL.md.tmpl',
|
||||
'retro/SKILL.md.tmpl',
|
||||
'document-release/SKILL.md.tmpl',
|
||||
'plan-eng-review/SKILL.md.tmpl',
|
||||
'plan-design-review/SKILL.md.tmpl',
|
||||
];
|
||||
|
||||
// Patterns that indicate hardcoded 'main' in git commands
|
||||
@@ -513,7 +529,7 @@ describe('v0.4.1 preamble features', () => {
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'qa-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
@@ -543,6 +559,10 @@ describe('Contributor mode preamble structure', () => {
|
||||
'ship/SKILL.md', 'review/SKILL.md',
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
|
||||
for (const skill of skillsWithPreamble) {
|
||||
@@ -628,7 +648,7 @@ describe('Completeness Principle in generated SKILL.md files', () => {
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
'plan-design-review/SKILL.md',
|
||||
'qa-design-review/SKILL.md',
|
||||
'design-review/SKILL.md',
|
||||
'design-consultation/SKILL.md',
|
||||
'document-release/SKILL.md',
|
||||
];
|
||||
@@ -801,8 +821,8 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
|
||||
expect(content).toContain('Step 2.5');
|
||||
});
|
||||
|
||||
test('TEST_BOOTSTRAP appears in qa-design-review/SKILL.md', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
|
||||
test('TEST_BOOTSTRAP appears in design-review/SKILL.md', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Test Framework Bootstrap');
|
||||
});
|
||||
|
||||
@@ -843,10 +863,10 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => {
|
||||
expect(content).toContain('100% test coverage');
|
||||
});
|
||||
|
||||
test('WebSearch is in allowed-tools for qa, ship, qa-design-review', () => {
|
||||
test('WebSearch is in allowed-tools for qa, ship, design-review', () => {
|
||||
const qa = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
const ship = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
const qaDesign = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
|
||||
const qaDesign = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
|
||||
expect(qa).toContain('WebSearch');
|
||||
expect(ship).toContain('WebSearch');
|
||||
expect(qaDesign).toContain('WebSearch');
|
||||
@@ -869,8 +889,8 @@ describe('Phase 8e.5 regression test generation', () => {
|
||||
expect(content).not.toContain('Never modify tests or CI configuration');
|
||||
});
|
||||
|
||||
test('qa-design-review has CSS-aware Phase 8e.5 variant', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8');
|
||||
test('design-review has CSS-aware Phase 8e.5 variant', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('8e.5. Regression Test (design-review variant)');
|
||||
expect(content).toContain('CSS-only');
|
||||
expect(content).toContain('test(design): regression test');
|
||||
|
||||
@@ -66,7 +66,7 @@ describe('selectTests', () => {
|
||||
expect(result.selected).toContain('browse-snapshot');
|
||||
expect(result.selected).toContain('qa-quick');
|
||||
expect(result.selected).toContain('qa-fix-loop');
|
||||
expect(result.selected).toContain('qa-design-review-fix');
|
||||
expect(result.selected).toContain('design-review-fix');
|
||||
expect(result.reason).toBe('diff');
|
||||
// Should NOT include unrelated tests
|
||||
expect(result.selected).not.toContain('plan-ceo-review');
|
||||
|
||||
Reference in New Issue
Block a user