mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat: adversarial spec review loop + skill chaining (v0.9.1.0) (#249)
* feat: add {{SPEC_REVIEW_LOOP}}, {{DESIGN_SKETCH}}, benefits-from resolvers
Three new resolvers in gen-skill-docs.ts:
- {{SPEC_REVIEW_LOOP}}: adversarial subagent reviews documents on 5
dimensions (completeness, consistency, clarity, scope, feasibility)
with convergence guard, quality score, and JSONL metrics
- {{DESIGN_SKETCH}}: generates rough HTML wireframes for UI ideas using
DESIGN.md constraints and design principles, renders via $B
- {{BENEFITS_FROM}}: parses benefits-from frontmatter and generates
skill chaining offer prose (one-hop-max, never blocks)
Also extends TemplateContext with benefitsFrom field and adds inline
YAML frontmatter parsing for the new field.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* feat: /office-hours spec review loop + visual sketch phases
- Phase 4.5 ({{DESIGN_SKETCH}}): for UI ideas, generates rough HTML
wireframe using design principles from {{DESIGN_METHODOLOGY}} and
DESIGN.md, renders via $B, presents screenshot for iteration
- Phase 5.5 ({{SPEC_REVIEW_LOOP}}): adversarial subagent reviews the
design doc before user sees it — catches gaps in completeness,
consistency, clarity, scope, and feasibility
- Adds {{BROWSE_SETUP}} for $B availability in sketch phase
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* feat: skill chaining — plan reviews offer /office-hours
- plan-ceo-review: benefits-from office-hours, offers /office-hours when
no design doc found, mid-session detection when user seems lost,
spec review loop on CEO plan documents
- plan-eng-review: benefits-from office-hours, offers /office-hours when
no design doc found
- One-hop-max chaining: never blocks, max one offer per session
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* test: add validation + E2E tests for spec review, sketch, benefits-from
Unit tests (32 new assertions):
- SPEC_REVIEW_LOOP: 5 dimensions, Agent dispatch, 3 iterations, quality
score, metrics path, convergence guard, graceful failure
- DESIGN_SKETCH: DESIGN.md awareness, wireframe, $B goto/screenshot,
rough aesthetic, skip conditions
- BENEFITS_FROM: prerequisite offer in CEO + eng review, graceful
decline, skills without benefits-from don't get offer
- office-hours structure: spec review loop, adversarial dimensions,
visual sketch section
E2E tests (2 new):
- office-hours-spec-review: verifies agent understands the spec review
loop from SKILL.md
- plan-ceo-review-benefits: verifies agent understands the skill
chaining offer
Touchfiles updated for diff-based test selection.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* chore: bump version and changelog (v0.9.1.0)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---------
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -416,6 +416,98 @@ describe('REVIEW_DASHBOARD resolver', () => {
|
||||
});
|
||||
});
|
||||
|
||||
// --- {{SPEC_REVIEW_LOOP}} resolver tests ---
|
||||
|
||||
describe('SPEC_REVIEW_LOOP resolver', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('contains all 5 review dimensions', () => {
|
||||
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
|
||||
expect(content).toContain(dim);
|
||||
}
|
||||
});
|
||||
|
||||
test('references Agent tool for subagent dispatch', () => {
|
||||
expect(content).toMatch(/Agent.*tool/i);
|
||||
});
|
||||
|
||||
test('specifies max 3 iterations', () => {
|
||||
expect(content).toMatch(/3.*iteration|maximum.*3/i);
|
||||
});
|
||||
|
||||
test('includes quality score', () => {
|
||||
expect(content).toContain('quality score');
|
||||
});
|
||||
|
||||
test('includes metrics path', () => {
|
||||
expect(content).toContain('spec-review.jsonl');
|
||||
});
|
||||
|
||||
test('includes convergence guard', () => {
|
||||
expect(content).toMatch(/[Cc]onvergence/);
|
||||
});
|
||||
|
||||
test('includes graceful failure handling', () => {
|
||||
expect(content).toMatch(/skip.*review|unavailable/i);
|
||||
});
|
||||
});
|
||||
|
||||
// --- {{DESIGN_SKETCH}} resolver tests ---
|
||||
|
||||
describe('DESIGN_SKETCH resolver', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('references DESIGN.md for design system constraints', () => {
|
||||
expect(content).toContain('DESIGN.md');
|
||||
});
|
||||
|
||||
test('contains wireframe or sketch terminology', () => {
|
||||
expect(content).toMatch(/wireframe|sketch/i);
|
||||
});
|
||||
|
||||
test('references browse binary for rendering', () => {
|
||||
expect(content).toContain('$B goto');
|
||||
});
|
||||
|
||||
test('references screenshot capture', () => {
|
||||
expect(content).toContain('$B screenshot');
|
||||
});
|
||||
|
||||
test('specifies rough aesthetic', () => {
|
||||
expect(content).toMatch(/[Rr]ough|hand-drawn/);
|
||||
});
|
||||
|
||||
test('includes skip conditions', () => {
|
||||
expect(content).toMatch(/no UI component|skip/i);
|
||||
});
|
||||
});
|
||||
|
||||
// --- {{BENEFITS_FROM}} resolver tests ---
|
||||
|
||||
describe('BENEFITS_FROM resolver', () => {
|
||||
const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
|
||||
const engContent = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('plan-ceo-review contains prerequisite skill offer', () => {
|
||||
expect(ceoContent).toContain('Prerequisite Skill Offer');
|
||||
expect(ceoContent).toContain('/office-hours');
|
||||
});
|
||||
|
||||
test('plan-eng-review contains prerequisite skill offer', () => {
|
||||
expect(engContent).toContain('Prerequisite Skill Offer');
|
||||
expect(engContent).toContain('/office-hours');
|
||||
});
|
||||
|
||||
test('offer includes graceful decline', () => {
|
||||
expect(ceoContent).toContain('No worries');
|
||||
});
|
||||
|
||||
test('skills without benefits-from do NOT have prerequisite offer', () => {
|
||||
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
expect(qaContent).not.toContain('Prerequisite Skill Offer');
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Codex Generation Tests ─────────────────────────────────
|
||||
|
||||
describe('Codex generation (--host codex)', () => {
|
||||
|
||||
@@ -57,9 +57,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'review-base-branch': ['review/**'],
|
||||
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
|
||||
|
||||
// Office Hours
|
||||
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Plan reviews
|
||||
'plan-ceo-review': ['plan-ceo-review/**'],
|
||||
'plan-ceo-review-selective': ['plan-ceo-review/**'],
|
||||
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'plan-eng-review': ['plan-eng-review/**'],
|
||||
'plan-eng-review-artifact': ['plan-eng-review/**'],
|
||||
|
||||
@@ -140,6 +144,10 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
|
||||
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
|
||||
|
||||
// Office Hours
|
||||
'office-hours/SKILL.md spec review': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'office-hours/SKILL.md design sketch': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Other skills
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
|
||||
@@ -2911,6 +2911,128 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
// --- Office Hours Spec Review E2E ---
|
||||
|
||||
describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'], () => {
|
||||
let ohDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
ohDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-oh-spec-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: ohDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(ohDir, 'README.md'), '# Test Project\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'init']);
|
||||
|
||||
// Copy office-hours skill
|
||||
fs.mkdirSync(path.join(ohDir, 'office-hours'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'office-hours', 'SKILL.md'),
|
||||
path.join(ohDir, 'office-hours', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/office-hours SKILL.md contains spec review loop', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
|
||||
|
||||
Summarize what the "Spec Review Loop" section does — specifically:
|
||||
1. How many dimensions does the reviewer check?
|
||||
2. What tool is used to dispatch the reviewer?
|
||||
3. What's the maximum number of iterations?
|
||||
4. What metrics are tracked?
|
||||
|
||||
Write your summary to ${ohDir}/spec-review-summary.md`,
|
||||
workingDirectory: ohDir,
|
||||
maxTurns: 8,
|
||||
timeout: 120_000,
|
||||
testName: 'office-hours-spec-review',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/office-hours spec review', result);
|
||||
recordE2E('/office-hours-spec-review', 'Office Hours Spec Review E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const summaryPath = path.join(ohDir, 'spec-review-summary.md');
|
||||
if (fs.existsSync(summaryPath)) {
|
||||
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
|
||||
// Verify the agent understood the key concepts
|
||||
expect(summary).toMatch(/5.*dimension|dimension.*5|completeness|consistency|clarity|scope|feasibility/);
|
||||
expect(summary).toMatch(/agent|subagent/);
|
||||
expect(summary).toMatch(/3.*iteration|iteration.*3|maximum.*3/);
|
||||
}
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Plan CEO Review Benefits-From E2E ---
|
||||
|
||||
describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefits'], () => {
|
||||
let benefitsDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
benefitsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benefits-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: benefitsDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(benefitsDir, 'README.md'), '# Test Project\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'init']);
|
||||
|
||||
// Copy plan-ceo-review skill
|
||||
fs.mkdirSync(path.join(benefitsDir, 'plan-ceo-review'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
|
||||
path.join(benefitsDir, 'plan-ceo-review', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
|
||||
|
||||
Summarize what happens when no design doc is found — specifically:
|
||||
1. Is /office-hours offered as a prerequisite?
|
||||
2. What options does the user get?
|
||||
3. Is there a mid-session detection for when the user seems lost?
|
||||
|
||||
Write your summary to ${benefitsDir}/benefits-summary.md`,
|
||||
workingDirectory: benefitsDir,
|
||||
maxTurns: 8,
|
||||
timeout: 120_000,
|
||||
testName: 'plan-ceo-review-benefits',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/plan-ceo-review benefits-from', result);
|
||||
recordE2E('/plan-ceo-review-benefits', 'Plan CEO Review Benefits-From E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const summaryPath = path.join(benefitsDir, 'benefits-summary.md');
|
||||
if (fs.existsSync(summaryPath)) {
|
||||
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
|
||||
// Verify the agent understood the skill chaining
|
||||
expect(summary).toMatch(/office.hours/);
|
||||
expect(summary).toMatch(/design doc|no design/i);
|
||||
}
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -644,6 +644,59 @@ describe('office-hours skill structure', () => {
|
||||
test('contains builder operating principles', () => {
|
||||
expect(content).toContain('Delight is the currency');
|
||||
});
|
||||
|
||||
// Spec Review Loop (Phase 5.5)
|
||||
test('contains spec review loop', () => {
|
||||
expect(content).toContain('Spec Review Loop');
|
||||
});
|
||||
|
||||
test('contains adversarial review dimensions', () => {
|
||||
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
|
||||
expect(content).toContain(dim);
|
||||
}
|
||||
});
|
||||
|
||||
test('contains subagent dispatch instruction', () => {
|
||||
expect(content).toMatch(/Agent.*tool|subagent/i);
|
||||
});
|
||||
|
||||
test('contains max 3 iterations', () => {
|
||||
expect(content).toMatch(/3.*iteration|maximum.*3/i);
|
||||
});
|
||||
|
||||
test('contains quality score', () => {
|
||||
expect(content).toContain('quality score');
|
||||
});
|
||||
|
||||
test('contains spec review metrics path', () => {
|
||||
expect(content).toContain('spec-review.jsonl');
|
||||
});
|
||||
|
||||
test('contains convergence guard', () => {
|
||||
expect(content).toMatch(/convergence/i);
|
||||
});
|
||||
|
||||
// Visual Sketch (Phase 4.5)
|
||||
test('contains visual sketch section', () => {
|
||||
expect(content).toContain('Visual Sketch');
|
||||
});
|
||||
|
||||
test('contains wireframe generation', () => {
|
||||
expect(content).toMatch(/wireframe|sketch/i);
|
||||
});
|
||||
|
||||
test('contains DESIGN.md awareness', () => {
|
||||
expect(content).toContain('DESIGN.md');
|
||||
});
|
||||
|
||||
test('contains browse rendering', () => {
|
||||
expect(content).toContain('$B goto');
|
||||
expect(content).toContain('$B screenshot');
|
||||
});
|
||||
|
||||
test('contains rough aesthetic instruction', () => {
|
||||
expect(content).toMatch(/rough|hand-drawn/i);
|
||||
});
|
||||
});
|
||||
|
||||
describe('investigate skill structure', () => {
|
||||
@@ -856,6 +909,22 @@ describe('CEO review mode validation', () => {
|
||||
expect(content).toContain('HOLD SCOPE');
|
||||
expect(content).toContain('REDUCTION');
|
||||
});
|
||||
|
||||
// Skill chaining (benefits-from)
|
||||
test('contains prerequisite skill offer for office-hours', () => {
|
||||
expect(content).toContain('Prerequisite Skill Offer');
|
||||
expect(content).toContain('/office-hours');
|
||||
});
|
||||
|
||||
test('contains mid-session detection', () => {
|
||||
expect(content).toContain('Mid-session detection');
|
||||
expect(content).toMatch(/still figuring out|seems lost/i);
|
||||
});
|
||||
|
||||
// Spec review on CEO plans
|
||||
test('contains spec review loop for CEO plan documents', () => {
|
||||
expect(content).toContain('Spec Review Loop');
|
||||
});
|
||||
});
|
||||
|
||||
// --- gstack-slug helper ---
|
||||
|
||||
Reference in New Issue
Block a user