diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 995648a1..41436d6a 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -90,6 +90,10 @@ export const E2E_TOUCHFILES: Record = { // gstack-upgrade 'gstack-upgrade-happy-path': ['gstack-upgrade/**'], + + // Debug escalation + 'qa-debug-prompt-logic': ['qa/**', 'debug/**'], + 'qa-debug-escalation': ['qa/**', 'debug/**', 'browse/src/**'], }; /** @@ -123,6 +127,9 @@ export const LLM_JUDGE_TOUCHFILES: Record = { 'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'], 'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'], 'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'], + + // Debug escalation + 'qa/SKILL.md debug escalation': ['qa/SKILL.md', 'qa/SKILL.md.tmpl', 'debug/SKILL.md', 'debug/SKILL.md.tmpl'], }; /** diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 13539278..f02b6b6f 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -2841,6 +2841,67 @@ Output the diagram directly.`, }, 180_000); }); +// --- Debug escalation E2E tests --- + +describeIfSelected('Debug escalation', ['qa-debug-prompt-logic', 'qa-debug-escalation'], () => { + // Test A: Prompt-level deterministic — verify the template produces correct escalation behavior + testIfSelected('qa-debug-prompt-logic', async () => { + const result = await runSkillTest({ + prompt: `You are in Phase 8g of the /qa workflow. The following has happened: +- ISSUE-007: "Submit button does nothing on checkout page" + - Severity: critical + - URL: http://localhost:3000/checkout + - Fix attempt 1: Added click handler to button → caused JS error on payment page (reverted) + - Fix attempt 2: Fixed form action attribute → broke form validation (reverted) + - Files investigated: src/components/Checkout.tsx, src/pages/checkout.ts + - Console errors: "TypeError: Cannot read property 'submit' of null" + +Read qa/SKILL.md and follow Phase 8g exactly. Show the Agent prompt you would use to spawn the debug sub-agent. Do NOT actually spawn the agent — just output the prompt you would use.`, + maxTurns: 10, + }); + + recordE2E('qa-debug-prompt-logic', 'Debug escalation', result); + + // Verify the output contains a well-formed structured brief + const output = result.output ?? ''; + const hasIssueId = /ISSUE-007/i.test(output); + const hasSymptom = /submit.*button|does nothing|checkout/i.test(output); + const hasRepro = /localhost.*3000|checkout/i.test(output); + const hasFailedAttempts = /fix attempt|click handler|form action/i.test(output); + const hasFiles = /Checkout\.tsx|checkout\.ts/i.test(output); + const hasDebugSkillRef = /debug\/SKILL\.md/i.test(output); + + console.log(`Has Issue ID: ${hasIssueId}`); + console.log(`Has Symptom: ${hasSymptom}`); + console.log(`Has Repro: ${hasRepro}`); + console.log(`Has Failed Attempts: ${hasFailedAttempts}`); + console.log(`Has Files: ${hasFiles}`); + console.log(`Has Debug Skill Ref: ${hasDebugSkillRef}`); + + // The output should contain all the structured handoff fields + expect(hasIssueId).toBe(true); + expect(hasSymptom).toBe(true); + expect(hasFailedAttempts).toBe(true); + expect(hasDebugSkillRef).toBe(true); + }, 120_000); + + // Test B: Full E2E with planted regression + // This test requires a fixture app with a deliberately hard-to-fix bug. + // The bug should resist at least 2 fix attempts to trigger escalation. + // TODO: Create fixture at browse/test/fixtures/qa-eval-debug-escalation/ + testIfSelected('qa-debug-escalation', async () => { + // Skip until fixture is created — this is a placeholder for the full flow + console.log('SKIP: qa-debug-escalation — fixture not yet created'); + console.log('TODO: Create browse/test/fixtures/qa-eval-debug-escalation/ with a deliberately hard-to-fix bug'); + // When implemented, this test should: + // 1. Start test server serving the fixture + // 2. Run /qa against it + // 3. Verify fix attempts are made and reverted + // 4. Verify Phase 8g triggers (Agent tool call appears in transcript) + // 5. Verify debug report appears in QA output + }, 300_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { if (evalCollector) { diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index 528d5115..058f52a0 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -668,6 +668,23 @@ describeIfSelected('Other skill evals', [ }, 30_000); }); +// Block 5: Debug escalation +describeIfSelected('Debug escalation evals', [ + 'qa/SKILL.md debug escalation', +], () => { + testIfSelected('qa/SKILL.md debug escalation', async () => { + await runWorkflowJudge({ + testName: 'qa/SKILL.md debug escalation', + suite: 'Debug escalation evals', + skillPath: 'qa/SKILL.md', + startMarker: '### 8g. Debug Escalation', + endMarker: '## Phase 9: Final QA', + judgeContext: 'a debug sub-agent escalation workflow within a QA fix loop', + judgeGoal: 'when and how to spawn a debug sub-agent for bugs that resisted multiple fix attempts, including structured bug brief format (issue ID, symptom, reproduction, failed fix attempts, files), result handling for all four statuses (DONE, DONE_WITH_CONCERNS, BLOCKED, agent failure), working tree cleanup, and sequential processing', + }); + }, 30_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { if (evalCollector) { diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index e63a4b67..f02c27b6 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -620,6 +620,74 @@ describe('debug skill structure', () => { 'DEBUG REPORT', '3-strike', 'BLOCKED']) { test(`contains ${section}`, () => expect(content).toContain(section)); } + + test('has browse setup for visual reproduction', () => { + expect(content).toContain('Visual reproduction'); + expect(content).toContain('$B goto'); + expect(content).toContain('$B screenshot'); + }); + + test('has visual verification in Phase 5', () => { + expect(content).toContain('Visual verification'); + expect(content).toContain('debug-ISSUE-NNN-fixed'); + }); +}); + +// --- Debug sub-agent escalation validation --- + +describe('Debug sub-agent escalation', () => { + test('qa/SKILL.md has Agent in allowed-tools', () => { + const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + // Check frontmatter allowed-tools section contains Agent + const frontmatter = content.split('---')[1]; + expect(frontmatter).toContain('Agent'); + }); + + test('qa/SKILL.md has Phase 8g Debug Escalation', () => { + const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + expect(content).toContain('8g. Debug Escalation'); + expect(content).toContain('reverted at least twice'); + expect(content).toContain('Bug Brief'); + expect(content).toContain('debug/SKILL.md'); + }); + + test('qa/SKILL.md has structured handoff in debug prompt', () => { + const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + expect(content).toContain('Issue ID'); + expect(content).toContain('Symptom'); + expect(content).toContain('Reproduction'); + expect(content).toContain('Failed fix attempts'); + expect(content).toContain('Files investigated'); + }); + + test('qa/SKILL.md has all four agent result handlers', () => { + const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + expect(content).toContain('DONE status'); + expect(content).toContain('DONE_WITH_CONCERNS'); + expect(content).toContain('git checkout .'); + expect(content).toContain('deferred (debug unavailable)'); + }); + + test('qa/SKILL.md has debug escalation summary in Phase 10', () => { + const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + expect(content).toContain('DEBUG ESCALATION'); + expect(content).toContain('Issues escalated'); + expect(content).toContain('Per-issue details'); + }); + + test('review/SKILL.md has Step 5.7 pre-existing bug detection', () => { + const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + expect(content).toContain('5.7: Pre-existing bug detection'); + expect(content).toContain('/debug'); + expect(content).toContain('pre-existing issue'); + }); + + test('ship/SKILL.md has reverted QA commit detection', () => { + const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + expect(content).toContain('Reverted QA fix detection'); + expect(content).toContain('revert.*fix(qa)'); + expect(content).toContain('/debug'); + }); }); // --- Contributor mode preamble structure validation --- diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 48613d64..84a4a5a8 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -122,7 +122,8 @@ describe('selectTests', () => { const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES); expect(result.selected).toContain('qa/SKILL.md workflow'); expect(result.selected).toContain('qa/SKILL.md health rubric'); - expect(result.selected.length).toBe(2); + expect(result.selected).toContain('qa/SKILL.md debug escalation'); + expect(result.selected.length).toBe(3); }); test('SKILL.md.tmpl root template only selects root-dependent tests', () => {