mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
feat: add debug escalation tests (validation + LLM judge + E2E)
Skill validation: 11 new assertions covering Phase 8g trigger, structured handoff fields, agent result handlers, debug escalation summary, Step 5.7 recommendation, ship reverted QA detection, and debug browse setup. LLM judge: evaluates Phase 8g template quality — structured brief format, result handling, working tree cleanup, sequential processing. E2E: prompt-level deterministic test (verifies escalation prompt has all required fields) + full flow stub (fixture TODO for planted regression). Touchfile entries for diff-based test selection. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -90,6 +90,10 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
|
||||
// Debug escalation
|
||||
'qa-debug-prompt-logic': ['qa/**', 'debug/**'],
|
||||
'qa-debug-escalation': ['qa/**', 'debug/**', 'browse/src/**'],
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -123,6 +127,9 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
|
||||
|
||||
// Debug escalation
|
||||
'qa/SKILL.md debug escalation': ['qa/SKILL.md', 'qa/SKILL.md.tmpl', 'debug/SKILL.md', 'debug/SKILL.md.tmpl'],
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -2841,6 +2841,67 @@ Output the diagram directly.`,
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Debug escalation E2E tests ---
|
||||
|
||||
describeIfSelected('Debug escalation', ['qa-debug-prompt-logic', 'qa-debug-escalation'], () => {
|
||||
// Test A: Prompt-level deterministic — verify the template produces correct escalation behavior
|
||||
testIfSelected('qa-debug-prompt-logic', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in Phase 8g of the /qa workflow. The following has happened:
|
||||
- ISSUE-007: "Submit button does nothing on checkout page"
|
||||
- Severity: critical
|
||||
- URL: http://localhost:3000/checkout
|
||||
- Fix attempt 1: Added click handler to button → caused JS error on payment page (reverted)
|
||||
- Fix attempt 2: Fixed form action attribute → broke form validation (reverted)
|
||||
- Files investigated: src/components/Checkout.tsx, src/pages/checkout.ts
|
||||
- Console errors: "TypeError: Cannot read property 'submit' of null"
|
||||
|
||||
Read qa/SKILL.md and follow Phase 8g exactly. Show the Agent prompt you would use to spawn the debug sub-agent. Do NOT actually spawn the agent — just output the prompt you would use.`,
|
||||
maxTurns: 10,
|
||||
});
|
||||
|
||||
recordE2E('qa-debug-prompt-logic', 'Debug escalation', result);
|
||||
|
||||
// Verify the output contains a well-formed structured brief
|
||||
const output = result.output ?? '';
|
||||
const hasIssueId = /ISSUE-007/i.test(output);
|
||||
const hasSymptom = /submit.*button|does nothing|checkout/i.test(output);
|
||||
const hasRepro = /localhost.*3000|checkout/i.test(output);
|
||||
const hasFailedAttempts = /fix attempt|click handler|form action/i.test(output);
|
||||
const hasFiles = /Checkout\.tsx|checkout\.ts/i.test(output);
|
||||
const hasDebugSkillRef = /debug\/SKILL\.md/i.test(output);
|
||||
|
||||
console.log(`Has Issue ID: ${hasIssueId}`);
|
||||
console.log(`Has Symptom: ${hasSymptom}`);
|
||||
console.log(`Has Repro: ${hasRepro}`);
|
||||
console.log(`Has Failed Attempts: ${hasFailedAttempts}`);
|
||||
console.log(`Has Files: ${hasFiles}`);
|
||||
console.log(`Has Debug Skill Ref: ${hasDebugSkillRef}`);
|
||||
|
||||
// The output should contain all the structured handoff fields
|
||||
expect(hasIssueId).toBe(true);
|
||||
expect(hasSymptom).toBe(true);
|
||||
expect(hasFailedAttempts).toBe(true);
|
||||
expect(hasDebugSkillRef).toBe(true);
|
||||
}, 120_000);
|
||||
|
||||
// Test B: Full E2E with planted regression
|
||||
// This test requires a fixture app with a deliberately hard-to-fix bug.
|
||||
// The bug should resist at least 2 fix attempts to trigger escalation.
|
||||
// TODO: Create fixture at browse/test/fixtures/qa-eval-debug-escalation/
|
||||
testIfSelected('qa-debug-escalation', async () => {
|
||||
// Skip until fixture is created — this is a placeholder for the full flow
|
||||
console.log('SKIP: qa-debug-escalation — fixture not yet created');
|
||||
console.log('TODO: Create browse/test/fixtures/qa-eval-debug-escalation/ with a deliberately hard-to-fix bug');
|
||||
// When implemented, this test should:
|
||||
// 1. Start test server serving the fixture
|
||||
// 2. Run /qa against it
|
||||
// 3. Verify fix attempts are made and reverted
|
||||
// 4. Verify Phase 8g triggers (Agent tool call appears in transcript)
|
||||
// 5. Verify debug report appears in QA output
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -668,6 +668,23 @@ describeIfSelected('Other skill evals', [
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Block 5: Debug escalation
|
||||
describeIfSelected('Debug escalation evals', [
|
||||
'qa/SKILL.md debug escalation',
|
||||
], () => {
|
||||
testIfSelected('qa/SKILL.md debug escalation', async () => {
|
||||
await runWorkflowJudge({
|
||||
testName: 'qa/SKILL.md debug escalation',
|
||||
suite: 'Debug escalation evals',
|
||||
skillPath: 'qa/SKILL.md',
|
||||
startMarker: '### 8g. Debug Escalation',
|
||||
endMarker: '## Phase 9: Final QA',
|
||||
judgeContext: 'a debug sub-agent escalation workflow within a QA fix loop',
|
||||
judgeGoal: 'when and how to spawn a debug sub-agent for bugs that resisted multiple fix attempts, including structured bug brief format (issue ID, symptom, reproduction, failed fix attempts, files), result handling for all four statuses (DONE, DONE_WITH_CONCERNS, BLOCKED, agent failure), working tree cleanup, and sequential processing',
|
||||
});
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -620,6 +620,74 @@ describe('debug skill structure', () => {
|
||||
'DEBUG REPORT', '3-strike', 'BLOCKED']) {
|
||||
test(`contains ${section}`, () => expect(content).toContain(section));
|
||||
}
|
||||
|
||||
test('has browse setup for visual reproduction', () => {
|
||||
expect(content).toContain('Visual reproduction');
|
||||
expect(content).toContain('$B goto');
|
||||
expect(content).toContain('$B screenshot');
|
||||
});
|
||||
|
||||
test('has visual verification in Phase 5', () => {
|
||||
expect(content).toContain('Visual verification');
|
||||
expect(content).toContain('debug-ISSUE-NNN-fixed');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Debug sub-agent escalation validation ---
|
||||
|
||||
describe('Debug sub-agent escalation', () => {
|
||||
test('qa/SKILL.md has Agent in allowed-tools', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
// Check frontmatter allowed-tools section contains Agent
|
||||
const frontmatter = content.split('---')[1];
|
||||
expect(frontmatter).toContain('Agent');
|
||||
});
|
||||
|
||||
test('qa/SKILL.md has Phase 8g Debug Escalation', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('8g. Debug Escalation');
|
||||
expect(content).toContain('reverted at least twice');
|
||||
expect(content).toContain('Bug Brief');
|
||||
expect(content).toContain('debug/SKILL.md');
|
||||
});
|
||||
|
||||
test('qa/SKILL.md has structured handoff in debug prompt', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Issue ID');
|
||||
expect(content).toContain('Symptom');
|
||||
expect(content).toContain('Reproduction');
|
||||
expect(content).toContain('Failed fix attempts');
|
||||
expect(content).toContain('Files investigated');
|
||||
});
|
||||
|
||||
test('qa/SKILL.md has all four agent result handlers', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('DONE status');
|
||||
expect(content).toContain('DONE_WITH_CONCERNS');
|
||||
expect(content).toContain('git checkout .');
|
||||
expect(content).toContain('deferred (debug unavailable)');
|
||||
});
|
||||
|
||||
test('qa/SKILL.md has debug escalation summary in Phase 10', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('DEBUG ESCALATION');
|
||||
expect(content).toContain('Issues escalated');
|
||||
expect(content).toContain('Per-issue details');
|
||||
});
|
||||
|
||||
test('review/SKILL.md has Step 5.7 pre-existing bug detection', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('5.7: Pre-existing bug detection');
|
||||
expect(content).toContain('/debug');
|
||||
expect(content).toContain('pre-existing issue');
|
||||
});
|
||||
|
||||
test('ship/SKILL.md has reverted QA commit detection', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Reverted QA fix detection');
|
||||
expect(content).toContain('revert.*fix(qa)');
|
||||
expect(content).toContain('/debug');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Contributor mode preamble structure validation ---
|
||||
|
||||
@@ -122,7 +122,8 @@ describe('selectTests', () => {
|
||||
const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES);
|
||||
expect(result.selected).toContain('qa/SKILL.md workflow');
|
||||
expect(result.selected).toContain('qa/SKILL.md health rubric');
|
||||
expect(result.selected.length).toBe(2);
|
||||
expect(result.selected).toContain('qa/SKILL.md debug escalation');
|
||||
expect(result.selected.length).toBe(3);
|
||||
});
|
||||
|
||||
test('SKILL.md.tmpl root template only selects root-dependent tests', () => {
|
||||
|
||||
Reference in New Issue
Block a user