fix: /qa never refuses browser testing on backend-only changes (#202)

* feat: QA skill never refuses browser testing Add anti-refusal guardrails to /qa and /qa-only skills. When the user invokes /qa, the skill must always use the browser — even if the diff shows only backend/config changes with no obvious UI surface. Falls back to Quick mode (homepage + top 5 nav targets) when no specific pages are identified from the diff. Adds LLM-as-judge eval to verify the anti-refusal behavior. * chore: bump version and changelog (v0.8.1) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-01 19:25:10 +02:00 · 2026-03-19 00:31:26 -05:00
parent d85233017b
commit d961188276
8 changed files with 74 additions and 4 deletions
@@ -119,6 +119,7 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
  'regression vs baseline':           ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
  'qa/SKILL.md workflow':             ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
  'qa/SKILL.md health rubric':        ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
+  'qa/SKILL.md anti-refusal':         ['qa/SKILL.md', 'qa/SKILL.md.tmpl', 'qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
  'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
  'baseline score pinning':           ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],

@@ -256,7 +256,7 @@ Scores are 1-5 overall quality.`,

 // --- Part 7: QA skill quality evals (C6) ---

-describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => {
+describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric', 'qa/SKILL.md anti-refusal'], () => {
  const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');

  testIfSelected('qa/SKILL.md workflow', async () => {
@@ -342,6 +342,59 @@ ${section}`);
    expect(scores.completeness).toBeGreaterThanOrEqual(3);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);
+
+  testIfSelected('qa/SKILL.md anti-refusal', async () => {
+    const t0 = Date.now();
+    // Extract both the diff-aware mode section and Important Rules section
+    const diffAwareStart = qaContent.indexOf('### Diff-aware');
+    const diffAwareEnd = qaContent.indexOf('### Full');
+    const rulesStart = qaContent.indexOf('## Important Rules');
+    const rulesEnd = qaContent.indexOf('## Framework-Specific');
+    const diffAwareSection = qaContent.slice(diffAwareStart, diffAwareEnd);
+    const rulesSection = qaContent.slice(rulesStart, rulesEnd);
+
+    const result = await callJudge<{ would_browse: boolean; fallback_behavior: string; confidence: number; reasoning: string }>(`You are evaluating whether a QA testing skill document would cause an AI agent to USE THE BROWSER or REFUSE to use the browser in a specific scenario.
+
+SCENARIO:
+A user runs /qa (a browser-based QA testing skill). The branch diff shows ONLY prompt template files and config file changes — no routes, views, controllers, components, or CSS were changed. The changes are "purely backend" with no obvious UI surface.
+
+QUESTION: Based on the document excerpts below, would the agent open the browser and test the application, or would it decline/refuse to browse and suggest running evals or unit tests instead?
+
+DOCUMENT EXCERPT 1 (Diff-aware mode instructions):
+${diffAwareSection}
+
+DOCUMENT EXCERPT 2 (Important Rules):
+${rulesSection}
+
+Respond with ONLY valid JSON:
+{
+  "would_browse": true or false,
+  "fallback_behavior": "description of what the agent would do when no UI pages are identified from the diff",
+  "confidence": N (1-5, how confident you are in your answer),
+  "reasoning": "brief explanation"
+}
+
+Rules:
+- would_browse should be true if the document instructs the agent to always use the browser regardless of diff content
+- would_browse should be false if the document allows the agent to skip browser testing for non-UI changes
+- confidence: 5 = document is unambiguous, 1 = document is unclear or contradictory`);
+
+    console.log('QA anti-refusal result:', JSON.stringify(result, null, 2));
+
+    evalCollector?.addTest({
+      name: 'qa/SKILL.md anti-refusal',
+      suite: 'QA skill quality evals',
+      tier: 'llm-judge',
+      passed: result.would_browse === true && result.confidence >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { would_browse: result.would_browse ? 1 : 0, confidence: result.confidence },
+      judge_reasoning: result.reasoning,
+    });
+
+    expect(result.would_browse).toBe(true);
+    expect(result.confidence).toBeGreaterThanOrEqual(4);
+  }, 30_000);
 });

 // --- Part 7: Cross-skill consistency judge (C7) ---
@@ -123,7 +123,8 @@ describe('selectTests', () => {
    const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES);
    expect(result.selected).toContain('qa/SKILL.md workflow');
    expect(result.selected).toContain('qa/SKILL.md health rubric');
-    expect(result.selected.length).toBe(2);
+    expect(result.selected).toContain('qa/SKILL.md anti-refusal');
+    expect(result.selected.length).toBe(3);
  });

  test('SKILL.md.tmpl root template selects root-dependent tests and routing tests', () => {