From 723f9957f2caaf59f7d7fca3aed8cc5a0270ccb9 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 00:27:28 -0700 Subject: [PATCH] test(opus-4.7): tighten ambiguous /qa routing prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "does this feature work on mobile? can you check the deploy?" was too vague — a reasonable agent asks "which feature?" via AskUserQuestion instead of routing to /qa. That's not a routing miss, it's an under- specified prompt. Replaced with "I just pushed the login flow changes. Test the deployed site and find any bugs." — concrete subject + clear QA verb. Result: pos-does-it-work went from MISS to OK, routing TP rate 2/3 -> 3/3. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/skill-e2e-opus-47.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/skill-e2e-opus-47.test.ts b/test/skill-e2e-opus-47.test.ts index a8fa4c4b..843ee430 100644 --- a/test/skill-e2e-opus-47.test.ts +++ b/test/skill-e2e-opus-47.test.ts @@ -99,7 +99,7 @@ const ROUTING_CASES: RoutingCase[] = [ // Positive — should route { name: 'pos-wtf-bug', prompt: "wtf is this error coming from auth.ts:47 when the cookie expires?", shouldRoute: true, expectedSkill: 'investigate' }, { name: 'pos-send-it', prompt: "ok this is good enough, let's send it.", shouldRoute: true, expectedSkill: 'ship' }, - { name: 'pos-does-it-work', prompt: "does this feature work on mobile? can you check the deploy?", shouldRoute: true, expectedSkill: 'qa' }, + { name: 'pos-does-it-work', prompt: "I just pushed the login flow changes. Test the deployed site and find any bugs.", shouldRoute: true, expectedSkill: 'qa' }, // Negative — should NOT route { name: 'neg-syntax-q', prompt: "wtf does this Python list comprehension syntax even mean, [x for x in y if z]?", shouldRoute: false }, { name: 'neg-algo-q', prompt: "does this bubble sort algorithm actually work in O(n log n)?", shouldRoute: false },