From e3384e325c2fb20861723bc37347d9f130f1f47f Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 29 Mar 2026 15:51:41 -0700
Subject: [PATCH] fix: destructure callJudge return value in LLM eval tests

callJudge<T> returns { result: T, meta } but three call sites were
accessing properties directly on the wrapper object instead of
destructuring result first. This caused "Expected and actual values
must be numbers or bigints" in all workflow judge tests (10 failures).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/skill-llm-eval.test.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts
index cc82ad2e..eb3de404 100644
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -369,7 +369,7 @@ ${section}`);
     const diffAwareSection = qaContent.slice(diffAwareStart, diffAwareEnd);
     const rulesSection = qaContent.slice(rulesStart, rulesEnd);
 
-    const result = await callJudge<{ would_browse: boolean; fallback_behavior: string; confidence: number; reasoning: string }>(`You are evaluating whether a QA testing skill document would cause an AI agent to USE THE BROWSER or REFUSE to use the browser in a specific scenario.
+    const { result } = await callJudge<{ would_browse: boolean; fallback_behavior: string; confidence: number; reasoning: string }>(`You are evaluating whether a QA testing skill document would cause an AI agent to USE THE BROWSER or REFUSE to use the browser in a specific scenario.
 
 SCENARIO:
 A user runs /qa (a browser-based QA testing skill). The branch diff shows ONLY prompt template files and config file changes — no routes, views, controllers, components, or CSS were changed. The changes are "purely backend" with no obvious UI surface.
@@ -568,7 +568,7 @@ async function runWorkflowJudge(opts: {
     section = content.slice(startIdx);
   }
 
-  const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
+  const { result: scores } = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
 
 The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
 that are documented separately — do NOT penalize for missing external definitions.
@@ -806,7 +806,7 @@ describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
     const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
     const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
 
-    const result = await callJudge<{
+    const { result } = await callJudge<{
       directness: number;
       concreteness: number;
       avoids_corporate: number;