From 59752fc5101bec9622cc4277cb427dcd4bff05b9 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 16:47:35 -0500
Subject: [PATCH] feat: wire eval-cache + eval-tier into LLM judge, pin E2E
 model

callJudge/judge now return {result, meta} with SHA-based caching
(~$0.18/run savings when SKILL.md unchanged) and dynamic model
selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from
EVAL_TIER to claude -p. outcomeJudge retains simple return type.
All 8 LLM eval test sites updated with real costs and costs[].

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 TODOS.md                       |   4 +-
 test/helpers/llm-judge.test.ts | 117 +++++++++++++++++++++++++++++++++
 test/helpers/llm-judge.ts      |  59 ++++++++++++++---
 test/skill-llm-eval.test.ts    |  99 ++++++++++++++++------------
 4 files changed, 227 insertions(+), 52 deletions(-)
 create mode 100644 test/helpers/llm-judge.test.ts

diff --git a/TODOS.md b/TODOS.md
index 4916c236..b5ec8ac3 100644
--- a/TODOS.md
+++ b/TODOS.md
@@ -231,7 +231,7 @@
 
 **Why:** Spot quality trends — is the app getting better or worse?
 
-**Context:** QA already writes structured reports. This adds cross-run comparison.
+**Context:** `eval:trend` now tracks test-level pass rates (eval infrastructure). QA-run-level trending (health scores over time across QA report files) is a separate feature that could reuse `computeTrends` pattern from `lib/cli-eval.ts`.
 
 **Effort:** S
 **Priority:** P2
@@ -335,6 +335,8 @@
 
 **Why:** Reduce E2E test cost and flakiness.
 
+**Status:** Model pinning shipped (session-runner.ts passes `--model` from `EVAL_TIER` env). Retry:2 still TODO.
+
 **Effort:** XS
 **Priority:** P2
 
diff --git a/test/helpers/llm-judge.test.ts b/test/helpers/llm-judge.test.ts
new file mode 100644
index 00000000..03cf7788
--- /dev/null
+++ b/test/helpers/llm-judge.test.ts
@@ -0,0 +1,117 @@
+/**
+ * Tests for LLM judge cache + tier integration.
+ * Mocks Anthropic client to avoid API calls.
+ */
+
+import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+let tmpCacheDir: string;
+const origEnv: Record<string, string | undefined> = {};
+
+beforeEach(() => {
+  tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-'));
+  // Point cache to temp dir and clear tier env vars
+  origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR;
+  origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER;
+  origEnv.EVAL_TIER = process.env.EVAL_TIER;
+  origEnv.EVAL_CACHE = process.env.EVAL_CACHE;
+  process.env.GSTACK_STATE_DIR = tmpCacheDir;
+  delete process.env.EVAL_JUDGE_TIER;
+  delete process.env.EVAL_TIER;
+  delete process.env.EVAL_CACHE;
+});
+
+afterEach(() => {
+  // Restore env
+  for (const [key, val] of Object.entries(origEnv)) {
+    if (val === undefined) delete process.env[key];
+    else process.env[key] = val;
+  }
+  try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {}
+});
+
+// Test cache key computation directly (doesn't need mock)
+describe('cache key computation', () => {
+  test('computeCacheKey produces consistent hashes for same input', async () => {
+    const { computeCacheKey } = await import('../../lib/eval-cache');
+    const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
+    const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
+    expect(key1).toBe(key2);
+    expect(key1).toHaveLength(16);
+  });
+
+  test('cache key differs when model changes', async () => {
+    const { computeCacheKey } = await import('../../lib/eval-cache');
+    const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
+    const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt');
+    expect(key1).not.toBe(key2);
+  });
+
+  test('cache key differs when prompt changes', async () => {
+    const { computeCacheKey } = await import('../../lib/eval-cache');
+    const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A');
+    const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B');
+    expect(key1).not.toBe(key2);
+  });
+});
+
+// Test cache read/write directly
+describe('cache read/write for llm-judge suite', () => {
+  test('cacheRead returns null on miss', async () => {
+    const { cacheRead } = await import('../../lib/eval-cache');
+    expect(cacheRead('llm-judge', 'nonexistent')).toBeNull();
+  });
+
+  test('cacheWrite + cacheRead round-trip', async () => {
+    const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
+    const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' };
+    cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' });
+    const cached = cacheRead('llm-judge', 'test-key');
+    expect(cached).toEqual(data);
+  });
+
+  test('EVAL_CACHE=0 bypasses cache read', async () => {
+    const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
+    cacheWrite('llm-judge', 'bypass-key', { test: true });
+    process.env.EVAL_CACHE = '0';
+    expect(cacheRead('llm-judge', 'bypass-key')).toBeNull();
+  });
+});
+
+// Test tier resolution
+describe('tier resolution for judge', () => {
+  test('defaults to standard (sonnet) when no env set', async () => {
+    const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
+    expect(resolveJudgeTier()).toBe('standard');
+    expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6');
+  });
+
+  test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => {
+    process.env.EVAL_JUDGE_TIER = 'haiku';
+    // Need fresh import to pick up env change
+    const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
+    expect(resolveJudgeTier()).toBe('fast');
+    expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5');
+  });
+
+  test('EVAL_JUDGE_TIER=opus selects full tier', async () => {
+    process.env.EVAL_JUDGE_TIER = 'opus';
+    const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
+    expect(resolveJudgeTier()).toBe('full');
+    expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6');
+  });
+});
+
+// Test JudgeMeta shape
+describe('JudgeMeta interface', () => {
+  test('exported from llm-judge module', async () => {
+    const mod = await import('./llm-judge');
+    // Verify callJudge and judge are exported functions
+    expect(typeof mod.callJudge).toBe('function');
+    expect(typeof mod.judge).toBe('function');
+    expect(typeof mod.outcomeJudge).toBe('function');
+  });
+});
diff --git a/test/helpers/llm-judge.ts b/test/helpers/llm-judge.ts
index 7040cd6c..61d6927a 100644
--- a/test/helpers/llm-judge.ts
+++ b/test/helpers/llm-judge.ts
@@ -1,13 +1,19 @@
 /**
  * Shared LLM-as-judge helpers for eval and E2E tests.
  *
- * Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
- * and outcomeJudge (planted-bug detection scorer).
+ * Provides callJudge (generic JSON-from-LLM with cache + tier support),
+ * judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
  *
- * Requires: ANTHROPIC_API_KEY env var
+ * Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
+ *
+ * Env vars:
+ *   EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
+ *   EVAL_CACHE=0    — bypass cache, always re-run
  */
 
 import Anthropic from '@anthropic-ai/sdk';
+import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
+import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';
 
 export interface JudgeScore {
   clarity: number;       // 1-5
@@ -25,15 +31,35 @@ export interface OutcomeJudgeResult {
   reasoning: string;
 }
 
+export interface JudgeMeta {
+  model: string;
+  input_tokens: number;
+  output_tokens: number;
+  cached: boolean;
+}
+
 /**
- * Call claude-sonnet-4-6 with a prompt, extract JSON response.
+ * Call the judge model with a prompt, extract JSON response.
+ * Uses eval-cache for SHA-based caching and eval-tier for model selection.
  * Retries once on 429 rate limit errors.
  */
-export async function callJudge<T>(prompt: string): Promise<T> {
+export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
+  const model = tierToModel(resolveJudgeTier());
+
+  // Check cache (keyed by model + prompt content)
+  const cacheKey = computeCacheKey([], `${model}:${prompt}`);
+  const cached = cacheRead('llm-judge', cacheKey);
+  if (cached !== null) {
+    return {
+      result: cached as T,
+      meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
+    };
+  }
+
   const client = new Anthropic();
 
   const makeRequest = () => client.messages.create({
-    model: 'claude-sonnet-4-6',
+    model,
     max_tokens: 1024,
     messages: [{ role: 'user', content: prompt }],
   });
@@ -53,13 +79,25 @@ export async function callJudge<T>(prompt: string): Promise<T> {
   const text = response.content[0].type === 'text' ? response.content[0].text : '';
   const jsonMatch = text.match(/\{[\s\S]*\}/);
   if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
-  return JSON.parse(jsonMatch[0]) as T;
+  const result = JSON.parse(jsonMatch[0]) as T;
+
+  // Write to cache
+  cacheWrite('llm-judge', cacheKey, result, { model });
+
+  const meta: JudgeMeta = {
+    model,
+    input_tokens: (response.usage as any)?.input_tokens || 0,
+    output_tokens: (response.usage as any)?.output_tokens || 0,
+    cached: false,
+  };
+
+  return { result, meta };
 }
 
 /**
  * Score documentation quality on clarity/completeness/actionability (1-5).
  */
-export async function judge(section: string, content: string): Promise<JudgeScore> {
+export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
   return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
 
 The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
@@ -92,12 +130,14 @@ ${content}`);
 /**
  * Evaluate a QA report against planted-bug ground truth.
  * Returns detection metrics for the planted bugs.
+ * Note: outcomeJudge returns just the result (not meta) for backward compat
+ * with E2E test callers. Cache still works internally.
  */
 export async function outcomeJudge(
   groundTruth: any,
   report: string,
 ): Promise<OutcomeJudgeResult> {
-  return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
+  const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
 
 GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
 ${JSON.stringify(groundTruth.bugs, null, 2)}
@@ -127,4 +167,5 @@ Rules:
 - detection_rate = length of detected array
 - evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
   5 = excellent evidence for every bug, 1 = no evidence at all`);
+  return result;
 }
diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts
index ba635613..2889538c 100644
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -7,16 +7,18 @@
  * Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set)
  * Run: EVALS=1 bun run test:eval
  *
- * Cost: ~$0.05-0.15 per run (sonnet)
+ * Cost: ~$0.05-0.15 per run (sonnet), $0 on cache hit
+ * Cache: SHA-based via eval-cache. Set EVAL_CACHE=0 to force re-run.
+ * Model: Set EVAL_JUDGE_TIER=haiku|sonnet|opus to override (default: sonnet).
  */
 
 import { describe, test, expect, afterAll } from 'bun:test';
-import Anthropic from '@anthropic-ai/sdk';
 import * as fs from 'fs';
 import * as path from 'path';
 import { callJudge, judge } from './helpers/llm-judge';
-import type { JudgeScore } from './helpers/llm-judge';
+import type { JudgeMeta } from './helpers/llm-judge';
 import { EvalCollector } from './helpers/eval-store';
+import { MODEL_PRICING } from '../lib/eval-cost';
 
 const ROOT = path.resolve(import.meta.dir, '..');
 // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
@@ -26,6 +28,22 @@ const describeEval = evalsEnabled ? describe : describe.skip;
 // Eval result collector
 const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
 
+/** Compute actual judge cost from meta (0 on cache hit). */
+function judgeCost(meta: JudgeMeta): number {
+  if (meta.cached) return 0;
+  const p = MODEL_PRICING[meta.model] || { input: 3.0, output: 15.0 };
+  return (meta.input_tokens / 1_000_000) * p.input + (meta.output_tokens / 1_000_000) * p.output;
+}
+
+/** Build CostEntry array from judge meta (empty on cache hit). */
+function judgeCosts(meta: JudgeMeta) {
+  if (meta.cached) return [];
+  return [{
+    model: meta.model, calls: 1,
+    input_tokens: meta.input_tokens, output_tokens: meta.output_tokens,
+  }];
+}
+
 describeEval('LLM-as-judge quality evals', () => {
   test('command reference table scores >= 4 on all dimensions', async () => {
     const t0 = Date.now();
@@ -34,8 +52,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const end = content.indexOf('## Tips');
     const section = content.slice(start, end);
 
-    const scores = await judge('command reference table', section);
-    console.log('Command reference scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('command reference table', section);
+    console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'command reference table',
@@ -43,9 +61,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -60,8 +79,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const end = content.indexOf('## Command Reference');
     const section = content.slice(start, end);
 
-    const scores = await judge('snapshot flags reference', section);
-    console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('snapshot flags reference', section);
+    console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'snapshot flags reference',
@@ -69,9 +88,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -85,8 +105,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const start = content.indexOf('## Snapshot Flags');
     const section = content.slice(start);
 
-    const scores = await judge('browse skill reference (flags + commands)', section);
-    console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('browse skill reference (flags + commands)', section);
+    console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'browse/SKILL.md reference',
@@ -94,9 +114,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -111,8 +132,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const setupEnd = content.indexOf('## IMPORTANT');
     const section = content.slice(setupStart, setupEnd);
 
-    const scores = await judge('setup/binary discovery instructions', section);
-    console.log('Setup block scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('setup/binary discovery instructions', section);
+    console.log('Setup block scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'setup block',
@@ -120,9 +141,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.actionability >= 3 && scores.clarity >= 3,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     // Setup block is intentionally minimal (binary discovery only).
@@ -171,13 +193,7 @@ describeEval('LLM-as-judge quality evals', () => {
 | \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
 | \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
 
-    const client = new Anthropic();
-    const response = await client.messages.create({
-      model: 'claude-sonnet-4-6',
-      max_tokens: 1024,
-      messages: [{
-        role: 'user',
-        content: `You are comparing two versions of CLI documentation for an AI coding agent.
+    const { result, meta } = await callJudge<{ winner: string; reasoning: string; a_score: number; b_score: number }>(`You are comparing two versions of CLI documentation for an AI coding agent.
 
 VERSION A (baseline — hand-maintained):
 ${baseline}
@@ -193,15 +209,9 @@ Which version is better for an AI agent trying to use these commands? Consider:
 Respond with ONLY valid JSON:
 {"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
 
-Scores are 1-5 overall quality.`,
-      }],
-    });
+Scores are 1-5 overall quality.`);
 
-    const text = response.content[0].type === 'text' ? response.content[0].text : '';
-    const jsonMatch = text.match(/\{[\s\S]*\}/);
-    if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
-    const result = JSON.parse(jsonMatch[0]);
-    console.log('Regression comparison:', JSON.stringify(result, null, 2));
+    console.log('Regression comparison:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'regression vs baseline',
@@ -209,9 +219,10 @@ Scores are 1-5 overall quality.`,
       tier: 'llm-judge',
       passed: result.b_score >= result.a_score,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { a_score: result.a_score, b_score: result.b_score },
       judge_reasoning: result.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
@@ -229,7 +240,7 @@ describeEval('QA skill quality evals', () => {
     const end = qaContent.indexOf('## Health Score Rubric');
     const section = qaContent.slice(start, end);
 
-    const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
+    const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
 
 The agent reads this document to learn how to systematically QA test a web application. The workflow references
 a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions.
@@ -246,7 +257,7 @@ Respond with ONLY valid JSON:
 Here is the QA workflow to evaluate:
 
 ${section}`);
-    console.log('QA workflow scores:', JSON.stringify(scores, null, 2));
+    console.log('QA workflow scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'qa/SKILL.md workflow',
@@ -254,9 +265,10 @@ ${section}`);
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -271,7 +283,7 @@ ${section}`);
     const start = qaContent.indexOf('## Health Score Rubric');
     const section = qaContent.slice(start);
 
-    const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
+    const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
 
 The agent uses this rubric after QA testing a website. It needs to:
 1. Understand each scoring category and what counts as a deduction
@@ -289,7 +301,7 @@ Respond with ONLY valid JSON:
 Here is the rubric to evaluate:
 
 ${section}`);
-    console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));
+    console.log('QA health rubric scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'qa/SKILL.md health rubric',
@@ -297,9 +309,10 @@ ${section}`);
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -332,7 +345,7 @@ describeEval('Cross-skill consistency evals', () => {
       extractGrepLines(retroContent, 'retro/SKILL.md'),
     ].join('\n\n');
 
-    const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
+    const { result, meta } = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
 
 INTENDED ARCHITECTURE:
 - greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md)
@@ -355,7 +368,7 @@ Evaluate consistency. Respond with ONLY valid JSON:
 
 score (1-5): 5 = perfectly consistent, 1 = contradictory`);
 
-    console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));
+    console.log('Cross-skill consistency:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'cross-skill greptile consistency',
@@ -363,9 +376,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
       tier: 'llm-judge',
       passed: result.consistent && result.score >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { consistency_score: result.score },
       judge_reasoning: result.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(result.consistent).toBe(true);
@@ -392,7 +406,7 @@ describeEval('Baseline score pinning', () => {
     const cmdStart = skillContent.indexOf('## Command Reference');
     const cmdEnd = skillContent.indexOf('## Tips');
     const cmdSection = skillContent.slice(cmdStart, cmdEnd);
-    const cmdScores = await judge('command reference table', cmdSection);
+    const { result: cmdScores, meta } = await judge('command reference table', cmdSection);
 
     for (const dim of ['clarity', 'completeness', 'actionability'] as const) {
       if (cmdScores[dim] < baselines.command_reference[dim]) {
@@ -417,9 +431,10 @@ describeEval('Baseline score pinning', () => {
       tier: 'llm-judge',
       passed,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
       judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
+      costs: judgeCosts(meta),
     });
 
     if (!passed) {