From 59752fc5101bec9622cc4277cb427dcd4bff05b9 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 16:47:35 -0500 Subject: [PATCH] feat: wire eval-cache + eval-tier into LLM judge, pin E2E model callJudge/judge now return {result, meta} with SHA-based caching (~$0.18/run savings when SKILL.md unchanged) and dynamic model selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from EVAL_TIER to claude -p. outcomeJudge retains simple return type. All 8 LLM eval test sites updated with real costs and costs[]. Co-Authored-By: Claude Opus 4.6 (1M context) --- TODOS.md | 4 +- test/helpers/llm-judge.test.ts | 117 +++++++++++++++++++++++++++++++++ test/helpers/llm-judge.ts | 59 ++++++++++++++--- test/skill-llm-eval.test.ts | 99 ++++++++++++++++------------ 4 files changed, 227 insertions(+), 52 deletions(-) create mode 100644 test/helpers/llm-judge.test.ts diff --git a/TODOS.md b/TODOS.md index 4916c236..b5ec8ac3 100644 --- a/TODOS.md +++ b/TODOS.md @@ -231,7 +231,7 @@ **Why:** Spot quality trends — is the app getting better or worse? -**Context:** QA already writes structured reports. This adds cross-run comparison. +**Context:** `eval:trend` now tracks test-level pass rates (eval infrastructure). QA-run-level trending (health scores over time across QA report files) is a separate feature that could reuse `computeTrends` pattern from `lib/cli-eval.ts`. **Effort:** S **Priority:** P2 @@ -335,6 +335,8 @@ **Why:** Reduce E2E test cost and flakiness. +**Status:** Model pinning shipped (session-runner.ts passes `--model` from `EVAL_TIER` env). Retry:2 still TODO. + **Effort:** XS **Priority:** P2 diff --git a/test/helpers/llm-judge.test.ts b/test/helpers/llm-judge.test.ts new file mode 100644 index 00000000..03cf7788 --- /dev/null +++ b/test/helpers/llm-judge.test.ts @@ -0,0 +1,117 @@ +/** + * Tests for LLM judge cache + tier integration. + * Mocks Anthropic client to avoid API calls. + */ + +import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +let tmpCacheDir: string; +const origEnv: Record = {}; + +beforeEach(() => { + tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-')); + // Point cache to temp dir and clear tier env vars + origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR; + origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER; + origEnv.EVAL_TIER = process.env.EVAL_TIER; + origEnv.EVAL_CACHE = process.env.EVAL_CACHE; + process.env.GSTACK_STATE_DIR = tmpCacheDir; + delete process.env.EVAL_JUDGE_TIER; + delete process.env.EVAL_TIER; + delete process.env.EVAL_CACHE; +}); + +afterEach(() => { + // Restore env + for (const [key, val] of Object.entries(origEnv)) { + if (val === undefined) delete process.env[key]; + else process.env[key] = val; + } + try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {} +}); + +// Test cache key computation directly (doesn't need mock) +describe('cache key computation', () => { + test('computeCacheKey produces consistent hashes for same input', async () => { + const { computeCacheKey } = await import('../../lib/eval-cache'); + const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt'); + const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt'); + expect(key1).toBe(key2); + expect(key1).toHaveLength(16); + }); + + test('cache key differs when model changes', async () => { + const { computeCacheKey } = await import('../../lib/eval-cache'); + const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt'); + const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt'); + expect(key1).not.toBe(key2); + }); + + test('cache key differs when prompt changes', async () => { + const { computeCacheKey } = await import('../../lib/eval-cache'); + const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A'); + const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B'); + expect(key1).not.toBe(key2); + }); +}); + +// Test cache read/write directly +describe('cache read/write for llm-judge suite', () => { + test('cacheRead returns null on miss', async () => { + const { cacheRead } = await import('../../lib/eval-cache'); + expect(cacheRead('llm-judge', 'nonexistent')).toBeNull(); + }); + + test('cacheWrite + cacheRead round-trip', async () => { + const { cacheRead, cacheWrite } = await import('../../lib/eval-cache'); + const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' }; + cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' }); + const cached = cacheRead('llm-judge', 'test-key'); + expect(cached).toEqual(data); + }); + + test('EVAL_CACHE=0 bypasses cache read', async () => { + const { cacheRead, cacheWrite } = await import('../../lib/eval-cache'); + cacheWrite('llm-judge', 'bypass-key', { test: true }); + process.env.EVAL_CACHE = '0'; + expect(cacheRead('llm-judge', 'bypass-key')).toBeNull(); + }); +}); + +// Test tier resolution +describe('tier resolution for judge', () => { + test('defaults to standard (sonnet) when no env set', async () => { + const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier'); + expect(resolveJudgeTier()).toBe('standard'); + expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6'); + }); + + test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => { + process.env.EVAL_JUDGE_TIER = 'haiku'; + // Need fresh import to pick up env change + const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier'); + expect(resolveJudgeTier()).toBe('fast'); + expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5'); + }); + + test('EVAL_JUDGE_TIER=opus selects full tier', async () => { + process.env.EVAL_JUDGE_TIER = 'opus'; + const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier'); + expect(resolveJudgeTier()).toBe('full'); + expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6'); + }); +}); + +// Test JudgeMeta shape +describe('JudgeMeta interface', () => { + test('exported from llm-judge module', async () => { + const mod = await import('./llm-judge'); + // Verify callJudge and judge are exported functions + expect(typeof mod.callJudge).toBe('function'); + expect(typeof mod.judge).toBe('function'); + expect(typeof mod.outcomeJudge).toBe('function'); + }); +}); diff --git a/test/helpers/llm-judge.ts b/test/helpers/llm-judge.ts index 7040cd6c..61d6927a 100644 --- a/test/helpers/llm-judge.ts +++ b/test/helpers/llm-judge.ts @@ -1,13 +1,19 @@ /** * Shared LLM-as-judge helpers for eval and E2E tests. * - * Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer), - * and outcomeJudge (planted-bug detection scorer). + * Provides callJudge (generic JSON-from-LLM with cache + tier support), + * judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer). * - * Requires: ANTHROPIC_API_KEY env var + * Requires: ANTHROPIC_API_KEY env var (skipped on cache hit) + * + * Env vars: + * EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard) + * EVAL_CACHE=0 — bypass cache, always re-run */ import Anthropic from '@anthropic-ai/sdk'; +import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache'; +import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier'; export interface JudgeScore { clarity: number; // 1-5 @@ -25,15 +31,35 @@ export interface OutcomeJudgeResult { reasoning: string; } +export interface JudgeMeta { + model: string; + input_tokens: number; + output_tokens: number; + cached: boolean; +} + /** - * Call claude-sonnet-4-6 with a prompt, extract JSON response. + * Call the judge model with a prompt, extract JSON response. + * Uses eval-cache for SHA-based caching and eval-tier for model selection. * Retries once on 429 rate limit errors. */ -export async function callJudge(prompt: string): Promise { +export async function callJudge(prompt: string): Promise<{ result: T; meta: JudgeMeta }> { + const model = tierToModel(resolveJudgeTier()); + + // Check cache (keyed by model + prompt content) + const cacheKey = computeCacheKey([], `${model}:${prompt}`); + const cached = cacheRead('llm-judge', cacheKey); + if (cached !== null) { + return { + result: cached as T, + meta: { model, input_tokens: 0, output_tokens: 0, cached: true }, + }; + } + const client = new Anthropic(); const makeRequest = () => client.messages.create({ - model: 'claude-sonnet-4-6', + model, max_tokens: 1024, messages: [{ role: 'user', content: prompt }], }); @@ -53,13 +79,25 @@ export async function callJudge(prompt: string): Promise { const text = response.content[0].type === 'text' ? response.content[0].text : ''; const jsonMatch = text.match(/\{[\s\S]*\}/); if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); - return JSON.parse(jsonMatch[0]) as T; + const result = JSON.parse(jsonMatch[0]) as T; + + // Write to cache + cacheWrite('llm-judge', cacheKey, result, { model }); + + const meta: JudgeMeta = { + model, + input_tokens: (response.usage as any)?.input_tokens || 0, + output_tokens: (response.usage as any)?.output_tokens || 0, + cached: false, + }; + + return { result, meta }; } /** * Score documentation quality on clarity/completeness/actionability (1-5). */ -export async function judge(section: string, content: string): Promise { +export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> { return callJudge(`You are evaluating documentation quality for an AI coding agent's CLI tool reference. The agent reads this documentation to learn how to use a headless browser CLI. It needs to: @@ -92,12 +130,14 @@ ${content}`); /** * Evaluate a QA report against planted-bug ground truth. * Returns detection metrics for the planted bugs. + * Note: outcomeJudge returns just the result (not meta) for backward compat + * with E2E test callers. Cache still works internally. */ export async function outcomeJudge( groundTruth: any, report: string, ): Promise { - return callJudge(`You are evaluating a QA testing report against known ground truth bugs. + const { result } = await callJudge(`You are evaluating a QA testing report against known ground truth bugs. GROUND TRUTH (${groundTruth.total_bugs} planted bugs): ${JSON.stringify(groundTruth.bugs, null, 2)} @@ -127,4 +167,5 @@ Rules: - detection_rate = length of detected array - evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references? 5 = excellent evidence for every bug, 1 = no evidence at all`); + return result; } diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index ba635613..2889538c 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -7,16 +7,18 @@ * Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set) * Run: EVALS=1 bun run test:eval * - * Cost: ~$0.05-0.15 per run (sonnet) + * Cost: ~$0.05-0.15 per run (sonnet), $0 on cache hit + * Cache: SHA-based via eval-cache. Set EVAL_CACHE=0 to force re-run. + * Model: Set EVAL_JUDGE_TIER=haiku|sonnet|opus to override (default: sonnet). */ import { describe, test, expect, afterAll } from 'bun:test'; -import Anthropic from '@anthropic-ai/sdk'; import * as fs from 'fs'; import * as path from 'path'; import { callJudge, judge } from './helpers/llm-judge'; -import type { JudgeScore } from './helpers/llm-judge'; +import type { JudgeMeta } from './helpers/llm-judge'; import { EvalCollector } from './helpers/eval-store'; +import { MODEL_PRICING } from '../lib/eval-cost'; const ROOT = path.resolve(import.meta.dir, '..'); // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env) @@ -26,6 +28,22 @@ const describeEval = evalsEnabled ? describe : describe.skip; // Eval result collector const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null; +/** Compute actual judge cost from meta (0 on cache hit). */ +function judgeCost(meta: JudgeMeta): number { + if (meta.cached) return 0; + const p = MODEL_PRICING[meta.model] || { input: 3.0, output: 15.0 }; + return (meta.input_tokens / 1_000_000) * p.input + (meta.output_tokens / 1_000_000) * p.output; +} + +/** Build CostEntry array from judge meta (empty on cache hit). */ +function judgeCosts(meta: JudgeMeta) { + if (meta.cached) return []; + return [{ + model: meta.model, calls: 1, + input_tokens: meta.input_tokens, output_tokens: meta.output_tokens, + }]; +} + describeEval('LLM-as-judge quality evals', () => { test('command reference table scores >= 4 on all dimensions', async () => { const t0 = Date.now(); @@ -34,8 +52,8 @@ describeEval('LLM-as-judge quality evals', () => { const end = content.indexOf('## Tips'); const section = content.slice(start, end); - const scores = await judge('command reference table', section); - console.log('Command reference scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('command reference table', section); + console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'command reference table', @@ -43,9 +61,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -60,8 +79,8 @@ describeEval('LLM-as-judge quality evals', () => { const end = content.indexOf('## Command Reference'); const section = content.slice(start, end); - const scores = await judge('snapshot flags reference', section); - console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('snapshot flags reference', section); + console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'snapshot flags reference', @@ -69,9 +88,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -85,8 +105,8 @@ describeEval('LLM-as-judge quality evals', () => { const start = content.indexOf('## Snapshot Flags'); const section = content.slice(start); - const scores = await judge('browse skill reference (flags + commands)', section); - console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('browse skill reference (flags + commands)', section); + console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'browse/SKILL.md reference', @@ -94,9 +114,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -111,8 +132,8 @@ describeEval('LLM-as-judge quality evals', () => { const setupEnd = content.indexOf('## IMPORTANT'); const section = content.slice(setupStart, setupEnd); - const scores = await judge('setup/binary discovery instructions', section); - console.log('Setup block scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('setup/binary discovery instructions', section); + console.log('Setup block scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'setup block', @@ -120,9 +141,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.actionability >= 3 && scores.clarity >= 3, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); // Setup block is intentionally minimal (binary discovery only). @@ -171,13 +193,7 @@ describeEval('LLM-as-judge quality evals', () => { | \`is \` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | | \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`; - const client = new Anthropic(); - const response = await client.messages.create({ - model: 'claude-sonnet-4-6', - max_tokens: 1024, - messages: [{ - role: 'user', - content: `You are comparing two versions of CLI documentation for an AI coding agent. + const { result, meta } = await callJudge<{ winner: string; reasoning: string; a_score: number; b_score: number }>(`You are comparing two versions of CLI documentation for an AI coding agent. VERSION A (baseline — hand-maintained): ${baseline} @@ -193,15 +209,9 @@ Which version is better for an AI agent trying to use these commands? Consider: Respond with ONLY valid JSON: {"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N} -Scores are 1-5 overall quality.`, - }], - }); +Scores are 1-5 overall quality.`); - const text = response.content[0].type === 'text' ? response.content[0].text : ''; - const jsonMatch = text.match(/\{[\s\S]*\}/); - if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); - const result = JSON.parse(jsonMatch[0]); - console.log('Regression comparison:', JSON.stringify(result, null, 2)); + console.log('Regression comparison:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'regression vs baseline', @@ -209,9 +219,10 @@ Scores are 1-5 overall quality.`, tier: 'llm-judge', passed: result.b_score >= result.a_score, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { a_score: result.a_score, b_score: result.b_score }, judge_reasoning: result.reasoning, + costs: judgeCosts(meta), }); expect(result.b_score).toBeGreaterThanOrEqual(result.a_score); @@ -229,7 +240,7 @@ describeEval('QA skill quality evals', () => { const end = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start, end); - const scores = await callJudge(`You are evaluating the quality of a QA testing workflow document for an AI coding agent. + const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent. The agent reads this document to learn how to systematically QA test a web application. The workflow references a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions. @@ -246,7 +257,7 @@ Respond with ONLY valid JSON: Here is the QA workflow to evaluate: ${section}`); - console.log('QA workflow scores:', JSON.stringify(scores, null, 2)); + console.log('QA workflow scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'qa/SKILL.md workflow', @@ -254,9 +265,10 @@ ${section}`); tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -271,7 +283,7 @@ ${section}`); const start = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start); - const scores = await callJudge(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score. + const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score. The agent uses this rubric after QA testing a website. It needs to: 1. Understand each scoring category and what counts as a deduction @@ -289,7 +301,7 @@ Respond with ONLY valid JSON: Here is the rubric to evaluate: ${section}`); - console.log('QA health rubric scores:', JSON.stringify(scores, null, 2)); + console.log('QA health rubric scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'qa/SKILL.md health rubric', @@ -297,9 +309,10 @@ ${section}`); tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -332,7 +345,7 @@ describeEval('Cross-skill consistency evals', () => { extractGrepLines(retroContent, 'retro/SKILL.md'), ].join('\n\n'); - const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently. + const { result, meta } = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently. INTENDED ARCHITECTURE: - greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md) @@ -355,7 +368,7 @@ Evaluate consistency. Respond with ONLY valid JSON: score (1-5): 5 = perfectly consistent, 1 = contradictory`); - console.log('Cross-skill consistency:', JSON.stringify(result, null, 2)); + console.log('Cross-skill consistency:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'cross-skill greptile consistency', @@ -363,9 +376,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`); tier: 'llm-judge', passed: result.consistent && result.score >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { consistency_score: result.score }, judge_reasoning: result.reasoning, + costs: judgeCosts(meta), }); expect(result.consistent).toBe(true); @@ -392,7 +406,7 @@ describeEval('Baseline score pinning', () => { const cmdStart = skillContent.indexOf('## Command Reference'); const cmdEnd = skillContent.indexOf('## Tips'); const cmdSection = skillContent.slice(cmdStart, cmdEnd); - const cmdScores = await judge('command reference table', cmdSection); + const { result: cmdScores, meta } = await judge('command reference table', cmdSection); for (const dim of ['clarity', 'completeness', 'actionability'] as const) { if (cmdScores[dim] < baselines.command_reference[dim]) { @@ -417,9 +431,10 @@ describeEval('Baseline score pinning', () => { tier: 'llm-judge', passed, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability }, judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '), + costs: judgeCosts(meta), }); if (!passed) {