Files
gstack/test/helpers/llm-judge.ts
Garry Tan 59752fc510 feat: wire eval-cache + eval-tier into LLM judge, pin E2E model
callJudge/judge now return {result, meta} with SHA-based caching
(~$0.18/run savings when SKILL.md unchanged) and dynamic model
selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from
EVAL_TIER to claude -p. outcomeJudge retains simple return type.
All 8 LLM eval test sites updated with real costs and costs[].

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 16:47:35 -05:00

172 lines
5.6 KiB
TypeScript

/**
* Shared LLM-as-judge helpers for eval and E2E tests.
*
* Provides callJudge (generic JSON-from-LLM with cache + tier support),
* judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
*
* Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
*
* Env vars:
* EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
* EVAL_CACHE=0 — bypass cache, always re-run
*/
import Anthropic from '@anthropic-ai/sdk';
import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';
export interface JudgeScore {
clarity: number; // 1-5
completeness: number; // 1-5
actionability: number; // 1-5
reasoning: string;
}
export interface OutcomeJudgeResult {
detected: string[];
missed: string[];
false_positives: number;
detection_rate: number;
evidence_quality: number;
reasoning: string;
}
export interface JudgeMeta {
model: string;
input_tokens: number;
output_tokens: number;
cached: boolean;
}
/**
* Call the judge model with a prompt, extract JSON response.
* Uses eval-cache for SHA-based caching and eval-tier for model selection.
* Retries once on 429 rate limit errors.
*/
export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
const model = tierToModel(resolveJudgeTier());
// Check cache (keyed by model + prompt content)
const cacheKey = computeCacheKey([], `${model}:${prompt}`);
const cached = cacheRead('llm-judge', cacheKey);
if (cached !== null) {
return {
result: cached as T,
meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
};
}
const client = new Anthropic();
const makeRequest = () => client.messages.create({
model,
max_tokens: 1024,
messages: [{ role: 'user', content: prompt }],
});
let response;
try {
response = await makeRequest();
} catch (err: any) {
if (err.status === 429) {
await new Promise(r => setTimeout(r, 1000));
response = await makeRequest();
} else {
throw err;
}
}
const text = response.content[0].type === 'text' ? response.content[0].text : '';
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
const result = JSON.parse(jsonMatch[0]) as T;
// Write to cache
cacheWrite('llm-judge', cacheKey, result, { model });
const meta: JudgeMeta = {
model,
input_tokens: (response.usage as any)?.input_tokens || 0,
output_tokens: (response.usage as any)?.output_tokens || 0,
cached: false,
};
return { result, meta };
}
/**
* Score documentation quality on clarity/completeness/actionability (1-5).
*/
export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
1. Understand what each command does
2. Know what arguments to pass
3. Know valid values for enum-like parameters
4. Construct correct command invocations without guessing
Rate the following ${section} on three dimensions (1-5 scale):
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
Scoring guide:
- 5: Excellent — no ambiguity, all info present
- 4: Good — minor gaps an experienced agent could infer
- 3: Adequate — some guessing required
- 2: Poor — significant info missing
- 1: Unusable — agent would fail without external help
Respond with ONLY valid JSON in this exact format:
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
Here is the ${section} to evaluate:
${content}`);
}
/**
* Evaluate a QA report against planted-bug ground truth.
* Returns detection metrics for the planted bugs.
* Note: outcomeJudge returns just the result (not meta) for backward compat
* with E2E test callers. Cache still works internally.
*/
export async function outcomeJudge(
groundTruth: any,
report: string,
): Promise<OutcomeJudgeResult> {
const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
${JSON.stringify(groundTruth.bugs, null, 2)}
QA REPORT (generated by an AI agent):
${report}
For each planted bug, determine if the report identified it. A bug counts as
"detected" if the report describes the same defect, even if the wording differs.
Use the detection_hint keywords as guidance.
Also count false positives: issues in the report that don't correspond to any
planted bug AND aren't legitimate issues with the page.
Respond with ONLY valid JSON:
{
"detected": ["bug-id-1", "bug-id-2"],
"missed": ["bug-id-3"],
"false_positives": 0,
"detection_rate": 2,
"evidence_quality": 4,
"reasoning": "brief explanation"
}
Rules:
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
- detection_rate = length of detected array
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
5 = excellent evidence for every bug, 1 = no evidence at all`);
return result;
}