mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
59752fc510
callJudge/judge now return {result, meta} with SHA-based caching
(~$0.18/run savings when SKILL.md unchanged) and dynamic model
selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from
EVAL_TIER to claude -p. outcomeJudge retains simple return type.
All 8 LLM eval test sites updated with real costs and costs[].
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
172 lines
5.6 KiB
TypeScript
172 lines
5.6 KiB
TypeScript
/**
|
|
* Shared LLM-as-judge helpers for eval and E2E tests.
|
|
*
|
|
* Provides callJudge (generic JSON-from-LLM with cache + tier support),
|
|
* judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
|
|
*
|
|
* Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
|
|
*
|
|
* Env vars:
|
|
* EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
|
|
* EVAL_CACHE=0 — bypass cache, always re-run
|
|
*/
|
|
|
|
import Anthropic from '@anthropic-ai/sdk';
|
|
import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
|
|
import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';
|
|
|
|
export interface JudgeScore {
|
|
clarity: number; // 1-5
|
|
completeness: number; // 1-5
|
|
actionability: number; // 1-5
|
|
reasoning: string;
|
|
}
|
|
|
|
export interface OutcomeJudgeResult {
|
|
detected: string[];
|
|
missed: string[];
|
|
false_positives: number;
|
|
detection_rate: number;
|
|
evidence_quality: number;
|
|
reasoning: string;
|
|
}
|
|
|
|
export interface JudgeMeta {
|
|
model: string;
|
|
input_tokens: number;
|
|
output_tokens: number;
|
|
cached: boolean;
|
|
}
|
|
|
|
/**
|
|
* Call the judge model with a prompt, extract JSON response.
|
|
* Uses eval-cache for SHA-based caching and eval-tier for model selection.
|
|
* Retries once on 429 rate limit errors.
|
|
*/
|
|
export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
|
|
const model = tierToModel(resolveJudgeTier());
|
|
|
|
// Check cache (keyed by model + prompt content)
|
|
const cacheKey = computeCacheKey([], `${model}:${prompt}`);
|
|
const cached = cacheRead('llm-judge', cacheKey);
|
|
if (cached !== null) {
|
|
return {
|
|
result: cached as T,
|
|
meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
|
|
};
|
|
}
|
|
|
|
const client = new Anthropic();
|
|
|
|
const makeRequest = () => client.messages.create({
|
|
model,
|
|
max_tokens: 1024,
|
|
messages: [{ role: 'user', content: prompt }],
|
|
});
|
|
|
|
let response;
|
|
try {
|
|
response = await makeRequest();
|
|
} catch (err: any) {
|
|
if (err.status === 429) {
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
response = await makeRequest();
|
|
} else {
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
|
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
|
const result = JSON.parse(jsonMatch[0]) as T;
|
|
|
|
// Write to cache
|
|
cacheWrite('llm-judge', cacheKey, result, { model });
|
|
|
|
const meta: JudgeMeta = {
|
|
model,
|
|
input_tokens: (response.usage as any)?.input_tokens || 0,
|
|
output_tokens: (response.usage as any)?.output_tokens || 0,
|
|
cached: false,
|
|
};
|
|
|
|
return { result, meta };
|
|
}
|
|
|
|
/**
|
|
* Score documentation quality on clarity/completeness/actionability (1-5).
|
|
*/
|
|
export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
|
|
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
|
|
|
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
|
1. Understand what each command does
|
|
2. Know what arguments to pass
|
|
3. Know valid values for enum-like parameters
|
|
4. Construct correct command invocations without guessing
|
|
|
|
Rate the following ${section} on three dimensions (1-5 scale):
|
|
|
|
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
|
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
|
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
|
|
|
Scoring guide:
|
|
- 5: Excellent — no ambiguity, all info present
|
|
- 4: Good — minor gaps an experienced agent could infer
|
|
- 3: Adequate — some guessing required
|
|
- 2: Poor — significant info missing
|
|
- 1: Unusable — agent would fail without external help
|
|
|
|
Respond with ONLY valid JSON in this exact format:
|
|
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
|
|
Here is the ${section} to evaluate:
|
|
|
|
${content}`);
|
|
}
|
|
|
|
/**
|
|
* Evaluate a QA report against planted-bug ground truth.
|
|
* Returns detection metrics for the planted bugs.
|
|
* Note: outcomeJudge returns just the result (not meta) for backward compat
|
|
* with E2E test callers. Cache still works internally.
|
|
*/
|
|
export async function outcomeJudge(
|
|
groundTruth: any,
|
|
report: string,
|
|
): Promise<OutcomeJudgeResult> {
|
|
const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
|
|
|
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
|
${JSON.stringify(groundTruth.bugs, null, 2)}
|
|
|
|
QA REPORT (generated by an AI agent):
|
|
${report}
|
|
|
|
For each planted bug, determine if the report identified it. A bug counts as
|
|
"detected" if the report describes the same defect, even if the wording differs.
|
|
Use the detection_hint keywords as guidance.
|
|
|
|
Also count false positives: issues in the report that don't correspond to any
|
|
planted bug AND aren't legitimate issues with the page.
|
|
|
|
Respond with ONLY valid JSON:
|
|
{
|
|
"detected": ["bug-id-1", "bug-id-2"],
|
|
"missed": ["bug-id-3"],
|
|
"false_positives": 0,
|
|
"detection_rate": 2,
|
|
"evidence_quality": 4,
|
|
"reasoning": "brief explanation"
|
|
}
|
|
|
|
Rules:
|
|
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
|
|
- detection_rate = length of detected array
|
|
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
|
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
|
return result;
|
|
}
|