mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
feat: wire eval-cache + eval-tier into LLM judge, pin E2E model
callJudge/judge now return {result, meta} with SHA-based caching
(~$0.18/run savings when SKILL.md unchanged) and dynamic model
selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from
EVAL_TIER to claude -p. outcomeJudge retains simple return type.
All 8 LLM eval test sites updated with real costs and costs[].
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,117 @@
|
||||
/**
|
||||
* Tests for LLM judge cache + tier integration.
|
||||
* Mocks Anthropic client to avoid API calls.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
let tmpCacheDir: string;
|
||||
const origEnv: Record<string, string | undefined> = {};
|
||||
|
||||
beforeEach(() => {
|
||||
tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-'));
|
||||
// Point cache to temp dir and clear tier env vars
|
||||
origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR;
|
||||
origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER;
|
||||
origEnv.EVAL_TIER = process.env.EVAL_TIER;
|
||||
origEnv.EVAL_CACHE = process.env.EVAL_CACHE;
|
||||
process.env.GSTACK_STATE_DIR = tmpCacheDir;
|
||||
delete process.env.EVAL_JUDGE_TIER;
|
||||
delete process.env.EVAL_TIER;
|
||||
delete process.env.EVAL_CACHE;
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
// Restore env
|
||||
for (const [key, val] of Object.entries(origEnv)) {
|
||||
if (val === undefined) delete process.env[key];
|
||||
else process.env[key] = val;
|
||||
}
|
||||
try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
// Test cache key computation directly (doesn't need mock)
|
||||
describe('cache key computation', () => {
|
||||
test('computeCacheKey produces consistent hashes for same input', async () => {
|
||||
const { computeCacheKey } = await import('../../lib/eval-cache');
|
||||
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
||||
const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
||||
expect(key1).toBe(key2);
|
||||
expect(key1).toHaveLength(16);
|
||||
});
|
||||
|
||||
test('cache key differs when model changes', async () => {
|
||||
const { computeCacheKey } = await import('../../lib/eval-cache');
|
||||
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
||||
const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt');
|
||||
expect(key1).not.toBe(key2);
|
||||
});
|
||||
|
||||
test('cache key differs when prompt changes', async () => {
|
||||
const { computeCacheKey } = await import('../../lib/eval-cache');
|
||||
const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A');
|
||||
const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B');
|
||||
expect(key1).not.toBe(key2);
|
||||
});
|
||||
});
|
||||
|
||||
// Test cache read/write directly
|
||||
describe('cache read/write for llm-judge suite', () => {
|
||||
test('cacheRead returns null on miss', async () => {
|
||||
const { cacheRead } = await import('../../lib/eval-cache');
|
||||
expect(cacheRead('llm-judge', 'nonexistent')).toBeNull();
|
||||
});
|
||||
|
||||
test('cacheWrite + cacheRead round-trip', async () => {
|
||||
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
|
||||
const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' };
|
||||
cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' });
|
||||
const cached = cacheRead('llm-judge', 'test-key');
|
||||
expect(cached).toEqual(data);
|
||||
});
|
||||
|
||||
test('EVAL_CACHE=0 bypasses cache read', async () => {
|
||||
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
|
||||
cacheWrite('llm-judge', 'bypass-key', { test: true });
|
||||
process.env.EVAL_CACHE = '0';
|
||||
expect(cacheRead('llm-judge', 'bypass-key')).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// Test tier resolution
|
||||
describe('tier resolution for judge', () => {
|
||||
test('defaults to standard (sonnet) when no env set', async () => {
|
||||
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
||||
expect(resolveJudgeTier()).toBe('standard');
|
||||
expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6');
|
||||
});
|
||||
|
||||
test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => {
|
||||
process.env.EVAL_JUDGE_TIER = 'haiku';
|
||||
// Need fresh import to pick up env change
|
||||
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
||||
expect(resolveJudgeTier()).toBe('fast');
|
||||
expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5');
|
||||
});
|
||||
|
||||
test('EVAL_JUDGE_TIER=opus selects full tier', async () => {
|
||||
process.env.EVAL_JUDGE_TIER = 'opus';
|
||||
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
||||
expect(resolveJudgeTier()).toBe('full');
|
||||
expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6');
|
||||
});
|
||||
});
|
||||
|
||||
// Test JudgeMeta shape
|
||||
describe('JudgeMeta interface', () => {
|
||||
test('exported from llm-judge module', async () => {
|
||||
const mod = await import('./llm-judge');
|
||||
// Verify callJudge and judge are exported functions
|
||||
expect(typeof mod.callJudge).toBe('function');
|
||||
expect(typeof mod.judge).toBe('function');
|
||||
expect(typeof mod.outcomeJudge).toBe('function');
|
||||
});
|
||||
});
|
||||
@@ -1,13 +1,19 @@
|
||||
/**
|
||||
* Shared LLM-as-judge helpers for eval and E2E tests.
|
||||
*
|
||||
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
||||
* and outcomeJudge (planted-bug detection scorer).
|
||||
* Provides callJudge (generic JSON-from-LLM with cache + tier support),
|
||||
* judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
|
||||
*
|
||||
* Requires: ANTHROPIC_API_KEY env var
|
||||
* Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
|
||||
*
|
||||
* Env vars:
|
||||
* EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
|
||||
* EVAL_CACHE=0 — bypass cache, always re-run
|
||||
*/
|
||||
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
|
||||
import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';
|
||||
|
||||
export interface JudgeScore {
|
||||
clarity: number; // 1-5
|
||||
@@ -25,15 +31,35 @@ export interface OutcomeJudgeResult {
|
||||
reasoning: string;
|
||||
}
|
||||
|
||||
export interface JudgeMeta {
|
||||
model: string;
|
||||
input_tokens: number;
|
||||
output_tokens: number;
|
||||
cached: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
|
||||
* Call the judge model with a prompt, extract JSON response.
|
||||
* Uses eval-cache for SHA-based caching and eval-tier for model selection.
|
||||
* Retries once on 429 rate limit errors.
|
||||
*/
|
||||
export async function callJudge<T>(prompt: string): Promise<T> {
|
||||
export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
|
||||
const model = tierToModel(resolveJudgeTier());
|
||||
|
||||
// Check cache (keyed by model + prompt content)
|
||||
const cacheKey = computeCacheKey([], `${model}:${prompt}`);
|
||||
const cached = cacheRead('llm-judge', cacheKey);
|
||||
if (cached !== null) {
|
||||
return {
|
||||
result: cached as T,
|
||||
meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
|
||||
};
|
||||
}
|
||||
|
||||
const client = new Anthropic();
|
||||
|
||||
const makeRequest = () => client.messages.create({
|
||||
model: 'claude-sonnet-4-6',
|
||||
model,
|
||||
max_tokens: 1024,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
});
|
||||
@@ -53,13 +79,25 @@ export async function callJudge<T>(prompt: string): Promise<T> {
|
||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||
return JSON.parse(jsonMatch[0]) as T;
|
||||
const result = JSON.parse(jsonMatch[0]) as T;
|
||||
|
||||
// Write to cache
|
||||
cacheWrite('llm-judge', cacheKey, result, { model });
|
||||
|
||||
const meta: JudgeMeta = {
|
||||
model,
|
||||
input_tokens: (response.usage as any)?.input_tokens || 0,
|
||||
output_tokens: (response.usage as any)?.output_tokens || 0,
|
||||
cached: false,
|
||||
};
|
||||
|
||||
return { result, meta };
|
||||
}
|
||||
|
||||
/**
|
||||
* Score documentation quality on clarity/completeness/actionability (1-5).
|
||||
*/
|
||||
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
||||
export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
|
||||
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
||||
|
||||
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
||||
@@ -92,12 +130,14 @@ ${content}`);
|
||||
/**
|
||||
* Evaluate a QA report against planted-bug ground truth.
|
||||
* Returns detection metrics for the planted bugs.
|
||||
* Note: outcomeJudge returns just the result (not meta) for backward compat
|
||||
* with E2E test callers. Cache still works internally.
|
||||
*/
|
||||
export async function outcomeJudge(
|
||||
groundTruth: any,
|
||||
report: string,
|
||||
): Promise<OutcomeJudgeResult> {
|
||||
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
||||
const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
||||
|
||||
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
||||
${JSON.stringify(groundTruth.bugs, null, 2)}
|
||||
@@ -127,4 +167,5 @@ Rules:
|
||||
- detection_rate = length of detected array
|
||||
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
||||
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
||||
return result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user