Files
gstack/test/helpers/llm-judge.test.ts
Garry Tan 59752fc510 feat: wire eval-cache + eval-tier into LLM judge, pin E2E model
callJudge/judge now return {result, meta} with SHA-based caching
(~$0.18/run savings when SKILL.md unchanged) and dynamic model
selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from
EVAL_TIER to claude -p. outcomeJudge retains simple return type.
All 8 LLM eval test sites updated with real costs and costs[].

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 16:47:35 -05:00

118 lines
4.5 KiB
TypeScript

/**
* Tests for LLM judge cache + tier integration.
* Mocks Anthropic client to avoid API calls.
*/
import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
let tmpCacheDir: string;
const origEnv: Record<string, string | undefined> = {};
beforeEach(() => {
tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-'));
// Point cache to temp dir and clear tier env vars
origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR;
origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER;
origEnv.EVAL_TIER = process.env.EVAL_TIER;
origEnv.EVAL_CACHE = process.env.EVAL_CACHE;
process.env.GSTACK_STATE_DIR = tmpCacheDir;
delete process.env.EVAL_JUDGE_TIER;
delete process.env.EVAL_TIER;
delete process.env.EVAL_CACHE;
});
afterEach(() => {
// Restore env
for (const [key, val] of Object.entries(origEnv)) {
if (val === undefined) delete process.env[key];
else process.env[key] = val;
}
try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {}
});
// Test cache key computation directly (doesn't need mock)
describe('cache key computation', () => {
test('computeCacheKey produces consistent hashes for same input', async () => {
const { computeCacheKey } = await import('../../lib/eval-cache');
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
expect(key1).toBe(key2);
expect(key1).toHaveLength(16);
});
test('cache key differs when model changes', async () => {
const { computeCacheKey } = await import('../../lib/eval-cache');
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt');
expect(key1).not.toBe(key2);
});
test('cache key differs when prompt changes', async () => {
const { computeCacheKey } = await import('../../lib/eval-cache');
const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A');
const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B');
expect(key1).not.toBe(key2);
});
});
// Test cache read/write directly
describe('cache read/write for llm-judge suite', () => {
test('cacheRead returns null on miss', async () => {
const { cacheRead } = await import('../../lib/eval-cache');
expect(cacheRead('llm-judge', 'nonexistent')).toBeNull();
});
test('cacheWrite + cacheRead round-trip', async () => {
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' };
cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' });
const cached = cacheRead('llm-judge', 'test-key');
expect(cached).toEqual(data);
});
test('EVAL_CACHE=0 bypasses cache read', async () => {
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
cacheWrite('llm-judge', 'bypass-key', { test: true });
process.env.EVAL_CACHE = '0';
expect(cacheRead('llm-judge', 'bypass-key')).toBeNull();
});
});
// Test tier resolution
describe('tier resolution for judge', () => {
test('defaults to standard (sonnet) when no env set', async () => {
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
expect(resolveJudgeTier()).toBe('standard');
expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6');
});
test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => {
process.env.EVAL_JUDGE_TIER = 'haiku';
// Need fresh import to pick up env change
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
expect(resolveJudgeTier()).toBe('fast');
expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5');
});
test('EVAL_JUDGE_TIER=opus selects full tier', async () => {
process.env.EVAL_JUDGE_TIER = 'opus';
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
expect(resolveJudgeTier()).toBe('full');
expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6');
});
});
// Test JudgeMeta shape
describe('JudgeMeta interface', () => {
test('exported from llm-judge module', async () => {
const mod = await import('./llm-judge');
// Verify callJudge and judge are exported functions
expect(typeof mod.callJudge).toBe('function');
expect(typeof mod.judge).toBe('function');
expect(typeof mod.outcomeJudge).toBe('function');
});
});