mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
59752fc510
callJudge/judge now return {result, meta} with SHA-based caching
(~$0.18/run savings when SKILL.md unchanged) and dynamic model
selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from
EVAL_TIER to claude -p. outcomeJudge retains simple return type.
All 8 LLM eval test sites updated with real costs and costs[].
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
118 lines
4.5 KiB
TypeScript
118 lines
4.5 KiB
TypeScript
/**
|
|
* Tests for LLM judge cache + tier integration.
|
|
* Mocks Anthropic client to avoid API calls.
|
|
*/
|
|
|
|
import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
let tmpCacheDir: string;
|
|
const origEnv: Record<string, string | undefined> = {};
|
|
|
|
beforeEach(() => {
|
|
tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-'));
|
|
// Point cache to temp dir and clear tier env vars
|
|
origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR;
|
|
origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER;
|
|
origEnv.EVAL_TIER = process.env.EVAL_TIER;
|
|
origEnv.EVAL_CACHE = process.env.EVAL_CACHE;
|
|
process.env.GSTACK_STATE_DIR = tmpCacheDir;
|
|
delete process.env.EVAL_JUDGE_TIER;
|
|
delete process.env.EVAL_TIER;
|
|
delete process.env.EVAL_CACHE;
|
|
});
|
|
|
|
afterEach(() => {
|
|
// Restore env
|
|
for (const [key, val] of Object.entries(origEnv)) {
|
|
if (val === undefined) delete process.env[key];
|
|
else process.env[key] = val;
|
|
}
|
|
try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {}
|
|
});
|
|
|
|
// Test cache key computation directly (doesn't need mock)
|
|
describe('cache key computation', () => {
|
|
test('computeCacheKey produces consistent hashes for same input', async () => {
|
|
const { computeCacheKey } = await import('../../lib/eval-cache');
|
|
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
|
const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
|
expect(key1).toBe(key2);
|
|
expect(key1).toHaveLength(16);
|
|
});
|
|
|
|
test('cache key differs when model changes', async () => {
|
|
const { computeCacheKey } = await import('../../lib/eval-cache');
|
|
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
|
const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt');
|
|
expect(key1).not.toBe(key2);
|
|
});
|
|
|
|
test('cache key differs when prompt changes', async () => {
|
|
const { computeCacheKey } = await import('../../lib/eval-cache');
|
|
const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A');
|
|
const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B');
|
|
expect(key1).not.toBe(key2);
|
|
});
|
|
});
|
|
|
|
// Test cache read/write directly
|
|
describe('cache read/write for llm-judge suite', () => {
|
|
test('cacheRead returns null on miss', async () => {
|
|
const { cacheRead } = await import('../../lib/eval-cache');
|
|
expect(cacheRead('llm-judge', 'nonexistent')).toBeNull();
|
|
});
|
|
|
|
test('cacheWrite + cacheRead round-trip', async () => {
|
|
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
|
|
const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' };
|
|
cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' });
|
|
const cached = cacheRead('llm-judge', 'test-key');
|
|
expect(cached).toEqual(data);
|
|
});
|
|
|
|
test('EVAL_CACHE=0 bypasses cache read', async () => {
|
|
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
|
|
cacheWrite('llm-judge', 'bypass-key', { test: true });
|
|
process.env.EVAL_CACHE = '0';
|
|
expect(cacheRead('llm-judge', 'bypass-key')).toBeNull();
|
|
});
|
|
});
|
|
|
|
// Test tier resolution
|
|
describe('tier resolution for judge', () => {
|
|
test('defaults to standard (sonnet) when no env set', async () => {
|
|
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
|
expect(resolveJudgeTier()).toBe('standard');
|
|
expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6');
|
|
});
|
|
|
|
test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => {
|
|
process.env.EVAL_JUDGE_TIER = 'haiku';
|
|
// Need fresh import to pick up env change
|
|
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
|
expect(resolveJudgeTier()).toBe('fast');
|
|
expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5');
|
|
});
|
|
|
|
test('EVAL_JUDGE_TIER=opus selects full tier', async () => {
|
|
process.env.EVAL_JUDGE_TIER = 'opus';
|
|
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
|
expect(resolveJudgeTier()).toBe('full');
|
|
expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6');
|
|
});
|
|
});
|
|
|
|
// Test JudgeMeta shape
|
|
describe('JudgeMeta interface', () => {
|
|
test('exported from llm-judge module', async () => {
|
|
const mod = await import('./llm-judge');
|
|
// Verify callJudge and judge are exported functions
|
|
expect(typeof mod.callJudge).toBe('function');
|
|
expect(typeof mod.judge).toBe('function');
|
|
expect(typeof mod.outcomeJudge).toBe('function');
|
|
});
|
|
});
|