Merge remote-tracking branch 'origin/garrytan/team-supabase-store' into garrytan/dev-mode

This commit is contained in:
Garry Tan
2026-03-15 17:29:37 -05:00
20 changed files with 818 additions and 62 deletions
+68
View File
@@ -128,6 +128,74 @@ describe('EvalCollector', () => {
expect(data.tests).toHaveLength(0);
expect(data.tier).toBe('llm-judge');
});
test('finalize aggregates per-test costs into result-level costs[]', async () => {
const collector = new EvalCollector('e2e', tmpDir);
collector.addTest(makeEntry({
name: 'test-a',
costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }],
}));
collector.addTest(makeEntry({
name: 'test-b',
costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }],
}));
collector.addTest(makeEntry({
name: 'test-c',
costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }],
}));
const filepath = await collector.finalize();
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
expect(data.costs).toBeDefined();
expect(data.costs).toHaveLength(2); // two models
const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6');
const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5');
expect(sonnet).toBeDefined();
expect(sonnet!.calls).toBe(2);
expect(sonnet!.input_tokens).toBe(300);
expect(sonnet!.output_tokens).toBe(150);
expect(sonnet!.cost_usd).toBeCloseTo(0.03);
expect(haiku).toBeDefined();
expect(haiku!.calls).toBe(1);
expect(haiku!.cost_usd).toBeCloseTo(0.005);
});
test('finalize omits costs when no tests have cost data', async () => {
const collector = new EvalCollector('e2e', tmpDir);
collector.addTest(makeEntry({ name: 'no-costs' }));
const filepath = await collector.finalize();
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
expect(data.costs).toBeUndefined();
});
test('finalize aggregates cache token fields', async () => {
const collector = new EvalCollector('e2e', tmpDir);
collector.addTest(makeEntry({
name: 'test-a',
costs: [{
model: 'claude-sonnet-4-6', calls: 1,
input_tokens: 10, output_tokens: 50,
cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000,
cost_usd: 0.01,
}],
}));
collector.addTest(makeEntry({
name: 'test-b',
costs: [{
model: 'claude-sonnet-4-6', calls: 1,
input_tokens: 20, output_tokens: 100,
cache_read_input_tokens: 8000, cache_creation_input_tokens: 500,
cost_usd: 0.02,
}],
}));
const filepath = await collector.finalize();
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!;
expect(sonnet.cache_read_input_tokens).toBe(13000);
expect(sonnet.cache_creation_input_tokens).toBe(1500);
});
});
// --- extractToolSummary tests ---
+25
View File
@@ -13,6 +13,7 @@ import * as path from 'path';
import * as os from 'os';
import { spawnSync } from 'child_process';
import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
import type { CostEntry } from '../../lib/eval-format';
const SCHEMA_VERSION = 1;
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
@@ -50,6 +51,9 @@ export interface EvalTestEntry {
detected_bugs?: string[];
missed_bugs?: string[];
// Per-model cost breakdown
costs?: CostEntry[];
error?: string;
}
@@ -67,6 +71,7 @@ export interface EvalResult {
total_cost_usd: number;
total_duration_ms: number;
tests: EvalTestEntry[];
costs?: CostEntry[]; // aggregate per-model cost breakdown
_partial?: boolean; // true for incremental saves, absent in final
}
@@ -414,6 +419,25 @@ export class EvalCollector {
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
const passed = this.tests.filter(t => t.passed).length;
// Aggregate per-model costs across all tests
const costMap = new Map<string, CostEntry>();
for (const t of this.tests) {
for (const c of t.costs || []) {
const existing = costMap.get(c.model);
if (existing) {
existing.calls += c.calls;
existing.input_tokens += c.input_tokens;
existing.output_tokens += c.output_tokens;
existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0);
existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0);
if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd;
} else {
costMap.set(c.model, { ...c });
}
}
}
const costs = costMap.size > 0 ? [...costMap.values()] : undefined;
const result: EvalResult = {
schema_version: SCHEMA_VERSION,
version,
@@ -428,6 +452,7 @@ export class EvalCollector {
total_cost_usd: Math.round(totalCost * 100) / 100,
total_duration_ms: totalDuration,
tests: this.tests,
costs,
};
// Write eval file
+117
View File
@@ -0,0 +1,117 @@
/**
* Tests for LLM judge cache + tier integration.
* Mocks Anthropic client to avoid API calls.
*/
import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
let tmpCacheDir: string;
const origEnv: Record<string, string | undefined> = {};
beforeEach(() => {
tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-'));
// Point cache to temp dir and clear tier env vars
origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR;
origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER;
origEnv.EVAL_TIER = process.env.EVAL_TIER;
origEnv.EVAL_CACHE = process.env.EVAL_CACHE;
process.env.GSTACK_STATE_DIR = tmpCacheDir;
delete process.env.EVAL_JUDGE_TIER;
delete process.env.EVAL_TIER;
delete process.env.EVAL_CACHE;
});
afterEach(() => {
// Restore env
for (const [key, val] of Object.entries(origEnv)) {
if (val === undefined) delete process.env[key];
else process.env[key] = val;
}
try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {}
});
// Test cache key computation directly (doesn't need mock)
describe('cache key computation', () => {
test('computeCacheKey produces consistent hashes for same input', async () => {
const { computeCacheKey } = await import('../../lib/eval-cache');
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
expect(key1).toBe(key2);
expect(key1).toHaveLength(16);
});
test('cache key differs when model changes', async () => {
const { computeCacheKey } = await import('../../lib/eval-cache');
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt');
expect(key1).not.toBe(key2);
});
test('cache key differs when prompt changes', async () => {
const { computeCacheKey } = await import('../../lib/eval-cache');
const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A');
const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B');
expect(key1).not.toBe(key2);
});
});
// Test cache read/write directly
describe('cache read/write for llm-judge suite', () => {
test('cacheRead returns null on miss', async () => {
const { cacheRead } = await import('../../lib/eval-cache');
expect(cacheRead('llm-judge', 'nonexistent')).toBeNull();
});
test('cacheWrite + cacheRead round-trip', async () => {
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' };
cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' });
const cached = cacheRead('llm-judge', 'test-key');
expect(cached).toEqual(data);
});
test('EVAL_CACHE=0 bypasses cache read', async () => {
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
cacheWrite('llm-judge', 'bypass-key', { test: true });
process.env.EVAL_CACHE = '0';
expect(cacheRead('llm-judge', 'bypass-key')).toBeNull();
});
});
// Test tier resolution
describe('tier resolution for judge', () => {
test('defaults to standard (sonnet) when no env set', async () => {
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
expect(resolveJudgeTier()).toBe('standard');
expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6');
});
test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => {
process.env.EVAL_JUDGE_TIER = 'haiku';
// Need fresh import to pick up env change
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
expect(resolveJudgeTier()).toBe('fast');
expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5');
});
test('EVAL_JUDGE_TIER=opus selects full tier', async () => {
process.env.EVAL_JUDGE_TIER = 'opus';
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
expect(resolveJudgeTier()).toBe('full');
expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6');
});
});
// Test JudgeMeta shape
describe('JudgeMeta interface', () => {
test('exported from llm-judge module', async () => {
const mod = await import('./llm-judge');
// Verify callJudge and judge are exported functions
expect(typeof mod.callJudge).toBe('function');
expect(typeof mod.judge).toBe('function');
expect(typeof mod.outcomeJudge).toBe('function');
});
});
+50 -9
View File
@@ -1,13 +1,19 @@
/**
* Shared LLM-as-judge helpers for eval and E2E tests.
*
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
* and outcomeJudge (planted-bug detection scorer).
* Provides callJudge (generic JSON-from-LLM with cache + tier support),
* judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
*
* Requires: ANTHROPIC_API_KEY env var
* Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
*
* Env vars:
* EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
* EVAL_CACHE=0 — bypass cache, always re-run
*/
import Anthropic from '@anthropic-ai/sdk';
import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';
export interface JudgeScore {
clarity: number; // 1-5
@@ -25,15 +31,35 @@ export interface OutcomeJudgeResult {
reasoning: string;
}
export interface JudgeMeta {
model: string;
input_tokens: number;
output_tokens: number;
cached: boolean;
}
/**
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
* Call the judge model with a prompt, extract JSON response.
* Uses eval-cache for SHA-based caching and eval-tier for model selection.
* Retries once on 429 rate limit errors.
*/
export async function callJudge<T>(prompt: string): Promise<T> {
export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
const model = tierToModel(resolveJudgeTier());
// Check cache (keyed by model + prompt content)
const cacheKey = computeCacheKey([], `${model}:${prompt}`);
const cached = cacheRead('llm-judge', cacheKey);
if (cached !== null) {
return {
result: cached as T,
meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
};
}
const client = new Anthropic();
const makeRequest = () => client.messages.create({
model: 'claude-sonnet-4-6',
model,
max_tokens: 1024,
messages: [{ role: 'user', content: prompt }],
});
@@ -53,13 +79,25 @@ export async function callJudge<T>(prompt: string): Promise<T> {
const text = response.content[0].type === 'text' ? response.content[0].text : '';
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
return JSON.parse(jsonMatch[0]) as T;
const result = JSON.parse(jsonMatch[0]) as T;
// Write to cache
cacheWrite('llm-judge', cacheKey, result, { model });
const meta: JudgeMeta = {
model,
input_tokens: (response.usage as any)?.input_tokens || 0,
output_tokens: (response.usage as any)?.output_tokens || 0,
cached: false,
};
return { result, meta };
}
/**
* Score documentation quality on clarity/completeness/actionability (1-5).
*/
export async function judge(section: string, content: string): Promise<JudgeScore> {
export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
@@ -92,12 +130,14 @@ ${content}`);
/**
* Evaluate a QA report against planted-bug ground truth.
* Returns detection metrics for the planted bugs.
* Note: outcomeJudge returns just the result (not meta) for backward compat
* with E2E test callers. Cache still works internally.
*/
export async function outcomeJudge(
groundTruth: any,
report: string,
): Promise<OutcomeJudgeResult> {
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
${JSON.stringify(groundTruth.bugs, null, 2)}
@@ -127,4 +167,5 @@ Rules:
- detection_rate = length of detected array
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
5 = excellent evidence for every bug, 1 = no evidence at all`);
return result;
}
+32
View File
@@ -93,4 +93,36 @@ describe('parseNDJSON', () => {
expect(parsed.turnCount).toBe(2);
expect(parsed.toolCalls).toHaveLength(0);
});
test('resultLine preserves modelUsage for cost extraction', () => {
const lines = [
'{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}',
JSON.stringify({
type: 'result', subtype: 'success', total_cost_usd: 0.07,
num_turns: 1, result: 'Done.',
usage: { input_tokens: 8, output_tokens: 802 },
modelUsage: {
'claude-sonnet-4-6': {
inputTokens: 8, outputTokens: 802,
cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223,
costUSD: 0.07308,
},
},
}),
];
const parsed = parseNDJSON(lines);
expect(parsed.resultLine).not.toBeNull();
expect(parsed.resultLine.modelUsage).toBeDefined();
const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6'];
expect(usage.inputTokens).toBe(8);
expect(usage.outputTokens).toBe(802);
expect(usage.cacheReadInputTokens).toBe(88133);
expect(usage.costUSD).toBeCloseTo(0.07308);
});
test('resultLine without modelUsage has undefined modelUsage', () => {
const parsed = parseNDJSON(FIXTURE_LINES);
// Original fixture has no modelUsage on result line
expect(parsed.resultLine?.modelUsage).toBeUndefined();
});
});
+23 -1
View File
@@ -10,6 +10,8 @@ import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
import type { CostEntry } from '../../lib/eval-format';
import { resolveTier, tierToModel } from '../../lib/eval-tier';
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
@@ -34,6 +36,7 @@ export interface SkillTestResult {
output: string;
costEstimate: CostEstimate;
transcript: any[];
costs: CostEntry[];
}
const BROWSE_ERROR_PATTERNS = [
@@ -135,8 +138,11 @@ export async function runSkillTest(options: {
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
// avoid shell escaping issues. --verbose is required for stream-json mode.
// Model pinned via EVAL_TIER env var (default: sonnet).
const evalModel = tierToModel(resolveTier());
const args = [
'-p',
'--model', evalModel,
'--output-format', 'stream-json',
'--verbose',
'--dangerously-skip-permissions',
@@ -323,5 +329,21 @@ export async function runSkillTest(options: {
turnsUsed,
};
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
// Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
const costs: CostEntry[] = [];
if (resultLine?.modelUsage) {
for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
costs.push({
model,
calls: 1,
input_tokens: usage.inputTokens || 0,
output_tokens: usage.outputTokens || 0,
cache_read_input_tokens: usage.cacheReadInputTokens || 0,
cache_creation_input_tokens: usage.cacheCreationInputTokens || 0,
cost_usd: usage.costUSD,
});
}
}
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
}
+193
View File
@@ -0,0 +1,193 @@
/**
* Tests for computeTrends() — per-test pass rate trend tracking.
*/
import { describe, test, expect } from 'bun:test';
import { computeTrends } from '../lib/cli-eval';
import type { EvalResult } from './helpers/eval-store';
/** Build a minimal EvalResult with given tests. */
function makeRun(opts: {
timestamp: string;
tier?: 'e2e' | 'llm-judge';
tests: Array<{ name: string; passed: boolean }>;
}): EvalResult {
return {
schema_version: 1,
version: '0.3.3',
branch: 'main',
git_sha: 'abc',
timestamp: opts.timestamp,
hostname: 'test',
tier: opts.tier || 'e2e',
total_tests: opts.tests.length,
passed: opts.tests.filter(t => t.passed).length,
failed: opts.tests.filter(t => !t.passed).length,
total_cost_usd: 0,
total_duration_ms: 0,
tests: opts.tests.map(t => ({
name: t.name, suite: 'test', tier: opts.tier || 'e2e' as const,
passed: t.passed, duration_ms: 0, cost_usd: 0,
})),
};
}
describe('computeTrends', () => {
test('classifies stable-pass test correctly', () => {
// 10 runs all passing — results are newest-first (loadEvalResults order)
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'always-pass', passed: true }],
})).reverse(); // newest first
const trends = computeTrends(results);
expect(trends).toHaveLength(1);
expect(trends[0].status).toBe('stable-pass');
expect(trends[0].passRate).toBe(1);
expect(trends[0].streak).toEqual({ type: 'pass', count: 10 });
expect(trends[0].flipCount).toBe(0);
});
test('classifies stable-fail test correctly', () => {
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'always-fail', passed: false }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('stable-fail');
expect(trends[0].passRate).toBe(0);
expect(trends[0].streak).toEqual({ type: 'fail', count: 10 });
});
test('classifies flaky test correctly — alternating pass/fail', () => {
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'flaky', passed: i % 2 === 0 }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('flaky');
expect(trends[0].flipCount).toBe(9);
expect(trends[0].passRate).toBe(0.5);
});
test('classifies improving test correctly', () => {
// First 5 fail, last 5 pass
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'improving', passed: i >= 5 }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('improving');
expect(trends[0].streak).toEqual({ type: 'pass', count: 5 });
});
test('classifies degrading test correctly', () => {
// First 7 pass, last 3 fail
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'degrading', passed: i < 7 }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('degrading');
expect(trends[0].streak).toEqual({ type: 'fail', count: 3 });
});
test('computes streak correctly with mixed ending', () => {
// pass, pass, fail, pass, pass, pass (newest)
const passed = [true, true, false, true, true, true];
const results = passed.map((p, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'test', passed: p }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].streak).toEqual({ type: 'pass', count: 3 });
});
test('computes flipCount correctly', () => {
// pass, fail, pass, pass, fail, pass → 4 flips
const passed = [true, false, true, true, false, true];
const results = passed.map((p, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'test', passed: p }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].flipCount).toBe(4);
});
test('handles single run', () => {
const results = [makeRun({
timestamp: '2026-03-15T00:00:00Z',
tests: [{ name: 'single', passed: true }],
})];
const trends = computeTrends(results);
expect(trends).toHaveLength(1);
expect(trends[0].passRate).toBe(1);
expect(trends[0].streak).toEqual({ type: 'pass', count: 1 });
expect(trends[0].flipCount).toBe(0);
expect(trends[0].status).toBe('stable-pass');
});
test('handles single failing run', () => {
const results = [makeRun({
timestamp: '2026-03-15T00:00:00Z',
tests: [{ name: 'single-fail', passed: false }],
})];
const trends = computeTrends(results);
expect(trends[0].status).toBe('stable-fail');
});
test('filters by tier', () => {
const results = [
makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'e2e', tests: [{ name: 'e2e-test', passed: true }] }),
makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'llm-judge', tests: [{ name: 'judge-test', passed: true }] }),
];
const e2eOnly = computeTrends(results, 'e2e');
expect(e2eOnly).toHaveLength(1);
expect(e2eOnly[0].name).toBe('e2e-test');
const judgeOnly = computeTrends(results, 'llm-judge');
expect(judgeOnly).toHaveLength(1);
expect(judgeOnly[0].name).toBe('judge-test');
});
test('filters by test name', () => {
const results = Array.from({ length: 3 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [
{ name: 'test-a', passed: true },
{ name: 'test-b', passed: false },
],
})).reverse();
const filtered = computeTrends(results, undefined, 'test-a');
expect(filtered).toHaveLength(1);
expect(filtered[0].name).toBe('test-a');
expect(filtered[0].passRate).toBe(1);
});
test('sorts flaky tests first', () => {
// Create runs where test-a is flaky and test-b is stable
const results = Array.from({ length: 6 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [
{ name: 'test-a', passed: i % 2 === 0 }, // flaky: alternating
{ name: 'test-b', passed: true }, // stable-pass
],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].name).toBe('test-a');
expect(trends[0].status).toBe('flaky');
expect(trends[1].name).toBe('test-b');
expect(trends[1].status).toBe('stable-pass');
});
});
+1
View File
@@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?:
exit_reason: result.exitReason,
timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
last_tool_call: lastTool,
costs: result.costs,
...extra,
});
}
+57 -42
View File
@@ -7,16 +7,18 @@
* Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set)
* Run: EVALS=1 bun run test:eval
*
* Cost: ~$0.05-0.15 per run (sonnet)
* Cost: ~$0.05-0.15 per run (sonnet), $0 on cache hit
* Cache: SHA-based via eval-cache. Set EVAL_CACHE=0 to force re-run.
* Model: Set EVAL_JUDGE_TIER=haiku|sonnet|opus to override (default: sonnet).
*/
import { describe, test, expect, afterAll } from 'bun:test';
import Anthropic from '@anthropic-ai/sdk';
import * as fs from 'fs';
import * as path from 'path';
import { callJudge, judge } from './helpers/llm-judge';
import type { JudgeScore } from './helpers/llm-judge';
import type { JudgeMeta } from './helpers/llm-judge';
import { EvalCollector } from './helpers/eval-store';
import { MODEL_PRICING } from '../lib/eval-cost';
const ROOT = path.resolve(import.meta.dir, '..');
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
@@ -26,6 +28,22 @@ const describeEval = evalsEnabled ? describe : describe.skip;
// Eval result collector
const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
/** Compute actual judge cost from meta (0 on cache hit). */
function judgeCost(meta: JudgeMeta): number {
if (meta.cached) return 0;
const p = MODEL_PRICING[meta.model] || { input: 3.0, output: 15.0 };
return (meta.input_tokens / 1_000_000) * p.input + (meta.output_tokens / 1_000_000) * p.output;
}
/** Build CostEntry array from judge meta (empty on cache hit). */
function judgeCosts(meta: JudgeMeta) {
if (meta.cached) return [];
return [{
model: meta.model, calls: 1,
input_tokens: meta.input_tokens, output_tokens: meta.output_tokens,
}];
}
describeEval('LLM-as-judge quality evals', () => {
test('command reference table scores >= 4 on all dimensions', async () => {
const t0 = Date.now();
@@ -34,8 +52,8 @@ describeEval('LLM-as-judge quality evals', () => {
const end = content.indexOf('## Tips');
const section = content.slice(start, end);
const scores = await judge('command reference table', section);
console.log('Command reference scores:', JSON.stringify(scores, null, 2));
const { result: scores, meta } = await judge('command reference table', section);
console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'command reference table',
@@ -43,9 +61,10 @@ describeEval('LLM-as-judge quality evals', () => {
tier: 'llm-judge',
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
costs: judgeCosts(meta),
});
expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -60,8 +79,8 @@ describeEval('LLM-as-judge quality evals', () => {
const end = content.indexOf('## Command Reference');
const section = content.slice(start, end);
const scores = await judge('snapshot flags reference', section);
console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));
const { result: scores, meta } = await judge('snapshot flags reference', section);
console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'snapshot flags reference',
@@ -69,9 +88,10 @@ describeEval('LLM-as-judge quality evals', () => {
tier: 'llm-judge',
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
costs: judgeCosts(meta),
});
expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -85,8 +105,8 @@ describeEval('LLM-as-judge quality evals', () => {
const start = content.indexOf('## Snapshot Flags');
const section = content.slice(start);
const scores = await judge('browse skill reference (flags + commands)', section);
console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));
const { result: scores, meta } = await judge('browse skill reference (flags + commands)', section);
console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'browse/SKILL.md reference',
@@ -94,9 +114,10 @@ describeEval('LLM-as-judge quality evals', () => {
tier: 'llm-judge',
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
costs: judgeCosts(meta),
});
expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -111,8 +132,8 @@ describeEval('LLM-as-judge quality evals', () => {
const setupEnd = content.indexOf('## IMPORTANT');
const section = content.slice(setupStart, setupEnd);
const scores = await judge('setup/binary discovery instructions', section);
console.log('Setup block scores:', JSON.stringify(scores, null, 2));
const { result: scores, meta } = await judge('setup/binary discovery instructions', section);
console.log('Setup block scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'setup block',
@@ -120,9 +141,10 @@ describeEval('LLM-as-judge quality evals', () => {
tier: 'llm-judge',
passed: scores.actionability >= 3 && scores.clarity >= 3,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
costs: judgeCosts(meta),
});
// Setup block is intentionally minimal (binary discovery only).
@@ -171,13 +193,7 @@ describeEval('LLM-as-judge quality evals', () => {
| \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
const client = new Anthropic();
const response = await client.messages.create({
model: 'claude-sonnet-4-6',
max_tokens: 1024,
messages: [{
role: 'user',
content: `You are comparing two versions of CLI documentation for an AI coding agent.
const { result, meta } = await callJudge<{ winner: string; reasoning: string; a_score: number; b_score: number }>(`You are comparing two versions of CLI documentation for an AI coding agent.
VERSION A (baseline — hand-maintained):
${baseline}
@@ -193,15 +209,9 @@ Which version is better for an AI agent trying to use these commands? Consider:
Respond with ONLY valid JSON:
{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
Scores are 1-5 overall quality.`,
}],
});
Scores are 1-5 overall quality.`);
const text = response.content[0].type === 'text' ? response.content[0].text : '';
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
const result = JSON.parse(jsonMatch[0]);
console.log('Regression comparison:', JSON.stringify(result, null, 2));
console.log('Regression comparison:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'regression vs baseline',
@@ -209,9 +219,10 @@ Scores are 1-5 overall quality.`,
tier: 'llm-judge',
passed: result.b_score >= result.a_score,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { a_score: result.a_score, b_score: result.b_score },
judge_reasoning: result.reasoning,
costs: judgeCosts(meta),
});
expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
@@ -229,7 +240,7 @@ describeEval('QA skill quality evals', () => {
const end = qaContent.indexOf('## Health Score Rubric');
const section = qaContent.slice(start, end);
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
The agent reads this document to learn how to systematically QA test a web application. The workflow references
a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions.
@@ -246,7 +257,7 @@ Respond with ONLY valid JSON:
Here is the QA workflow to evaluate:
${section}`);
console.log('QA workflow scores:', JSON.stringify(scores, null, 2));
console.log('QA workflow scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'qa/SKILL.md workflow',
@@ -254,9 +265,10 @@ ${section}`);
tier: 'llm-judge',
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
costs: judgeCosts(meta),
});
expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -271,7 +283,7 @@ ${section}`);
const start = qaContent.indexOf('## Health Score Rubric');
const section = qaContent.slice(start);
const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
The agent uses this rubric after QA testing a website. It needs to:
1. Understand each scoring category and what counts as a deduction
@@ -289,7 +301,7 @@ Respond with ONLY valid JSON:
Here is the rubric to evaluate:
${section}`);
console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));
console.log('QA health rubric scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'qa/SKILL.md health rubric',
@@ -297,9 +309,10 @@ ${section}`);
tier: 'llm-judge',
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
judge_reasoning: scores.reasoning,
costs: judgeCosts(meta),
});
expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -332,7 +345,7 @@ describeEval('Cross-skill consistency evals', () => {
extractGrepLines(retroContent, 'retro/SKILL.md'),
].join('\n\n');
const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
const { result, meta } = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
INTENDED ARCHITECTURE:
- greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md)
@@ -355,7 +368,7 @@ Evaluate consistency. Respond with ONLY valid JSON:
score (1-5): 5 = perfectly consistent, 1 = contradictory`);
console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));
console.log('Cross-skill consistency:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
evalCollector?.addTest({
name: 'cross-skill greptile consistency',
@@ -363,9 +376,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
tier: 'llm-judge',
passed: result.consistent && result.score >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { consistency_score: result.score },
judge_reasoning: result.reasoning,
costs: judgeCosts(meta),
});
expect(result.consistent).toBe(true);
@@ -392,7 +406,7 @@ describeEval('Baseline score pinning', () => {
const cmdStart = skillContent.indexOf('## Command Reference');
const cmdEnd = skillContent.indexOf('## Tips');
const cmdSection = skillContent.slice(cmdStart, cmdEnd);
const cmdScores = await judge('command reference table', cmdSection);
const { result: cmdScores, meta } = await judge('command reference table', cmdSection);
for (const dim of ['clarity', 'completeness', 'actionability'] as const) {
if (cmdScores[dim] < baselines.command_reference[dim]) {
@@ -417,9 +431,10 @@ describeEval('Baseline score pinning', () => {
tier: 'llm-judge',
passed,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
cost_usd: judgeCost(meta),
judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
costs: judgeCosts(meta),
});
if (!passed) {