From 02925cfc7a479b1adb397e0c8d811fed24966b2c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 16:47:27 -0500 Subject: [PATCH 1/8] feat: wire costs[] from modelUsage into eval results Extract per-model token usage from resultLine.modelUsage (including cache tokens and exact API cost), flow CostEntry[] through EvalCollector, aggregate in finalize(). Extend CostEntry with cache_read_input_tokens, cache_creation_input_tokens, cost_usd. computeCosts() prefers exact cost_usd over MODEL_PRICING when available (~4x more accurate with prompt caching). Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/eval-cost.ts | 23 +++++++--- lib/eval-format.ts | 4 ++ test/helpers/eval-store.test.ts | 68 +++++++++++++++++++++++++++++ test/helpers/eval-store.ts | 25 +++++++++++ test/helpers/session-runner.test.ts | 32 ++++++++++++++ test/helpers/session-runner.ts | 24 +++++++++- test/skill-e2e.test.ts | 1 + 7 files changed, 170 insertions(+), 7 deletions(-) diff --git a/lib/eval-cost.ts b/lib/eval-cost.ts index 1dbe31c8..ac520c88 100644 --- a/lib/eval-cost.ts +++ b/lib/eval-cost.ts @@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } { export function computeCosts(costs: CostEntry[]): CostDashboard { const byModel = new Map(); + // Track exact cost_usd sums per model (from API-provided costs) + const exactCosts = new Map(); + for (const entry of costs) { const existing = byModel.get(entry.model); if (existing) { @@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard { estimated_cost_usd: 0, }); } + if (entry.cost_usd !== undefined) { + exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd); + } } - // Calculate costs + // Calculate costs — prefer exact cost_usd (accounts for cache discounts) let total = 0; let atFast = 0; let atFull = 0; @@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard { const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING; for (const summary of byModel.values()) { - const pricing = getPricing(summary.model); - summary.estimated_cost_usd = - (summary.input_tokens / 1_000_000) * pricing.input + - (summary.output_tokens / 1_000_000) * pricing.output; + const exact = exactCosts.get(summary.model); + if (exact !== undefined) { + summary.estimated_cost_usd = exact; + } else { + const pricing = getPricing(summary.model); + summary.estimated_cost_usd = + (summary.input_tokens / 1_000_000) * pricing.input + + (summary.output_tokens / 1_000_000) * pricing.output; + } total += summary.estimated_cost_usd; - // What-if at fast/full tiers + // What-if at fast/full tiers (always from token counts) atFast += (summary.input_tokens / 1_000_000) * fastPricing.input + (summary.output_tokens / 1_000_000) * fastPricing.output; diff --git a/lib/eval-format.ts b/lib/eval-format.ts index 0dcc347d..6a88cac2 100644 --- a/lib/eval-format.ts +++ b/lib/eval-format.ts @@ -15,6 +15,10 @@ export interface CostEntry { calls: number; input_tokens: number; output_tokens: number; + cache_read_input_tokens?: number; + cache_creation_input_tokens?: number; + /** Exact cost from API when available (accounts for cache discounts). */ + cost_usd?: number; } export interface FailureEntry { diff --git a/test/helpers/eval-store.test.ts b/test/helpers/eval-store.test.ts index a0539a0e..b0c5e742 100644 --- a/test/helpers/eval-store.test.ts +++ b/test/helpers/eval-store.test.ts @@ -128,6 +128,74 @@ describe('EvalCollector', () => { expect(data.tests).toHaveLength(0); expect(data.tier).toBe('llm-judge'); }); + + test('finalize aggregates per-test costs into result-level costs[]', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ + name: 'test-a', + costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }], + })); + collector.addTest(makeEntry({ + name: 'test-b', + costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }], + })); + collector.addTest(makeEntry({ + name: 'test-c', + costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }], + })); + + const filepath = await collector.finalize(); + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + + expect(data.costs).toBeDefined(); + expect(data.costs).toHaveLength(2); // two models + const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6'); + const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5'); + expect(sonnet).toBeDefined(); + expect(sonnet!.calls).toBe(2); + expect(sonnet!.input_tokens).toBe(300); + expect(sonnet!.output_tokens).toBe(150); + expect(sonnet!.cost_usd).toBeCloseTo(0.03); + expect(haiku).toBeDefined(); + expect(haiku!.calls).toBe(1); + expect(haiku!.cost_usd).toBeCloseTo(0.005); + }); + + test('finalize omits costs when no tests have cost data', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ name: 'no-costs' })); + const filepath = await collector.finalize(); + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + expect(data.costs).toBeUndefined(); + }); + + test('finalize aggregates cache token fields', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ + name: 'test-a', + costs: [{ + model: 'claude-sonnet-4-6', calls: 1, + input_tokens: 10, output_tokens: 50, + cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000, + cost_usd: 0.01, + }], + })); + collector.addTest(makeEntry({ + name: 'test-b', + costs: [{ + model: 'claude-sonnet-4-6', calls: 1, + input_tokens: 20, output_tokens: 100, + cache_read_input_tokens: 8000, cache_creation_input_tokens: 500, + cost_usd: 0.02, + }], + })); + + const filepath = await collector.finalize(); + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!; + expect(sonnet.cache_read_input_tokens).toBe(13000); + expect(sonnet.cache_creation_input_tokens).toBe(1500); + }); }); // --- extractToolSummary tests --- diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts index 63534322..46f1ce88 100644 --- a/test/helpers/eval-store.ts +++ b/test/helpers/eval-store.ts @@ -13,6 +13,7 @@ import * as path from 'path'; import * as os from 'os'; import { spawnSync } from 'child_process'; import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util'; +import type { CostEntry } from '../../lib/eval-format'; const SCHEMA_VERSION = 1; const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals'); @@ -50,6 +51,9 @@ export interface EvalTestEntry { detected_bugs?: string[]; missed_bugs?: string[]; + // Per-model cost breakdown + costs?: CostEntry[]; + error?: string; } @@ -67,6 +71,7 @@ export interface EvalResult { total_cost_usd: number; total_duration_ms: number; tests: EvalTestEntry[]; + costs?: CostEntry[]; // aggregate per-model cost breakdown _partial?: boolean; // true for incremental saves, absent in final } @@ -414,6 +419,25 @@ export class EvalCollector { const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0); const passed = this.tests.filter(t => t.passed).length; + // Aggregate per-model costs across all tests + const costMap = new Map(); + for (const t of this.tests) { + for (const c of t.costs || []) { + const existing = costMap.get(c.model); + if (existing) { + existing.calls += c.calls; + existing.input_tokens += c.input_tokens; + existing.output_tokens += c.output_tokens; + existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0); + existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0); + if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd; + } else { + costMap.set(c.model, { ...c }); + } + } + } + const costs = costMap.size > 0 ? [...costMap.values()] : undefined; + const result: EvalResult = { schema_version: SCHEMA_VERSION, version, @@ -428,6 +452,7 @@ export class EvalCollector { total_cost_usd: Math.round(totalCost * 100) / 100, total_duration_ms: totalDuration, tests: this.tests, + costs, }; // Write eval file diff --git a/test/helpers/session-runner.test.ts b/test/helpers/session-runner.test.ts index 812d4f8a..9a06dd66 100644 --- a/test/helpers/session-runner.test.ts +++ b/test/helpers/session-runner.test.ts @@ -93,4 +93,36 @@ describe('parseNDJSON', () => { expect(parsed.turnCount).toBe(2); expect(parsed.toolCalls).toHaveLength(0); }); + + test('resultLine preserves modelUsage for cost extraction', () => { + const lines = [ + '{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}', + JSON.stringify({ + type: 'result', subtype: 'success', total_cost_usd: 0.07, + num_turns: 1, result: 'Done.', + usage: { input_tokens: 8, output_tokens: 802 }, + modelUsage: { + 'claude-sonnet-4-6': { + inputTokens: 8, outputTokens: 802, + cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223, + costUSD: 0.07308, + }, + }, + }), + ]; + const parsed = parseNDJSON(lines); + expect(parsed.resultLine).not.toBeNull(); + expect(parsed.resultLine.modelUsage).toBeDefined(); + const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6']; + expect(usage.inputTokens).toBe(8); + expect(usage.outputTokens).toBe(802); + expect(usage.cacheReadInputTokens).toBe(88133); + expect(usage.costUSD).toBeCloseTo(0.07308); + }); + + test('resultLine without modelUsage has undefined modelUsage', () => { + const parsed = parseNDJSON(FIXTURE_LINES); + // Original fixture has no modelUsage on result line + expect(parsed.resultLine?.modelUsage).toBeUndefined(); + }); }); diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts index 33c4cf14..b04465fa 100644 --- a/test/helpers/session-runner.ts +++ b/test/helpers/session-runner.ts @@ -10,6 +10,8 @@ import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util'; +import type { CostEntry } from '../../lib/eval-format'; +import { resolveTier, tierToModel } from '../../lib/eval-tier'; const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); @@ -34,6 +36,7 @@ export interface SkillTestResult { output: string; costEstimate: CostEstimate; transcript: any[]; + costs: CostEntry[]; } const BROWSE_ERROR_PATTERNS = [ @@ -135,8 +138,11 @@ export async function runSkillTest(options: { // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to // avoid shell escaping issues. --verbose is required for stream-json mode. + // Model pinned via EVAL_TIER env var (default: sonnet). + const evalModel = tierToModel(resolveTier()); const args = [ '-p', + '--model', evalModel, '--output-format', 'stream-json', '--verbose', '--dangerously-skip-permissions', @@ -323,5 +329,21 @@ export async function runSkillTest(options: { turnsUsed, }; - return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript }; + // Extract per-model costs from resultLine.modelUsage (camelCase → snake_case) + const costs: CostEntry[] = []; + if (resultLine?.modelUsage) { + for (const [model, usage] of Object.entries(resultLine.modelUsage as Record)) { + costs.push({ + model, + calls: 1, + input_tokens: usage.inputTokens || 0, + output_tokens: usage.outputTokens || 0, + cache_read_input_tokens: usage.cacheReadInputTokens || 0, + cache_creation_input_tokens: usage.cacheCreationInputTokens || 0, + cost_usd: usage.costUSD, + }); + } + } + + return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs }; } diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 758f0d3f..19da2de4 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: exit_reason: result.exitReason, timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined, last_tool_call: lastTool, + costs: result.costs, ...extra, }); } From 59752fc5101bec9622cc4277cb427dcd4bff05b9 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 16:47:35 -0500 Subject: [PATCH 2/8] feat: wire eval-cache + eval-tier into LLM judge, pin E2E model callJudge/judge now return {result, meta} with SHA-based caching (~$0.18/run savings when SKILL.md unchanged) and dynamic model selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from EVAL_TIER to claude -p. outcomeJudge retains simple return type. All 8 LLM eval test sites updated with real costs and costs[]. Co-Authored-By: Claude Opus 4.6 (1M context) --- TODOS.md | 4 +- test/helpers/llm-judge.test.ts | 117 +++++++++++++++++++++++++++++++++ test/helpers/llm-judge.ts | 59 ++++++++++++++--- test/skill-llm-eval.test.ts | 99 ++++++++++++++++------------ 4 files changed, 227 insertions(+), 52 deletions(-) create mode 100644 test/helpers/llm-judge.test.ts diff --git a/TODOS.md b/TODOS.md index 4916c236..b5ec8ac3 100644 --- a/TODOS.md +++ b/TODOS.md @@ -231,7 +231,7 @@ **Why:** Spot quality trends — is the app getting better or worse? -**Context:** QA already writes structured reports. This adds cross-run comparison. +**Context:** `eval:trend` now tracks test-level pass rates (eval infrastructure). QA-run-level trending (health scores over time across QA report files) is a separate feature that could reuse `computeTrends` pattern from `lib/cli-eval.ts`. **Effort:** S **Priority:** P2 @@ -335,6 +335,8 @@ **Why:** Reduce E2E test cost and flakiness. +**Status:** Model pinning shipped (session-runner.ts passes `--model` from `EVAL_TIER` env). Retry:2 still TODO. + **Effort:** XS **Priority:** P2 diff --git a/test/helpers/llm-judge.test.ts b/test/helpers/llm-judge.test.ts new file mode 100644 index 00000000..03cf7788 --- /dev/null +++ b/test/helpers/llm-judge.test.ts @@ -0,0 +1,117 @@ +/** + * Tests for LLM judge cache + tier integration. + * Mocks Anthropic client to avoid API calls. + */ + +import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +let tmpCacheDir: string; +const origEnv: Record = {}; + +beforeEach(() => { + tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-')); + // Point cache to temp dir and clear tier env vars + origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR; + origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER; + origEnv.EVAL_TIER = process.env.EVAL_TIER; + origEnv.EVAL_CACHE = process.env.EVAL_CACHE; + process.env.GSTACK_STATE_DIR = tmpCacheDir; + delete process.env.EVAL_JUDGE_TIER; + delete process.env.EVAL_TIER; + delete process.env.EVAL_CACHE; +}); + +afterEach(() => { + // Restore env + for (const [key, val] of Object.entries(origEnv)) { + if (val === undefined) delete process.env[key]; + else process.env[key] = val; + } + try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {} +}); + +// Test cache key computation directly (doesn't need mock) +describe('cache key computation', () => { + test('computeCacheKey produces consistent hashes for same input', async () => { + const { computeCacheKey } = await import('../../lib/eval-cache'); + const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt'); + const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt'); + expect(key1).toBe(key2); + expect(key1).toHaveLength(16); + }); + + test('cache key differs when model changes', async () => { + const { computeCacheKey } = await import('../../lib/eval-cache'); + const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt'); + const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt'); + expect(key1).not.toBe(key2); + }); + + test('cache key differs when prompt changes', async () => { + const { computeCacheKey } = await import('../../lib/eval-cache'); + const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A'); + const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B'); + expect(key1).not.toBe(key2); + }); +}); + +// Test cache read/write directly +describe('cache read/write for llm-judge suite', () => { + test('cacheRead returns null on miss', async () => { + const { cacheRead } = await import('../../lib/eval-cache'); + expect(cacheRead('llm-judge', 'nonexistent')).toBeNull(); + }); + + test('cacheWrite + cacheRead round-trip', async () => { + const { cacheRead, cacheWrite } = await import('../../lib/eval-cache'); + const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' }; + cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' }); + const cached = cacheRead('llm-judge', 'test-key'); + expect(cached).toEqual(data); + }); + + test('EVAL_CACHE=0 bypasses cache read', async () => { + const { cacheRead, cacheWrite } = await import('../../lib/eval-cache'); + cacheWrite('llm-judge', 'bypass-key', { test: true }); + process.env.EVAL_CACHE = '0'; + expect(cacheRead('llm-judge', 'bypass-key')).toBeNull(); + }); +}); + +// Test tier resolution +describe('tier resolution for judge', () => { + test('defaults to standard (sonnet) when no env set', async () => { + const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier'); + expect(resolveJudgeTier()).toBe('standard'); + expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6'); + }); + + test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => { + process.env.EVAL_JUDGE_TIER = 'haiku'; + // Need fresh import to pick up env change + const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier'); + expect(resolveJudgeTier()).toBe('fast'); + expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5'); + }); + + test('EVAL_JUDGE_TIER=opus selects full tier', async () => { + process.env.EVAL_JUDGE_TIER = 'opus'; + const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier'); + expect(resolveJudgeTier()).toBe('full'); + expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6'); + }); +}); + +// Test JudgeMeta shape +describe('JudgeMeta interface', () => { + test('exported from llm-judge module', async () => { + const mod = await import('./llm-judge'); + // Verify callJudge and judge are exported functions + expect(typeof mod.callJudge).toBe('function'); + expect(typeof mod.judge).toBe('function'); + expect(typeof mod.outcomeJudge).toBe('function'); + }); +}); diff --git a/test/helpers/llm-judge.ts b/test/helpers/llm-judge.ts index 7040cd6c..61d6927a 100644 --- a/test/helpers/llm-judge.ts +++ b/test/helpers/llm-judge.ts @@ -1,13 +1,19 @@ /** * Shared LLM-as-judge helpers for eval and E2E tests. * - * Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer), - * and outcomeJudge (planted-bug detection scorer). + * Provides callJudge (generic JSON-from-LLM with cache + tier support), + * judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer). * - * Requires: ANTHROPIC_API_KEY env var + * Requires: ANTHROPIC_API_KEY env var (skipped on cache hit) + * + * Env vars: + * EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard) + * EVAL_CACHE=0 — bypass cache, always re-run */ import Anthropic from '@anthropic-ai/sdk'; +import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache'; +import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier'; export interface JudgeScore { clarity: number; // 1-5 @@ -25,15 +31,35 @@ export interface OutcomeJudgeResult { reasoning: string; } +export interface JudgeMeta { + model: string; + input_tokens: number; + output_tokens: number; + cached: boolean; +} + /** - * Call claude-sonnet-4-6 with a prompt, extract JSON response. + * Call the judge model with a prompt, extract JSON response. + * Uses eval-cache for SHA-based caching and eval-tier for model selection. * Retries once on 429 rate limit errors. */ -export async function callJudge(prompt: string): Promise { +export async function callJudge(prompt: string): Promise<{ result: T; meta: JudgeMeta }> { + const model = tierToModel(resolveJudgeTier()); + + // Check cache (keyed by model + prompt content) + const cacheKey = computeCacheKey([], `${model}:${prompt}`); + const cached = cacheRead('llm-judge', cacheKey); + if (cached !== null) { + return { + result: cached as T, + meta: { model, input_tokens: 0, output_tokens: 0, cached: true }, + }; + } + const client = new Anthropic(); const makeRequest = () => client.messages.create({ - model: 'claude-sonnet-4-6', + model, max_tokens: 1024, messages: [{ role: 'user', content: prompt }], }); @@ -53,13 +79,25 @@ export async function callJudge(prompt: string): Promise { const text = response.content[0].type === 'text' ? response.content[0].text : ''; const jsonMatch = text.match(/\{[\s\S]*\}/); if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); - return JSON.parse(jsonMatch[0]) as T; + const result = JSON.parse(jsonMatch[0]) as T; + + // Write to cache + cacheWrite('llm-judge', cacheKey, result, { model }); + + const meta: JudgeMeta = { + model, + input_tokens: (response.usage as any)?.input_tokens || 0, + output_tokens: (response.usage as any)?.output_tokens || 0, + cached: false, + }; + + return { result, meta }; } /** * Score documentation quality on clarity/completeness/actionability (1-5). */ -export async function judge(section: string, content: string): Promise { +export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> { return callJudge(`You are evaluating documentation quality for an AI coding agent's CLI tool reference. The agent reads this documentation to learn how to use a headless browser CLI. It needs to: @@ -92,12 +130,14 @@ ${content}`); /** * Evaluate a QA report against planted-bug ground truth. * Returns detection metrics for the planted bugs. + * Note: outcomeJudge returns just the result (not meta) for backward compat + * with E2E test callers. Cache still works internally. */ export async function outcomeJudge( groundTruth: any, report: string, ): Promise { - return callJudge(`You are evaluating a QA testing report against known ground truth bugs. + const { result } = await callJudge(`You are evaluating a QA testing report against known ground truth bugs. GROUND TRUTH (${groundTruth.total_bugs} planted bugs): ${JSON.stringify(groundTruth.bugs, null, 2)} @@ -127,4 +167,5 @@ Rules: - detection_rate = length of detected array - evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references? 5 = excellent evidence for every bug, 1 = no evidence at all`); + return result; } diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index ba635613..2889538c 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -7,16 +7,18 @@ * Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set) * Run: EVALS=1 bun run test:eval * - * Cost: ~$0.05-0.15 per run (sonnet) + * Cost: ~$0.05-0.15 per run (sonnet), $0 on cache hit + * Cache: SHA-based via eval-cache. Set EVAL_CACHE=0 to force re-run. + * Model: Set EVAL_JUDGE_TIER=haiku|sonnet|opus to override (default: sonnet). */ import { describe, test, expect, afterAll } from 'bun:test'; -import Anthropic from '@anthropic-ai/sdk'; import * as fs from 'fs'; import * as path from 'path'; import { callJudge, judge } from './helpers/llm-judge'; -import type { JudgeScore } from './helpers/llm-judge'; +import type { JudgeMeta } from './helpers/llm-judge'; import { EvalCollector } from './helpers/eval-store'; +import { MODEL_PRICING } from '../lib/eval-cost'; const ROOT = path.resolve(import.meta.dir, '..'); // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env) @@ -26,6 +28,22 @@ const describeEval = evalsEnabled ? describe : describe.skip; // Eval result collector const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null; +/** Compute actual judge cost from meta (0 on cache hit). */ +function judgeCost(meta: JudgeMeta): number { + if (meta.cached) return 0; + const p = MODEL_PRICING[meta.model] || { input: 3.0, output: 15.0 }; + return (meta.input_tokens / 1_000_000) * p.input + (meta.output_tokens / 1_000_000) * p.output; +} + +/** Build CostEntry array from judge meta (empty on cache hit). */ +function judgeCosts(meta: JudgeMeta) { + if (meta.cached) return []; + return [{ + model: meta.model, calls: 1, + input_tokens: meta.input_tokens, output_tokens: meta.output_tokens, + }]; +} + describeEval('LLM-as-judge quality evals', () => { test('command reference table scores >= 4 on all dimensions', async () => { const t0 = Date.now(); @@ -34,8 +52,8 @@ describeEval('LLM-as-judge quality evals', () => { const end = content.indexOf('## Tips'); const section = content.slice(start, end); - const scores = await judge('command reference table', section); - console.log('Command reference scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('command reference table', section); + console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'command reference table', @@ -43,9 +61,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -60,8 +79,8 @@ describeEval('LLM-as-judge quality evals', () => { const end = content.indexOf('## Command Reference'); const section = content.slice(start, end); - const scores = await judge('snapshot flags reference', section); - console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('snapshot flags reference', section); + console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'snapshot flags reference', @@ -69,9 +88,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -85,8 +105,8 @@ describeEval('LLM-as-judge quality evals', () => { const start = content.indexOf('## Snapshot Flags'); const section = content.slice(start); - const scores = await judge('browse skill reference (flags + commands)', section); - console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('browse skill reference (flags + commands)', section); + console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'browse/SKILL.md reference', @@ -94,9 +114,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -111,8 +132,8 @@ describeEval('LLM-as-judge quality evals', () => { const setupEnd = content.indexOf('## IMPORTANT'); const section = content.slice(setupStart, setupEnd); - const scores = await judge('setup/binary discovery instructions', section); - console.log('Setup block scores:', JSON.stringify(scores, null, 2)); + const { result: scores, meta } = await judge('setup/binary discovery instructions', section); + console.log('Setup block scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'setup block', @@ -120,9 +141,10 @@ describeEval('LLM-as-judge quality evals', () => { tier: 'llm-judge', passed: scores.actionability >= 3 && scores.clarity >= 3, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); // Setup block is intentionally minimal (binary discovery only). @@ -171,13 +193,7 @@ describeEval('LLM-as-judge quality evals', () => { | \`is \` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | | \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`; - const client = new Anthropic(); - const response = await client.messages.create({ - model: 'claude-sonnet-4-6', - max_tokens: 1024, - messages: [{ - role: 'user', - content: `You are comparing two versions of CLI documentation for an AI coding agent. + const { result, meta } = await callJudge<{ winner: string; reasoning: string; a_score: number; b_score: number }>(`You are comparing two versions of CLI documentation for an AI coding agent. VERSION A (baseline — hand-maintained): ${baseline} @@ -193,15 +209,9 @@ Which version is better for an AI agent trying to use these commands? Consider: Respond with ONLY valid JSON: {"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N} -Scores are 1-5 overall quality.`, - }], - }); +Scores are 1-5 overall quality.`); - const text = response.content[0].type === 'text' ? response.content[0].text : ''; - const jsonMatch = text.match(/\{[\s\S]*\}/); - if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); - const result = JSON.parse(jsonMatch[0]); - console.log('Regression comparison:', JSON.stringify(result, null, 2)); + console.log('Regression comparison:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'regression vs baseline', @@ -209,9 +219,10 @@ Scores are 1-5 overall quality.`, tier: 'llm-judge', passed: result.b_score >= result.a_score, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { a_score: result.a_score, b_score: result.b_score }, judge_reasoning: result.reasoning, + costs: judgeCosts(meta), }); expect(result.b_score).toBeGreaterThanOrEqual(result.a_score); @@ -229,7 +240,7 @@ describeEval('QA skill quality evals', () => { const end = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start, end); - const scores = await callJudge(`You are evaluating the quality of a QA testing workflow document for an AI coding agent. + const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent. The agent reads this document to learn how to systematically QA test a web application. The workflow references a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions. @@ -246,7 +257,7 @@ Respond with ONLY valid JSON: Here is the QA workflow to evaluate: ${section}`); - console.log('QA workflow scores:', JSON.stringify(scores, null, 2)); + console.log('QA workflow scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'qa/SKILL.md workflow', @@ -254,9 +265,10 @@ ${section}`); tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -271,7 +283,7 @@ ${section}`); const start = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start); - const scores = await callJudge(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score. + const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score. The agent uses this rubric after QA testing a website. It needs to: 1. Understand each scoring category and what counts as a deduction @@ -289,7 +301,7 @@ Respond with ONLY valid JSON: Here is the rubric to evaluate: ${section}`); - console.log('QA health rubric scores:', JSON.stringify(scores, null, 2)); + console.log('QA health rubric scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'qa/SKILL.md health rubric', @@ -297,9 +309,10 @@ ${section}`); tier: 'llm-judge', passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, judge_reasoning: scores.reasoning, + costs: judgeCosts(meta), }); expect(scores.clarity).toBeGreaterThanOrEqual(4); @@ -332,7 +345,7 @@ describeEval('Cross-skill consistency evals', () => { extractGrepLines(retroContent, 'retro/SKILL.md'), ].join('\n\n'); - const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently. + const { result, meta } = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently. INTENDED ARCHITECTURE: - greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md) @@ -355,7 +368,7 @@ Evaluate consistency. Respond with ONLY valid JSON: score (1-5): 5 = perfectly consistent, 1 = contradictory`); - console.log('Cross-skill consistency:', JSON.stringify(result, null, 2)); + console.log('Cross-skill consistency:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : ''); evalCollector?.addTest({ name: 'cross-skill greptile consistency', @@ -363,9 +376,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`); tier: 'llm-judge', passed: result.consistent && result.score >= 4, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { consistency_score: result.score }, judge_reasoning: result.reasoning, + costs: judgeCosts(meta), }); expect(result.consistent).toBe(true); @@ -392,7 +406,7 @@ describeEval('Baseline score pinning', () => { const cmdStart = skillContent.indexOf('## Command Reference'); const cmdEnd = skillContent.indexOf('## Tips'); const cmdSection = skillContent.slice(cmdStart, cmdEnd); - const cmdScores = await judge('command reference table', cmdSection); + const { result: cmdScores, meta } = await judge('command reference table', cmdSection); for (const dim of ['clarity', 'completeness', 'actionability'] as const) { if (cmdScores[dim] < baselines.command_reference[dim]) { @@ -417,9 +431,10 @@ describeEval('Baseline score pinning', () => { tier: 'llm-judge', passed, duration_ms: Date.now() - t0, - cost_usd: 0.02, + cost_usd: judgeCost(meta), judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability }, judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '), + costs: judgeCosts(meta), }); if (!passed) { From daea165333311848afcfd58aebaf711a71aff0b5 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 16:47:41 -0500 Subject: [PATCH 3/8] feat: add eval:trend CLI for per-test pass rate tracking computeTrends() classifies tests as stable-pass/stable-fail/flaky/ improving/degrading based on pass rate, flip count, and recent streak. gstack eval trend shows sparkline table with --limit, --tier, --test filters. Guard CLI main block with import.meta.main to prevent execution on import. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/cli-eval.ts | 192 ++++++++++++++++++++++++++++++++++- package.json | 1 + test/lib-eval-trend.test.ts | 193 ++++++++++++++++++++++++++++++++++++ 3 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 test/lib-eval-trend.test.ts diff --git a/lib/cli-eval.ts b/lib/cli-eval.ts index df16d033..bee75ae0 100644 --- a/lib/cli-eval.ts +++ b/lib/cli-eval.ts @@ -258,6 +258,7 @@ async function cmdSummary(args: string[]): Promise { if (flakyTests.length > 0) { console.log(` Flaky tests (${flakyTests.length}):`); for (const name of flakyTests) console.log(` - ${name}`); + console.log(` Run 'bun run eval:trend' for detailed time series.`); console.log('─'.repeat(60)); } @@ -429,6 +430,191 @@ async function cmdWatch(): Promise { process.exit(exitCode); } +// --- Trend tracking --- + +export interface TestTrend { + name: string; + tier: string; + results: Array<{ timestamp: string; passed: boolean }>; + passRate: number; + streak: { type: 'pass' | 'fail'; count: number }; + flipCount: number; + status: 'stable-pass' | 'stable-fail' | 'flaky' | 'improving' | 'degrading'; +} + +/** + * Compute per-test pass rate trends from eval results. + * Pure function — no I/O. Results are ordered chronologically (oldest first). + */ +export function computeTrends( + results: EvalResult[], + filterTier?: string, + filterTest?: string, +): TestTrend[] { + // Build time series per test (chronological — oldest first) + const byTest = new Map>(); + + // Results from loadEvalResults are newest-first, so reverse for chronological + const chronological = [...results].reverse(); + + for (const r of chronological) { + if (filterTier && r.tier !== filterTier) continue; + for (const t of r.tests) { + if (filterTest && t.name !== filterTest) continue; + const key = `${r.tier}:${t.name}`; + if (!byTest.has(key)) byTest.set(key, []); + byTest.get(key)!.push({ timestamp: r.timestamp, passed: t.passed }); + } + } + + const trends: TestTrend[] = []; + + for (const [key, results] of byTest) { + const [tier, ...nameParts] = key.split(':'); + const name = nameParts.join(':'); + const total = results.length; + const passCount = results.filter(r => r.passed).length; + const passRate = total > 0 ? passCount / total : 0; + + // Streak: walk from newest (end of array) backward + let streakType: 'pass' | 'fail' = results[results.length - 1].passed ? 'pass' : 'fail'; + let streakCount = 0; + for (let i = results.length - 1; i >= 0; i--) { + const r = results[i].passed ? 'pass' : 'fail'; + if (r === streakType) streakCount++; + else break; + } + + // Flip count: transitions between pass and fail + let flipCount = 0; + for (let i = 1; i < results.length; i++) { + if (results[i].passed !== results[i - 1].passed) flipCount++; + } + + // Classify status + let status: TestTrend['status']; + const last3 = results.slice(-3); + const earlier = results.slice(0, -3); + const last3AllPass = last3.length >= 3 && last3.every(r => r.passed); + const last3HasFail = last3.some(r => !r.passed); + const earlierHadFailures = earlier.some(r => !r.passed); + const earlierWasPassing = earlier.length > 0 && earlier.every(r => r.passed); + + // Check improving/degrading first — a clear recent trend outranks raw pass rate + if (last3AllPass && earlierHadFailures) { + status = 'improving'; + } else if (last3HasFail && earlierWasPassing) { + status = 'degrading'; + } else if (flipCount >= 3 || (passRate > 0.3 && passRate < 0.7)) { + status = 'flaky'; + } else if (passRate >= 0.9 && flipCount <= 1) { + status = 'stable-pass'; + } else if (passRate <= 0.1 && flipCount <= 1) { + status = 'stable-fail'; + } else if (passRate >= 0.5) { + status = 'stable-pass'; + } else { + status = 'stable-fail'; + } + + trends.push({ + name, tier, results, passRate, + streak: { type: streakType, count: streakCount }, + flipCount, status, + }); + } + + // Sort: flaky first, then flipCount desc, then name + trends.sort((a, b) => { + const statusOrder = { flaky: 0, degrading: 1, improving: 2, 'stable-fail': 3, 'stable-pass': 4 }; + const sa = statusOrder[a.status] ?? 5; + const sb = statusOrder[b.status] ?? 5; + if (sa !== sb) return sa - sb; + if (a.flipCount !== b.flipCount) return b.flipCount - a.flipCount; + return a.name.localeCompare(b.name); + }); + + return trends; +} + +async function cmdTrend(args: string[]): Promise { + let limit = 10; + let filterTier: string | undefined; + let filterTest: string | undefined; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } + else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; } + else if (args[i] === '--test' && args[i + 1]) { filterTest = args[++i]; } + } + + const results = loadEvalResults(undefined, limit); + if (results.length === 0) { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + return; + } + + const trends = computeTrends(results, filterTier, filterTest); + + if (trends.length === 0) { + console.log('No test data matching filters.'); + return; + } + + // Determine how many result columns to show + const maxResults = Math.min(limit, Math.max(...trends.map(t => t.results.length))); + + console.log(''); + console.log(`Test Trends (last ${results.length} runs)`); + console.log('═'.repeat(80)); + console.log( + ' ' + + 'Test Name'.padEnd(36) + + 'Rate'.padEnd(7) + + `Last ${maxResults}`.padEnd(maxResults + 3) + + 'Streak'.padEnd(8) + + 'Status' + ); + console.log('─'.repeat(80)); + + let flakyCount = 0; + let degradingCount = 0; + + for (const t of trends) { + if (t.status === 'flaky') flakyCount++; + if (t.status === 'degrading') degradingCount++; + + const fullName = `${t.tier}:${t.name}`; + const displayName = fullName.length > 34 ? fullName.slice(0, 31) + '...' : fullName.padEnd(36); + const rate = `${Math.round(t.passRate * 100)}%`.padEnd(7); + + // Build sparkline of last N results + const sparkline = t.results + .slice(-maxResults) + .map(r => r.passed ? '\u2713' : '\u2717') + .join(''); + + const streak = `${t.streak.count}${t.streak.type === 'pass' ? '\u2713' : '\u2717'}`.padEnd(8); + + // Color status + let statusStr = t.status; + if (isTTY) { + if (t.status === 'flaky' || t.status === 'degrading') statusStr = red(t.status); + else if (t.status === 'stable-pass' || t.status === 'improving') statusStr = green(t.status); + else statusStr = dim(t.status); + } + + console.log(` ${displayName}${rate}${sparkline.padEnd(maxResults + 3)}${streak}${statusStr}`); + } + + console.log('─'.repeat(80)); + const parts: string[] = [`${trends.length} tests tracked`]; + if (flakyCount > 0) parts.push(`${flakyCount} flaky`); + if (degradingCount > 0) parts.push(`${degradingCount} degrading`); + console.log(` ${parts.join(' | ')}`); + console.log(''); +} + function printUsage(): void { console.log(` gstack eval — eval management CLI @@ -441,13 +627,15 @@ Commands: summary [--limit N] Aggregate stats across all runs push Validate + save + sync an eval result cost Show per-model cost breakdown + trend [--limit N] [--tier X] [--test X] Per-test pass rate trends cache read|write|stats|clear|verify Manage eval cache watch Live E2E test dashboard `); } -// --- Main --- +// --- Main (only when run directly, not imported) --- +if (import.meta.main) { const command = process.argv[2]; const cmdArgs = process.argv.slice(3); @@ -457,6 +645,7 @@ switch (command) { case 'summary': cmdSummary(cmdArgs); break; case 'push': cmdPush(cmdArgs); break; case 'cost': cmdCost(cmdArgs); break; + case 'trend': cmdTrend(cmdArgs); break; case 'cache': cmdCache(cmdArgs); break; case 'watch': cmdWatch(); break; case '--help': case '-h': case 'help': case undefined: @@ -467,3 +656,4 @@ switch (command) { printUsage(); process.exit(1); } +} diff --git a/package.json b/package.json index 18090e7d..da816815 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "eval:list": "bun run lib/cli-eval.ts list", "eval:compare": "bun run lib/cli-eval.ts compare", "eval:summary": "bun run lib/cli-eval.ts summary", + "eval:trend": "bun run lib/cli-eval.ts trend", "eval:watch": "bun run lib/cli-eval.ts watch" }, "dependencies": { diff --git a/test/lib-eval-trend.test.ts b/test/lib-eval-trend.test.ts new file mode 100644 index 00000000..c15aa149 --- /dev/null +++ b/test/lib-eval-trend.test.ts @@ -0,0 +1,193 @@ +/** + * Tests for computeTrends() — per-test pass rate trend tracking. + */ + +import { describe, test, expect } from 'bun:test'; +import { computeTrends } from '../lib/cli-eval'; +import type { EvalResult } from './helpers/eval-store'; + +/** Build a minimal EvalResult with given tests. */ +function makeRun(opts: { + timestamp: string; + tier?: 'e2e' | 'llm-judge'; + tests: Array<{ name: string; passed: boolean }>; +}): EvalResult { + return { + schema_version: 1, + version: '0.3.3', + branch: 'main', + git_sha: 'abc', + timestamp: opts.timestamp, + hostname: 'test', + tier: opts.tier || 'e2e', + total_tests: opts.tests.length, + passed: opts.tests.filter(t => t.passed).length, + failed: opts.tests.filter(t => !t.passed).length, + total_cost_usd: 0, + total_duration_ms: 0, + tests: opts.tests.map(t => ({ + name: t.name, suite: 'test', tier: opts.tier || 'e2e' as const, + passed: t.passed, duration_ms: 0, cost_usd: 0, + })), + }; +} + +describe('computeTrends', () => { + test('classifies stable-pass test correctly', () => { + // 10 runs all passing — results are newest-first (loadEvalResults order) + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'always-pass', passed: true }], + })).reverse(); // newest first + + const trends = computeTrends(results); + expect(trends).toHaveLength(1); + expect(trends[0].status).toBe('stable-pass'); + expect(trends[0].passRate).toBe(1); + expect(trends[0].streak).toEqual({ type: 'pass', count: 10 }); + expect(trends[0].flipCount).toBe(0); + }); + + test('classifies stable-fail test correctly', () => { + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'always-fail', passed: false }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('stable-fail'); + expect(trends[0].passRate).toBe(0); + expect(trends[0].streak).toEqual({ type: 'fail', count: 10 }); + }); + + test('classifies flaky test correctly — alternating pass/fail', () => { + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'flaky', passed: i % 2 === 0 }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('flaky'); + expect(trends[0].flipCount).toBe(9); + expect(trends[0].passRate).toBe(0.5); + }); + + test('classifies improving test correctly', () => { + // First 5 fail, last 5 pass + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'improving', passed: i >= 5 }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('improving'); + expect(trends[0].streak).toEqual({ type: 'pass', count: 5 }); + }); + + test('classifies degrading test correctly', () => { + // First 7 pass, last 3 fail + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'degrading', passed: i < 7 }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('degrading'); + expect(trends[0].streak).toEqual({ type: 'fail', count: 3 }); + }); + + test('computes streak correctly with mixed ending', () => { + // pass, pass, fail, pass, pass, pass (newest) + const passed = [true, true, false, true, true, true]; + const results = passed.map((p, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'test', passed: p }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].streak).toEqual({ type: 'pass', count: 3 }); + }); + + test('computes flipCount correctly', () => { + // pass, fail, pass, pass, fail, pass → 4 flips + const passed = [true, false, true, true, false, true]; + const results = passed.map((p, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'test', passed: p }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].flipCount).toBe(4); + }); + + test('handles single run', () => { + const results = [makeRun({ + timestamp: '2026-03-15T00:00:00Z', + tests: [{ name: 'single', passed: true }], + })]; + + const trends = computeTrends(results); + expect(trends).toHaveLength(1); + expect(trends[0].passRate).toBe(1); + expect(trends[0].streak).toEqual({ type: 'pass', count: 1 }); + expect(trends[0].flipCount).toBe(0); + expect(trends[0].status).toBe('stable-pass'); + }); + + test('handles single failing run', () => { + const results = [makeRun({ + timestamp: '2026-03-15T00:00:00Z', + tests: [{ name: 'single-fail', passed: false }], + })]; + + const trends = computeTrends(results); + expect(trends[0].status).toBe('stable-fail'); + }); + + test('filters by tier', () => { + const results = [ + makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'e2e', tests: [{ name: 'e2e-test', passed: true }] }), + makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'llm-judge', tests: [{ name: 'judge-test', passed: true }] }), + ]; + + const e2eOnly = computeTrends(results, 'e2e'); + expect(e2eOnly).toHaveLength(1); + expect(e2eOnly[0].name).toBe('e2e-test'); + + const judgeOnly = computeTrends(results, 'llm-judge'); + expect(judgeOnly).toHaveLength(1); + expect(judgeOnly[0].name).toBe('judge-test'); + }); + + test('filters by test name', () => { + const results = Array.from({ length: 3 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [ + { name: 'test-a', passed: true }, + { name: 'test-b', passed: false }, + ], + })).reverse(); + + const filtered = computeTrends(results, undefined, 'test-a'); + expect(filtered).toHaveLength(1); + expect(filtered[0].name).toBe('test-a'); + expect(filtered[0].passRate).toBe(1); + }); + + test('sorts flaky tests first', () => { + // Create runs where test-a is flaky and test-b is stable + const results = Array.from({ length: 6 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [ + { name: 'test-a', passed: i % 2 === 0 }, // flaky: alternating + { name: 'test-b', passed: true }, // stable-pass + ], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].name).toBe('test-a'); + expect(trends[0].status).toBe('flaky'); + expect(trends[1].name).toBe('test-b'); + expect(trends[1].status).toBe('stable-pass'); + }); +}); From 33c95528702bec20cce57f7c47b33bd252575402 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 16:47:46 -0500 Subject: [PATCH 4/8] chore: update gitignore Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cc41a3e7..37f571b6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ bun.lock .env.local .env.* !.env.example +.gstack-sync.json From e28033353dd64cc7958f8c95cac83114559b03f0 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 16:55:34 -0500 Subject: [PATCH 5/8] chore: bump v0.3.10, update CHANGELOG and docs Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 18 ++++++++++++++++++ CLAUDE.md | 1 + CONTRIBUTING.md | 5 ++++- VERSION | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c571e6e..b040306b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Changelog +## 0.3.10 — 2026-03-15 + +### Added +- **Per-model cost tracking** — eval results now include `costs[]` with exact per-model token usage (input, output, cache read, cache creation) and API-reported cost. Extracted from `resultLine.modelUsage` in `claude -p` NDJSON stream. `computeCosts()` prefers exact `cost_usd` over MODEL_PRICING estimates (~4x more accurate with prompt caching). +- **LLM judge caching** — SHA-based caching for LLM-as-judge eval calls via `eval-cache.ts`. Cache keyed by `model:prompt`, so unchanged SKILL.md content skips API calls entirely. ~$0.18/run savings. Set `EVAL_CACHE=0` to force re-run. +- **Dynamic model selection** — `EVAL_JUDGE_TIER` env var controls which Claude model runs judge evals (haiku/sonnet/opus, default: sonnet). `EVAL_TIER` pins the E2E test model via `--model` flag to `claude -p`. +- **`bun run eval:trend`** — per-test pass rate tracking over last N runs. Classifies tests as stable-pass, stable-fail, flaky, improving, or degrading. Sparkline table with `--limit`, `--tier`, `--test` filters. Answers "is /retro getting more reliable?" instantly. +- **CostEntry extended** — `cache_read_input_tokens`, `cache_creation_input_tokens`, `cost_usd` optional fields for accurate cache-aware cost reporting. +- 22 new tests: 10 cache/tier integration (llm-judge.test.ts), 12 trend classification (lib-eval-trend.test.ts). + +### Changed +- `callJudge()` and `judge()` now return `{ result, meta }` with `JudgeMeta` (model, tokens, cached flag). `outcomeJudge()` retains simple return type for E2E callers. +- `EvalCollector.finalize()` aggregates per-test `costs[]` into result-level cost breakdown. +- `cli-eval.ts` main block guarded with `import.meta.main` to prevent execution on import. +- `eval:summary` now hints to run `eval:trend` when flaky tests are detected. +- All 8 LLM eval test sites updated from hard-coded `cost_usd: 0.02` to real API-reported costs. +- Regression test refactored from direct `Anthropic()` client to `callJudge()` (benefits from cache + tier). + ## 0.3.9 — 2026-03-15 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index c6909357..681566b3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,6 +15,7 @@ bun run dev:skill # watch mode: auto-regen + validate on change bun run eval:list # list all eval runs from ~/.gstack-dev/evals/ bun run eval:compare # compare two eval runs (auto-picks most recent) bun run eval:summary # aggregate stats across all eval runs +bun run eval:trend # per-test pass rate trends (flaky detection) ``` `test:evals` requires `ANTHROPIC_API_KEY`. E2E tests stream progress in real-time diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 34e502ea..0116be43 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -134,6 +134,8 @@ When E2E tests run, they produce machine-readable artifacts in `~/.gstack-dev/`: bun run eval:list # list all eval runs bun run eval:compare # compare two runs (auto-picks most recent) bun run eval:summary # aggregate stats across all runs +bun run eval:trend # per-test pass rate over last N runs (flaky detection) +bun run eval:cache stats # check LLM judge cache hit rate ``` Artifacts are never cleaned up — they accumulate in `~/.gstack-dev/` for post-mortem debugging and trend analysis. @@ -152,7 +154,8 @@ Each dimension is scored 1-5. Threshold: every dimension must score **≥ 4**. T # Needs ANTHROPIC_API_KEY in .env — included in bun run test:evals ``` -- Uses `claude-sonnet-4-6` for scoring stability +- Model defaults to `claude-sonnet-4-6`; override with `EVAL_JUDGE_TIER=haiku|opus` +- Results are SHA-cached — unchanged SKILL.md content skips API calls ($0 on repeat runs). Set `EVAL_CACHE=0` to force re-run. - Tests live in `test/skill-llm-eval.test.ts` - Calls the Anthropic API directly (not `claude -p`), so it works from anywhere including inside Claude Code diff --git a/VERSION b/VERSION index 940ac09a..5503126d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.9 +0.3.10 From eb7ef2153b8b299b942c17ffc1f26e8996471d9e Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 17:04:49 -0500 Subject: [PATCH 6/8] docs: add setup comments to .gstack-sync.json.example Explain what team sync gives you, that it's optional, and how to set it up. Points to TEAM_COORDINATION_STORE.md for full guide. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gstack-sync.json.example | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gstack-sync.json.example b/.gstack-sync.json.example index 4803eb42..6dc6dce7 100644 --- a/.gstack-sync.json.example +++ b/.gstack-sync.json.example @@ -1,4 +1,9 @@ { + "_comment": "OPTIONAL: Team sync configuration for shared eval/retro/QA data via Supabase.", + "_docs": "See docs/designs/TEAM_COORDINATION_STORE.md for full setup guide.", + "_what_you_get": "Shared eval dashboards, cross-team trend tracking, retro aggregation, QA report history. Without this file, everything works locally — sync is purely additive.", + "_setup": "1. Create a Supabase project. 2. Run supabase/migrations/*.sql in order. 3. Copy this file to .gstack-sync.json and fill in your values. 4. Set GSTACK_SUPABASE_ACCESS_TOKEN or run gstack sync login.", + "supabase_url": "https://YOUR_PROJECT.supabase.co", "supabase_anon_key": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.YOUR_ANON_KEY_HERE", "team_slug": "your-team-name" From 14320469b012830fcc046ba86eb32b95a4f064c0 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 17:05:45 -0500 Subject: [PATCH 7/8] docs: CHANGELOG covers full branch scope including team sync Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b040306b..b4151b1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,20 +3,25 @@ ## 0.3.10 — 2026-03-15 ### Added -- **Per-model cost tracking** — eval results now include `costs[]` with exact per-model token usage (input, output, cache read, cache creation) and API-reported cost. Extracted from `resultLine.modelUsage` in `claude -p` NDJSON stream. `computeCosts()` prefers exact `cost_usd` over MODEL_PRICING estimates (~4x more accurate with prompt caching). +- **Team sync via Supabase (optional)** — shared data store for eval results, retro snapshots, QA reports, ship logs, and Greptile triage across team members. All sync operations are non-fatal and non-blocking — skills never wait on network. Offline queue with automatic retry (up to 5 attempts). Zero impact when not configured: without `.gstack-sync.json`, everything works locally as before. See `docs/designs/TEAM_COORDINATION_STORE.md` for architecture and setup. +- **Supabase migration SQL** — 4 migration files in `supabase/migrations/` for teams, eval_runs, data tables (retros, QA, ships, Greptile), and eval costs. Row-level security policies ensure team members can only access their own team's data. +- **Sync config + auth** — `.gstack-sync.json` for project-level config (Supabase URL, anon key, team slug). `~/.gstack/auth.json` for user-level tokens (keyed by Supabase URL for multi-team support). `GSTACK_SUPABASE_ACCESS_TOKEN` env var for CI/automation. Token refresh built in. +- **`gstack sync` CLI** — `status`, `push`, `pull`, `drain`, `login`, `logout` subcommands for managing team sync. +- **Universal eval format** — `StandardEvalResult` schema with validation, normalization, and bidirectional legacy conversion. Any language can produce JSON matching this format and push via `gstack eval push`. +- **Unified eval CLI** — `gstack eval list|compare|summary|trend|push|cost|cache|watch` consolidating all eval tools into one entry point. +- **Per-model cost tracking** — eval results now include `costs[]` with exact per-model token usage (input, output, cache read, cache creation) and API-reported cost. Extracted from `resultLine.modelUsage` in the `claude -p` NDJSON stream. `computeCosts()` prefers exact `cost_usd` over MODEL_PRICING estimates (~4x more accurate with prompt caching). - **LLM judge caching** — SHA-based caching for LLM-as-judge eval calls via `eval-cache.ts`. Cache keyed by `model:prompt`, so unchanged SKILL.md content skips API calls entirely. ~$0.18/run savings. Set `EVAL_CACHE=0` to force re-run. - **Dynamic model selection** — `EVAL_JUDGE_TIER` env var controls which Claude model runs judge evals (haiku/sonnet/opus, default: sonnet). `EVAL_TIER` pins the E2E test model via `--model` flag to `claude -p`. -- **`bun run eval:trend`** — per-test pass rate tracking over last N runs. Classifies tests as stable-pass, stable-fail, flaky, improving, or degrading. Sparkline table with `--limit`, `--tier`, `--test` filters. Answers "is /retro getting more reliable?" instantly. -- **CostEntry extended** — `cache_read_input_tokens`, `cache_creation_input_tokens`, `cost_usd` optional fields for accurate cache-aware cost reporting. -- 22 new tests: 10 cache/tier integration (llm-judge.test.ts), 12 trend classification (lib-eval-trend.test.ts). +- **`bun run eval:trend`** — per-test pass rate tracking over last N runs. Classifies tests as stable-pass, stable-fail, flaky, improving, or degrading. Sparkline table with `--limit`, `--tier`, `--test` filters. +- **Shared utilities** — `lib/util.ts` extracted with `atomicWriteJSON`, `readJSON`, `getGitInfo`, `getRemoteSlug`, `listEvalFiles`, `loadEvalResults`, `formatTimestamp`, and path constants. +- 52+ new tests across eval cache, cost, format, tier, trend, sync config, sync client, and LLM judge integration. ### Changed - `callJudge()` and `judge()` now return `{ result, meta }` with `JudgeMeta` (model, tokens, cached flag). `outcomeJudge()` retains simple return type for E2E callers. -- `EvalCollector.finalize()` aggregates per-test `costs[]` into result-level cost breakdown. +- `EvalCollector.finalize()` aggregates per-test `costs[]` into result-level cost breakdown and attempts team sync (non-blocking). - `cli-eval.ts` main block guarded with `import.meta.main` to prevent execution on import. - `eval:summary` now hints to run `eval:trend` when flaky tests are detected. - All 8 LLM eval test sites updated from hard-coded `cost_usd: 0.02` to real API-reported costs. -- Regression test refactored from direct `Anthropic()` client to `callJudge()` (benefits from cache + tier). ## 0.3.9 — 2026-03-15 From 704fe34e98ea006008e79f89dd471ba90c0aa2b8 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 17:06:51 -0500 Subject: [PATCH 8/8] docs: clean up sync example, add team sync section to README Remove _comment hacks from JSON example file. Add short team sync section to README explaining what it is, that it's optional, and how to set it up. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gstack-sync.json.example | 5 ----- README.md | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.gstack-sync.json.example b/.gstack-sync.json.example index 6dc6dce7..4803eb42 100644 --- a/.gstack-sync.json.example +++ b/.gstack-sync.json.example @@ -1,9 +1,4 @@ { - "_comment": "OPTIONAL: Team sync configuration for shared eval/retro/QA data via Supabase.", - "_docs": "See docs/designs/TEAM_COORDINATION_STORE.md for full setup guide.", - "_what_you_get": "Shared eval dashboards, cross-team trend tracking, retro aggregation, QA report history. Without this file, everything works locally — sync is purely additive.", - "_setup": "1. Create a Supabase project. 2. Run supabase/migrations/*.sql in order. 3. Copy this file to .gstack-sync.json and fill in your values. 4. Set GSTACK_SUPABASE_ACCESS_TOKEN or run gstack sync login.", - "supabase_url": "https://YOUR_PROJECT.supabase.co", "supabase_anon_key": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.YOUR_ANON_KEY_HERE", "team_slug": "your-team-name" diff --git a/README.md b/README.md index 27548066..9e23d11d 100644 --- a/README.md +++ b/README.md @@ -629,6 +629,12 @@ bun run eval:watch # live dashboard during E2E runs E2E tests stream real-time progress, write machine-readable diagnostics, and persist partial results that survive kills. See CONTRIBUTING.md for the full eval infrastructure. +### Team sync (optional) + +For teams, gstack can sync eval results, retro snapshots, QA reports, and ship logs to a shared Supabase store. Without this, everything works locally as before — sync is purely additive. + +To set up: copy `.gstack-sync.json.example` to `.gstack-sync.json`, create a Supabase project, run the migrations in `supabase/migrations/`, and fill in your credentials. See `docs/designs/TEAM_COORDINATION_STORE.md` for the full guide. + ## License MIT