diff --git a/lib/eval-cost.ts b/lib/eval-cost.ts index 1dbe31c8..ac520c88 100644 --- a/lib/eval-cost.ts +++ b/lib/eval-cost.ts @@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } { export function computeCosts(costs: CostEntry[]): CostDashboard { const byModel = new Map(); + // Track exact cost_usd sums per model (from API-provided costs) + const exactCosts = new Map(); + for (const entry of costs) { const existing = byModel.get(entry.model); if (existing) { @@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard { estimated_cost_usd: 0, }); } + if (entry.cost_usd !== undefined) { + exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd); + } } - // Calculate costs + // Calculate costs — prefer exact cost_usd (accounts for cache discounts) let total = 0; let atFast = 0; let atFull = 0; @@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard { const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING; for (const summary of byModel.values()) { - const pricing = getPricing(summary.model); - summary.estimated_cost_usd = - (summary.input_tokens / 1_000_000) * pricing.input + - (summary.output_tokens / 1_000_000) * pricing.output; + const exact = exactCosts.get(summary.model); + if (exact !== undefined) { + summary.estimated_cost_usd = exact; + } else { + const pricing = getPricing(summary.model); + summary.estimated_cost_usd = + (summary.input_tokens / 1_000_000) * pricing.input + + (summary.output_tokens / 1_000_000) * pricing.output; + } total += summary.estimated_cost_usd; - // What-if at fast/full tiers + // What-if at fast/full tiers (always from token counts) atFast += (summary.input_tokens / 1_000_000) * fastPricing.input + (summary.output_tokens / 1_000_000) * fastPricing.output; diff --git a/lib/eval-format.ts b/lib/eval-format.ts index 0dcc347d..6a88cac2 100644 --- a/lib/eval-format.ts +++ b/lib/eval-format.ts @@ -15,6 +15,10 @@ export interface CostEntry { calls: number; input_tokens: number; output_tokens: number; + cache_read_input_tokens?: number; + cache_creation_input_tokens?: number; + /** Exact cost from API when available (accounts for cache discounts). */ + cost_usd?: number; } export interface FailureEntry { diff --git a/test/helpers/eval-store.test.ts b/test/helpers/eval-store.test.ts index a0539a0e..b0c5e742 100644 --- a/test/helpers/eval-store.test.ts +++ b/test/helpers/eval-store.test.ts @@ -128,6 +128,74 @@ describe('EvalCollector', () => { expect(data.tests).toHaveLength(0); expect(data.tier).toBe('llm-judge'); }); + + test('finalize aggregates per-test costs into result-level costs[]', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ + name: 'test-a', + costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }], + })); + collector.addTest(makeEntry({ + name: 'test-b', + costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }], + })); + collector.addTest(makeEntry({ + name: 'test-c', + costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }], + })); + + const filepath = await collector.finalize(); + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + + expect(data.costs).toBeDefined(); + expect(data.costs).toHaveLength(2); // two models + const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6'); + const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5'); + expect(sonnet).toBeDefined(); + expect(sonnet!.calls).toBe(2); + expect(sonnet!.input_tokens).toBe(300); + expect(sonnet!.output_tokens).toBe(150); + expect(sonnet!.cost_usd).toBeCloseTo(0.03); + expect(haiku).toBeDefined(); + expect(haiku!.calls).toBe(1); + expect(haiku!.cost_usd).toBeCloseTo(0.005); + }); + + test('finalize omits costs when no tests have cost data', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ name: 'no-costs' })); + const filepath = await collector.finalize(); + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + expect(data.costs).toBeUndefined(); + }); + + test('finalize aggregates cache token fields', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ + name: 'test-a', + costs: [{ + model: 'claude-sonnet-4-6', calls: 1, + input_tokens: 10, output_tokens: 50, + cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000, + cost_usd: 0.01, + }], + })); + collector.addTest(makeEntry({ + name: 'test-b', + costs: [{ + model: 'claude-sonnet-4-6', calls: 1, + input_tokens: 20, output_tokens: 100, + cache_read_input_tokens: 8000, cache_creation_input_tokens: 500, + cost_usd: 0.02, + }], + })); + + const filepath = await collector.finalize(); + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!; + expect(sonnet.cache_read_input_tokens).toBe(13000); + expect(sonnet.cache_creation_input_tokens).toBe(1500); + }); }); // --- extractToolSummary tests --- diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts index 63534322..46f1ce88 100644 --- a/test/helpers/eval-store.ts +++ b/test/helpers/eval-store.ts @@ -13,6 +13,7 @@ import * as path from 'path'; import * as os from 'os'; import { spawnSync } from 'child_process'; import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util'; +import type { CostEntry } from '../../lib/eval-format'; const SCHEMA_VERSION = 1; const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals'); @@ -50,6 +51,9 @@ export interface EvalTestEntry { detected_bugs?: string[]; missed_bugs?: string[]; + // Per-model cost breakdown + costs?: CostEntry[]; + error?: string; } @@ -67,6 +71,7 @@ export interface EvalResult { total_cost_usd: number; total_duration_ms: number; tests: EvalTestEntry[]; + costs?: CostEntry[]; // aggregate per-model cost breakdown _partial?: boolean; // true for incremental saves, absent in final } @@ -414,6 +419,25 @@ export class EvalCollector { const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0); const passed = this.tests.filter(t => t.passed).length; + // Aggregate per-model costs across all tests + const costMap = new Map(); + for (const t of this.tests) { + for (const c of t.costs || []) { + const existing = costMap.get(c.model); + if (existing) { + existing.calls += c.calls; + existing.input_tokens += c.input_tokens; + existing.output_tokens += c.output_tokens; + existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0); + existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0); + if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd; + } else { + costMap.set(c.model, { ...c }); + } + } + } + const costs = costMap.size > 0 ? [...costMap.values()] : undefined; + const result: EvalResult = { schema_version: SCHEMA_VERSION, version, @@ -428,6 +452,7 @@ export class EvalCollector { total_cost_usd: Math.round(totalCost * 100) / 100, total_duration_ms: totalDuration, tests: this.tests, + costs, }; // Write eval file diff --git a/test/helpers/session-runner.test.ts b/test/helpers/session-runner.test.ts index 812d4f8a..9a06dd66 100644 --- a/test/helpers/session-runner.test.ts +++ b/test/helpers/session-runner.test.ts @@ -93,4 +93,36 @@ describe('parseNDJSON', () => { expect(parsed.turnCount).toBe(2); expect(parsed.toolCalls).toHaveLength(0); }); + + test('resultLine preserves modelUsage for cost extraction', () => { + const lines = [ + '{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}', + JSON.stringify({ + type: 'result', subtype: 'success', total_cost_usd: 0.07, + num_turns: 1, result: 'Done.', + usage: { input_tokens: 8, output_tokens: 802 }, + modelUsage: { + 'claude-sonnet-4-6': { + inputTokens: 8, outputTokens: 802, + cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223, + costUSD: 0.07308, + }, + }, + }), + ]; + const parsed = parseNDJSON(lines); + expect(parsed.resultLine).not.toBeNull(); + expect(parsed.resultLine.modelUsage).toBeDefined(); + const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6']; + expect(usage.inputTokens).toBe(8); + expect(usage.outputTokens).toBe(802); + expect(usage.cacheReadInputTokens).toBe(88133); + expect(usage.costUSD).toBeCloseTo(0.07308); + }); + + test('resultLine without modelUsage has undefined modelUsage', () => { + const parsed = parseNDJSON(FIXTURE_LINES); + // Original fixture has no modelUsage on result line + expect(parsed.resultLine?.modelUsage).toBeUndefined(); + }); }); diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts index 33c4cf14..b04465fa 100644 --- a/test/helpers/session-runner.ts +++ b/test/helpers/session-runner.ts @@ -10,6 +10,8 @@ import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util'; +import type { CostEntry } from '../../lib/eval-format'; +import { resolveTier, tierToModel } from '../../lib/eval-tier'; const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); @@ -34,6 +36,7 @@ export interface SkillTestResult { output: string; costEstimate: CostEstimate; transcript: any[]; + costs: CostEntry[]; } const BROWSE_ERROR_PATTERNS = [ @@ -135,8 +138,11 @@ export async function runSkillTest(options: { // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to // avoid shell escaping issues. --verbose is required for stream-json mode. + // Model pinned via EVAL_TIER env var (default: sonnet). + const evalModel = tierToModel(resolveTier()); const args = [ '-p', + '--model', evalModel, '--output-format', 'stream-json', '--verbose', '--dangerously-skip-permissions', @@ -323,5 +329,21 @@ export async function runSkillTest(options: { turnsUsed, }; - return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript }; + // Extract per-model costs from resultLine.modelUsage (camelCase → snake_case) + const costs: CostEntry[] = []; + if (resultLine?.modelUsage) { + for (const [model, usage] of Object.entries(resultLine.modelUsage as Record)) { + costs.push({ + model, + calls: 1, + input_tokens: usage.inputTokens || 0, + output_tokens: usage.outputTokens || 0, + cache_read_input_tokens: usage.cacheReadInputTokens || 0, + cache_creation_input_tokens: usage.cacheCreationInputTokens || 0, + cost_usd: usage.costUSD, + }); + } + } + + return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs }; } diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 758f0d3f..19da2de4 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: exit_reason: result.exitReason, timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined, last_tool_call: lastTool, + costs: result.costs, ...extra, }); }