feat: wire costs[] from modelUsage into eval results

Extract per-model token usage from resultLine.modelUsage (including cache tokens and exact API cost), flow CostEntry[] through EvalCollector, aggregate in finalize(). Extend CostEntry with cache_read_input_tokens, cache_creation_input_tokens, cost_usd. computeCosts() prefers exact cost_usd over MODEL_PRICING when available (~4x more accurate with prompt caching). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 21:25:27 +02:00 · 2026-03-15 16:47:27 -05:00
parent 4ad73f7362
commit 02925cfc7a
7 changed files with 170 additions and 7 deletions
@@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } {
 export function computeCosts(costs: CostEntry[]): CostDashboard {
  const byModel = new Map<string, CostSummary>();

+  // Track exact cost_usd sums per model (from API-provided costs)
+  const exactCosts = new Map<string, number>();
+
  for (const entry of costs) {
    const existing = byModel.get(entry.model);
    if (existing) {
@@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
        estimated_cost_usd: 0,
      });
    }
+    if (entry.cost_usd !== undefined) {
+      exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd);
+    }
  }

-  // Calculate costs
+  // Calculate costs — prefer exact cost_usd (accounts for cache discounts)
  let total = 0;
  let atFast = 0;
  let atFull = 0;
@@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
  const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;

  for (const summary of byModel.values()) {
-    const pricing = getPricing(summary.model);
-    summary.estimated_cost_usd =
-      (summary.input_tokens / 1_000_000) * pricing.input +
-      (summary.output_tokens / 1_000_000) * pricing.output;
+    const exact = exactCosts.get(summary.model);
+    if (exact !== undefined) {
+      summary.estimated_cost_usd = exact;
+    } else {
+      const pricing = getPricing(summary.model);
+      summary.estimated_cost_usd =
+        (summary.input_tokens / 1_000_000) * pricing.input +
+        (summary.output_tokens / 1_000_000) * pricing.output;
+    }
    total += summary.estimated_cost_usd;

-    // What-if at fast/full tiers
+    // What-if at fast/full tiers (always from token counts)
    atFast +=
      (summary.input_tokens / 1_000_000) * fastPricing.input +
      (summary.output_tokens / 1_000_000) * fastPricing.output;
@@ -15,6 +15,10 @@ export interface CostEntry {
  calls: number;
  input_tokens: number;
  output_tokens: number;
+  cache_read_input_tokens?: number;
+  cache_creation_input_tokens?: number;
+  /** Exact cost from API when available (accounts for cache discounts). */
+  cost_usd?: number;
 }

 export interface FailureEntry {
@@ -128,6 +128,74 @@ describe('EvalCollector', () => {
    expect(data.tests).toHaveLength(0);
    expect(data.tier).toBe('llm-judge');
  });
+
+  test('finalize aggregates per-test costs into result-level costs[]', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({
+      name: 'test-a',
+      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-b',
+      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-c',
+      costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }],
+    }));
+
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+
+    expect(data.costs).toBeDefined();
+    expect(data.costs).toHaveLength(2); // two models
+    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6');
+    const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5');
+    expect(sonnet).toBeDefined();
+    expect(sonnet!.calls).toBe(2);
+    expect(sonnet!.input_tokens).toBe(300);
+    expect(sonnet!.output_tokens).toBe(150);
+    expect(sonnet!.cost_usd).toBeCloseTo(0.03);
+    expect(haiku).toBeDefined();
+    expect(haiku!.calls).toBe(1);
+    expect(haiku!.cost_usd).toBeCloseTo(0.005);
+  });
+
+  test('finalize omits costs when no tests have cost data', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({ name: 'no-costs' }));
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    expect(data.costs).toBeUndefined();
+  });
+
+  test('finalize aggregates cache token fields', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({
+      name: 'test-a',
+      costs: [{
+        model: 'claude-sonnet-4-6', calls: 1,
+        input_tokens: 10, output_tokens: 50,
+        cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000,
+        cost_usd: 0.01,
+      }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-b',
+      costs: [{
+        model: 'claude-sonnet-4-6', calls: 1,
+        input_tokens: 20, output_tokens: 100,
+        cache_read_input_tokens: 8000, cache_creation_input_tokens: 500,
+        cost_usd: 0.02,
+      }],
+    }));
+
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!;
+    expect(sonnet.cache_read_input_tokens).toBe(13000);
+    expect(sonnet.cache_creation_input_tokens).toBe(1500);
+  });
 });

 // --- extractToolSummary tests ---
@@ -13,6 +13,7 @@ import * as path from 'path';
 import * as os from 'os';
 import { spawnSync } from 'child_process';
 import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
+import type { CostEntry } from '../../lib/eval-format';

 const SCHEMA_VERSION = 1;
 const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
@@ -50,6 +51,9 @@ export interface EvalTestEntry {
  detected_bugs?: string[];
  missed_bugs?: string[];

+  // Per-model cost breakdown
+  costs?: CostEntry[];
+
  error?: string;
 }

@@ -67,6 +71,7 @@ export interface EvalResult {
  total_cost_usd: number;
  total_duration_ms: number;
  tests: EvalTestEntry[];
+  costs?: CostEntry[];  // aggregate per-model cost breakdown
  _partial?: boolean;  // true for incremental saves, absent in final
 }

@@ -414,6 +419,25 @@ export class EvalCollector {
    const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
    const passed = this.tests.filter(t => t.passed).length;

+    // Aggregate per-model costs across all tests
+    const costMap = new Map<string, CostEntry>();
+    for (const t of this.tests) {
+      for (const c of t.costs || []) {
+        const existing = costMap.get(c.model);
+        if (existing) {
+          existing.calls += c.calls;
+          existing.input_tokens += c.input_tokens;
+          existing.output_tokens += c.output_tokens;
+          existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0);
+          existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0);
+          if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd;
+        } else {
+          costMap.set(c.model, { ...c });
+        }
+      }
+    }
+    const costs = costMap.size > 0 ? [...costMap.values()] : undefined;
+
    const result: EvalResult = {
      schema_version: SCHEMA_VERSION,
      version,
@@ -428,6 +452,7 @@ export class EvalCollector {
      total_cost_usd: Math.round(totalCost * 100) / 100,
      total_duration_ms: totalDuration,
      tests: this.tests,
+      costs,
    };

    // Write eval file
@@ -93,4 +93,36 @@ describe('parseNDJSON', () => {
    expect(parsed.turnCount).toBe(2);
    expect(parsed.toolCalls).toHaveLength(0);
  });
+
+  test('resultLine preserves modelUsage for cost extraction', () => {
+    const lines = [
+      '{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}',
+      JSON.stringify({
+        type: 'result', subtype: 'success', total_cost_usd: 0.07,
+        num_turns: 1, result: 'Done.',
+        usage: { input_tokens: 8, output_tokens: 802 },
+        modelUsage: {
+          'claude-sonnet-4-6': {
+            inputTokens: 8, outputTokens: 802,
+            cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223,
+            costUSD: 0.07308,
+          },
+        },
+      }),
+    ];
+    const parsed = parseNDJSON(lines);
+    expect(parsed.resultLine).not.toBeNull();
+    expect(parsed.resultLine.modelUsage).toBeDefined();
+    const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6'];
+    expect(usage.inputTokens).toBe(8);
+    expect(usage.outputTokens).toBe(802);
+    expect(usage.cacheReadInputTokens).toBe(88133);
+    expect(usage.costUSD).toBeCloseTo(0.07308);
+  });
+
+  test('resultLine without modelUsage has undefined modelUsage', () => {
+    const parsed = parseNDJSON(FIXTURE_LINES);
+    // Original fixture has no modelUsage on result line
+    expect(parsed.resultLine?.modelUsage).toBeUndefined();
+  });
 });
@@ -10,6 +10,8 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
+import type { CostEntry } from '../../lib/eval-format';
+import { resolveTier, tierToModel } from '../../lib/eval-tier';

 const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');

@@ -34,6 +36,7 @@ export interface SkillTestResult {
  output: string;
  costEstimate: CostEstimate;
  transcript: any[];
+  costs: CostEntry[];
 }

 const BROWSE_ERROR_PATTERNS = [
@@ -135,8 +138,11 @@ export async function runSkillTest(options: {

  // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
  // avoid shell escaping issues. --verbose is required for stream-json mode.
+  // Model pinned via EVAL_TIER env var (default: sonnet).
+  const evalModel = tierToModel(resolveTier());
  const args = [
    '-p',
+    '--model', evalModel,
    '--output-format', 'stream-json',
    '--verbose',
    '--dangerously-skip-permissions',
@@ -323,5 +329,21 @@ export async function runSkillTest(options: {
    turnsUsed,
  };

-  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
+  // Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
+  const costs: CostEntry[] = [];
+  if (resultLine?.modelUsage) {
+    for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
+      costs.push({
+        model,
+        calls: 1,
+        input_tokens: usage.inputTokens || 0,
+        output_tokens: usage.outputTokens || 0,
+        cache_read_input_tokens: usage.cacheReadInputTokens || 0,
+        cache_creation_input_tokens: usage.cacheCreationInputTokens || 0,
+        cost_usd: usage.costUSD,
+      });
+    }
+  }
+
+  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
 }
@@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?:
    exit_reason: result.exitReason,
    timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
    last_tool_call: lastTool,
+    costs: result.costs,
    ...extra,
  });
 }