feat: wire costs[] from modelUsage into eval results

Extract per-model token usage from resultLine.modelUsage (including cache tokens and exact API cost), flow CostEntry[] through EvalCollector, aggregate in finalize(). Extend CostEntry with cache_read_input_tokens, cache_creation_input_tokens, cost_usd. computeCosts() prefers exact cost_usd over MODEL_PRICING when available (~4x more accurate with prompt caching). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 14:06:42 +02:00 · 2026-03-15 16:47:27 -05:00
parent 4ad73f7362
commit 02925cfc7a
7 changed files with 170 additions and 7 deletions
@@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } {
 export function computeCosts(costs: CostEntry[]): CostDashboard {
  const byModel = new Map<string, CostSummary>();

+  // Track exact cost_usd sums per model (from API-provided costs)
+  const exactCosts = new Map<string, number>();
+
  for (const entry of costs) {
    const existing = byModel.get(entry.model);
    if (existing) {
@@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
        estimated_cost_usd: 0,
      });
    }
+    if (entry.cost_usd !== undefined) {
+      exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd);
+    }
  }

-  // Calculate costs
+  // Calculate costs — prefer exact cost_usd (accounts for cache discounts)
  let total = 0;
  let atFast = 0;
  let atFull = 0;
@@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
  const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;

  for (const summary of byModel.values()) {
-    const pricing = getPricing(summary.model);
-    summary.estimated_cost_usd =
-      (summary.input_tokens / 1_000_000) * pricing.input +
-      (summary.output_tokens / 1_000_000) * pricing.output;
+    const exact = exactCosts.get(summary.model);
+    if (exact !== undefined) {
+      summary.estimated_cost_usd = exact;
+    } else {
+      const pricing = getPricing(summary.model);
+      summary.estimated_cost_usd =
+        (summary.input_tokens / 1_000_000) * pricing.input +
+        (summary.output_tokens / 1_000_000) * pricing.output;
+    }
    total += summary.estimated_cost_usd;

-    // What-if at fast/full tiers
+    // What-if at fast/full tiers (always from token counts)
    atFast +=
      (summary.input_tokens / 1_000_000) * fastPricing.input +
      (summary.output_tokens / 1_000_000) * fastPricing.output;
@@ -15,6 +15,10 @@ export interface CostEntry {
  calls: number;
  input_tokens: number;
  output_tokens: number;
+  cache_read_input_tokens?: number;
+  cache_creation_input_tokens?: number;
+  /** Exact cost from API when available (accounts for cache discounts). */
+  cost_usd?: number;
 }

 export interface FailureEntry {