diff --git a/lib/eval-cost.ts b/lib/eval-cost.ts
index 1dbe31c8..ac520c88 100644
--- a/lib/eval-cost.ts
+++ b/lib/eval-cost.ts
@@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } {
 export function computeCosts(costs: CostEntry[]): CostDashboard {
   const byModel = new Map<string, CostSummary>();
 
+  // Track exact cost_usd sums per model (from API-provided costs)
+  const exactCosts = new Map<string, number>();
+
   for (const entry of costs) {
     const existing = byModel.get(entry.model);
     if (existing) {
@@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
         estimated_cost_usd: 0,
       });
     }
+    if (entry.cost_usd !== undefined) {
+      exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd);
+    }
   }
 
-  // Calculate costs
+  // Calculate costs — prefer exact cost_usd (accounts for cache discounts)
   let total = 0;
   let atFast = 0;
   let atFull = 0;
@@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
   const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
 
   for (const summary of byModel.values()) {
-    const pricing = getPricing(summary.model);
-    summary.estimated_cost_usd =
-      (summary.input_tokens / 1_000_000) * pricing.input +
-      (summary.output_tokens / 1_000_000) * pricing.output;
+    const exact = exactCosts.get(summary.model);
+    if (exact !== undefined) {
+      summary.estimated_cost_usd = exact;
+    } else {
+      const pricing = getPricing(summary.model);
+      summary.estimated_cost_usd =
+        (summary.input_tokens / 1_000_000) * pricing.input +
+        (summary.output_tokens / 1_000_000) * pricing.output;
+    }
     total += summary.estimated_cost_usd;
 
-    // What-if at fast/full tiers
+    // What-if at fast/full tiers (always from token counts)
     atFast +=
       (summary.input_tokens / 1_000_000) * fastPricing.input +
       (summary.output_tokens / 1_000_000) * fastPricing.output;
diff --git a/lib/eval-format.ts b/lib/eval-format.ts
index 0dcc347d..6a88cac2 100644
--- a/lib/eval-format.ts
+++ b/lib/eval-format.ts
@@ -15,6 +15,10 @@ export interface CostEntry {
   calls: number;
   input_tokens: number;
   output_tokens: number;
+  cache_read_input_tokens?: number;
+  cache_creation_input_tokens?: number;
+  /** Exact cost from API when available (accounts for cache discounts). */
+  cost_usd?: number;
 }
 
 export interface FailureEntry {
diff --git a/test/helpers/eval-store.test.ts b/test/helpers/eval-store.test.ts
index a0539a0e..b0c5e742 100644
--- a/test/helpers/eval-store.test.ts
+++ b/test/helpers/eval-store.test.ts
@@ -128,6 +128,74 @@ describe('EvalCollector', () => {
     expect(data.tests).toHaveLength(0);
     expect(data.tier).toBe('llm-judge');
   });
+
+  test('finalize aggregates per-test costs into result-level costs[]', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({
+      name: 'test-a',
+      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-b',
+      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-c',
+      costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }],
+    }));
+
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+
+    expect(data.costs).toBeDefined();
+    expect(data.costs).toHaveLength(2); // two models
+    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6');
+    const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5');
+    expect(sonnet).toBeDefined();
+    expect(sonnet!.calls).toBe(2);
+    expect(sonnet!.input_tokens).toBe(300);
+    expect(sonnet!.output_tokens).toBe(150);
+    expect(sonnet!.cost_usd).toBeCloseTo(0.03);
+    expect(haiku).toBeDefined();
+    expect(haiku!.calls).toBe(1);
+    expect(haiku!.cost_usd).toBeCloseTo(0.005);
+  });
+
+  test('finalize omits costs when no tests have cost data', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({ name: 'no-costs' }));
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    expect(data.costs).toBeUndefined();
+  });
+
+  test('finalize aggregates cache token fields', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({
+      name: 'test-a',
+      costs: [{
+        model: 'claude-sonnet-4-6', calls: 1,
+        input_tokens: 10, output_tokens: 50,
+        cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000,
+        cost_usd: 0.01,
+      }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-b',
+      costs: [{
+        model: 'claude-sonnet-4-6', calls: 1,
+        input_tokens: 20, output_tokens: 100,
+        cache_read_input_tokens: 8000, cache_creation_input_tokens: 500,
+        cost_usd: 0.02,
+      }],
+    }));
+
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!;
+    expect(sonnet.cache_read_input_tokens).toBe(13000);
+    expect(sonnet.cache_creation_input_tokens).toBe(1500);
+  });
 });
 
 // --- extractToolSummary tests ---
diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts
index 63534322..46f1ce88 100644
--- a/test/helpers/eval-store.ts
+++ b/test/helpers/eval-store.ts
@@ -13,6 +13,7 @@ import * as path from 'path';
 import * as os from 'os';
 import { spawnSync } from 'child_process';
 import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
+import type { CostEntry } from '../../lib/eval-format';
 
 const SCHEMA_VERSION = 1;
 const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
@@ -50,6 +51,9 @@ export interface EvalTestEntry {
   detected_bugs?: string[];
   missed_bugs?: string[];
 
+  // Per-model cost breakdown
+  costs?: CostEntry[];
+
   error?: string;
 }
 
@@ -67,6 +71,7 @@ export interface EvalResult {
   total_cost_usd: number;
   total_duration_ms: number;
   tests: EvalTestEntry[];
+  costs?: CostEntry[];  // aggregate per-model cost breakdown
   _partial?: boolean;  // true for incremental saves, absent in final
 }
 
@@ -414,6 +419,25 @@ export class EvalCollector {
     const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
     const passed = this.tests.filter(t => t.passed).length;
 
+    // Aggregate per-model costs across all tests
+    const costMap = new Map<string, CostEntry>();
+    for (const t of this.tests) {
+      for (const c of t.costs || []) {
+        const existing = costMap.get(c.model);
+        if (existing) {
+          existing.calls += c.calls;
+          existing.input_tokens += c.input_tokens;
+          existing.output_tokens += c.output_tokens;
+          existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0);
+          existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0);
+          if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd;
+        } else {
+          costMap.set(c.model, { ...c });
+        }
+      }
+    }
+    const costs = costMap.size > 0 ? [...costMap.values()] : undefined;
+
     const result: EvalResult = {
       schema_version: SCHEMA_VERSION,
       version,
@@ -428,6 +452,7 @@ export class EvalCollector {
       total_cost_usd: Math.round(totalCost * 100) / 100,
       total_duration_ms: totalDuration,
       tests: this.tests,
+      costs,
     };
 
     // Write eval file
diff --git a/test/helpers/session-runner.test.ts b/test/helpers/session-runner.test.ts
index 812d4f8a..9a06dd66 100644
--- a/test/helpers/session-runner.test.ts
+++ b/test/helpers/session-runner.test.ts
@@ -93,4 +93,36 @@ describe('parseNDJSON', () => {
     expect(parsed.turnCount).toBe(2);
     expect(parsed.toolCalls).toHaveLength(0);
   });
+
+  test('resultLine preserves modelUsage for cost extraction', () => {
+    const lines = [
+      '{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}',
+      JSON.stringify({
+        type: 'result', subtype: 'success', total_cost_usd: 0.07,
+        num_turns: 1, result: 'Done.',
+        usage: { input_tokens: 8, output_tokens: 802 },
+        modelUsage: {
+          'claude-sonnet-4-6': {
+            inputTokens: 8, outputTokens: 802,
+            cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223,
+            costUSD: 0.07308,
+          },
+        },
+      }),
+    ];
+    const parsed = parseNDJSON(lines);
+    expect(parsed.resultLine).not.toBeNull();
+    expect(parsed.resultLine.modelUsage).toBeDefined();
+    const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6'];
+    expect(usage.inputTokens).toBe(8);
+    expect(usage.outputTokens).toBe(802);
+    expect(usage.cacheReadInputTokens).toBe(88133);
+    expect(usage.costUSD).toBeCloseTo(0.07308);
+  });
+
+  test('resultLine without modelUsage has undefined modelUsage', () => {
+    const parsed = parseNDJSON(FIXTURE_LINES);
+    // Original fixture has no modelUsage on result line
+    expect(parsed.resultLine?.modelUsage).toBeUndefined();
+  });
 });
diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts
index 33c4cf14..b04465fa 100644
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@@ -10,6 +10,8 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
+import type { CostEntry } from '../../lib/eval-format';
+import { resolveTier, tierToModel } from '../../lib/eval-tier';
 
 const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
 
@@ -34,6 +36,7 @@ export interface SkillTestResult {
   output: string;
   costEstimate: CostEstimate;
   transcript: any[];
+  costs: CostEntry[];
 }
 
 const BROWSE_ERROR_PATTERNS = [
@@ -135,8 +138,11 @@ export async function runSkillTest(options: {
 
   // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
   // avoid shell escaping issues. --verbose is required for stream-json mode.
+  // Model pinned via EVAL_TIER env var (default: sonnet).
+  const evalModel = tierToModel(resolveTier());
   const args = [
     '-p',
+    '--model', evalModel,
     '--output-format', 'stream-json',
     '--verbose',
     '--dangerously-skip-permissions',
@@ -323,5 +329,21 @@ export async function runSkillTest(options: {
     turnsUsed,
   };
 
-  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
+  // Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
+  const costs: CostEntry[] = [];
+  if (resultLine?.modelUsage) {
+    for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
+      costs.push({
+        model,
+        calls: 1,
+        input_tokens: usage.inputTokens || 0,
+        output_tokens: usage.outputTokens || 0,
+        cache_read_input_tokens: usage.cacheReadInputTokens || 0,
+        cache_creation_input_tokens: usage.cacheCreationInputTokens || 0,
+        cost_usd: usage.costUSD,
+      });
+    }
+  }
+
+  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
 }
diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts
index 758f0d3f..19da2de4 100644
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?:
     exit_reason: result.exitReason,
     timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
     last_tool_call: lastTool,
+    costs: result.costs,
     ...extra,
   });
 }