From 02925cfc7a479b1adb397e0c8d811fed24966b2c Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 16:47:27 -0500
Subject: [PATCH 1/8] feat: wire costs[] from modelUsage into eval results

Extract per-model token usage from resultLine.modelUsage (including
cache tokens and exact API cost), flow CostEntry[] through EvalCollector,
aggregate in finalize(). Extend CostEntry with cache_read_input_tokens,
cache_creation_input_tokens, cost_usd. computeCosts() prefers exact
cost_usd over MODEL_PRICING when available (~4x more accurate with
prompt caching).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/eval-cost.ts                    | 23 +++++++---
 lib/eval-format.ts                  |  4 ++
 test/helpers/eval-store.test.ts     | 68 +++++++++++++++++++++++++++++
 test/helpers/eval-store.ts          | 25 +++++++++++
 test/helpers/session-runner.test.ts | 32 ++++++++++++++
 test/helpers/session-runner.ts      | 24 +++++++++-
 test/skill-e2e.test.ts              |  1 +
 7 files changed, 170 insertions(+), 7 deletions(-)

diff --git a/lib/eval-cost.ts b/lib/eval-cost.ts
index 1dbe31c8..ac520c88 100644
--- a/lib/eval-cost.ts
+++ b/lib/eval-cost.ts
@@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } {
 export function computeCosts(costs: CostEntry[]): CostDashboard {
   const byModel = new Map<string, CostSummary>();
 
+  // Track exact cost_usd sums per model (from API-provided costs)
+  const exactCosts = new Map<string, number>();
+
   for (const entry of costs) {
     const existing = byModel.get(entry.model);
     if (existing) {
@@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
         estimated_cost_usd: 0,
       });
     }
+    if (entry.cost_usd !== undefined) {
+      exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd);
+    }
   }
 
-  // Calculate costs
+  // Calculate costs — prefer exact cost_usd (accounts for cache discounts)
   let total = 0;
   let atFast = 0;
   let atFull = 0;
@@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
   const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
 
   for (const summary of byModel.values()) {
-    const pricing = getPricing(summary.model);
-    summary.estimated_cost_usd =
-      (summary.input_tokens / 1_000_000) * pricing.input +
-      (summary.output_tokens / 1_000_000) * pricing.output;
+    const exact = exactCosts.get(summary.model);
+    if (exact !== undefined) {
+      summary.estimated_cost_usd = exact;
+    } else {
+      const pricing = getPricing(summary.model);
+      summary.estimated_cost_usd =
+        (summary.input_tokens / 1_000_000) * pricing.input +
+        (summary.output_tokens / 1_000_000) * pricing.output;
+    }
     total += summary.estimated_cost_usd;
 
-    // What-if at fast/full tiers
+    // What-if at fast/full tiers (always from token counts)
     atFast +=
       (summary.input_tokens / 1_000_000) * fastPricing.input +
       (summary.output_tokens / 1_000_000) * fastPricing.output;
diff --git a/lib/eval-format.ts b/lib/eval-format.ts
index 0dcc347d..6a88cac2 100644
--- a/lib/eval-format.ts
+++ b/lib/eval-format.ts
@@ -15,6 +15,10 @@ export interface CostEntry {
   calls: number;
   input_tokens: number;
   output_tokens: number;
+  cache_read_input_tokens?: number;
+  cache_creation_input_tokens?: number;
+  /** Exact cost from API when available (accounts for cache discounts). */
+  cost_usd?: number;
 }
 
 export interface FailureEntry {
diff --git a/test/helpers/eval-store.test.ts b/test/helpers/eval-store.test.ts
index a0539a0e..b0c5e742 100644
--- a/test/helpers/eval-store.test.ts
+++ b/test/helpers/eval-store.test.ts
@@ -128,6 +128,74 @@ describe('EvalCollector', () => {
     expect(data.tests).toHaveLength(0);
     expect(data.tier).toBe('llm-judge');
   });
+
+  test('finalize aggregates per-test costs into result-level costs[]', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({
+      name: 'test-a',
+      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-b',
+      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-c',
+      costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }],
+    }));
+
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+
+    expect(data.costs).toBeDefined();
+    expect(data.costs).toHaveLength(2); // two models
+    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6');
+    const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5');
+    expect(sonnet).toBeDefined();
+    expect(sonnet!.calls).toBe(2);
+    expect(sonnet!.input_tokens).toBe(300);
+    expect(sonnet!.output_tokens).toBe(150);
+    expect(sonnet!.cost_usd).toBeCloseTo(0.03);
+    expect(haiku).toBeDefined();
+    expect(haiku!.calls).toBe(1);
+    expect(haiku!.cost_usd).toBeCloseTo(0.005);
+  });
+
+  test('finalize omits costs when no tests have cost data', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({ name: 'no-costs' }));
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    expect(data.costs).toBeUndefined();
+  });
+
+  test('finalize aggregates cache token fields', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({
+      name: 'test-a',
+      costs: [{
+        model: 'claude-sonnet-4-6', calls: 1,
+        input_tokens: 10, output_tokens: 50,
+        cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000,
+        cost_usd: 0.01,
+      }],
+    }));
+    collector.addTest(makeEntry({
+      name: 'test-b',
+      costs: [{
+        model: 'claude-sonnet-4-6', calls: 1,
+        input_tokens: 20, output_tokens: 100,
+        cache_read_input_tokens: 8000, cache_creation_input_tokens: 500,
+        cost_usd: 0.02,
+      }],
+    }));
+
+    const filepath = await collector.finalize();
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!;
+    expect(sonnet.cache_read_input_tokens).toBe(13000);
+    expect(sonnet.cache_creation_input_tokens).toBe(1500);
+  });
 });
 
 // --- extractToolSummary tests ---
diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts
index 63534322..46f1ce88 100644
--- a/test/helpers/eval-store.ts
+++ b/test/helpers/eval-store.ts
@@ -13,6 +13,7 @@ import * as path from 'path';
 import * as os from 'os';
 import { spawnSync } from 'child_process';
 import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
+import type { CostEntry } from '../../lib/eval-format';
 
 const SCHEMA_VERSION = 1;
 const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
@@ -50,6 +51,9 @@ export interface EvalTestEntry {
   detected_bugs?: string[];
   missed_bugs?: string[];
 
+  // Per-model cost breakdown
+  costs?: CostEntry[];
+
   error?: string;
 }
 
@@ -67,6 +71,7 @@ export interface EvalResult {
   total_cost_usd: number;
   total_duration_ms: number;
   tests: EvalTestEntry[];
+  costs?: CostEntry[];  // aggregate per-model cost breakdown
   _partial?: boolean;  // true for incremental saves, absent in final
 }
 
@@ -414,6 +419,25 @@ export class EvalCollector {
     const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
     const passed = this.tests.filter(t => t.passed).length;
 
+    // Aggregate per-model costs across all tests
+    const costMap = new Map<string, CostEntry>();
+    for (const t of this.tests) {
+      for (const c of t.costs || []) {
+        const existing = costMap.get(c.model);
+        if (existing) {
+          existing.calls += c.calls;
+          existing.input_tokens += c.input_tokens;
+          existing.output_tokens += c.output_tokens;
+          existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0);
+          existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0);
+          if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd;
+        } else {
+          costMap.set(c.model, { ...c });
+        }
+      }
+    }
+    const costs = costMap.size > 0 ? [...costMap.values()] : undefined;
+
     const result: EvalResult = {
       schema_version: SCHEMA_VERSION,
       version,
@@ -428,6 +452,7 @@ export class EvalCollector {
       total_cost_usd: Math.round(totalCost * 100) / 100,
       total_duration_ms: totalDuration,
       tests: this.tests,
+      costs,
     };
 
     // Write eval file
diff --git a/test/helpers/session-runner.test.ts b/test/helpers/session-runner.test.ts
index 812d4f8a..9a06dd66 100644
--- a/test/helpers/session-runner.test.ts
+++ b/test/helpers/session-runner.test.ts
@@ -93,4 +93,36 @@ describe('parseNDJSON', () => {
     expect(parsed.turnCount).toBe(2);
     expect(parsed.toolCalls).toHaveLength(0);
   });
+
+  test('resultLine preserves modelUsage for cost extraction', () => {
+    const lines = [
+      '{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}',
+      JSON.stringify({
+        type: 'result', subtype: 'success', total_cost_usd: 0.07,
+        num_turns: 1, result: 'Done.',
+        usage: { input_tokens: 8, output_tokens: 802 },
+        modelUsage: {
+          'claude-sonnet-4-6': {
+            inputTokens: 8, outputTokens: 802,
+            cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223,
+            costUSD: 0.07308,
+          },
+        },
+      }),
+    ];
+    const parsed = parseNDJSON(lines);
+    expect(parsed.resultLine).not.toBeNull();
+    expect(parsed.resultLine.modelUsage).toBeDefined();
+    const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6'];
+    expect(usage.inputTokens).toBe(8);
+    expect(usage.outputTokens).toBe(802);
+    expect(usage.cacheReadInputTokens).toBe(88133);
+    expect(usage.costUSD).toBeCloseTo(0.07308);
+  });
+
+  test('resultLine without modelUsage has undefined modelUsage', () => {
+    const parsed = parseNDJSON(FIXTURE_LINES);
+    // Original fixture has no modelUsage on result line
+    expect(parsed.resultLine?.modelUsage).toBeUndefined();
+  });
 });
diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts
index 33c4cf14..b04465fa 100644
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@@ -10,6 +10,8 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
+import type { CostEntry } from '../../lib/eval-format';
+import { resolveTier, tierToModel } from '../../lib/eval-tier';
 
 const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
 
@@ -34,6 +36,7 @@ export interface SkillTestResult {
   output: string;
   costEstimate: CostEstimate;
   transcript: any[];
+  costs: CostEntry[];
 }
 
 const BROWSE_ERROR_PATTERNS = [
@@ -135,8 +138,11 @@ export async function runSkillTest(options: {
 
   // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
   // avoid shell escaping issues. --verbose is required for stream-json mode.
+  // Model pinned via EVAL_TIER env var (default: sonnet).
+  const evalModel = tierToModel(resolveTier());
   const args = [
     '-p',
+    '--model', evalModel,
     '--output-format', 'stream-json',
     '--verbose',
     '--dangerously-skip-permissions',
@@ -323,5 +329,21 @@ export async function runSkillTest(options: {
     turnsUsed,
   };
 
-  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
+  // Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
+  const costs: CostEntry[] = [];
+  if (resultLine?.modelUsage) {
+    for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
+      costs.push({
+        model,
+        calls: 1,
+        input_tokens: usage.inputTokens || 0,
+        output_tokens: usage.outputTokens || 0,
+        cache_read_input_tokens: usage.cacheReadInputTokens || 0,
+        cache_creation_input_tokens: usage.cacheCreationInputTokens || 0,
+        cost_usd: usage.costUSD,
+      });
+    }
+  }
+
+  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
 }
diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts
index 758f0d3f..19da2de4 100644
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?:
     exit_reason: result.exitReason,
     timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
     last_tool_call: lastTool,
+    costs: result.costs,
     ...extra,
   });
 }

From 59752fc5101bec9622cc4277cb427dcd4bff05b9 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 16:47:35 -0500
Subject: [PATCH 2/8] feat: wire eval-cache + eval-tier into LLM judge, pin E2E
 model

callJudge/judge now return {result, meta} with SHA-based caching
(~$0.18/run savings when SKILL.md unchanged) and dynamic model
selection via EVAL_JUDGE_TIER env var. E2E tests pass --model from
EVAL_TIER to claude -p. outcomeJudge retains simple return type.
All 8 LLM eval test sites updated with real costs and costs[].

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 TODOS.md                       |   4 +-
 test/helpers/llm-judge.test.ts | 117 +++++++++++++++++++++++++++++++++
 test/helpers/llm-judge.ts      |  59 ++++++++++++++---
 test/skill-llm-eval.test.ts    |  99 ++++++++++++++++------------
 4 files changed, 227 insertions(+), 52 deletions(-)
 create mode 100644 test/helpers/llm-judge.test.ts

diff --git a/TODOS.md b/TODOS.md
index 4916c236..b5ec8ac3 100644
--- a/TODOS.md
+++ b/TODOS.md
@@ -231,7 +231,7 @@
 
 **Why:** Spot quality trends — is the app getting better or worse?
 
-**Context:** QA already writes structured reports. This adds cross-run comparison.
+**Context:** `eval:trend` now tracks test-level pass rates (eval infrastructure). QA-run-level trending (health scores over time across QA report files) is a separate feature that could reuse `computeTrends` pattern from `lib/cli-eval.ts`.
 
 **Effort:** S
 **Priority:** P2
@@ -335,6 +335,8 @@
 
 **Why:** Reduce E2E test cost and flakiness.
 
+**Status:** Model pinning shipped (session-runner.ts passes `--model` from `EVAL_TIER` env). Retry:2 still TODO.
+
 **Effort:** XS
 **Priority:** P2
 
diff --git a/test/helpers/llm-judge.test.ts b/test/helpers/llm-judge.test.ts
new file mode 100644
index 00000000..03cf7788
--- /dev/null
+++ b/test/helpers/llm-judge.test.ts
@@ -0,0 +1,117 @@
+/**
+ * Tests for LLM judge cache + tier integration.
+ * Mocks Anthropic client to avoid API calls.
+ */
+
+import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+let tmpCacheDir: string;
+const origEnv: Record<string, string | undefined> = {};
+
+beforeEach(() => {
+  tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-'));
+  // Point cache to temp dir and clear tier env vars
+  origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR;
+  origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER;
+  origEnv.EVAL_TIER = process.env.EVAL_TIER;
+  origEnv.EVAL_CACHE = process.env.EVAL_CACHE;
+  process.env.GSTACK_STATE_DIR = tmpCacheDir;
+  delete process.env.EVAL_JUDGE_TIER;
+  delete process.env.EVAL_TIER;
+  delete process.env.EVAL_CACHE;
+});
+
+afterEach(() => {
+  // Restore env
+  for (const [key, val] of Object.entries(origEnv)) {
+    if (val === undefined) delete process.env[key];
+    else process.env[key] = val;
+  }
+  try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {}
+});
+
+// Test cache key computation directly (doesn't need mock)
+describe('cache key computation', () => {
+  test('computeCacheKey produces consistent hashes for same input', async () => {
+    const { computeCacheKey } = await import('../../lib/eval-cache');
+    const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
+    const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
+    expect(key1).toBe(key2);
+    expect(key1).toHaveLength(16);
+  });
+
+  test('cache key differs when model changes', async () => {
+    const { computeCacheKey } = await import('../../lib/eval-cache');
+    const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
+    const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt');
+    expect(key1).not.toBe(key2);
+  });
+
+  test('cache key differs when prompt changes', async () => {
+    const { computeCacheKey } = await import('../../lib/eval-cache');
+    const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A');
+    const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B');
+    expect(key1).not.toBe(key2);
+  });
+});
+
+// Test cache read/write directly
+describe('cache read/write for llm-judge suite', () => {
+  test('cacheRead returns null on miss', async () => {
+    const { cacheRead } = await import('../../lib/eval-cache');
+    expect(cacheRead('llm-judge', 'nonexistent')).toBeNull();
+  });
+
+  test('cacheWrite + cacheRead round-trip', async () => {
+    const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
+    const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' };
+    cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' });
+    const cached = cacheRead('llm-judge', 'test-key');
+    expect(cached).toEqual(data);
+  });
+
+  test('EVAL_CACHE=0 bypasses cache read', async () => {
+    const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
+    cacheWrite('llm-judge', 'bypass-key', { test: true });
+    process.env.EVAL_CACHE = '0';
+    expect(cacheRead('llm-judge', 'bypass-key')).toBeNull();
+  });
+});
+
+// Test tier resolution
+describe('tier resolution for judge', () => {
+  test('defaults to standard (sonnet) when no env set', async () => {
+    const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
+    expect(resolveJudgeTier()).toBe('standard');
+    expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6');
+  });
+
+  test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => {
+    process.env.EVAL_JUDGE_TIER = 'haiku';
+    // Need fresh import to pick up env change
+    const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
+    expect(resolveJudgeTier()).toBe('fast');
+    expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5');
+  });
+
+  test('EVAL_JUDGE_TIER=opus selects full tier', async () => {
+    process.env.EVAL_JUDGE_TIER = 'opus';
+    const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
+    expect(resolveJudgeTier()).toBe('full');
+    expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6');
+  });
+});
+
+// Test JudgeMeta shape
+describe('JudgeMeta interface', () => {
+  test('exported from llm-judge module', async () => {
+    const mod = await import('./llm-judge');
+    // Verify callJudge and judge are exported functions
+    expect(typeof mod.callJudge).toBe('function');
+    expect(typeof mod.judge).toBe('function');
+    expect(typeof mod.outcomeJudge).toBe('function');
+  });
+});
diff --git a/test/helpers/llm-judge.ts b/test/helpers/llm-judge.ts
index 7040cd6c..61d6927a 100644
--- a/test/helpers/llm-judge.ts
+++ b/test/helpers/llm-judge.ts
@@ -1,13 +1,19 @@
 /**
  * Shared LLM-as-judge helpers for eval and E2E tests.
  *
- * Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
- * and outcomeJudge (planted-bug detection scorer).
+ * Provides callJudge (generic JSON-from-LLM with cache + tier support),
+ * judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
  *
- * Requires: ANTHROPIC_API_KEY env var
+ * Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
+ *
+ * Env vars:
+ *   EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
+ *   EVAL_CACHE=0    — bypass cache, always re-run
  */
 
 import Anthropic from '@anthropic-ai/sdk';
+import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
+import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';
 
 export interface JudgeScore {
   clarity: number;       // 1-5
@@ -25,15 +31,35 @@ export interface OutcomeJudgeResult {
   reasoning: string;
 }
 
+export interface JudgeMeta {
+  model: string;
+  input_tokens: number;
+  output_tokens: number;
+  cached: boolean;
+}
+
 /**
- * Call claude-sonnet-4-6 with a prompt, extract JSON response.
+ * Call the judge model with a prompt, extract JSON response.
+ * Uses eval-cache for SHA-based caching and eval-tier for model selection.
  * Retries once on 429 rate limit errors.
  */
-export async function callJudge<T>(prompt: string): Promise<T> {
+export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
+  const model = tierToModel(resolveJudgeTier());
+
+  // Check cache (keyed by model + prompt content)
+  const cacheKey = computeCacheKey([], `${model}:${prompt}`);
+  const cached = cacheRead('llm-judge', cacheKey);
+  if (cached !== null) {
+    return {
+      result: cached as T,
+      meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
+    };
+  }
+
   const client = new Anthropic();
 
   const makeRequest = () => client.messages.create({
-    model: 'claude-sonnet-4-6',
+    model,
     max_tokens: 1024,
     messages: [{ role: 'user', content: prompt }],
   });
@@ -53,13 +79,25 @@ export async function callJudge<T>(prompt: string): Promise<T> {
   const text = response.content[0].type === 'text' ? response.content[0].text : '';
   const jsonMatch = text.match(/\{[\s\S]*\}/);
   if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
-  return JSON.parse(jsonMatch[0]) as T;
+  const result = JSON.parse(jsonMatch[0]) as T;
+
+  // Write to cache
+  cacheWrite('llm-judge', cacheKey, result, { model });
+
+  const meta: JudgeMeta = {
+    model,
+    input_tokens: (response.usage as any)?.input_tokens || 0,
+    output_tokens: (response.usage as any)?.output_tokens || 0,
+    cached: false,
+  };
+
+  return { result, meta };
 }
 
 /**
  * Score documentation quality on clarity/completeness/actionability (1-5).
  */
-export async function judge(section: string, content: string): Promise<JudgeScore> {
+export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
   return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
 
 The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
@@ -92,12 +130,14 @@ ${content}`);
 /**
  * Evaluate a QA report against planted-bug ground truth.
  * Returns detection metrics for the planted bugs.
+ * Note: outcomeJudge returns just the result (not meta) for backward compat
+ * with E2E test callers. Cache still works internally.
  */
 export async function outcomeJudge(
   groundTruth: any,
   report: string,
 ): Promise<OutcomeJudgeResult> {
-  return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
+  const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
 
 GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
 ${JSON.stringify(groundTruth.bugs, null, 2)}
@@ -127,4 +167,5 @@ Rules:
 - detection_rate = length of detected array
 - evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
   5 = excellent evidence for every bug, 1 = no evidence at all`);
+  return result;
 }
diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts
index ba635613..2889538c 100644
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -7,16 +7,18 @@
  * Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set)
  * Run: EVALS=1 bun run test:eval
  *
- * Cost: ~$0.05-0.15 per run (sonnet)
+ * Cost: ~$0.05-0.15 per run (sonnet), $0 on cache hit
+ * Cache: SHA-based via eval-cache. Set EVAL_CACHE=0 to force re-run.
+ * Model: Set EVAL_JUDGE_TIER=haiku|sonnet|opus to override (default: sonnet).
  */
 
 import { describe, test, expect, afterAll } from 'bun:test';
-import Anthropic from '@anthropic-ai/sdk';
 import * as fs from 'fs';
 import * as path from 'path';
 import { callJudge, judge } from './helpers/llm-judge';
-import type { JudgeScore } from './helpers/llm-judge';
+import type { JudgeMeta } from './helpers/llm-judge';
 import { EvalCollector } from './helpers/eval-store';
+import { MODEL_PRICING } from '../lib/eval-cost';
 
 const ROOT = path.resolve(import.meta.dir, '..');
 // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
@@ -26,6 +28,22 @@ const describeEval = evalsEnabled ? describe : describe.skip;
 // Eval result collector
 const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
 
+/** Compute actual judge cost from meta (0 on cache hit). */
+function judgeCost(meta: JudgeMeta): number {
+  if (meta.cached) return 0;
+  const p = MODEL_PRICING[meta.model] || { input: 3.0, output: 15.0 };
+  return (meta.input_tokens / 1_000_000) * p.input + (meta.output_tokens / 1_000_000) * p.output;
+}
+
+/** Build CostEntry array from judge meta (empty on cache hit). */
+function judgeCosts(meta: JudgeMeta) {
+  if (meta.cached) return [];
+  return [{
+    model: meta.model, calls: 1,
+    input_tokens: meta.input_tokens, output_tokens: meta.output_tokens,
+  }];
+}
+
 describeEval('LLM-as-judge quality evals', () => {
   test('command reference table scores >= 4 on all dimensions', async () => {
     const t0 = Date.now();
@@ -34,8 +52,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const end = content.indexOf('## Tips');
     const section = content.slice(start, end);
 
-    const scores = await judge('command reference table', section);
-    console.log('Command reference scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('command reference table', section);
+    console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'command reference table',
@@ -43,9 +61,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -60,8 +79,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const end = content.indexOf('## Command Reference');
     const section = content.slice(start, end);
 
-    const scores = await judge('snapshot flags reference', section);
-    console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('snapshot flags reference', section);
+    console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'snapshot flags reference',
@@ -69,9 +88,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -85,8 +105,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const start = content.indexOf('## Snapshot Flags');
     const section = content.slice(start);
 
-    const scores = await judge('browse skill reference (flags + commands)', section);
-    console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('browse skill reference (flags + commands)', section);
+    console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'browse/SKILL.md reference',
@@ -94,9 +114,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -111,8 +132,8 @@ describeEval('LLM-as-judge quality evals', () => {
     const setupEnd = content.indexOf('## IMPORTANT');
     const section = content.slice(setupStart, setupEnd);
 
-    const scores = await judge('setup/binary discovery instructions', section);
-    console.log('Setup block scores:', JSON.stringify(scores, null, 2));
+    const { result: scores, meta } = await judge('setup/binary discovery instructions', section);
+    console.log('Setup block scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'setup block',
@@ -120,9 +141,10 @@ describeEval('LLM-as-judge quality evals', () => {
       tier: 'llm-judge',
       passed: scores.actionability >= 3 && scores.clarity >= 3,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     // Setup block is intentionally minimal (binary discovery only).
@@ -171,13 +193,7 @@ describeEval('LLM-as-judge quality evals', () => {
 | \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
 | \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
 
-    const client = new Anthropic();
-    const response = await client.messages.create({
-      model: 'claude-sonnet-4-6',
-      max_tokens: 1024,
-      messages: [{
-        role: 'user',
-        content: `You are comparing two versions of CLI documentation for an AI coding agent.
+    const { result, meta } = await callJudge<{ winner: string; reasoning: string; a_score: number; b_score: number }>(`You are comparing two versions of CLI documentation for an AI coding agent.
 
 VERSION A (baseline — hand-maintained):
 ${baseline}
@@ -193,15 +209,9 @@ Which version is better for an AI agent trying to use these commands? Consider:
 Respond with ONLY valid JSON:
 {"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
 
-Scores are 1-5 overall quality.`,
-      }],
-    });
+Scores are 1-5 overall quality.`);
 
-    const text = response.content[0].type === 'text' ? response.content[0].text : '';
-    const jsonMatch = text.match(/\{[\s\S]*\}/);
-    if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
-    const result = JSON.parse(jsonMatch[0]);
-    console.log('Regression comparison:', JSON.stringify(result, null, 2));
+    console.log('Regression comparison:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'regression vs baseline',
@@ -209,9 +219,10 @@ Scores are 1-5 overall quality.`,
       tier: 'llm-judge',
       passed: result.b_score >= result.a_score,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { a_score: result.a_score, b_score: result.b_score },
       judge_reasoning: result.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
@@ -229,7 +240,7 @@ describeEval('QA skill quality evals', () => {
     const end = qaContent.indexOf('## Health Score Rubric');
     const section = qaContent.slice(start, end);
 
-    const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
+    const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
 
 The agent reads this document to learn how to systematically QA test a web application. The workflow references
 a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions.
@@ -246,7 +257,7 @@ Respond with ONLY valid JSON:
 Here is the QA workflow to evaluate:
 
 ${section}`);
-    console.log('QA workflow scores:', JSON.stringify(scores, null, 2));
+    console.log('QA workflow scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'qa/SKILL.md workflow',
@@ -254,9 +265,10 @@ ${section}`);
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -271,7 +283,7 @@ ${section}`);
     const start = qaContent.indexOf('## Health Score Rubric');
     const section = qaContent.slice(start);
 
-    const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
+    const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
 
 The agent uses this rubric after QA testing a website. It needs to:
 1. Understand each scoring category and what counts as a deduction
@@ -289,7 +301,7 @@ Respond with ONLY valid JSON:
 Here is the rubric to evaluate:
 
 ${section}`);
-    console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));
+    console.log('QA health rubric scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'qa/SKILL.md health rubric',
@@ -297,9 +309,10 @@ ${section}`);
       tier: 'llm-judge',
       passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
       judge_reasoning: scores.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(scores.clarity).toBeGreaterThanOrEqual(4);
@@ -332,7 +345,7 @@ describeEval('Cross-skill consistency evals', () => {
       extractGrepLines(retroContent, 'retro/SKILL.md'),
     ].join('\n\n');
 
-    const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
+    const { result, meta } = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
 
 INTENDED ARCHITECTURE:
 - greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md)
@@ -355,7 +368,7 @@ Evaluate consistency. Respond with ONLY valid JSON:
 
 score (1-5): 5 = perfectly consistent, 1 = contradictory`);
 
-    console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));
+    console.log('Cross-skill consistency:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
 
     evalCollector?.addTest({
       name: 'cross-skill greptile consistency',
@@ -363,9 +376,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
       tier: 'llm-judge',
       passed: result.consistent && result.score >= 4,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { consistency_score: result.score },
       judge_reasoning: result.reasoning,
+      costs: judgeCosts(meta),
     });
 
     expect(result.consistent).toBe(true);
@@ -392,7 +406,7 @@ describeEval('Baseline score pinning', () => {
     const cmdStart = skillContent.indexOf('## Command Reference');
     const cmdEnd = skillContent.indexOf('## Tips');
     const cmdSection = skillContent.slice(cmdStart, cmdEnd);
-    const cmdScores = await judge('command reference table', cmdSection);
+    const { result: cmdScores, meta } = await judge('command reference table', cmdSection);
 
     for (const dim of ['clarity', 'completeness', 'actionability'] as const) {
       if (cmdScores[dim] < baselines.command_reference[dim]) {
@@ -417,9 +431,10 @@ describeEval('Baseline score pinning', () => {
       tier: 'llm-judge',
       passed,
       duration_ms: Date.now() - t0,
-      cost_usd: 0.02,
+      cost_usd: judgeCost(meta),
       judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
       judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
+      costs: judgeCosts(meta),
     });
 
     if (!passed) {

From daea165333311848afcfd58aebaf711a71aff0b5 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 16:47:41 -0500
Subject: [PATCH 3/8] feat: add eval:trend CLI for per-test pass rate tracking

computeTrends() classifies tests as stable-pass/stable-fail/flaky/
improving/degrading based on pass rate, flip count, and recent streak.
gstack eval trend shows sparkline table with --limit, --tier, --test
filters. Guard CLI main block with import.meta.main to prevent
execution on import.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/cli-eval.ts             | 192 ++++++++++++++++++++++++++++++++++-
 package.json                |   1 +
 test/lib-eval-trend.test.ts | 193 ++++++++++++++++++++++++++++++++++++
 3 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 test/lib-eval-trend.test.ts

diff --git a/lib/cli-eval.ts b/lib/cli-eval.ts
index df16d033..bee75ae0 100644
--- a/lib/cli-eval.ts
+++ b/lib/cli-eval.ts
@@ -258,6 +258,7 @@ async function cmdSummary(args: string[]): Promise<void> {
   if (flakyTests.length > 0) {
     console.log(`  Flaky tests (${flakyTests.length}):`);
     for (const name of flakyTests) console.log(`    - ${name}`);
+    console.log(`  Run 'bun run eval:trend' for detailed time series.`);
     console.log('─'.repeat(60));
   }
 
@@ -429,6 +430,191 @@ async function cmdWatch(): Promise<void> {
   process.exit(exitCode);
 }
 
+// --- Trend tracking ---
+
+export interface TestTrend {
+  name: string;
+  tier: string;
+  results: Array<{ timestamp: string; passed: boolean }>;
+  passRate: number;
+  streak: { type: 'pass' | 'fail'; count: number };
+  flipCount: number;
+  status: 'stable-pass' | 'stable-fail' | 'flaky' | 'improving' | 'degrading';
+}
+
+/**
+ * Compute per-test pass rate trends from eval results.
+ * Pure function — no I/O. Results are ordered chronologically (oldest first).
+ */
+export function computeTrends(
+  results: EvalResult[],
+  filterTier?: string,
+  filterTest?: string,
+): TestTrend[] {
+  // Build time series per test (chronological — oldest first)
+  const byTest = new Map<string, Array<{ timestamp: string; passed: boolean }>>();
+
+  // Results from loadEvalResults are newest-first, so reverse for chronological
+  const chronological = [...results].reverse();
+
+  for (const r of chronological) {
+    if (filterTier && r.tier !== filterTier) continue;
+    for (const t of r.tests) {
+      if (filterTest && t.name !== filterTest) continue;
+      const key = `${r.tier}:${t.name}`;
+      if (!byTest.has(key)) byTest.set(key, []);
+      byTest.get(key)!.push({ timestamp: r.timestamp, passed: t.passed });
+    }
+  }
+
+  const trends: TestTrend[] = [];
+
+  for (const [key, results] of byTest) {
+    const [tier, ...nameParts] = key.split(':');
+    const name = nameParts.join(':');
+    const total = results.length;
+    const passCount = results.filter(r => r.passed).length;
+    const passRate = total > 0 ? passCount / total : 0;
+
+    // Streak: walk from newest (end of array) backward
+    let streakType: 'pass' | 'fail' = results[results.length - 1].passed ? 'pass' : 'fail';
+    let streakCount = 0;
+    for (let i = results.length - 1; i >= 0; i--) {
+      const r = results[i].passed ? 'pass' : 'fail';
+      if (r === streakType) streakCount++;
+      else break;
+    }
+
+    // Flip count: transitions between pass and fail
+    let flipCount = 0;
+    for (let i = 1; i < results.length; i++) {
+      if (results[i].passed !== results[i - 1].passed) flipCount++;
+    }
+
+    // Classify status
+    let status: TestTrend['status'];
+    const last3 = results.slice(-3);
+    const earlier = results.slice(0, -3);
+    const last3AllPass = last3.length >= 3 && last3.every(r => r.passed);
+    const last3HasFail = last3.some(r => !r.passed);
+    const earlierHadFailures = earlier.some(r => !r.passed);
+    const earlierWasPassing = earlier.length > 0 && earlier.every(r => r.passed);
+
+    // Check improving/degrading first — a clear recent trend outranks raw pass rate
+    if (last3AllPass && earlierHadFailures) {
+      status = 'improving';
+    } else if (last3HasFail && earlierWasPassing) {
+      status = 'degrading';
+    } else if (flipCount >= 3 || (passRate > 0.3 && passRate < 0.7)) {
+      status = 'flaky';
+    } else if (passRate >= 0.9 && flipCount <= 1) {
+      status = 'stable-pass';
+    } else if (passRate <= 0.1 && flipCount <= 1) {
+      status = 'stable-fail';
+    } else if (passRate >= 0.5) {
+      status = 'stable-pass';
+    } else {
+      status = 'stable-fail';
+    }
+
+    trends.push({
+      name, tier, results, passRate,
+      streak: { type: streakType, count: streakCount },
+      flipCount, status,
+    });
+  }
+
+  // Sort: flaky first, then flipCount desc, then name
+  trends.sort((a, b) => {
+    const statusOrder = { flaky: 0, degrading: 1, improving: 2, 'stable-fail': 3, 'stable-pass': 4 };
+    const sa = statusOrder[a.status] ?? 5;
+    const sb = statusOrder[b.status] ?? 5;
+    if (sa !== sb) return sa - sb;
+    if (a.flipCount !== b.flipCount) return b.flipCount - a.flipCount;
+    return a.name.localeCompare(b.name);
+  });
+
+  return trends;
+}
+
+async function cmdTrend(args: string[]): Promise<void> {
+  let limit = 10;
+  let filterTier: string | undefined;
+  let filterTest: string | undefined;
+
+  for (let i = 0; i < args.length; i++) {
+    if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
+    else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
+    else if (args[i] === '--test' && args[i + 1]) { filterTest = args[++i]; }
+  }
+
+  const results = loadEvalResults<EvalResult>(undefined, limit);
+  if (results.length === 0) {
+    console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+    return;
+  }
+
+  const trends = computeTrends(results, filterTier, filterTest);
+
+  if (trends.length === 0) {
+    console.log('No test data matching filters.');
+    return;
+  }
+
+  // Determine how many result columns to show
+  const maxResults = Math.min(limit, Math.max(...trends.map(t => t.results.length)));
+
+  console.log('');
+  console.log(`Test Trends (last ${results.length} runs)`);
+  console.log('═'.repeat(80));
+  console.log(
+    '  ' +
+    'Test Name'.padEnd(36) +
+    'Rate'.padEnd(7) +
+    `Last ${maxResults}`.padEnd(maxResults + 3) +
+    'Streak'.padEnd(8) +
+    'Status'
+  );
+  console.log('─'.repeat(80));
+
+  let flakyCount = 0;
+  let degradingCount = 0;
+
+  for (const t of trends) {
+    if (t.status === 'flaky') flakyCount++;
+    if (t.status === 'degrading') degradingCount++;
+
+    const fullName = `${t.tier}:${t.name}`;
+    const displayName = fullName.length > 34 ? fullName.slice(0, 31) + '...' : fullName.padEnd(36);
+    const rate = `${Math.round(t.passRate * 100)}%`.padEnd(7);
+
+    // Build sparkline of last N results
+    const sparkline = t.results
+      .slice(-maxResults)
+      .map(r => r.passed ? '\u2713' : '\u2717')
+      .join('');
+
+    const streak = `${t.streak.count}${t.streak.type === 'pass' ? '\u2713' : '\u2717'}`.padEnd(8);
+
+    // Color status
+    let statusStr = t.status;
+    if (isTTY) {
+      if (t.status === 'flaky' || t.status === 'degrading') statusStr = red(t.status);
+      else if (t.status === 'stable-pass' || t.status === 'improving') statusStr = green(t.status);
+      else statusStr = dim(t.status);
+    }
+
+    console.log(`  ${displayName}${rate}${sparkline.padEnd(maxResults + 3)}${streak}${statusStr}`);
+  }
+
+  console.log('─'.repeat(80));
+  const parts: string[] = [`${trends.length} tests tracked`];
+  if (flakyCount > 0) parts.push(`${flakyCount} flaky`);
+  if (degradingCount > 0) parts.push(`${degradingCount} degrading`);
+  console.log(`  ${parts.join(' | ')}`);
+  console.log('');
+}
+
 function printUsage(): void {
   console.log(`
 gstack eval — eval management CLI
@@ -441,13 +627,15 @@ Commands:
   summary [--limit N]                         Aggregate stats across all runs
   push <file>                                 Validate + save + sync an eval result
   cost <file>                                 Show per-model cost breakdown
+  trend [--limit N] [--tier X] [--test X]     Per-test pass rate trends
   cache read|write|stats|clear|verify         Manage eval cache
   watch                                       Live E2E test dashboard
 `);
 }
 
-// --- Main ---
+// --- Main (only when run directly, not imported) ---
 
+if (import.meta.main) {
 const command = process.argv[2];
 const cmdArgs = process.argv.slice(3);
 
@@ -457,6 +645,7 @@ switch (command) {
   case 'summary': cmdSummary(cmdArgs); break;
   case 'push':    cmdPush(cmdArgs); break;
   case 'cost':    cmdCost(cmdArgs); break;
+  case 'trend':   cmdTrend(cmdArgs); break;
   case 'cache':   cmdCache(cmdArgs); break;
   case 'watch':   cmdWatch(); break;
   case '--help': case '-h': case 'help': case undefined:
@@ -467,3 +656,4 @@ switch (command) {
     printUsage();
     process.exit(1);
 }
+}
diff --git a/package.json b/package.json
index 18090e7d..da816815 100644
--- a/package.json
+++ b/package.json
@@ -21,6 +21,7 @@
     "eval:list": "bun run lib/cli-eval.ts list",
     "eval:compare": "bun run lib/cli-eval.ts compare",
     "eval:summary": "bun run lib/cli-eval.ts summary",
+    "eval:trend": "bun run lib/cli-eval.ts trend",
     "eval:watch": "bun run lib/cli-eval.ts watch"
   },
   "dependencies": {
diff --git a/test/lib-eval-trend.test.ts b/test/lib-eval-trend.test.ts
new file mode 100644
index 00000000..c15aa149
--- /dev/null
+++ b/test/lib-eval-trend.test.ts
@@ -0,0 +1,193 @@
+/**
+ * Tests for computeTrends() — per-test pass rate trend tracking.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { computeTrends } from '../lib/cli-eval';
+import type { EvalResult } from './helpers/eval-store';
+
+/** Build a minimal EvalResult with given tests. */
+function makeRun(opts: {
+  timestamp: string;
+  tier?: 'e2e' | 'llm-judge';
+  tests: Array<{ name: string; passed: boolean }>;
+}): EvalResult {
+  return {
+    schema_version: 1,
+    version: '0.3.3',
+    branch: 'main',
+    git_sha: 'abc',
+    timestamp: opts.timestamp,
+    hostname: 'test',
+    tier: opts.tier || 'e2e',
+    total_tests: opts.tests.length,
+    passed: opts.tests.filter(t => t.passed).length,
+    failed: opts.tests.filter(t => !t.passed).length,
+    total_cost_usd: 0,
+    total_duration_ms: 0,
+    tests: opts.tests.map(t => ({
+      name: t.name, suite: 'test', tier: opts.tier || 'e2e' as const,
+      passed: t.passed, duration_ms: 0, cost_usd: 0,
+    })),
+  };
+}
+
+describe('computeTrends', () => {
+  test('classifies stable-pass test correctly', () => {
+    // 10 runs all passing — results are newest-first (loadEvalResults order)
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'always-pass', passed: true }],
+    })).reverse(); // newest first
+
+    const trends = computeTrends(results);
+    expect(trends).toHaveLength(1);
+    expect(trends[0].status).toBe('stable-pass');
+    expect(trends[0].passRate).toBe(1);
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 10 });
+    expect(trends[0].flipCount).toBe(0);
+  });
+
+  test('classifies stable-fail test correctly', () => {
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'always-fail', passed: false }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('stable-fail');
+    expect(trends[0].passRate).toBe(0);
+    expect(trends[0].streak).toEqual({ type: 'fail', count: 10 });
+  });
+
+  test('classifies flaky test correctly — alternating pass/fail', () => {
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'flaky', passed: i % 2 === 0 }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('flaky');
+    expect(trends[0].flipCount).toBe(9);
+    expect(trends[0].passRate).toBe(0.5);
+  });
+
+  test('classifies improving test correctly', () => {
+    // First 5 fail, last 5 pass
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'improving', passed: i >= 5 }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('improving');
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 5 });
+  });
+
+  test('classifies degrading test correctly', () => {
+    // First 7 pass, last 3 fail
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'degrading', passed: i < 7 }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('degrading');
+    expect(trends[0].streak).toEqual({ type: 'fail', count: 3 });
+  });
+
+  test('computes streak correctly with mixed ending', () => {
+    // pass, pass, fail, pass, pass, pass (newest)
+    const passed = [true, true, false, true, true, true];
+    const results = passed.map((p, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'test', passed: p }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 3 });
+  });
+
+  test('computes flipCount correctly', () => {
+    // pass, fail, pass, pass, fail, pass → 4 flips
+    const passed = [true, false, true, true, false, true];
+    const results = passed.map((p, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'test', passed: p }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].flipCount).toBe(4);
+  });
+
+  test('handles single run', () => {
+    const results = [makeRun({
+      timestamp: '2026-03-15T00:00:00Z',
+      tests: [{ name: 'single', passed: true }],
+    })];
+
+    const trends = computeTrends(results);
+    expect(trends).toHaveLength(1);
+    expect(trends[0].passRate).toBe(1);
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 1 });
+    expect(trends[0].flipCount).toBe(0);
+    expect(trends[0].status).toBe('stable-pass');
+  });
+
+  test('handles single failing run', () => {
+    const results = [makeRun({
+      timestamp: '2026-03-15T00:00:00Z',
+      tests: [{ name: 'single-fail', passed: false }],
+    })];
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('stable-fail');
+  });
+
+  test('filters by tier', () => {
+    const results = [
+      makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'e2e', tests: [{ name: 'e2e-test', passed: true }] }),
+      makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'llm-judge', tests: [{ name: 'judge-test', passed: true }] }),
+    ];
+
+    const e2eOnly = computeTrends(results, 'e2e');
+    expect(e2eOnly).toHaveLength(1);
+    expect(e2eOnly[0].name).toBe('e2e-test');
+
+    const judgeOnly = computeTrends(results, 'llm-judge');
+    expect(judgeOnly).toHaveLength(1);
+    expect(judgeOnly[0].name).toBe('judge-test');
+  });
+
+  test('filters by test name', () => {
+    const results = Array.from({ length: 3 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [
+        { name: 'test-a', passed: true },
+        { name: 'test-b', passed: false },
+      ],
+    })).reverse();
+
+    const filtered = computeTrends(results, undefined, 'test-a');
+    expect(filtered).toHaveLength(1);
+    expect(filtered[0].name).toBe('test-a');
+    expect(filtered[0].passRate).toBe(1);
+  });
+
+  test('sorts flaky tests first', () => {
+    // Create runs where test-a is flaky and test-b is stable
+    const results = Array.from({ length: 6 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [
+        { name: 'test-a', passed: i % 2 === 0 }, // flaky: alternating
+        { name: 'test-b', passed: true },          // stable-pass
+      ],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].name).toBe('test-a');
+    expect(trends[0].status).toBe('flaky');
+    expect(trends[1].name).toBe('test-b');
+    expect(trends[1].status).toBe('stable-pass');
+  });
+});

From 33c95528702bec20cce57f7c47b33bd252575402 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 16:47:46 -0500
Subject: [PATCH 4/8] chore: update gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index cc41a3e7..37f571b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ bun.lock
 .env.local
 .env.*
 !.env.example
+.gstack-sync.json

From e28033353dd64cc7958f8c95cac83114559b03f0 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 16:55:34 -0500
Subject: [PATCH 5/8] chore: bump v0.3.10, update CHANGELOG and docs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md    | 18 ++++++++++++++++++
 CLAUDE.md       |  1 +
 CONTRIBUTING.md |  5 ++++-
 VERSION         |  2 +-
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4c571e6e..b040306b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,23 @@
 # Changelog
 
+## 0.3.10 — 2026-03-15
+
+### Added
+- **Per-model cost tracking** — eval results now include `costs[]` with exact per-model token usage (input, output, cache read, cache creation) and API-reported cost. Extracted from `resultLine.modelUsage` in `claude -p` NDJSON stream. `computeCosts()` prefers exact `cost_usd` over MODEL_PRICING estimates (~4x more accurate with prompt caching).
+- **LLM judge caching** — SHA-based caching for LLM-as-judge eval calls via `eval-cache.ts`. Cache keyed by `model:prompt`, so unchanged SKILL.md content skips API calls entirely. ~$0.18/run savings. Set `EVAL_CACHE=0` to force re-run.
+- **Dynamic model selection** — `EVAL_JUDGE_TIER` env var controls which Claude model runs judge evals (haiku/sonnet/opus, default: sonnet). `EVAL_TIER` pins the E2E test model via `--model` flag to `claude -p`.
+- **`bun run eval:trend`** — per-test pass rate tracking over last N runs. Classifies tests as stable-pass, stable-fail, flaky, improving, or degrading. Sparkline table with `--limit`, `--tier`, `--test` filters. Answers "is /retro getting more reliable?" instantly.
+- **CostEntry extended** — `cache_read_input_tokens`, `cache_creation_input_tokens`, `cost_usd` optional fields for accurate cache-aware cost reporting.
+- 22 new tests: 10 cache/tier integration (llm-judge.test.ts), 12 trend classification (lib-eval-trend.test.ts).
+
+### Changed
+- `callJudge()` and `judge()` now return `{ result, meta }` with `JudgeMeta` (model, tokens, cached flag). `outcomeJudge()` retains simple return type for E2E callers.
+- `EvalCollector.finalize()` aggregates per-test `costs[]` into result-level cost breakdown.
+- `cli-eval.ts` main block guarded with `import.meta.main` to prevent execution on import.
+- `eval:summary` now hints to run `eval:trend` when flaky tests are detected.
+- All 8 LLM eval test sites updated from hard-coded `cost_usd: 0.02` to real API-reported costs.
+- Regression test refactored from direct `Anthropic()` client to `callJudge()` (benefits from cache + tier).
+
 ## 0.3.9 — 2026-03-15
 
 ### Added
diff --git a/CLAUDE.md b/CLAUDE.md
index c6909357..681566b3 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -15,6 +15,7 @@ bun run dev:skill    # watch mode: auto-regen + validate on change
 bun run eval:list    # list all eval runs from ~/.gstack-dev/evals/
 bun run eval:compare # compare two eval runs (auto-picks most recent)
 bun run eval:summary # aggregate stats across all eval runs
+bun run eval:trend   # per-test pass rate trends (flaky detection)
 ```
 
 `test:evals` requires `ANTHROPIC_API_KEY`. E2E tests stream progress in real-time
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 34e502ea..0116be43 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -134,6 +134,8 @@ When E2E tests run, they produce machine-readable artifacts in `~/.gstack-dev/`:
 bun run eval:list            # list all eval runs
 bun run eval:compare         # compare two runs (auto-picks most recent)
 bun run eval:summary         # aggregate stats across all runs
+bun run eval:trend           # per-test pass rate over last N runs (flaky detection)
+bun run eval:cache stats     # check LLM judge cache hit rate
 ```
 
 Artifacts are never cleaned up — they accumulate in `~/.gstack-dev/` for post-mortem debugging and trend analysis.
@@ -152,7 +154,8 @@ Each dimension is scored 1-5. Threshold: every dimension must score **≥ 4**. T
 # Needs ANTHROPIC_API_KEY in .env — included in bun run test:evals
 ```
 
-- Uses `claude-sonnet-4-6` for scoring stability
+- Model defaults to `claude-sonnet-4-6`; override with `EVAL_JUDGE_TIER=haiku|opus`
+- Results are SHA-cached — unchanged SKILL.md content skips API calls ($0 on repeat runs). Set `EVAL_CACHE=0` to force re-run.
 - Tests live in `test/skill-llm-eval.test.ts`
 - Calls the Anthropic API directly (not `claude -p`), so it works from anywhere including inside Claude Code
 
diff --git a/VERSION b/VERSION
index 940ac09a..5503126d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.3.9
+0.3.10

From eb7ef2153b8b299b942c17ffc1f26e8996471d9e Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 17:04:49 -0500
Subject: [PATCH 6/8] docs: add setup comments to .gstack-sync.json.example

Explain what team sync gives you, that it's optional, and how to
set it up. Points to TEAM_COORDINATION_STORE.md for full guide.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .gstack-sync.json.example | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gstack-sync.json.example b/.gstack-sync.json.example
index 4803eb42..6dc6dce7 100644
--- a/.gstack-sync.json.example
+++ b/.gstack-sync.json.example
@@ -1,4 +1,9 @@
 {
+  "_comment": "OPTIONAL: Team sync configuration for shared eval/retro/QA data via Supabase.",
+  "_docs": "See docs/designs/TEAM_COORDINATION_STORE.md for full setup guide.",
+  "_what_you_get": "Shared eval dashboards, cross-team trend tracking, retro aggregation, QA report history. Without this file, everything works locally — sync is purely additive.",
+  "_setup": "1. Create a Supabase project. 2. Run supabase/migrations/*.sql in order. 3. Copy this file to .gstack-sync.json and fill in your values. 4. Set GSTACK_SUPABASE_ACCESS_TOKEN or run gstack sync login.",
+
   "supabase_url": "https://YOUR_PROJECT.supabase.co",
   "supabase_anon_key": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.YOUR_ANON_KEY_HERE",
   "team_slug": "your-team-name"

From 14320469b012830fcc046ba86eb32b95a4f064c0 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 17:05:45 -0500
Subject: [PATCH 7/8] docs: CHANGELOG covers full branch scope including team
 sync

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b040306b..b4151b1b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,20 +3,25 @@
 ## 0.3.10 — 2026-03-15
 
 ### Added
-- **Per-model cost tracking** — eval results now include `costs[]` with exact per-model token usage (input, output, cache read, cache creation) and API-reported cost. Extracted from `resultLine.modelUsage` in `claude -p` NDJSON stream. `computeCosts()` prefers exact `cost_usd` over MODEL_PRICING estimates (~4x more accurate with prompt caching).
+- **Team sync via Supabase (optional)** — shared data store for eval results, retro snapshots, QA reports, ship logs, and Greptile triage across team members. All sync operations are non-fatal and non-blocking — skills never wait on network. Offline queue with automatic retry (up to 5 attempts). Zero impact when not configured: without `.gstack-sync.json`, everything works locally as before. See `docs/designs/TEAM_COORDINATION_STORE.md` for architecture and setup.
+- **Supabase migration SQL** — 4 migration files in `supabase/migrations/` for teams, eval_runs, data tables (retros, QA, ships, Greptile), and eval costs. Row-level security policies ensure team members can only access their own team's data.
+- **Sync config + auth** — `.gstack-sync.json` for project-level config (Supabase URL, anon key, team slug). `~/.gstack/auth.json` for user-level tokens (keyed by Supabase URL for multi-team support). `GSTACK_SUPABASE_ACCESS_TOKEN` env var for CI/automation. Token refresh built in.
+- **`gstack sync` CLI** — `status`, `push`, `pull`, `drain`, `login`, `logout` subcommands for managing team sync.
+- **Universal eval format** — `StandardEvalResult` schema with validation, normalization, and bidirectional legacy conversion. Any language can produce JSON matching this format and push via `gstack eval push`.
+- **Unified eval CLI** — `gstack eval list|compare|summary|trend|push|cost|cache|watch` consolidating all eval tools into one entry point.
+- **Per-model cost tracking** — eval results now include `costs[]` with exact per-model token usage (input, output, cache read, cache creation) and API-reported cost. Extracted from `resultLine.modelUsage` in the `claude -p` NDJSON stream. `computeCosts()` prefers exact `cost_usd` over MODEL_PRICING estimates (~4x more accurate with prompt caching).
 - **LLM judge caching** — SHA-based caching for LLM-as-judge eval calls via `eval-cache.ts`. Cache keyed by `model:prompt`, so unchanged SKILL.md content skips API calls entirely. ~$0.18/run savings. Set `EVAL_CACHE=0` to force re-run.
 - **Dynamic model selection** — `EVAL_JUDGE_TIER` env var controls which Claude model runs judge evals (haiku/sonnet/opus, default: sonnet). `EVAL_TIER` pins the E2E test model via `--model` flag to `claude -p`.
-- **`bun run eval:trend`** — per-test pass rate tracking over last N runs. Classifies tests as stable-pass, stable-fail, flaky, improving, or degrading. Sparkline table with `--limit`, `--tier`, `--test` filters. Answers "is /retro getting more reliable?" instantly.
-- **CostEntry extended** — `cache_read_input_tokens`, `cache_creation_input_tokens`, `cost_usd` optional fields for accurate cache-aware cost reporting.
-- 22 new tests: 10 cache/tier integration (llm-judge.test.ts), 12 trend classification (lib-eval-trend.test.ts).
+- **`bun run eval:trend`** — per-test pass rate tracking over last N runs. Classifies tests as stable-pass, stable-fail, flaky, improving, or degrading. Sparkline table with `--limit`, `--tier`, `--test` filters.
+- **Shared utilities** — `lib/util.ts` extracted with `atomicWriteJSON`, `readJSON`, `getGitInfo`, `getRemoteSlug`, `listEvalFiles`, `loadEvalResults`, `formatTimestamp`, and path constants.
+- 52+ new tests across eval cache, cost, format, tier, trend, sync config, sync client, and LLM judge integration.
 
 ### Changed
 - `callJudge()` and `judge()` now return `{ result, meta }` with `JudgeMeta` (model, tokens, cached flag). `outcomeJudge()` retains simple return type for E2E callers.
-- `EvalCollector.finalize()` aggregates per-test `costs[]` into result-level cost breakdown.
+- `EvalCollector.finalize()` aggregates per-test `costs[]` into result-level cost breakdown and attempts team sync (non-blocking).
 - `cli-eval.ts` main block guarded with `import.meta.main` to prevent execution on import.
 - `eval:summary` now hints to run `eval:trend` when flaky tests are detected.
 - All 8 LLM eval test sites updated from hard-coded `cost_usd: 0.02` to real API-reported costs.
-- Regression test refactored from direct `Anthropic()` client to `callJudge()` (benefits from cache + tier).
 
 ## 0.3.9 — 2026-03-15
 

From 704fe34e98ea006008e79f89dd471ba90c0aa2b8 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 17:06:51 -0500
Subject: [PATCH 8/8] docs: clean up sync example, add team sync section to
 README

Remove _comment hacks from JSON example file. Add short team sync
section to README explaining what it is, that it's optional, and
how to set it up.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .gstack-sync.json.example | 5 -----
 README.md                 | 6 ++++++
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.gstack-sync.json.example b/.gstack-sync.json.example
index 6dc6dce7..4803eb42 100644
--- a/.gstack-sync.json.example
+++ b/.gstack-sync.json.example
@@ -1,9 +1,4 @@
 {
-  "_comment": "OPTIONAL: Team sync configuration for shared eval/retro/QA data via Supabase.",
-  "_docs": "See docs/designs/TEAM_COORDINATION_STORE.md for full setup guide.",
-  "_what_you_get": "Shared eval dashboards, cross-team trend tracking, retro aggregation, QA report history. Without this file, everything works locally — sync is purely additive.",
-  "_setup": "1. Create a Supabase project. 2. Run supabase/migrations/*.sql in order. 3. Copy this file to .gstack-sync.json and fill in your values. 4. Set GSTACK_SUPABASE_ACCESS_TOKEN or run gstack sync login.",
-
   "supabase_url": "https://YOUR_PROJECT.supabase.co",
   "supabase_anon_key": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.YOUR_ANON_KEY_HERE",
   "team_slug": "your-team-name"
diff --git a/README.md b/README.md
index 27548066..9e23d11d 100644
--- a/README.md
+++ b/README.md
@@ -629,6 +629,12 @@ bun run eval:watch            # live dashboard during E2E runs
 
 E2E tests stream real-time progress, write machine-readable diagnostics, and persist partial results that survive kills. See CONTRIBUTING.md for the full eval infrastructure.
 
+### Team sync (optional)
+
+For teams, gstack can sync eval results, retro snapshots, QA reports, and ship logs to a shared Supabase store. Without this, everything works locally as before — sync is purely additive.
+
+To set up: copy `.gstack-sync.json.example` to `.gstack-sync.json`, create a Supabase project, run the migrations in `supabase/migrations/`, and fill in your credentials. See `docs/designs/TEAM_COORDINATION_STORE.md` for the full guide.
+
 ## License
 
 MIT