feat: add eval format validation, tier selection, cost tracking

- lib/eval-format.ts: StandardEvalResult interfaces, validateEvalResult(), normalizeFromLegacy/normalizeToLegacy round-trip converters - lib/eval-tier.ts: EvalTier type, resolveTier/resolveJudgeTier from env, tierToModel mapping, TIER_ALIASES (haiku→fast, sonnet→standard, opus→full) - lib/eval-cost.ts: MODEL_PRICING (last verified 2025-05-01), computeCosts(), formatCostDashboard(), aggregateCosts(), fallback for unknown models - 42 tests across 3 test files Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 14:06:42 +02:00 · 2026-03-15 09:39:18 -05:00
parent 7f7035f55a
commit 9bc6c9416f
6 changed files with 846 additions and 0 deletions
@@ -0,0 +1,158 @@
+/**
+ * Per-model cost tracking for eval runs.
+ *
+ * Computes cost breakdowns from CostEntry arrays and formats
+ * them as terminal tables. Supports aggregation across multiple runs.
+ */
+
+import type { CostEntry, StandardEvalResult } from './eval-format';
+
+// --- Interfaces ---
+
+export interface CostSummary {
+  model: string;
+  calls: number;
+  input_tokens: number;
+  output_tokens: number;
+  estimated_cost_usd: number;
+}
+
+export interface CostDashboard {
+  entries: CostSummary[];
+  total: number;
+  at_fast_tier: number;
+  at_full_tier: number;
+}
+
+// --- Pricing ---
+
+/**
+ * Per-million-token pricing for Claude models.
+ * Last verified: 2025-05-01
+ */
+export const MODEL_PRICING: Record<string, { input: number; output: number }> = {
+  'claude-opus-4-6':       { input: 15.00, output: 75.00 },
+  'claude-sonnet-4-6':     { input: 3.00,  output: 15.00 },
+  'claude-haiku-4-5':      { input: 0.80,  output: 4.00  },
+  // Legacy model IDs
+  'claude-3-5-sonnet-20241022': { input: 3.00, output: 15.00 },
+  'claude-3-5-haiku-20241022':  { input: 0.80, output: 4.00  },
+  'claude-3-opus-20240229':     { input: 15.00, output: 75.00 },
+};
+
+/** Fallback pricing for unknown models (use sonnet pricing as a safe middle ground). */
+const FALLBACK_PRICING = { input: 3.00, output: 15.00 };
+
+// --- Computation ---
+
+function getPricing(model: string): { input: number; output: number } {
+  return MODEL_PRICING[model] || FALLBACK_PRICING;
+}
+
+/**
+ * Compute per-model cost summaries from an array of CostEntry records.
+ */
+export function computeCosts(costs: CostEntry[]): CostDashboard {
+  const byModel = new Map<string, CostSummary>();
+
+  for (const entry of costs) {
+    const existing = byModel.get(entry.model);
+    if (existing) {
+      existing.calls += entry.calls;
+      existing.input_tokens += entry.input_tokens;
+      existing.output_tokens += entry.output_tokens;
+    } else {
+      byModel.set(entry.model, {
+        model: entry.model,
+        calls: entry.calls,
+        input_tokens: entry.input_tokens,
+        output_tokens: entry.output_tokens,
+        estimated_cost_usd: 0,
+      });
+    }
+  }
+
+  // Calculate costs
+  let total = 0;
+  let atFast = 0;
+  let atFull = 0;
+  const fastPricing = MODEL_PRICING['claude-haiku-4-5'] || FALLBACK_PRICING;
+  const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
+
+  for (const summary of byModel.values()) {
+    const pricing = getPricing(summary.model);
+    summary.estimated_cost_usd =
+      (summary.input_tokens / 1_000_000) * pricing.input +
+      (summary.output_tokens / 1_000_000) * pricing.output;
+    total += summary.estimated_cost_usd;
+
+    // What-if at fast/full tiers
+    atFast +=
+      (summary.input_tokens / 1_000_000) * fastPricing.input +
+      (summary.output_tokens / 1_000_000) * fastPricing.output;
+    atFull +=
+      (summary.input_tokens / 1_000_000) * fullPricing.input +
+      (summary.output_tokens / 1_000_000) * fullPricing.output;
+  }
+
+  const entries = [...byModel.values()].sort((a, b) => b.estimated_cost_usd - a.estimated_cost_usd);
+
+  return {
+    entries,
+    total: Math.round(total * 1_000_000) / 1_000_000,
+    at_fast_tier: Math.round(atFast * 1_000_000) / 1_000_000,
+    at_full_tier: Math.round(atFull * 1_000_000) / 1_000_000,
+  };
+}
+
+/**
+ * Format a CostDashboard as a terminal table.
+ */
+export function formatCostDashboard(dashboard: CostDashboard): string {
+  const lines: string[] = [];
+  lines.push('');
+  lines.push('Cost Breakdown');
+  lines.push('═'.repeat(75));
+  lines.push(
+    '  ' +
+    'Model'.padEnd(32) +
+    'Calls'.padEnd(8) +
+    'In Tokens'.padEnd(12) +
+    'Out Tokens'.padEnd(12) +
+    'Cost'
+  );
+  lines.push('─'.repeat(75));
+
+  for (const entry of dashboard.entries) {
+    const model = entry.model.length > 30 ? entry.model.slice(0, 27) + '...' : entry.model.padEnd(32);
+    lines.push(
+      `  ${model}` +
+      `${entry.calls}`.padEnd(8) +
+      `${entry.input_tokens.toLocaleString()}`.padEnd(12) +
+      `${entry.output_tokens.toLocaleString()}`.padEnd(12) +
+      `$${entry.estimated_cost_usd.toFixed(4)}`
+    );
+  }
+
+  lines.push('─'.repeat(75));
+  lines.push(`  Total: $${dashboard.total.toFixed(4)}`);
+  lines.push(`  At fast tier (Haiku):  $${dashboard.at_fast_tier.toFixed(4)}`);
+  lines.push(`  At full tier (Opus):   $${dashboard.at_full_tier.toFixed(4)}`);
+  lines.push('');
+
+  return lines.join('\n');
+}
+
+/**
+ * Aggregate costs across multiple StandardEvalResult runs.
+ * Merges all costs[] arrays and computes a single dashboard.
+ */
+export function aggregateCosts(results: StandardEvalResult[]): CostDashboard {
+  const allCosts: CostEntry[] = [];
+  for (const r of results) {
+    if (r.costs) {
+      allCosts.push(...r.costs);
+    }
+  }
+  return computeCosts(allCosts);
+}
@@ -0,0 +1,229 @@
+/**
+ * Standard eval result format — validation and normalization.
+ *
+ * Superset of the legacy EvalResult from test/helpers/eval-store.ts.
+ * Any language can produce a JSON file matching StandardEvalResult and
+ * push it through `gstack eval push`.
+ */
+
+import type { EvalResult, EvalTestEntry } from '../test/helpers/eval-store';
+
+// --- Interfaces ---
+
+export interface CostEntry {
+  model: string;
+  calls: number;
+  input_tokens: number;
+  output_tokens: number;
+}
+
+export interface FailureEntry {
+  test_name: string;
+  error: string;
+  category?: string;
+}
+
+export interface ComparisonEntry {
+  label: string;
+  model: string;
+  score: number;
+  cost_usd: number;
+}
+
+export interface StandardTestEntry {
+  name: string;
+  suite: string;
+  tier: string;
+  passed: boolean;
+  duration_ms: number;
+  cost_usd: number;
+  output?: Record<string, unknown>;
+
+  // Optional fields from legacy format
+  turns_used?: number;
+  exit_reason?: string;
+  detection_rate?: number;
+  false_positives?: number;
+  evidence_quality?: number;
+  detected_bugs?: string[];
+  missed_bugs?: string[];
+  judge_scores?: Record<string, number>;
+  judge_reasoning?: string;
+  error?: string;
+}
+
+export interface StandardEvalResult {
+  schema_version: number;
+  version: string;
+  label?: string;
+  git_branch: string;
+  git_sha: string;
+  timestamp: string;
+  hostname: string;
+  tier: string;
+  total: number;
+  passed: number;
+  failed: number;
+  total_cost_usd: number;
+  duration_seconds: number;
+  all_results: StandardTestEntry[];
+  prompt_sha?: string;
+  by_category?: Record<string, { passed: number; failed: number }>;
+  costs?: CostEntry[];
+  comparison?: ComparisonEntry[];
+  failures?: FailureEntry[];
+  _partial?: boolean;
+}
+
+// --- Validation ---
+
+const REQUIRED_FIELDS: Array<[string, string]> = [
+  ['schema_version', 'number'],
+  ['version', 'string'],
+  ['git_branch', 'string'],
+  ['git_sha', 'string'],
+  ['timestamp', 'string'],
+  ['tier', 'string'],
+  ['total', 'number'],
+  ['passed', 'number'],
+  ['failed', 'number'],
+  ['total_cost_usd', 'number'],
+  ['duration_seconds', 'number'],
+  ['all_results', 'object'], // array check below
+];
+
+/**
+ * Validate that an unknown value conforms to StandardEvalResult.
+ * Returns { valid: true, errors: [] } or { valid: false, errors: [...] }.
+ */
+export function validateEvalResult(data: unknown): { valid: boolean; errors: string[] } {
+  const errors: string[] = [];
+
+  if (data === null || typeof data !== 'object') {
+    return { valid: false, errors: ['Input must be a non-null object'] };
+  }
+
+  const obj = data as Record<string, unknown>;
+
+  for (const [field, expectedType] of REQUIRED_FIELDS) {
+    if (!(field in obj)) {
+      errors.push(`Missing required field: ${field}`);
+    } else if (typeof obj[field] !== expectedType) {
+      errors.push(`Field "${field}" must be ${expectedType}, got ${typeof obj[field]}`);
+    }
+  }
+
+  // all_results must be an array
+  if ('all_results' in obj && !Array.isArray(obj.all_results)) {
+    errors.push('Field "all_results" must be an array');
+  }
+
+  // Validate each test entry minimally
+  if (Array.isArray(obj.all_results)) {
+    for (let i = 0; i < obj.all_results.length; i++) {
+      const entry = obj.all_results[i];
+      if (typeof entry !== 'object' || entry === null) {
+        errors.push(`all_results[${i}] must be an object`);
+        continue;
+      }
+      if (typeof (entry as Record<string, unknown>).name !== 'string') {
+        errors.push(`all_results[${i}].name must be a string`);
+      }
+      if (typeof (entry as Record<string, unknown>).passed !== 'boolean') {
+        errors.push(`all_results[${i}].passed must be a boolean`);
+      }
+    }
+  }
+
+  return { valid: errors.length === 0, errors };
+}
+
+// --- Normalization ---
+
+/**
+ * Convert legacy EvalResult → StandardEvalResult.
+ */
+export function normalizeFromLegacy(legacy: EvalResult): StandardEvalResult {
+  return {
+    schema_version: legacy.schema_version,
+    version: legacy.version,
+    git_branch: legacy.branch,
+    git_sha: legacy.git_sha,
+    timestamp: legacy.timestamp,
+    hostname: legacy.hostname,
+    tier: legacy.tier,
+    total: legacy.total_tests,
+    passed: legacy.passed,
+    failed: legacy.failed,
+    total_cost_usd: legacy.total_cost_usd,
+    duration_seconds: Math.round(legacy.total_duration_ms / 1000),
+    all_results: legacy.tests.map(legacyTestToStandard),
+    _partial: legacy._partial,
+  };
+}
+
+function legacyTestToStandard(t: EvalTestEntry): StandardTestEntry {
+  const entry: StandardTestEntry = {
+    name: t.name,
+    suite: t.suite,
+    tier: t.tier,
+    passed: t.passed,
+    duration_ms: t.duration_ms,
+    cost_usd: t.cost_usd,
+  };
+  if (t.turns_used !== undefined) entry.turns_used = t.turns_used;
+  if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason;
+  if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate;
+  if (t.false_positives !== undefined) entry.false_positives = t.false_positives;
+  if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality;
+  if (t.detected_bugs) entry.detected_bugs = t.detected_bugs;
+  if (t.missed_bugs) entry.missed_bugs = t.missed_bugs;
+  if (t.judge_scores) entry.judge_scores = t.judge_scores;
+  if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning;
+  if (t.error !== undefined) entry.error = t.error;
+  return entry;
+}
+
+/**
+ * Convert StandardEvalResult → legacy EvalResult for compat with existing compare/list.
+ */
+export function normalizeToLegacy(standard: StandardEvalResult): EvalResult {
+  return {
+    schema_version: standard.schema_version,
+    version: standard.version,
+    branch: standard.git_branch,
+    git_sha: standard.git_sha,
+    timestamp: standard.timestamp,
+    hostname: standard.hostname,
+    tier: standard.tier as 'e2e' | 'llm-judge',
+    total_tests: standard.total,
+    passed: standard.passed,
+    failed: standard.failed,
+    total_cost_usd: standard.total_cost_usd,
+    total_duration_ms: standard.duration_seconds * 1000,
+    tests: standard.all_results.map(standardTestToLegacy),
+    _partial: standard._partial,
+  };
+}
+
+function standardTestToLegacy(t: StandardTestEntry): EvalTestEntry {
+  const entry: EvalTestEntry = {
+    name: t.name,
+    suite: t.suite,
+    tier: t.tier as 'e2e' | 'llm-judge',
+    passed: t.passed,
+    duration_ms: t.duration_ms,
+    cost_usd: t.cost_usd,
+  };
+  if (t.turns_used !== undefined) entry.turns_used = t.turns_used;
+  if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason;
+  if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate;
+  if (t.false_positives !== undefined) entry.false_positives = t.false_positives;
+  if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality;
+  if (t.detected_bugs) entry.detected_bugs = t.detected_bugs;
+  if (t.missed_bugs) entry.missed_bugs = t.missed_bugs;
+  if (t.judge_scores) entry.judge_scores = t.judge_scores;
+  if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning;
+  if (t.error !== undefined) entry.error = t.error;
+  return entry;
+}
@@ -0,0 +1,51 @@
+/**
+ * Model tier selection for evals.
+ *
+ * Maps tier names to Claude models. Supports env var overrides
+ * for EVAL_TIER and EVAL_JUDGE_TIER.
+ */
+
+export type EvalTier = 'fast' | 'standard' | 'full';
+
+export const TIER_ALIASES: Record<string, EvalTier> = {
+  haiku: 'fast',
+  sonnet: 'standard',
+  opus: 'full',
+};
+
+const TIER_TO_MODEL: Record<EvalTier, string> = {
+  fast: 'claude-haiku-4-5',
+  standard: 'claude-sonnet-4-6',
+  full: 'claude-opus-4-6',
+};
+
+/**
+ * Resolve the eval tier from EVAL_TIER env var.
+ * Supports both tier names ('fast', 'standard', 'full') and
+ * model aliases ('haiku', 'sonnet', 'opus').
+ * Defaults to 'standard'.
+ */
+export function resolveTier(): EvalTier {
+  const raw = process.env.EVAL_TIER?.toLowerCase().trim();
+  if (!raw) return 'standard';
+  if (raw in TIER_ALIASES) return TIER_ALIASES[raw];
+  if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw;
+  return 'standard';
+}
+
+/**
+ * Resolve the judge tier from EVAL_JUDGE_TIER env var.
+ * Falls back to resolveTier() if not set.
+ */
+export function resolveJudgeTier(): EvalTier {
+  const raw = process.env.EVAL_JUDGE_TIER?.toLowerCase().trim();
+  if (!raw) return resolveTier();
+  if (raw in TIER_ALIASES) return TIER_ALIASES[raw];
+  if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw;
+  return resolveTier();
+}
+
+/** Map a tier to its Claude model ID. */
+export function tierToModel(tier: EvalTier): string {
+  return TIER_TO_MODEL[tier];
+}