test(eval): paid periodic overlay-efficacy harness

`test/skill-e2e-overlay-harness.test.ts` iterates OVERLAY_FIXTURES, runs two arms per fixture (overlay-ON, overlay-OFF) at N=10 trials with bounded concurrency. Arms use SDK preset `claude_code` so both include the real Claude Code system prompt; overlay-ON appends the resolved overlay text. Saves per-trial raw event streams to `~/.gstack/projects/<slug>/transcripts/` for forensic recovery. Gated on `EVALS=1 && EVALS_TIER=periodic`. ~$3/run (40 trials).
2026-05-02 03:35:09 +02:00 · 2026-04-23 09:13:58 -07:00
parent 06a862faab
commit e432f4bd94
1 changed files with 320 additions and 0 deletions
@@ -0,0 +1,320 @@
+/**
+ * Overlay-efficacy harness (periodic tier, paid).
+ *
+ * Measures whether a model-specific overlay nudge actually changes model
+ * behavior when run through the real Claude Agent SDK — the harness
+ * Claude Code itself is built on. This complements test/skill-e2e-opus-47.test.ts
+ * which measures the same thing via `claude -p` subprocess (a different
+ * harness with different prompt composition).
+ *
+ * For each fixture in test/fixtures/overlay-nudges.ts, runs two arms at
+ * `fixture.trials` trials per arm with bounded concurrency:
+ *   - overlay-on:  SDK systemPrompt = resolved overlay content
+ *   - overlay-off: SDK systemPrompt = "" (empty)
+ *
+ * Both arms have no CLAUDE.md, no skills directory, no setting-source
+ * inheritance (settingSources: []). This is the TRUE bare comparison —
+ * the only variable is the overlay text.
+ *
+ * Budget ~$20 per run at 40 trials (2 fixtures × 2 arms × 10 trials).
+ * Gated by EVALS=1 AND EVALS_TIER=periodic. Never runs under test:gate.
+ */
+
+import { describe, test, expect, afterAll } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  runAgentSdkTest,
+  resolveClaudeBinary,
+  type AgentSdkResult,
+  type SystemPromptOption,
+} from './helpers/agent-sdk-runner';
+import { EvalCollector, getProjectEvalDir } from './helpers/eval-store';
+import {
+  OVERLAY_FIXTURES,
+  type OverlayFixture,
+} from './fixtures/overlay-nudges';
+import { readOverlay } from '../scripts/resolvers/model-overlay';
+
+const evalsEnabled = !!process.env.EVALS;
+const periodicTier = process.env.EVALS_TIER === 'periodic';
+const shouldRun = evalsEnabled && periodicTier;
+
+const describeE2E = shouldRun ? describe : describe.skip;
+// EvalCollector's tier must be 'e2e' | 'llm-judge' per its type signature.
+// The existing paid evals violate this by passing descriptive names like
+// 'e2e-opus-47' — a pre-existing pattern that only works because bun-test
+// runs without strict typechecking. We stay conforming here.
+const evalCollector = shouldRun ? new EvalCollector('e2e') : null;
+
+const REPO_ROOT = path.resolve(import.meta.dir, '..');
+const runId = new Date()
+  .toISOString()
+  .replace(/[:.]/g, '')
+  .replace('T', '-')
+  .slice(0, 15);
+const TRANSCRIPTS_DIR = path.join(
+  path.dirname(getProjectEvalDir()),
+  'transcripts',
+  `overlay-harness-${runId}`,
+);
+
+// ---------------------------------------------------------------------------
+// Per-arm helpers
+// ---------------------------------------------------------------------------
+
+type Arm = 'overlay-on' | 'overlay-off';
+
+function mkTrialDir(fixtureId: string, arm: Arm, n: number): string {
+  const dir = fs.mkdtempSync(
+    path.join(os.tmpdir(), `overlay-harness-${fixtureId}-${arm}-${n}-`),
+  );
+  return dir;
+}
+
+function saveRawTranscript(
+  fixtureId: string,
+  arm: Arm,
+  n: number,
+  result: AgentSdkResult,
+): void {
+  fs.mkdirSync(TRANSCRIPTS_DIR, { recursive: true });
+  const out = path.join(TRANSCRIPTS_DIR, `${fixtureId}-${arm}-${n}.jsonl`);
+  const lines = result.events.map((e) => JSON.stringify(e));
+  fs.writeFileSync(out, lines.join('\n') + '\n');
+}
+
+function overlayContentFor(fixture: OverlayFixture): string {
+  const family = path.basename(fixture.overlayPath, '.md');
+  const resolved = readOverlay(family);
+  if (!resolved) {
+    throw new Error(
+      `fixture ${fixture.id}: resolver returned empty content for ${family}`,
+    );
+  }
+  return resolved;
+}
+
+// ---------------------------------------------------------------------------
+// Per-fixture runner
+// ---------------------------------------------------------------------------
+
+interface ArmResult {
+  metrics: number[];
+  costs: number[];
+  durations: number[];
+  rateLimitExhausted: number;
+  sdkClaudeCodeVersions: Set<string>;
+}
+
+async function runArm(
+  fixture: OverlayFixture,
+  arm: Arm,
+  systemPrompt: SystemPromptOption,
+  claudeBinary: string | null,
+): Promise<ArmResult> {
+  const result: ArmResult = {
+    metrics: [],
+    costs: [],
+    durations: [],
+    rateLimitExhausted: 0,
+    sdkClaudeCodeVersions: new Set(),
+  };
+
+  const trials = fixture.trials;
+  const concurrency = fixture.concurrency ?? 3;
+
+  // Simple bounded executor: run trials in chunks of `concurrency`.
+  // The process-level semaphore in agent-sdk-runner.ts enforces the true cap.
+  let nextTrial = 0;
+  const workers = Array.from({ length: concurrency }, async () => {
+    while (true) {
+      const n = nextTrial++;
+      if (n >= trials) return;
+
+      const dir = mkTrialDir(fixture.id, arm, n);
+      fixture.setupWorkspace(dir);
+      try {
+        const sdkResult = await runAgentSdkTest({
+          systemPrompt,
+          userPrompt: fixture.userPrompt,
+          workingDirectory: dir,
+          model: fixture.model,
+          maxTurns: 5,
+          allowedTools: ['Read', 'Glob', 'Grep', 'Bash'],
+          permissionMode: 'bypassPermissions',
+          settingSources: [],
+          env: { ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY ?? '' },
+          pathToClaudeCodeExecutable: claudeBinary ?? undefined,
+          testName: `${fixture.id}-${arm}-${n}`,
+          runId,
+          fixtureId: fixture.id,
+          onRetry: (_) => {
+            // Reset the workspace before the retry so partial Bash side effects
+            // from the failed attempt don't contaminate.
+            fs.rmSync(dir, { recursive: true, force: true });
+            fs.mkdirSync(dir, { recursive: true });
+            fixture.setupWorkspace(dir);
+          },
+        });
+
+        saveRawTranscript(fixture.id, arm, n, sdkResult);
+
+        const metric = fixture.metric(sdkResult);
+        result.metrics.push(metric);
+        result.costs.push(sdkResult.costUsd);
+        result.durations.push(sdkResult.durationMs);
+        result.sdkClaudeCodeVersions.add(sdkResult.sdkClaudeCodeVersion);
+
+        evalCollector?.addTest({
+          name: `${fixture.id}-${arm}-${n}`,
+          suite: 'overlay-harness',
+          tier: 'e2e',
+          passed: true,
+          duration_ms: sdkResult.durationMs,
+          cost_usd: sdkResult.costUsd,
+          transcript: sdkResult.events,
+          prompt: fixture.userPrompt,
+          output: sdkResult.output,
+          turns_used: sdkResult.turnsUsed,
+          browse_errors: sdkResult.browseErrors,
+          exit_reason: sdkResult.exitReason,
+          model: sdkResult.model,
+          first_response_ms: sdkResult.firstResponseMs,
+          max_inter_turn_ms: sdkResult.maxInterTurnMs,
+        });
+      } catch (err) {
+        if (err instanceof Error && err.name === 'RateLimitExhaustedError') {
+          result.rateLimitExhausted++;
+          // Record a failed trial so the collector captures the attempt.
+          evalCollector?.addTest({
+            name: `${fixture.id}-${arm}-${n}`,
+            suite: 'overlay-harness',
+            tier: 'e2e',
+            passed: false,
+            duration_ms: 0,
+            cost_usd: 0,
+            exit_reason: 'rate_limit_exhausted',
+            error: err.message,
+          });
+        } else {
+          throw err;
+        }
+      } finally {
+        try {
+          fs.rmSync(dir, { recursive: true, force: true });
+        } catch {
+          // best-effort cleanup
+        }
+      }
+    }
+  });
+
+  await Promise.all(workers);
+  return result;
+}
+
+function mean(xs: number[]): number {
+  if (xs.length === 0) return 0;
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+
+function sum(xs: number[]): number {
+  return xs.reduce((a, b) => a + b, 0);
+}
+
+// ---------------------------------------------------------------------------
+// Test bodies
+// ---------------------------------------------------------------------------
+
+describeE2E('overlay efficacy harness (SDK)', () => {
+  // Resolve binary once
+  const claudeBinary = resolveClaudeBinary();
+
+  if (!claudeBinary) {
+    test.skip(
+      'no local `claude` binary on PATH — cannot pin for harness parity',
+      () => {},
+    );
+    return;
+  }
+
+  for (const fixture of OVERLAY_FIXTURES) {
+    test(
+      `${fixture.id}: overlay-ON vs overlay-OFF, N=${fixture.trials} per arm`,
+      async () => {
+        const overlayText = overlayContentFor(fixture);
+        expect(overlayText.length).toBeGreaterThan(100);
+
+        // Arm composition: both arms use the real Claude Code default system
+        // prompt (preset). Overlay-ON APPENDS the overlay text; overlay-OFF
+        // uses the default alone. This measures the overlay's marginal effect
+        // ON TOP of Claude Code's normal behavioral scaffolding — which is
+        // the only measurement that matches how real Claude Code composes
+        // overlays into its system prompt stack.
+        const [onArm, offArm] = await Promise.all([
+          runArm(
+            fixture,
+            'overlay-on',
+            { type: 'preset', preset: 'claude_code', append: overlayText },
+            claudeBinary,
+          ),
+          runArm(
+            fixture,
+            'overlay-off',
+            { type: 'preset', preset: 'claude_code' },
+            claudeBinary,
+          ),
+        ]);
+
+        const arms = {
+          overlay: onArm.metrics,
+          off: offArm.metrics,
+        };
+
+        const meanOn = mean(arms.overlay);
+        const meanOff = mean(arms.off);
+        const lift = meanOn - meanOff;
+        const floorHits = arms.overlay.filter((n) => n >= 2).length;
+        const totalCost = sum(onArm.costs) + sum(offArm.costs);
+        const versionSet = new Set([
+          ...onArm.sdkClaudeCodeVersions,
+          ...offArm.sdkClaudeCodeVersions,
+        ]);
+
+        // Loud output for the next person reading the eval JSON:
+        // eslint-disable-next-line no-console
+        console.log(
+          `\n[${fixture.id}]\n` +
+            `  binary: ${claudeBinary}\n` +
+            `  claude_code_version(s): ${[...versionSet].join(', ')}\n` +
+            `  overlay-ON  metrics: [${arms.overlay.join(', ')}]  mean=${meanOn.toFixed(2)}\n` +
+            `  overlay-OFF metrics: [${arms.off.join(', ')}]  mean=${meanOff.toFixed(2)}\n` +
+            `  lift: ${lift.toFixed(2)}  floor_hits(>=2): ${floorHits}/${fixture.trials}\n` +
+            `  rate_limit_exhausted: on=${onArm.rateLimitExhausted} off=${offArm.rateLimitExhausted}\n` +
+            `  total_cost_usd: $${totalCost.toFixed(4)}\n` +
+            `  transcripts: ${TRANSCRIPTS_DIR}`,
+        );
+
+        // Demand enough trials actually completed to make the assertion
+        // meaningful. If rate-limit exhaustion took out more than half of an
+        // arm, fail loudly rather than pass/fail on a fragment.
+        const minTrials = Math.ceil(fixture.trials / 2);
+        expect(arms.overlay.length).toBeGreaterThanOrEqual(minTrials);
+        expect(arms.off.length).toBeGreaterThanOrEqual(minTrials);
+
+        expect(fixture.pass(arms)).toBe(true);
+      },
+      30 * 60 * 1000, // 30 minute timeout per fixture
+    );
+  }
+});
+
+afterAll(async () => {
+  if (evalCollector) {
+    const filepath = await evalCollector.finalize();
+    // eslint-disable-next-line no-console
+    console.log(`\n[overlay-harness] eval results: ${filepath}`);
+  }
+});