Files
gstack/test/skill-e2e-overlay-harness.test.ts
T
Garry Tan 76829b76dc feat(eval): extend OverlayFixture with allowedTools, maxTurns, direction
Per-fixture tool allowlist unblocks measuring nudges that need Edit/Write
(e.g. literal-interpretation 'fix the failing tests' needs write access).
Per-fixture maxTurns lets harder prompts run longer without changing the
default. `direction` is cosmetic metadata for test output labeling.

Also adds reusable predicates and metrics:
- lowerIsBetter20Pct / higherIsBetter20Pct — 20% lift threshold vs baseline
- bashToolCallCount — count of Bash tool_use across the session
- turnsToCompletion — SDK-reported num_turns at result
- uniqueFilesEdited — Edit/Write/MultiEdit file_path set size

test/skill-e2e-overlay-harness.test.ts now threads fixture.allowedTools
and fixture.maxTurns through runArm.
2026-04-23 11:07:37 -07:00

321 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Overlay-efficacy harness (periodic tier, paid).
*
* Measures whether a model-specific overlay nudge actually changes model
* behavior when run through the real Claude Agent SDK — the harness
* Claude Code itself is built on. This complements test/skill-e2e-opus-47.test.ts
* which measures the same thing via `claude -p` subprocess (a different
* harness with different prompt composition).
*
* For each fixture in test/fixtures/overlay-nudges.ts, runs two arms at
* `fixture.trials` trials per arm with bounded concurrency:
* - overlay-on: SDK systemPrompt = resolved overlay content
* - overlay-off: SDK systemPrompt = "" (empty)
*
* Both arms have no CLAUDE.md, no skills directory, no setting-source
* inheritance (settingSources: []). This is the TRUE bare comparison —
* the only variable is the overlay text.
*
* Budget ~$20 per run at 40 trials (2 fixtures × 2 arms × 10 trials).
* Gated by EVALS=1 AND EVALS_TIER=periodic. Never runs under test:gate.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import {
runAgentSdkTest,
resolveClaudeBinary,
type AgentSdkResult,
type SystemPromptOption,
} from './helpers/agent-sdk-runner';
import { EvalCollector, getProjectEvalDir } from './helpers/eval-store';
import {
OVERLAY_FIXTURES,
type OverlayFixture,
} from './fixtures/overlay-nudges';
import { readOverlay } from '../scripts/resolvers/model-overlay';
const evalsEnabled = !!process.env.EVALS;
const periodicTier = process.env.EVALS_TIER === 'periodic';
const shouldRun = evalsEnabled && periodicTier;
const describeE2E = shouldRun ? describe : describe.skip;
// EvalCollector's tier must be 'e2e' | 'llm-judge' per its type signature.
// The existing paid evals violate this by passing descriptive names like
// 'e2e-opus-47' — a pre-existing pattern that only works because bun-test
// runs without strict typechecking. We stay conforming here.
const evalCollector = shouldRun ? new EvalCollector('e2e') : null;
const REPO_ROOT = path.resolve(import.meta.dir, '..');
const runId = new Date()
.toISOString()
.replace(/[:.]/g, '')
.replace('T', '-')
.slice(0, 15);
const TRANSCRIPTS_DIR = path.join(
path.dirname(getProjectEvalDir()),
'transcripts',
`overlay-harness-${runId}`,
);
// ---------------------------------------------------------------------------
// Per-arm helpers
// ---------------------------------------------------------------------------
type Arm = 'overlay-on' | 'overlay-off';
function mkTrialDir(fixtureId: string, arm: Arm, n: number): string {
const dir = fs.mkdtempSync(
path.join(os.tmpdir(), `overlay-harness-${fixtureId}-${arm}-${n}-`),
);
return dir;
}
function saveRawTranscript(
fixtureId: string,
arm: Arm,
n: number,
result: AgentSdkResult,
): void {
fs.mkdirSync(TRANSCRIPTS_DIR, { recursive: true });
const out = path.join(TRANSCRIPTS_DIR, `${fixtureId}-${arm}-${n}.jsonl`);
const lines = result.events.map((e) => JSON.stringify(e));
fs.writeFileSync(out, lines.join('\n') + '\n');
}
function overlayContentFor(fixture: OverlayFixture): string {
const family = path.basename(fixture.overlayPath, '.md');
const resolved = readOverlay(family);
if (!resolved) {
throw new Error(
`fixture ${fixture.id}: resolver returned empty content for ${family}`,
);
}
return resolved;
}
// ---------------------------------------------------------------------------
// Per-fixture runner
// ---------------------------------------------------------------------------
interface ArmResult {
metrics: number[];
costs: number[];
durations: number[];
rateLimitExhausted: number;
sdkClaudeCodeVersions: Set<string>;
}
async function runArm(
fixture: OverlayFixture,
arm: Arm,
systemPrompt: SystemPromptOption,
claudeBinary: string | null,
): Promise<ArmResult> {
const result: ArmResult = {
metrics: [],
costs: [],
durations: [],
rateLimitExhausted: 0,
sdkClaudeCodeVersions: new Set(),
};
const trials = fixture.trials;
const concurrency = fixture.concurrency ?? 3;
// Simple bounded executor: run trials in chunks of `concurrency`.
// The process-level semaphore in agent-sdk-runner.ts enforces the true cap.
let nextTrial = 0;
const workers = Array.from({ length: concurrency }, async () => {
while (true) {
const n = nextTrial++;
if (n >= trials) return;
const dir = mkTrialDir(fixture.id, arm, n);
fixture.setupWorkspace(dir);
try {
const sdkResult = await runAgentSdkTest({
systemPrompt,
userPrompt: fixture.userPrompt,
workingDirectory: dir,
model: fixture.model,
maxTurns: fixture.maxTurns ?? 5,
allowedTools: fixture.allowedTools ?? ['Read', 'Glob', 'Grep', 'Bash'],
permissionMode: 'bypassPermissions',
settingSources: [],
env: { ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY ?? '' },
pathToClaudeCodeExecutable: claudeBinary ?? undefined,
testName: `${fixture.id}-${arm}-${n}`,
runId,
fixtureId: fixture.id,
onRetry: (_) => {
// Reset the workspace before the retry so partial Bash side effects
// from the failed attempt don't contaminate.
fs.rmSync(dir, { recursive: true, force: true });
fs.mkdirSync(dir, { recursive: true });
fixture.setupWorkspace(dir);
},
});
saveRawTranscript(fixture.id, arm, n, sdkResult);
const metric = fixture.metric(sdkResult);
result.metrics.push(metric);
result.costs.push(sdkResult.costUsd);
result.durations.push(sdkResult.durationMs);
result.sdkClaudeCodeVersions.add(sdkResult.sdkClaudeCodeVersion);
evalCollector?.addTest({
name: `${fixture.id}-${arm}-${n}`,
suite: 'overlay-harness',
tier: 'e2e',
passed: true,
duration_ms: sdkResult.durationMs,
cost_usd: sdkResult.costUsd,
transcript: sdkResult.events,
prompt: fixture.userPrompt,
output: sdkResult.output,
turns_used: sdkResult.turnsUsed,
browse_errors: sdkResult.browseErrors,
exit_reason: sdkResult.exitReason,
model: sdkResult.model,
first_response_ms: sdkResult.firstResponseMs,
max_inter_turn_ms: sdkResult.maxInterTurnMs,
});
} catch (err) {
if (err instanceof Error && err.name === 'RateLimitExhaustedError') {
result.rateLimitExhausted++;
// Record a failed trial so the collector captures the attempt.
evalCollector?.addTest({
name: `${fixture.id}-${arm}-${n}`,
suite: 'overlay-harness',
tier: 'e2e',
passed: false,
duration_ms: 0,
cost_usd: 0,
exit_reason: 'rate_limit_exhausted',
error: err.message,
});
} else {
throw err;
}
} finally {
try {
fs.rmSync(dir, { recursive: true, force: true });
} catch {
// best-effort cleanup
}
}
}
});
await Promise.all(workers);
return result;
}
function mean(xs: number[]): number {
if (xs.length === 0) return 0;
return xs.reduce((a, b) => a + b, 0) / xs.length;
}
function sum(xs: number[]): number {
return xs.reduce((a, b) => a + b, 0);
}
// ---------------------------------------------------------------------------
// Test bodies
// ---------------------------------------------------------------------------
describeE2E('overlay efficacy harness (SDK)', () => {
// Resolve binary once
const claudeBinary = resolveClaudeBinary();
if (!claudeBinary) {
test.skip(
'no local `claude` binary on PATH — cannot pin for harness parity',
() => {},
);
return;
}
for (const fixture of OVERLAY_FIXTURES) {
test(
`${fixture.id}: overlay-ON vs overlay-OFF, N=${fixture.trials} per arm`,
async () => {
const overlayText = overlayContentFor(fixture);
expect(overlayText.length).toBeGreaterThan(100);
// Arm composition: both arms use the real Claude Code default system
// prompt (preset). Overlay-ON APPENDS the overlay text; overlay-OFF
// uses the default alone. This measures the overlay's marginal effect
// ON TOP of Claude Code's normal behavioral scaffolding — which is
// the only measurement that matches how real Claude Code composes
// overlays into its system prompt stack.
const [onArm, offArm] = await Promise.all([
runArm(
fixture,
'overlay-on',
{ type: 'preset', preset: 'claude_code', append: overlayText },
claudeBinary,
),
runArm(
fixture,
'overlay-off',
{ type: 'preset', preset: 'claude_code' },
claudeBinary,
),
]);
const arms = {
overlay: onArm.metrics,
off: offArm.metrics,
};
const meanOn = mean(arms.overlay);
const meanOff = mean(arms.off);
const lift = meanOn - meanOff;
const floorHits = arms.overlay.filter((n) => n >= 2).length;
const totalCost = sum(onArm.costs) + sum(offArm.costs);
const versionSet = new Set([
...onArm.sdkClaudeCodeVersions,
...offArm.sdkClaudeCodeVersions,
]);
// Loud output for the next person reading the eval JSON:
// eslint-disable-next-line no-console
console.log(
`\n[${fixture.id}]\n` +
` binary: ${claudeBinary}\n` +
` claude_code_version(s): ${[...versionSet].join(', ')}\n` +
` overlay-ON metrics: [${arms.overlay.join(', ')}] mean=${meanOn.toFixed(2)}\n` +
` overlay-OFF metrics: [${arms.off.join(', ')}] mean=${meanOff.toFixed(2)}\n` +
` lift: ${lift.toFixed(2)} floor_hits(>=2): ${floorHits}/${fixture.trials}\n` +
` rate_limit_exhausted: on=${onArm.rateLimitExhausted} off=${offArm.rateLimitExhausted}\n` +
` total_cost_usd: $${totalCost.toFixed(4)}\n` +
` transcripts: ${TRANSCRIPTS_DIR}`,
);
// Demand enough trials actually completed to make the assertion
// meaningful. If rate-limit exhaustion took out more than half of an
// arm, fail loudly rather than pass/fail on a fragment.
const minTrials = Math.ceil(fixture.trials / 2);
expect(arms.overlay.length).toBeGreaterThanOrEqual(minTrials);
expect(arms.off.length).toBeGreaterThanOrEqual(minTrials);
expect(fixture.pass(arms)).toBe(true);
},
30 * 60 * 1000, // 30 minute timeout per fixture
);
}
});
afterAll(async () => {
if (evalCollector) {
const filepath = await evalCollector.finalize();
// eslint-disable-next-line no-console
console.log(`\n[overlay-harness] eval results: ${filepath}`);
}
});