diff --git a/test/helpers/e2e-helpers.ts b/test/helpers/e2e-helpers.ts index 6b5694f0..b65e0a79 100644 --- a/test/helpers/e2e-helpers.ts +++ b/test/helpers/e2e-helpers.ts @@ -30,6 +30,13 @@ export const evalsEnabled = !!process.env.EVALS; // Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch. export let selectedTests: string[] | null = null; // null = run all +// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback +const FAST_EXCLUDED_TESTS = [ + 'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch', + 'design-consultation-core', 'design-consultation-existing', + 'qa-fix-loop', 'design-review-fix', +]; + if (evalsEnabled && !process.env.EVALS_ALL) { const baseBranch = process.env.EVALS_BASE || detectBaseBranch(ROOT) @@ -48,6 +55,17 @@ if (evalsEnabled && !process.env.EVALS_ALL) { // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all } +// Apply EVALS_FAST filter after diff-based selection +if (evalsEnabled && process.env.EVALS_FAST) { + if (selectedTests === null) { + // Run all minus excluded + selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t)); + } else { + selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t)); + } + process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`); +} + export const describeE2E = evalsEnabled ? describe : describe.skip; /** Wrap a describe block to skip entirely if none of its tests are selected. */ @@ -164,6 +182,9 @@ export function recordE2E( exit_reason: result.exitReason, timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined, last_tool_call: lastTool, + model: result.model, + first_response_ms: result.firstResponseMs, + max_inter_turn_ms: result.maxInterTurnMs, ...extra, }); } diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts index 9dd64109..f2f13fce 100644 --- a/test/helpers/eval-store.ts +++ b/test/helpers/eval-store.ts @@ -42,6 +42,11 @@ export interface EvalTestEntry { timeout_at_turn?: number; // which turn was active when timeout hit last_tool_call?: string; // e.g. "Write(review-output.md)" + // Model + timing diagnostics (added for Sonnet/Opus split) + model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6' + first_response_ms?: number; // time from spawn to first NDJSON line + max_inter_turn_ms?: number; // peak latency between consecutive tool calls + // Outcome eval detection_rate?: number; false_positives?: number; @@ -65,6 +70,7 @@ export interface EvalResult { failed: number; total_cost_usd: number; total_duration_ms: number; + wall_clock_ms?: number; // wall-clock from collector creation to finalization (shows parallelism) tests: EvalTestEntry[]; _partial?: boolean; // true for incremental saves, absent in final } @@ -546,6 +552,7 @@ export class EvalCollector { private tests: EvalTestEntry[] = []; private finalized = false; private evalDir: string; + private createdAt = Date.now(); constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) { this.tier = tier; @@ -615,6 +622,7 @@ export class EvalCollector { failed: this.tests.length - passed, total_cost_usd: Math.round(totalCost * 100) / 100, total_duration_ms: totalDuration, + wall_clock_ms: Date.now() - this.createdAt, tests: this.tests, }; diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts index 6654df5f..c9b85de0 100644 --- a/test/helpers/session-runner.ts +++ b/test/helpers/session-runner.ts @@ -41,6 +41,12 @@ export interface SkillTestResult { output: string; costEstimate: CostEstimate; transcript: any[]; + /** Which model was used for this test (added for Sonnet/Opus split diagnostics) */ + model: string; + /** Time from spawn to first NDJSON line, in ms (added for rate-limit diagnostics) */ + firstResponseMs: number; + /** Peak latency between consecutive tool calls, in ms */ + maxInterTurnMs: number; } const BROWSE_ERROR_PATTERNS = [ @@ -116,6 +122,8 @@ export async function runSkillTest(options: { timeout?: number; testName?: string; runId?: string; + /** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */ + model?: string; }): Promise { const { prompt, @@ -126,6 +134,7 @@ export async function runSkillTest(options: { testName, runId, } = options; + const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6'; const startTime = Date.now(); const startedAt = new Date().toISOString(); @@ -144,6 +153,7 @@ export async function runSkillTest(options: { // avoid shell escaping issues. --verbose is required for stream-json mode. const args = [ '-p', + '--model', model, '--output-format', 'stream-json', '--verbose', '--dangerously-skip-permissions', @@ -175,6 +185,9 @@ export async function runSkillTest(options: { const collectedLines: string[] = []; let liveTurnCount = 0; let liveToolCount = 0; + let firstResponseMs = 0; + let lastToolTime = 0; + let maxInterTurnMs = 0; const stderrPromise = new Response(proc.stderr).text(); const reader = proc.stdout.getReader(); @@ -201,7 +214,15 @@ export async function runSkillTest(options: { for (const item of content) { if (item.type === 'tool_use') { liveToolCount++; - const elapsed = Math.round((Date.now() - startTime) / 1000); + const now = Date.now(); + const elapsed = Math.round((now - startTime) / 1000); + // Track timing telemetry + if (firstResponseMs === 0) firstResponseMs = now - startTime; + if (lastToolTime > 0) { + const interTurn = now - lastToolTime; + if (interTurn > maxInterTurnMs) maxInterTurnMs = interTurn; + } + lastToolTime = now; const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`; process.stderr.write(progressLine); @@ -330,5 +351,5 @@ export async function runSkillTest(options: { turnsUsed, }; - return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript }; + return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, model, firstResponseMs, maxInterTurnMs }; }