diff --git a/package.json b/package.json index b1866837..fd4fda65 100644 --- a/package.json +++ b/package.json @@ -15,8 +15,9 @@ "test": "bun test browse/test/ test/ --ignore 'test/skill-e2e-*.test.ts' --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts", "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", - "test:e2e": "EVALS=1 bun test --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", - "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test:e2e": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test:e2e:fast": "EVALS=1 EVALS_FAST=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts", "test:codex": "EVALS=1 bun test test/codex-e2e.test.ts", "test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", diff --git a/test/skill-e2e-plan.test.ts b/test/skill-e2e-plan.test.ts index d1ac2f99..1fc5b968 100644 --- a/test/skill-e2e-plan.test.ts +++ b/test/skill-e2e-plan.test.ts @@ -81,6 +81,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and timeout: 360_000, testName: 'plan-ceo-review', runId, + model: 'claude-opus-4-6', }); logCost('/plan-ceo-review', result); @@ -165,6 +166,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and timeout: 360_000, testName: 'plan-ceo-review-selective', runId, + model: 'claude-opus-4-6', }); logCost('/plan-ceo-review (SELECTIVE)', result); @@ -257,6 +259,7 @@ Focus on architecture, code quality, tests, and performance sections.`, timeout: 360_000, testName: 'plan-eng-review', runId, + model: 'claude-opus-4-6', }); logCost('/plan-eng-review', result); @@ -382,6 +385,7 @@ Write your review to ${planDir}/review-output.md`, timeout: 360_000, testName: 'plan-eng-review-artifact', runId, + model: 'claude-opus-4-6', }); logCost('/plan-eng-review artifact', result); diff --git a/test/skill-e2e-qa-bugs.test.ts b/test/skill-e2e-qa-bugs.test.ts index 45d32ba7..b93e97c0 100644 --- a/test/skill-e2e-qa-bugs.test.ts +++ b/test/skill-e2e-qa-bugs.test.ts @@ -100,6 +100,7 @@ CRITICAL RULES: timeout: 300_000, testName: `qa-${label}`, runId, + model: 'claude-opus-4-6', }); logCost(`/qa ${label}`, result); diff --git a/test/skill-e2e-review.test.ts b/test/skill-e2e-review.test.ts index f492e1f4..2d1b8550 100644 --- a/test/skill-e2e-review.test.ts +++ b/test/skill-e2e-review.test.ts @@ -510,6 +510,7 @@ Analyze the git history and produce the narrative report as described in the SKI timeout: 300_000, testName: 'retro', runId, + model: 'claude-opus-4-6', }); logCost('/retro', result);