From c18d9fa308663fbd3f2e29af198f3c671e96c490 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 23:30:56 -0700 Subject: [PATCH] test(plan-ceo): bump --disallowedTools test timeout to 10 min Last 5 runs showed the model under --disallowedTools spending the full 5-min budget in 'high effort thinking' before surfacing options. The LLM judge correctly reports state=working at every 30s tick, so the high-water-mark fallback never fires. 10-min budget gives the model 20 judge windows to eventually surface the question. Outer bun timeout bumped accordingly to 660s (inner +60s). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/skill-e2e-plan-ceo-plan-mode.test.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts index f22f6baf8..bcb87a559 100644 --- a/test/skill-e2e-plan-ceo-plan-mode.test.ts +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -107,7 +107,11 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { skillName: 'plan-ceo-review', inPlanMode: true, extraArgs: ['--disallowedTools', 'AskUserQuestion'], - timeoutMs: 300_000, + // 10-min budget: post-v1.28 the model under --disallowedTools sometimes + // spends 5+ min in "high effort thinking" before surfacing options. The + // judge fires every 30s and high-water-marks the first prose-AUQ tick; + // 10 min gives the model 20 surfacing windows. + timeoutMs: 600_000, }); // The user must SEE the question one way or another. Three valid surfaces: @@ -159,5 +163,5 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { // to enforce the at-bottom contract against. The contract is // exercised by the periodic finding-count tests, which DO run the // full review. - }, 360_000); + }, 660_000); // outer = inner timeoutMs (600_000) + 60s grace });