From 351bbb842649e8770306b3e9d8598c58575eb7bd Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 10 May 2026 11:08:26 -0700
Subject: [PATCH] =?UTF-8?q?test:=20bump=20remaining=20claude-opus-4-6=20?=
 =?UTF-8?q?=E2=86=92=204-7=20references?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mechanical model ID bump across the E2E eval suite. All six in-repo
files that referenced the older opus identifier are updated to match
the model gstack now defaults to. No behavior change beyond the model
ID the test harness asks for.

Contributed by @johnnysoftware7 (#1392).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/helpers/eval-store.ts      |  2 +-
 test/skill-e2e-design.test.ts   |  4 ++--
 test/skill-e2e-plan.test.ts     | 12 ++++++------
 test/skill-e2e-qa-bugs.test.ts  |  2 +-
 test/skill-e2e-review.test.ts   |  2 +-
 test/skill-e2e-workflow.test.ts |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts
index 9942f1e37..9a801ae1c 100644
--- a/test/helpers/eval-store.ts
+++ b/test/helpers/eval-store.ts
@@ -68,7 +68,7 @@ export interface EvalTestEntry {
   last_tool_call?: string;    // e.g. "Write(review-output.md)"
 
   // Model + timing diagnostics (added for Sonnet/Opus split)
-  model?: string;                // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
+  model?: string;                // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-7'
   first_response_ms?: number;    // time from spawn to first NDJSON line
   max_inter_turn_ms?: number;    // peak latency between consecutive tool calls
 
diff --git a/test/skill-e2e-design.test.ts b/test/skill-e2e-design.test.ts
index a207965f5..123d522b5 100644
--- a/test/skill-e2e-design.test.ts
+++ b/test/skill-e2e-design.test.ts
@@ -103,7 +103,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
       timeout: 360_000,
       testName: 'design-consultation-core',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/design-consultation core', result);
@@ -227,7 +227,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
       timeout: 360_000,
       testName: 'design-consultation-existing',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/design-consultation existing', result);
diff --git a/test/skill-e2e-plan.test.ts b/test/skill-e2e-plan.test.ts
index 269c889c3..cb630ca97 100644
--- a/test/skill-e2e-plan.test.ts
+++ b/test/skill-e2e-plan.test.ts
@@ -82,7 +82,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
       timeout: 360_000,
       testName: 'plan-ceo-review',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/plan-ceo-review', result);
@@ -167,7 +167,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
       timeout: 360_000,
       testName: 'plan-ceo-review-selective',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/plan-ceo-review (SELECTIVE)', result);
@@ -233,7 +233,7 @@ Write your expansion proposals to ${planDir}/proposals.md with ONLY the proposal
       timeout: 360_000,
       testName: 'plan-ceo-review-expansion-energy',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/plan-ceo-review (EXPANSION ENERGY)', result);
@@ -333,7 +333,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
       timeout: 360_000,
       testName: 'plan-eng-review',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/plan-eng-review', result);
@@ -459,7 +459,7 @@ Write your review to ${planDir}/review-output.md`,
       timeout: 360_000,
       testName: 'plan-eng-review-artifact',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/plan-eng-review artifact', result);
@@ -679,7 +679,7 @@ This review report at the bottom of the plan is the MOST IMPORTANT deliverable o
       timeout: 360_000,
       testName: 'plan-review-report',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/plan-eng-review report', result);
diff --git a/test/skill-e2e-qa-bugs.test.ts b/test/skill-e2e-qa-bugs.test.ts
index f9fa8a679..93514295f 100644
--- a/test/skill-e2e-qa-bugs.test.ts
+++ b/test/skill-e2e-qa-bugs.test.ts
@@ -100,7 +100,7 @@ CRITICAL RULES:
       timeout: 300_000,
       testName: `qa-${label}`,
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost(`/qa ${label}`, result);
diff --git a/test/skill-e2e-review.test.ts b/test/skill-e2e-review.test.ts
index 0e0bca025..1adbe25c7 100644
--- a/test/skill-e2e-review.test.ts
+++ b/test/skill-e2e-review.test.ts
@@ -514,7 +514,7 @@ Analyze the git history and produce the narrative report as described in the SKI
       timeout: 300_000,
       testName: 'retro',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/retro', result);
diff --git a/test/skill-e2e-workflow.test.ts b/test/skill-e2e-workflow.test.ts
index ee08290e8..52892a50d 100644
--- a/test/skill-e2e-workflow.test.ts
+++ b/test/skill-e2e-workflow.test.ts
@@ -503,7 +503,7 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
       timeout: 300_000,
       testName: 'codex-review',
       runId,
-      model: 'claude-opus-4-6',
+      model: 'claude-opus-4-7',
     });
 
     logCost('/codex review', result);