test: bump remaining claude-opus-4-6 → 4-7 references

Mechanical model ID bump across the E2E eval suite. All six in-repo
files that referenced the older opus identifier are updated to match
the model gstack now defaults to. No behavior change beyond the model
ID the test harness asks for.

Contributed by @johnnysoftware7 (#1392).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-10 11:08:26 -07:00
parent 599d2e22a7
commit 351bbb8426
6 changed files with 12 additions and 12 deletions
+1 -1
View File
@@ -68,7 +68,7 @@ export interface EvalTestEntry {
last_tool_call?: string; // e.g. "Write(review-output.md)"
// Model + timing diagnostics (added for Sonnet/Opus split)
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-7'
first_response_ms?: number; // time from spawn to first NDJSON line
max_inter_turn_ms?: number; // peak latency between consecutive tool calls
+2 -2
View File
@@ -103,7 +103,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
timeout: 360_000,
testName: 'design-consultation-core',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/design-consultation core', result);
@@ -227,7 +227,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
timeout: 360_000,
testName: 'design-consultation-existing',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/design-consultation existing', result);
+6 -6
View File
@@ -82,7 +82,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
timeout: 360_000,
testName: 'plan-ceo-review',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/plan-ceo-review', result);
@@ -167,7 +167,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
timeout: 360_000,
testName: 'plan-ceo-review-selective',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/plan-ceo-review (SELECTIVE)', result);
@@ -233,7 +233,7 @@ Write your expansion proposals to ${planDir}/proposals.md with ONLY the proposal
timeout: 360_000,
testName: 'plan-ceo-review-expansion-energy',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/plan-ceo-review (EXPANSION ENERGY)', result);
@@ -333,7 +333,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
timeout: 360_000,
testName: 'plan-eng-review',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/plan-eng-review', result);
@@ -459,7 +459,7 @@ Write your review to ${planDir}/review-output.md`,
timeout: 360_000,
testName: 'plan-eng-review-artifact',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/plan-eng-review artifact', result);
@@ -679,7 +679,7 @@ This review report at the bottom of the plan is the MOST IMPORTANT deliverable o
timeout: 360_000,
testName: 'plan-review-report',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/plan-eng-review report', result);
+1 -1
View File
@@ -100,7 +100,7 @@ CRITICAL RULES:
timeout: 300_000,
testName: `qa-${label}`,
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost(`/qa ${label}`, result);
+1 -1
View File
@@ -514,7 +514,7 @@ Analyze the git history and produce the narrative report as described in the SKI
timeout: 300_000,
testName: 'retro',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/retro', result);
+1 -1
View File
@@ -503,7 +503,7 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
timeout: 300_000,
testName: 'codex-review',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});
logCost('/codex review', result);