mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
test: bump remaining claude-opus-4-6 → 4-7 references
Mechanical model ID bump across the E2E eval suite. All six in-repo files that referenced the older opus identifier are updated to match the model gstack now defaults to. No behavior change beyond the model ID the test harness asks for. Contributed by @johnnysoftware7 (#1392). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -68,7 +68,7 @@ export interface EvalTestEntry {
|
||||
last_tool_call?: string; // e.g. "Write(review-output.md)"
|
||||
|
||||
// Model + timing diagnostics (added for Sonnet/Opus split)
|
||||
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
|
||||
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-7'
|
||||
first_response_ms?: number; // time from spawn to first NDJSON line
|
||||
max_inter_turn_ms?: number; // peak latency between consecutive tool calls
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
|
||||
timeout: 360_000,
|
||||
testName: 'design-consultation-core',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/design-consultation core', result);
|
||||
@@ -227,7 +227,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
|
||||
timeout: 360_000,
|
||||
testName: 'design-consultation-existing',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/design-consultation existing', result);
|
||||
|
||||
@@ -82,7 +82,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
timeout: 360_000,
|
||||
testName: 'plan-ceo-review',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-ceo-review', result);
|
||||
@@ -167,7 +167,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
timeout: 360_000,
|
||||
testName: 'plan-ceo-review-selective',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-ceo-review (SELECTIVE)', result);
|
||||
@@ -233,7 +233,7 @@ Write your expansion proposals to ${planDir}/proposals.md with ONLY the proposal
|
||||
timeout: 360_000,
|
||||
testName: 'plan-ceo-review-expansion-energy',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-ceo-review (EXPANSION ENERGY)', result);
|
||||
@@ -333,7 +333,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-eng-review',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-eng-review', result);
|
||||
@@ -459,7 +459,7 @@ Write your review to ${planDir}/review-output.md`,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-eng-review-artifact',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-eng-review artifact', result);
|
||||
@@ -679,7 +679,7 @@ This review report at the bottom of the plan is the MOST IMPORTANT deliverable o
|
||||
timeout: 360_000,
|
||||
testName: 'plan-review-report',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-eng-review report', result);
|
||||
|
||||
@@ -100,7 +100,7 @@ CRITICAL RULES:
|
||||
timeout: 300_000,
|
||||
testName: `qa-${label}`,
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost(`/qa ${label}`, result);
|
||||
|
||||
@@ -514,7 +514,7 @@ Analyze the git history and produce the narrative report as described in the SKI
|
||||
timeout: 300_000,
|
||||
testName: 'retro',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/retro', result);
|
||||
|
||||
@@ -503,7 +503,7 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
|
||||
timeout: 300_000,
|
||||
testName: 'codex-review',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/codex review', result);
|
||||
|
||||
Reference in New Issue
Block a user