mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test(eval): 3 more overlay fixtures to measure remaining Claude nudges
Measures three overlay bullets that haven't been tested yet: - claude-dedicated-tools-vs-bash — claude.md says 'prefer Read/Edit/Write/ Glob/Grep over cat/sed/find/grep'. Fixture prompts 'list every TypeScript file under src/ and tell me what each exports' and counts Bash tool_use across the session. Overlay-ON should drop it by >=20%. - opus-4-7-effort-match-trivial — opus-4-7.md says 'simple file reads don't need deep reasoning.' Fixture uses a trivial one-file prompt (config.json lookup) and measures turns_used. Overlay-ON should be <=80% of baseline turns. - opus-4-7-literal-interpretation — opus-4-7.md says 'fix ALL failing tests, not just the obvious one.' Fixture seeds three failing test files with deliberately distinct failure modes and counts unique files edited. Overlay-ON should touch >=20% more files. Adding a fourth fixture for any remaining overlay nudge is a single entry. The harness is now proven on: fanout (deleted after measurement), dedicated tools, effort-match, and literal-interpretation.
This commit is contained in:
Vendored
+99
@@ -242,6 +242,105 @@ export const OVERLAY_FIXTURES: OverlayFixture[] = [
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// claude.md / "Dedicated tools over Bash"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'claude-dedicated-tools-vs-bash',
|
||||
overlayPath: 'model-overlays/claude.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
setupWorkspace: (dir) => {
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
|
||||
},
|
||||
userPrompt:
|
||||
"List every TypeScript file under src/ and tell me what each exports. " +
|
||||
"You may use any tools available.",
|
||||
// Metric: total Bash tool_use count across the whole session.
|
||||
// The overlay says "prefer Read/Glob/Grep over cat/find/grep shell."
|
||||
// A model following that should emit Glob + Read, not Bash ls/find/cat.
|
||||
metric: bashToolCallCount,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// opus-4-7.md / "Effort-match the step"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'opus-4-7-effort-match-trivial',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
maxTurns: 8,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.json'),
|
||||
'{"name": "demo", "version": "1.0.0"}\n',
|
||||
);
|
||||
},
|
||||
userPrompt: "What's the version in config.json?",
|
||||
// Metric: turns used to reach a result.
|
||||
// The overlay says "simple lookups don't need deep reasoning."
|
||||
// A trivial prompt like this should complete in 2 turns
|
||||
// (Read, respond). Extended deliberation bumps this up.
|
||||
metric: turnsToCompletion,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// opus-4-7.md / "Literal interpretation awareness"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'opus-4-7-literal-interpretation',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'higher_is_better',
|
||||
// Allow Edit and Write so the model can fix the tests.
|
||||
allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
// Three failing test files. The "obvious" one is named first; the
|
||||
// other two require the model to enumerate to notice them.
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'auth.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('auth returns true', () => { expect(1).toBe(2); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'billing.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('billing computes total', () => { expect('ten').toBe('10'); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'notifications.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
"# demo\n\nSmall project with three failing test files.\n",
|
||||
);
|
||||
},
|
||||
userPrompt: "Fix the failing tests.",
|
||||
// Metric: number of unique test files the model edited/wrote.
|
||||
// The overlay says "fix ALL failing tests, not just the obvious one."
|
||||
// Overlay-ON should touch all 3 test files. Overlay-OFF might stop
|
||||
// at the first one after making it pass.
|
||||
metric: uniqueFilesEdited,
|
||||
pass: higherIsBetter20Pct,
|
||||
},
|
||||
];
|
||||
|
||||
// Validate at module load so a broken fixture fails fast at test startup,
|
||||
|
||||
Reference in New Issue
Block a user