feat(benchmark): add --dry-run flag to gstack-model-benchmark

Matches gstack-publish --dry-run semantics. Validates the provider list,
resolves per-adapter auth, echoes the resolved flag values, and exits
without invoking any provider CLI. Zero-cost pre-flight for CI pipelines
and for catching auth drift before starting a paid benchmark run.

Output shape:
  == gstack-model-benchmark --dry-run ==
    prompt:     <truncated>
    providers:  claude, gpt, gemini
    workdir:    /tmp/...
    timeout_ms: 300000
    output:     table
    judge:      off

  Adapter availability:
    claude: OK
    gpt:    NOT READY — <reason>
    gemini: NOT READY — <reason>

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-18 06:44:48 +08:00
parent 5260987d86
commit 42715188c2
+58
View File
@@ -16,15 +16,27 @@
* (default: include them with unavailable marker)
* --judge Run Anthropic SDK judge on outputs for quality score
* (requires ANTHROPIC_API_KEY; adds ~$0.05 per call)
* --dry-run Validate flags + resolve auth, don't invoke providers
* (matches gstack-publish --dry-run semantics)
*
* Examples:
* gstack-model-benchmark --prompt "Write a haiku about databases" --models claude,gpt
* gstack-model-benchmark ./test-prompt.txt --models claude,gpt,gemini --judge
* gstack-model-benchmark --prompt "hi" --models claude,gpt,gemini --dry-run
*/
import * as fs from 'fs';
import * as path from 'path';
import { runBenchmark, formatTable, formatJson, formatMarkdown, type BenchmarkInput } from '../test/helpers/benchmark-runner';
import { ClaudeAdapter } from '../test/helpers/providers/claude';
import { GptAdapter } from '../test/helpers/providers/gpt';
import { GeminiAdapter } from '../test/helpers/providers/gemini';
const ADAPTER_FACTORIES = {
claude: () => new ClaudeAdapter(),
gpt: () => new GptAdapter(),
gemini: () => new GeminiAdapter(),
};
type OutputFormat = 'table' | 'json' | 'markdown';
@@ -75,6 +87,12 @@ async function main(): Promise<void> {
const output = (arg('--output', 'table') as OutputFormat);
const skipUnavailable = flag('--skip-unavailable');
const doJudge = flag('--judge');
const dryRun = flag('--dry-run');
if (dryRun) {
await dryRunReport({ prompt, providers, workdir, timeoutMs, output, doJudge });
return;
}
const input: BenchmarkInput = {
prompt,
@@ -105,6 +123,46 @@ async function main(): Promise<void> {
process.stdout.write(out + '\n');
}
async function dryRunReport(opts: {
prompt: string;
providers: Array<'claude' | 'gpt' | 'gemini'>;
workdir: string;
timeoutMs: number;
output: OutputFormat;
doJudge: boolean;
}): Promise<void> {
const lines: string[] = [];
lines.push('== gstack-model-benchmark --dry-run ==');
lines.push(` prompt: ${opts.prompt.length > 80 ? opts.prompt.slice(0, 80) + '…' : opts.prompt}`);
lines.push(` providers: ${opts.providers.join(', ')}`);
lines.push(` workdir: ${opts.workdir}`);
lines.push(` timeout_ms: ${opts.timeoutMs}`);
lines.push(` output: ${opts.output}`);
lines.push(` judge: ${opts.doJudge ? 'on (Anthropic SDK)' : 'off'}`);
lines.push('');
lines.push('Adapter availability:');
let authFailures = 0;
for (const name of opts.providers) {
const factory = ADAPTER_FACTORIES[name];
if (!factory) {
lines.push(` ${name}: UNKNOWN PROVIDER`);
authFailures += 1;
continue;
}
const adapter = factory();
const check = await adapter.available();
if (check.ok) {
lines.push(` ${adapter.name}: OK`);
} else {
lines.push(` ${adapter.name}: NOT READY — ${check.reason}`);
authFailures += 1;
}
}
lines.push('');
lines.push(`(--dry-run — no prompts sent. ${authFailures} provider(s) unavailable.)`);
process.stdout.write(lines.join('\n') + '\n');
}
main().catch(err => {
console.error('FATAL:', err);
process.exit(1);