mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
8af68207f5
User feedback: "i don't think i would use gstack-publish, i think we
should remove it." Agreed. The CLI + marketplace wiring was an
ambitious but speculative primitive. Zero users, zero validated demand,
and the existing manual `clawhub publish` workflow already covers the
real case (OpenClaw methodology skill publishing).
Deleted:
- bin/gstack-publish (the CLI)
- skills.json (the marketplace manifest)
- test/publish-dry-run.test.ts (13 tests)
- ship/SKILL.md.tmpl Step 19.5 — the methodology-skill publish-on-ship
check. No target to dispatch to anymore.
- README.md Power tools row for gstack-publish
Updated:
- bin/gstack-model-benchmark doc comment: dropped "matches gstack-publish
--dry-run semantics" reference (self-describing flag now)
- CHANGELOG 1.3.0.0 entry:
* Release summary: "three new binaries" → "two new binaries".
Dropped the /ship publish-check narrative.
* Numbers table: "1 of 3 → 3 of 3 wired" → "1 of 2 → 2 of 2 wired".
Deterministic test count: 45 → 32 (removed publish-dry-run's 13).
* Added section: removed gstack-publish CLI bullet + /ship Step 19.5
bullet.
* "What this means for users" closer: replaced the /ship publish
paragraph with the design-taste-engine learning loop, which IS
real, wired, and something users hit every week via /design-shotgun.
* Contributors section: "Four new test files" → "Three new test files"
Retained:
- openclaw/skills/gstack-openclaw-* skill dirs (pre-existed this PR,
still publishable manually via `clawhub publish`, useful standalone
for ClawHub installs)
- CLAUDE.md publishing-native-skills section (same rationale)
Regenerated SKILL.md across all hosts. Ship golden fixtures refreshed
for claude/codex/factory. 455 tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
169 lines
5.8 KiB
TypeScript
Executable File
169 lines
5.8 KiB
TypeScript
Executable File
#!/usr/bin/env bun
|
|
/**
|
|
* gstack-model-benchmark — run the same prompt across multiple providers
|
|
* and compare latency, tokens, cost, quality, and tool-call count.
|
|
*
|
|
* Usage:
|
|
* gstack-model-benchmark <skill-or-prompt-file> [options]
|
|
*
|
|
* Options:
|
|
* --models claude,gpt,gemini Comma-separated provider list (default: claude)
|
|
* --prompt "<text>" Inline prompt instead of a file
|
|
* --workdir <path> Working dir passed to each CLI (default: cwd)
|
|
* --timeout-ms <n> Per-provider timeout (default: 300000)
|
|
* --output table|json|markdown Output format (default: table)
|
|
* --skip-unavailable Skip providers that fail available() check
|
|
* (default: include them with unavailable marker)
|
|
* --judge Run Anthropic SDK judge on outputs for quality score
|
|
* (requires ANTHROPIC_API_KEY; adds ~$0.05 per call)
|
|
* --dry-run Validate flags + resolve auth, don't invoke providers
|
|
*
|
|
* Examples:
|
|
* gstack-model-benchmark --prompt "Write a haiku about databases" --models claude,gpt
|
|
* gstack-model-benchmark ./test-prompt.txt --models claude,gpt,gemini --judge
|
|
* gstack-model-benchmark --prompt "hi" --models claude,gpt,gemini --dry-run
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import { runBenchmark, formatTable, formatJson, formatMarkdown, type BenchmarkInput } from '../test/helpers/benchmark-runner';
|
|
import { ClaudeAdapter } from '../test/helpers/providers/claude';
|
|
import { GptAdapter } from '../test/helpers/providers/gpt';
|
|
import { GeminiAdapter } from '../test/helpers/providers/gemini';
|
|
|
|
const ADAPTER_FACTORIES = {
|
|
claude: () => new ClaudeAdapter(),
|
|
gpt: () => new GptAdapter(),
|
|
gemini: () => new GeminiAdapter(),
|
|
};
|
|
|
|
type OutputFormat = 'table' | 'json' | 'markdown';
|
|
|
|
function arg(name: string, def?: string): string | undefined {
|
|
const idx = process.argv.findIndex(a => a === name || a.startsWith(name + '='));
|
|
if (idx < 0) return def;
|
|
const eqIdx = process.argv[idx].indexOf('=');
|
|
if (eqIdx >= 0) return process.argv[idx].slice(eqIdx + 1);
|
|
return process.argv[idx + 1];
|
|
}
|
|
|
|
function flag(name: string): boolean {
|
|
return process.argv.includes(name);
|
|
}
|
|
|
|
function parseProviders(s: string | undefined): Array<'claude' | 'gpt' | 'gemini'> {
|
|
if (!s) return ['claude'];
|
|
const seen = new Set<'claude' | 'gpt' | 'gemini'>();
|
|
for (const p of s.split(',').map(x => x.trim()).filter(Boolean)) {
|
|
if (p === 'claude' || p === 'gpt' || p === 'gemini') seen.add(p);
|
|
else {
|
|
console.error(`WARN: unknown provider '${p}' — skipping. Valid: claude, gpt, gemini.`);
|
|
}
|
|
}
|
|
return seen.size ? Array.from(seen) : ['claude'];
|
|
}
|
|
|
|
function resolvePrompt(positional: string | undefined): string {
|
|
const inline = arg('--prompt');
|
|
if (inline) return inline;
|
|
if (!positional) {
|
|
console.error('ERROR: specify a prompt via positional path or --prompt "<text>"');
|
|
process.exit(1);
|
|
}
|
|
if (fs.existsSync(positional)) {
|
|
return fs.readFileSync(positional, 'utf-8');
|
|
}
|
|
// Not a file — treat as inline prompt
|
|
return positional;
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
const positional = process.argv.slice(2).find(a => !a.startsWith('--'));
|
|
const prompt = resolvePrompt(positional);
|
|
const providers = parseProviders(arg('--models'));
|
|
const workdir = arg('--workdir', process.cwd())!;
|
|
const timeoutMs = parseInt(arg('--timeout-ms', '300000')!, 10);
|
|
const output = (arg('--output', 'table') as OutputFormat);
|
|
const skipUnavailable = flag('--skip-unavailable');
|
|
const doJudge = flag('--judge');
|
|
const dryRun = flag('--dry-run');
|
|
|
|
if (dryRun) {
|
|
await dryRunReport({ prompt, providers, workdir, timeoutMs, output, doJudge });
|
|
return;
|
|
}
|
|
|
|
const input: BenchmarkInput = {
|
|
prompt,
|
|
workdir,
|
|
providers,
|
|
timeoutMs,
|
|
skipUnavailable,
|
|
};
|
|
|
|
const report = await runBenchmark(input);
|
|
|
|
if (doJudge) {
|
|
try {
|
|
const { judgeEntries } = await import('../test/helpers/benchmark-judge');
|
|
await judgeEntries(report);
|
|
} catch (err) {
|
|
console.error(`WARN: judge unavailable: ${(err as Error).message}`);
|
|
}
|
|
}
|
|
|
|
let out: string;
|
|
switch (output) {
|
|
case 'json': out = formatJson(report); break;
|
|
case 'markdown': out = formatMarkdown(report); break;
|
|
case 'table':
|
|
default: out = formatTable(report); break;
|
|
}
|
|
process.stdout.write(out + '\n');
|
|
}
|
|
|
|
async function dryRunReport(opts: {
|
|
prompt: string;
|
|
providers: Array<'claude' | 'gpt' | 'gemini'>;
|
|
workdir: string;
|
|
timeoutMs: number;
|
|
output: OutputFormat;
|
|
doJudge: boolean;
|
|
}): Promise<void> {
|
|
const lines: string[] = [];
|
|
lines.push('== gstack-model-benchmark --dry-run ==');
|
|
lines.push(` prompt: ${opts.prompt.length > 80 ? opts.prompt.slice(0, 80) + '…' : opts.prompt}`);
|
|
lines.push(` providers: ${opts.providers.join(', ')}`);
|
|
lines.push(` workdir: ${opts.workdir}`);
|
|
lines.push(` timeout_ms: ${opts.timeoutMs}`);
|
|
lines.push(` output: ${opts.output}`);
|
|
lines.push(` judge: ${opts.doJudge ? 'on (Anthropic SDK)' : 'off'}`);
|
|
lines.push('');
|
|
lines.push('Adapter availability:');
|
|
let authFailures = 0;
|
|
for (const name of opts.providers) {
|
|
const factory = ADAPTER_FACTORIES[name];
|
|
if (!factory) {
|
|
lines.push(` ${name}: UNKNOWN PROVIDER`);
|
|
authFailures += 1;
|
|
continue;
|
|
}
|
|
const adapter = factory();
|
|
const check = await adapter.available();
|
|
if (check.ok) {
|
|
lines.push(` ${adapter.name}: OK`);
|
|
} else {
|
|
lines.push(` ${adapter.name}: NOT READY — ${check.reason}`);
|
|
authFailures += 1;
|
|
}
|
|
}
|
|
lines.push('');
|
|
lines.push(`(--dry-run — no prompts sent. ${authFailures} provider(s) unavailable.)`);
|
|
process.stdout.write(lines.join('\n') + '\n');
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('FATAL:', err);
|
|
process.exit(1);
|
|
});
|