mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 07:10:12 +02:00
chore: cap audit — remove distill rate cap, loosen size/budget gates
Plan-tune cathedral follow-up. The 3/day distill cap was theatrical: at ~$0.01 per Haiku call, even a runaway loop firing every minute would cost ~$14/day, and free-text events are rare enough that the natural input rate self-limits to 1-2 fires/day. Count caps don't protect against runaway bugs (which fire 1000x/second, not 4 times/day) but DO punish heavy users who'd legitimately distill multiple times during a busy week. Removed: 3/day rate cap on bin/gstack-distill-free-text. --status output swapped from "TODAY: N / 3" to "TODAY: N run(s), $X" so users see what they're spending instead of how close they are to a meaningless count. Loosened (caps that exist for real-runaway protection, not normal scope): - EVALS_BUDGET_HARD_CAP_GATE $25 → $200/run - EVALS_BUDGET_HARD_CAP_PERIODIC $70 → $500/run - EVALS_BUDGET_HARD_CAP $30 → $300/run (umbrella fallback) - GSTACK_SIZE_BUDGET_RATIO 1.05 → 1.50 per-skill ratio - plan-review preamble byte budget 40K → 60K Principle: caps exist to catch obvious bugs (infinite retry, model price change, prompt blowup), not to gate legitimate scope growth. Set high enough that real growth never trips them, only bug territory does. Adjusted defaults are 4-8× historical worst case, leaving ample headroom for the next 12 months of legitimate expansion. Tests updated: distill-free-text removes the 3-test rate-cap describe block in favor of "no rate cap" assertion that 10 runs/day pass. Other budget tests still pass because they were never near the old ceilings. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,8 +13,11 @@
|
||||
# gstack-distill-free-text --dry-run # show prompt, no API call
|
||||
# gstack-distill-free-text --status # show last-run stats
|
||||
#
|
||||
# Per D7 cathedral cap: max 3 distills/day per slug. Cumulative cost log
|
||||
# appended to $GSTACK_STATE_ROOT/distill-cost.jsonl.
|
||||
# No rate cap — the natural rate of free-text events (rare; user has to type
|
||||
# "Other" then content) bounds this loop already. Each Haiku call is ~$0.01,
|
||||
# so even a runaway at one-per-minute would be ~$14/day worst case. The
|
||||
# cumulative cost log at $GSTACK_STATE_ROOT/distill-cost.jsonl gives full
|
||||
# auditability via --status when you want it.
|
||||
# Per D6: Anthropic SDK direct call, fail-loud on missing ANTHROPIC_API_KEY.
|
||||
set -euo pipefail
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
@@ -55,8 +58,9 @@ if [ "$MODE" = "status" ]; then
|
||||
const totalUsd = mine.reduce((a, e) => a + (e.cost_usd_est || 0), 0);
|
||||
const todayIso = new Date().toISOString().slice(0, 10);
|
||||
const today = mine.filter((e) => (e.ts || "").startsWith(todayIso));
|
||||
const todayUsd = today.reduce((a, e) => a + (e.cost_usd_est || 0), 0);
|
||||
console.log("RUNS: " + mine.length);
|
||||
console.log("TODAY: " + today.length + " / 3");
|
||||
console.log("TODAY: " + today.length + " run(s), $" + todayUsd.toFixed(4));
|
||||
console.log("ESTIMATED_TOTAL_USD: $" + totalUsd.toFixed(4));
|
||||
const last = mine[mine.length - 1];
|
||||
console.log("LAST_RUN: " + (last.ts || "?") + " | " + (last.proposals_count || 0) + " proposals");
|
||||
@@ -72,26 +76,8 @@ if [ "$MODE" = "background" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Rate cap check (D7: max 3/day per slug) ---------------------------
|
||||
|
||||
DAILY_COUNT=$(COST_LOG_PATH="$COST_LOG" SLUG_PATH="$SLUG" bun -e '
|
||||
const fs = require("fs");
|
||||
const slug = process.env.SLUG_PATH;
|
||||
const path = process.env.COST_LOG_PATH;
|
||||
if (!fs.existsSync(path)) { console.log("0"); process.exit(0); }
|
||||
const todayIso = new Date().toISOString().slice(0, 10);
|
||||
const lines = fs.readFileSync(path, "utf-8").trim().split("\n").filter(Boolean);
|
||||
const n = lines
|
||||
.map((l) => { try { return JSON.parse(l); } catch { return null; } })
|
||||
.filter((e) => e && e.slug === slug && (e.ts || "").startsWith(todayIso))
|
||||
.length;
|
||||
console.log(String(n));
|
||||
')
|
||||
|
||||
if [ "$DAILY_COUNT" -ge 3 ] 2>/dev/null; then
|
||||
echo "RATE_CAPPED: $DAILY_COUNT distills today (3/day limit). Use --status for run history."
|
||||
exit 0
|
||||
fi
|
||||
# No rate cap. Natural input rate (free-text events are rare) + Haiku price
|
||||
# (~$0.01/run) keep this bounded. Use --status to audit spend.
|
||||
|
||||
# --- Gather unprocessed auq-other events from this project -------------
|
||||
|
||||
|
||||
@@ -104,44 +104,22 @@ describe('--status', () => {
|
||||
const r = run(['--status']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('RUNS: 2');
|
||||
expect(r.stdout).toContain('TODAY: 2 / 3');
|
||||
expect(r.stdout).toMatch(/TODAY: 2 run\(s\)/);
|
||||
});
|
||||
});
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Rate cap (D7)
|
||||
// No rate cap (v1.52.0.0 cap audit) — the natural rate of free-text events
|
||||
// is rare enough that count-based capping was theatrical. Cost log alone
|
||||
// provides auditability via --status.
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
describe('rate cap (3/day per slug)', () => {
|
||||
test('exits with RATE_CAPPED when 3 runs already logged today', () => {
|
||||
describe('no rate cap (audit removed)', () => {
|
||||
test('never exits with RATE_CAPPED, even with many runs today', () => {
|
||||
const today = new Date().toISOString();
|
||||
writeCostLogEntry(cwdSlug, today);
|
||||
writeCostLogEntry(cwdSlug, today);
|
||||
writeCostLogEntry(cwdSlug, today);
|
||||
for (let i = 0; i < 10; i++) writeCostLogEntry(cwdSlug, today);
|
||||
const r = run([]);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toMatch(/RATE_CAPPED/);
|
||||
});
|
||||
|
||||
test('yesterday runs do not count against today cap', () => {
|
||||
const today = new Date().toISOString();
|
||||
const yesterday = new Date(Date.now() - 25 * 60 * 60 * 1000).toISOString();
|
||||
writeCostLogEntry(cwdSlug, yesterday);
|
||||
writeCostLogEntry(cwdSlug, yesterday);
|
||||
writeCostLogEntry(cwdSlug, yesterday);
|
||||
writeCostLogEntry(cwdSlug, today);
|
||||
const r = run([]);
|
||||
// Not capped — proceeds past the cap check; will hit NO_LOG next.
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).not.toMatch(/RATE_CAPPED/);
|
||||
});
|
||||
|
||||
test('other slugs in cost log do not count against this slug', () => {
|
||||
const today = new Date().toISOString();
|
||||
writeCostLogEntry('other-slug', today);
|
||||
writeCostLogEntry('other-slug', today);
|
||||
writeCostLogEntry('other-slug', today);
|
||||
const r = run([]);
|
||||
expect(r.stdout).not.toMatch(/RATE_CAPPED/);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -327,10 +327,13 @@ describe('gen-skill-docs', () => {
|
||||
// resolver gained the <gstack-qid:...> marker convention + the
|
||||
// (recommended) label requirement (D2 + D18 — both load-bearing for
|
||||
// hook enforcement). Adds ~700 bytes.
|
||||
// Ratcheted 40000 → 60000 in v1.52.0.0 cap audit: ~20K headroom so
|
||||
// future preamble adds don't trip the gate on each PR. Real runaway
|
||||
// (preamble doubling) still trips; normal scope growth doesn't.
|
||||
for (const skill of reviewSkills) {
|
||||
const content = fs.readFileSync(skill.path, 'utf-8');
|
||||
const preamble = extractPreambleBeforeWorkflow(content, skill.markers);
|
||||
expect(Buffer.byteLength(preamble, 'utf-8')).toBeLessThan(40_000);
|
||||
expect(Buffer.byteLength(preamble, 'utf-8')).toBeLessThan(60_000);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -41,20 +41,24 @@ import { logBudgetOverride } from './helpers/budget-override';
|
||||
* v1.45.0.0 T5 — hard eval cost cap.
|
||||
*
|
||||
* Per-tier defaults (override via env):
|
||||
* EVALS_BUDGET_HARD_CAP_GATE default $25/run
|
||||
* EVALS_BUDGET_HARD_CAP_PERIODIC default $70/run
|
||||
* EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $30
|
||||
* EVALS_BUDGET_HARD_CAP_GATE default $200/run
|
||||
* EVALS_BUDGET_HARD_CAP_PERIODIC default $500/run
|
||||
* EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $300
|
||||
* EVALS_BUDGET_OVERRIDE_REASON if set, override fires AND audit-logs to
|
||||
* ~/.gstack/analytics/spend-overrides.jsonl
|
||||
*
|
||||
* Caps are dollars-per-run, not dollars-per-test. A test that legitimately
|
||||
* gets more expensive should bake into the baseline; a runaway eval (infinite
|
||||
* retry, model price change) gets stopped here.
|
||||
* Caps are dollars-per-run, not dollars-per-test. The cap exists to catch
|
||||
* runaway evals (infinite retry, model price change, prompt-blowup bug),
|
||||
* NOT to gate legitimate scope growth. Set high enough that real growth
|
||||
* never trips it — only obvious-bug territory does. Adjusted v1.52.0.0
|
||||
* (cathedral cap audit): $25 → $200 gate, $70 → $500 periodic. Prior
|
||||
* defaults tripped on normal-scope expansion; new ceilings are 8× the
|
||||
* historical worst-case eval run.
|
||||
*/
|
||||
const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 30;
|
||||
const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 300;
|
||||
const TIER_CAPS: Record<'e2e' | 'llm-judge', number> = {
|
||||
e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || DEFAULT_HARD_CAP_USD,
|
||||
'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(70, DEFAULT_HARD_CAP_USD),
|
||||
e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || Math.min(200, DEFAULT_HARD_CAP_USD),
|
||||
'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(500, DEFAULT_HARD_CAP_USD),
|
||||
};
|
||||
|
||||
function currentGitBranch(): string {
|
||||
|
||||
@@ -37,13 +37,14 @@ import { logBudgetOverride } from './helpers/budget-override';
|
||||
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
||||
const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.47.0.0.json');
|
||||
|
||||
// Default per-skill ratio is 1.05 (5% growth tolerance). T4 catalog trim
|
||||
// MOVES text from frontmatter (always-loaded catalog) to a body section
|
||||
// ("## When to invoke"), so small skills with already-short descriptions
|
||||
// see a tiny body growth from the section header itself (~20 bytes). The
|
||||
// 5% per-skill tolerance accommodates that while still catching real bloat;
|
||||
// the always-loaded catalog cost is enforced separately with a hard ceiling.
|
||||
const DEFAULT_RATIO = 1.05;
|
||||
// Default per-skill ratio is 1.50 (50% growth tolerance). Adjusted v1.52.0.0
|
||||
// (cathedral cap audit) from 1.05 → 1.50: a 5% ratio tripped on legitimate
|
||||
// feature additions (e.g., plan-tune cathedral T13 grew SKILL.md ×1.24
|
||||
// adding load-bearing Dream cycle + Audit unmarked + Recent auto-decisions
|
||||
// surfaces). Real bloat is 2-3×; this catches that while not tripping on
|
||||
// normal feature scope. The always-loaded catalog cost is enforced
|
||||
// separately with a hard ceiling.
|
||||
const DEFAULT_RATIO = 1.50;
|
||||
const RATIO = Number(process.env.GSTACK_SIZE_BUDGET_RATIO) || DEFAULT_RATIO;
|
||||
|
||||
interface Regression {
|
||||
|
||||
Reference in New Issue
Block a user