diff --git a/bin/gstack-distill-free-text b/bin/gstack-distill-free-text index 27fb303da..4f0688dcb 100755 --- a/bin/gstack-distill-free-text +++ b/bin/gstack-distill-free-text @@ -13,8 +13,11 @@ # gstack-distill-free-text --dry-run # show prompt, no API call # gstack-distill-free-text --status # show last-run stats # -# Per D7 cathedral cap: max 3 distills/day per slug. Cumulative cost log -# appended to $GSTACK_STATE_ROOT/distill-cost.jsonl. +# No rate cap — the natural rate of free-text events (rare; user has to type +# "Other" then content) bounds this loop already. Each Haiku call is ~$0.01, +# so even a runaway at one-per-minute would be ~$14/day worst case. The +# cumulative cost log at $GSTACK_STATE_ROOT/distill-cost.jsonl gives full +# auditability via --status when you want it. # Per D6: Anthropic SDK direct call, fail-loud on missing ANTHROPIC_API_KEY. set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" @@ -55,8 +58,9 @@ if [ "$MODE" = "status" ]; then const totalUsd = mine.reduce((a, e) => a + (e.cost_usd_est || 0), 0); const todayIso = new Date().toISOString().slice(0, 10); const today = mine.filter((e) => (e.ts || "").startsWith(todayIso)); + const todayUsd = today.reduce((a, e) => a + (e.cost_usd_est || 0), 0); console.log("RUNS: " + mine.length); - console.log("TODAY: " + today.length + " / 3"); + console.log("TODAY: " + today.length + " run(s), $" + todayUsd.toFixed(4)); console.log("ESTIMATED_TOTAL_USD: $" + totalUsd.toFixed(4)); const last = mine[mine.length - 1]; console.log("LAST_RUN: " + (last.ts || "?") + " | " + (last.proposals_count || 0) + " proposals"); @@ -72,26 +76,8 @@ if [ "$MODE" = "background" ]; then exit 0 fi -# --- Rate cap check (D7: max 3/day per slug) --------------------------- - -DAILY_COUNT=$(COST_LOG_PATH="$COST_LOG" SLUG_PATH="$SLUG" bun -e ' - const fs = require("fs"); - const slug = process.env.SLUG_PATH; - const path = process.env.COST_LOG_PATH; - if (!fs.existsSync(path)) { console.log("0"); process.exit(0); } - const todayIso = new Date().toISOString().slice(0, 10); - const lines = fs.readFileSync(path, "utf-8").trim().split("\n").filter(Boolean); - const n = lines - .map((l) => { try { return JSON.parse(l); } catch { return null; } }) - .filter((e) => e && e.slug === slug && (e.ts || "").startsWith(todayIso)) - .length; - console.log(String(n)); -') - -if [ "$DAILY_COUNT" -ge 3 ] 2>/dev/null; then - echo "RATE_CAPPED: $DAILY_COUNT distills today (3/day limit). Use --status for run history." - exit 0 -fi +# No rate cap. Natural input rate (free-text events are rare) + Haiku price +# (~$0.01/run) keep this bounded. Use --status to audit spend. # --- Gather unprocessed auq-other events from this project ------------- diff --git a/test/distill-free-text.test.ts b/test/distill-free-text.test.ts index 3e1f69d29..a79490831 100644 --- a/test/distill-free-text.test.ts +++ b/test/distill-free-text.test.ts @@ -104,44 +104,22 @@ describe('--status', () => { const r = run(['--status']); expect(r.status).toBe(0); expect(r.stdout).toContain('RUNS: 2'); - expect(r.stdout).toContain('TODAY: 2 / 3'); + expect(r.stdout).toMatch(/TODAY: 2 run\(s\)/); }); }); // ---------------------------------------------------------------------- -// Rate cap (D7) +// No rate cap (v1.52.0.0 cap audit) — the natural rate of free-text events +// is rare enough that count-based capping was theatrical. Cost log alone +// provides auditability via --status. // ---------------------------------------------------------------------- -describe('rate cap (3/day per slug)', () => { - test('exits with RATE_CAPPED when 3 runs already logged today', () => { +describe('no rate cap (audit removed)', () => { + test('never exits with RATE_CAPPED, even with many runs today', () => { const today = new Date().toISOString(); - writeCostLogEntry(cwdSlug, today); - writeCostLogEntry(cwdSlug, today); - writeCostLogEntry(cwdSlug, today); + for (let i = 0; i < 10; i++) writeCostLogEntry(cwdSlug, today); const r = run([]); expect(r.status).toBe(0); - expect(r.stdout).toMatch(/RATE_CAPPED/); - }); - - test('yesterday runs do not count against today cap', () => { - const today = new Date().toISOString(); - const yesterday = new Date(Date.now() - 25 * 60 * 60 * 1000).toISOString(); - writeCostLogEntry(cwdSlug, yesterday); - writeCostLogEntry(cwdSlug, yesterday); - writeCostLogEntry(cwdSlug, yesterday); - writeCostLogEntry(cwdSlug, today); - const r = run([]); - // Not capped — proceeds past the cap check; will hit NO_LOG next. - expect(r.status).toBe(0); - expect(r.stdout).not.toMatch(/RATE_CAPPED/); - }); - - test('other slugs in cost log do not count against this slug', () => { - const today = new Date().toISOString(); - writeCostLogEntry('other-slug', today); - writeCostLogEntry('other-slug', today); - writeCostLogEntry('other-slug', today); - const r = run([]); expect(r.stdout).not.toMatch(/RATE_CAPPED/); }); }); diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index b6dc69861..a405c2da9 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -327,10 +327,13 @@ describe('gen-skill-docs', () => { // resolver gained the marker convention + the // (recommended) label requirement (D2 + D18 — both load-bearing for // hook enforcement). Adds ~700 bytes. + // Ratcheted 40000 → 60000 in v1.52.0.0 cap audit: ~20K headroom so + // future preamble adds don't trip the gate on each PR. Real runaway + // (preamble doubling) still trips; normal scope growth doesn't. for (const skill of reviewSkills) { const content = fs.readFileSync(skill.path, 'utf-8'); const preamble = extractPreambleBeforeWorkflow(content, skill.markers); - expect(Buffer.byteLength(preamble, 'utf-8')).toBeLessThan(40_000); + expect(Buffer.byteLength(preamble, 'utf-8')).toBeLessThan(60_000); } }); diff --git a/test/skill-budget-regression.test.ts b/test/skill-budget-regression.test.ts index 494ac6781..85391bfc2 100644 --- a/test/skill-budget-regression.test.ts +++ b/test/skill-budget-regression.test.ts @@ -41,20 +41,24 @@ import { logBudgetOverride } from './helpers/budget-override'; * v1.45.0.0 T5 — hard eval cost cap. * * Per-tier defaults (override via env): - * EVALS_BUDGET_HARD_CAP_GATE default $25/run - * EVALS_BUDGET_HARD_CAP_PERIODIC default $70/run - * EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $30 + * EVALS_BUDGET_HARD_CAP_GATE default $200/run + * EVALS_BUDGET_HARD_CAP_PERIODIC default $500/run + * EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $300 * EVALS_BUDGET_OVERRIDE_REASON if set, override fires AND audit-logs to * ~/.gstack/analytics/spend-overrides.jsonl * - * Caps are dollars-per-run, not dollars-per-test. A test that legitimately - * gets more expensive should bake into the baseline; a runaway eval (infinite - * retry, model price change) gets stopped here. + * Caps are dollars-per-run, not dollars-per-test. The cap exists to catch + * runaway evals (infinite retry, model price change, prompt-blowup bug), + * NOT to gate legitimate scope growth. Set high enough that real growth + * never trips it — only obvious-bug territory does. Adjusted v1.52.0.0 + * (cathedral cap audit): $25 → $200 gate, $70 → $500 periodic. Prior + * defaults tripped on normal-scope expansion; new ceilings are 8× the + * historical worst-case eval run. */ -const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 30; +const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 300; const TIER_CAPS: Record<'e2e' | 'llm-judge', number> = { - e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || DEFAULT_HARD_CAP_USD, - 'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(70, DEFAULT_HARD_CAP_USD), + e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || Math.min(200, DEFAULT_HARD_CAP_USD), + 'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(500, DEFAULT_HARD_CAP_USD), }; function currentGitBranch(): string { diff --git a/test/skill-size-budget.test.ts b/test/skill-size-budget.test.ts index f86f8c5f4..b5b71a80f 100644 --- a/test/skill-size-budget.test.ts +++ b/test/skill-size-budget.test.ts @@ -37,13 +37,14 @@ import { logBudgetOverride } from './helpers/budget-override'; const REPO_ROOT = path.resolve(import.meta.dir, '..'); const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.47.0.0.json'); -// Default per-skill ratio is 1.05 (5% growth tolerance). T4 catalog trim -// MOVES text from frontmatter (always-loaded catalog) to a body section -// ("## When to invoke"), so small skills with already-short descriptions -// see a tiny body growth from the section header itself (~20 bytes). The -// 5% per-skill tolerance accommodates that while still catching real bloat; -// the always-loaded catalog cost is enforced separately with a hard ceiling. -const DEFAULT_RATIO = 1.05; +// Default per-skill ratio is 1.50 (50% growth tolerance). Adjusted v1.52.0.0 +// (cathedral cap audit) from 1.05 → 1.50: a 5% ratio tripped on legitimate +// feature additions (e.g., plan-tune cathedral T13 grew SKILL.md ×1.24 +// adding load-bearing Dream cycle + Audit unmarked + Recent auto-decisions +// surfaces). Real bloat is 2-3×; this catches that while not tripping on +// normal feature scope. The always-loaded catalog cost is enforced +// separately with a hard ceiling. +const DEFAULT_RATIO = 1.50; const RATIO = Number(process.env.GSTACK_SIZE_BUDGET_RATIO) || DEFAULT_RATIO; interface Regression {