From 620f5dbaea09ef854a4bb6705700904e10f1a73c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 18 Apr 2026 06:54:09 +0800 Subject: [PATCH] =?UTF-8?q?test:=20/review=20hardening=20=E2=80=94=20NOT-R?= =?UTF-8?q?EADY=20env=20isolation,=20workdir=20cleanup,=20perf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Applied from the adversarial subagent pass during /review on this branch: - test/benchmark-cli.test.ts — new "NOT READY path fires when auth env vars are stripped" test. The default dry-run test always showed OK on dev machines with auth, hiding regressions in the remediation-hint branch. Stripped env (no auth vars, HOME→empty tmpdir) now force- exercises gpt + gemini NOT READY paths and asserts every NOT READY line includes a concrete remediation hint (install/login/export). (claude adapter's os.homedir() call is Bun-cached; the 2-of-3 adapter coverage is sufficient to exercise the branch.) - test/taste-engine.test.ts — session-cap test rewritten to seed the profile with 50 entries + one real CLI call, instead of 55 sequential subprocess spawns. Same coverage (FIFO eviction at the boundary), ~5s faster CI time. Also pins first-casing-wins on the Geist/GEIST merge assertion — bumpPref() keeps the first-arrival casing, so the test documents that policy. - test/skill-e2e-benchmark-providers.test.ts — workdir creation moved from module-load into beforeAll, cleanup added in afterAll. Previous shape leaked a /tmp/bench-e2e-* dir every CI run. - test/publish-dry-run.test.ts — removed unused empty test/helpers mkdirSync from the sandbox setup. The bin doesn't import from there, so the empty dir was a footgun for future maintainers. - test/helpers/providers/gpt.ts — expanded the inline comment on `--skip-git-repo-check` to explicitly note that `-s read-only` is now load-bearing safety (the trust prompt was the secondary boundary; removing read-only while keeping skip-git-repo-check would be unsafe). Net: 45 passing tests (was 44), session-cap test 5s faster, one real regression surface covered that didn't exist before. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/benchmark-cli.test.ts | 43 ++++++++++++++++++++++ test/helpers/providers/gpt.ts | 8 ++-- test/publish-dry-run.test.ts | 1 - test/skill-e2e-benchmark-providers.test.ts | 15 +++++++- test/taste-engine.test.ts | 36 ++++++++++++++---- 5 files changed, 90 insertions(+), 13 deletions(-) diff --git a/test/benchmark-cli.test.ts b/test/benchmark-cli.test.ts index 3b798abf..2932ec0c 100644 --- a/test/benchmark-cli.test.ts +++ b/test/benchmark-cli.test.ts @@ -97,6 +97,49 @@ describe('gstack-model-benchmark --dry-run', () => { } }); + test('NOT READY path fires when auth env vars are stripped', () => { + // On a dev machine with full auth configured, the default --dry-run output + // shows OK for every provider with credentials. Strip auth env vars AND + // point HOME at an empty temp dir so adapters can't find file-based creds. + // This test exists to catch regressions where the NOT READY branch itself + // breaks (crash, missing remediation hint, wrong message format). + // + // Note: claude adapter's `os.homedir()` call is sometimes cached by Bun and + // doesn't always pick up the HOME override, so this test asserts only on + // gpt + gemini adapters where HOME redirection reliably makes the adapter's + // credentials-path check fail. Two adapters hitting NOT READY with full + // remediation messages is sufficient coverage for the branch. + const emptyHome = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-noauth-home-')); + try { + const minimalEnv: Record = { + PATH: process.env.PATH ?? '', + TERM: process.env.TERM ?? 'xterm', + HOME: emptyHome, + }; + const result = spawnSync('bun', ['run', BIN, '--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run'], { + cwd: ROOT, + env: minimalEnv, + encoding: 'utf-8', + timeout: 15000, + }); + expect(result.status).toBe(0); + const out = result.stdout?.toString() ?? ''; + // gpt + gemini must report NOT READY in this clean env (their auth check + // reads paths under the overridden HOME). + expect(out).toMatch(/gpt:\s+NOT READY/); + expect(out).toMatch(/gemini:\s+NOT READY/); + // Every NOT READY line must include a concrete remediation hint so users + // can resolve the missing auth. This is the regression we care about. + const notReadyLines = out.split('\n').filter(l => l.includes('NOT READY')); + expect(notReadyLines.length).toBeGreaterThanOrEqual(2); + for (const line of notReadyLines) { + expect(line).toMatch(/(install|Install|login|export|Run|Log in)/); + } + } finally { + fs.rmSync(emptyHome, { recursive: true, force: true }); + } + }); + test('long prompt is truncated in dry-run display', () => { const longPrompt = 'x'.repeat(200); const r = run(['--prompt', longPrompt, '--dry-run']); diff --git a/test/helpers/providers/gpt.ts b/test/helpers/providers/gpt.ts index 846b8be0..07757dc2 100644 --- a/test/helpers/providers/gpt.ts +++ b/test/helpers/providers/gpt.ts @@ -31,9 +31,11 @@ export class GptAdapter implements ProviderAdapter { async run(opts: RunOpts): Promise { const start = Date.now(); - // `--skip-git-repo-check` lets codex run in arbitrary working directories - // (temp dirs, non-git paths) without the interactive trust prompt. Benchmarks - // often don't care about the workdir — they're just running a prompt. + // `-s read-only` is load-bearing safety. With `--skip-git-repo-check` we + // bypass codex's interactive trust prompt for unknown directories (benchmarks + // often run in temp dirs / non-git paths), so the read-only sandbox is now + // the only boundary preventing codex from mutating the workdir. If you ever + // remove `-s read-only`, drop `--skip-git-repo-check` too. const args = ['exec', opts.prompt, '-C', opts.workdir, '-s', 'read-only', '--skip-git-repo-check', '--json']; if (opts.model) args.push('-m', opts.model); if (opts.extraArgs) args.push(...opts.extraArgs); diff --git a/test/publish-dry-run.test.ts b/test/publish-dry-run.test.ts index fe040f0b..4088f447 100644 --- a/test/publish-dry-run.test.ts +++ b/test/publish-dry-run.test.ts @@ -27,7 +27,6 @@ beforeEach(() => { // structure: copy the bin into sandbox/bin/, write a controlled skills.json at the root. sandbox = fs.mkdtempSync(path.join(os.tmpdir(), 'publish-sandbox-')); fs.mkdirSync(path.join(sandbox, 'bin')); - fs.mkdirSync(path.join(sandbox, 'test', 'helpers'), { recursive: true }); binCopy = path.join(sandbox, 'bin', 'gstack-publish'); fs.copyFileSync(BIN, binCopy); fs.chmodSync(binCopy, 0o755); diff --git a/test/skill-e2e-benchmark-providers.test.ts b/test/skill-e2e-benchmark-providers.test.ts index a9368a6b..8220f11a 100644 --- a/test/skill-e2e-benchmark-providers.test.ts +++ b/test/skill-e2e-benchmark-providers.test.ts @@ -18,7 +18,7 @@ * - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0` */ -import { describe, test, expect } from 'bun:test'; +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { ClaudeAdapter } from './helpers/providers/claude'; import { GptAdapter } from './helpers/providers/gpt'; import { GeminiAdapter } from './helpers/providers/gemini'; @@ -41,9 +41,20 @@ const gpt = new GptAdapter(); const gemini = new GeminiAdapter(); // Use a temp working directory so provider CLIs can't accidentally touch the repo. -const workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-')); +// Created in beforeAll / cleaned in afterAll so concurrent CI runs don't leak. +let workdir: string; describeIfEvals('multi-provider benchmark adapters (live)', () => { + beforeAll(() => { + workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-')); + }); + + afterAll(() => { + if (workdir && fs.existsSync(workdir)) { + fs.rmSync(workdir, { recursive: true, force: true }); + } + }); + test('claude: available() returns structured ok/reason', async () => { const check = await claude.available(); expect(check).toHaveProperty('ok'); diff --git a/test/taste-engine.test.ts b/test/taste-engine.test.ts index 52a39f89..e92a69da 100644 --- a/test/taste-engine.test.ts +++ b/test/taste-engine.test.ts @@ -208,13 +208,16 @@ describe('taste-engine: dimension extraction', () => { expect(p.dimensions.aesthetics.approved[0].value).toBe('brutalist'); }); - test('value matching is case-insensitive', () => { + test('value matching is case-insensitive (first casing wins)', () => { run(['approved', 'v1', '--reason', 'fonts: Geist']); run(['approved', 'v2', '--reason', 'fonts: GEIST']); const p = readProfile(); // Should merge into a single entry expect(p.dimensions.fonts.approved).toHaveLength(1); expect(p.dimensions.fonts.approved[0].approved_count).toBe(2); + // Canonical value is the first-arrival casing. bumpPref() stores value on + // insert and never overwrites on subsequent bumps. + expect(p.dimensions.fonts.approved[0].value).toBe('Geist'); }); test('unknown dimension labels are silently ignored', () => { @@ -231,14 +234,33 @@ describe('taste-engine: dimension extraction', () => { describe('taste-engine: session cap', () => { test('sessions truncate to last 50 entries (FIFO)', () => { - for (let i = 0; i < 55; i++) { - run(['approved', `v${i}`, '--reason', 'fonts: Geist']); - } + // Seed the profile with 50 existing sessions, then one real CLI call writes + // the 51st → the oldest must drop. Avoids 55 sequential subprocess spawns. + const seededSessions = Array.from({ length: 50 }, (_, i) => ({ + ts: new Date(Date.now() - (50 - i) * 1000).toISOString(), + action: 'approved' as const, + variant: `seed-${i}`, + })); + writeProfile({ + version: 1, + updated_at: new Date().toISOString(), + dimensions: { + fonts: { approved: [], rejected: [] }, + colors: { approved: [], rejected: [] }, + layouts: { approved: [], rejected: [] }, + aesthetics: { approved: [], rejected: [] }, + }, + sessions: seededSessions, + }); + const r = run(['approved', 'new-one', '--reason', 'fonts: Geist']); + expect(r.status).toBe(0); const p = readProfile(); expect(p.sessions).toHaveLength(50); - // Last 5 should be preserved, first 5 dropped - expect(p.sessions[0].variant).toBe('v5'); - expect(p.sessions[49].variant).toBe('v54'); + // The oldest seed (seed-0) must have been evicted FIFO; seed-1 is now first; + // the new entry is last. + expect(p.sessions[0].variant).toBe('seed-1'); + expect(p.sessions[48].variant).toBe('seed-49'); + expect(p.sessions[49].variant).toBe('new-one'); }); });