From 620f5dbaea09ef854a4bb6705700904e10f1a73c Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sat, 18 Apr 2026 06:54:09 +0800
Subject: [PATCH] =?UTF-8?q?test:=20/review=20hardening=20=E2=80=94=20NOT-R?=
 =?UTF-8?q?EADY=20env=20isolation,=20workdir=20cleanup,=20perf?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Applied from the adversarial subagent pass during /review on this branch:

- test/benchmark-cli.test.ts — new "NOT READY path fires when auth env
  vars are stripped" test. The default dry-run test always showed OK on
  dev machines with auth, hiding regressions in the remediation-hint
  branch. Stripped env (no auth vars, HOME→empty tmpdir) now force-
  exercises gpt + gemini NOT READY paths and asserts every NOT READY
  line includes a concrete remediation hint (install/login/export).
  (claude adapter's os.homedir() call is Bun-cached; the 2-of-3 adapter
  coverage is sufficient to exercise the branch.)

- test/taste-engine.test.ts — session-cap test rewritten to seed the
  profile with 50 entries + one real CLI call, instead of 55 sequential
  subprocess spawns. Same coverage (FIFO eviction at the boundary), ~5s
  faster CI time. Also pins first-casing-wins on the Geist/GEIST merge
  assertion — bumpPref() keeps the first-arrival casing, so the test
  documents that policy.

- test/skill-e2e-benchmark-providers.test.ts — workdir creation moved
  from module-load into beforeAll, cleanup added in afterAll. Previous
  shape leaked a /tmp/bench-e2e-* dir every CI run.

- test/publish-dry-run.test.ts — removed unused empty test/helpers
  mkdirSync from the sandbox setup. The bin doesn't import from there,
  so the empty dir was a footgun for future maintainers.

- test/helpers/providers/gpt.ts — expanded the inline comment on
  `--skip-git-repo-check` to explicitly note that `-s read-only` is now
  load-bearing safety (the trust prompt was the secondary boundary;
  removing read-only while keeping skip-git-repo-check would be unsafe).

Net: 45 passing tests (was 44), session-cap test 5s faster, one real
regression surface covered that didn't exist before.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/benchmark-cli.test.ts                 | 43 ++++++++++++++++++++++
 test/helpers/providers/gpt.ts              |  8 ++--
 test/publish-dry-run.test.ts               |  1 -
 test/skill-e2e-benchmark-providers.test.ts | 15 +++++++-
 test/taste-engine.test.ts                  | 36 ++++++++++++++----
 5 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/test/benchmark-cli.test.ts b/test/benchmark-cli.test.ts
index 3b798abf..2932ec0c 100644
--- a/test/benchmark-cli.test.ts
+++ b/test/benchmark-cli.test.ts
@@ -97,6 +97,49 @@ describe('gstack-model-benchmark --dry-run', () => {
     }
   });
 
+  test('NOT READY path fires when auth env vars are stripped', () => {
+    // On a dev machine with full auth configured, the default --dry-run output
+    // shows OK for every provider with credentials. Strip auth env vars AND
+    // point HOME at an empty temp dir so adapters can't find file-based creds.
+    // This test exists to catch regressions where the NOT READY branch itself
+    // breaks (crash, missing remediation hint, wrong message format).
+    //
+    // Note: claude adapter's `os.homedir()` call is sometimes cached by Bun and
+    // doesn't always pick up the HOME override, so this test asserts only on
+    // gpt + gemini adapters where HOME redirection reliably makes the adapter's
+    // credentials-path check fail. Two adapters hitting NOT READY with full
+    // remediation messages is sufficient coverage for the branch.
+    const emptyHome = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-noauth-home-'));
+    try {
+      const minimalEnv: Record<string, string> = {
+        PATH: process.env.PATH ?? '',
+        TERM: process.env.TERM ?? 'xterm',
+        HOME: emptyHome,
+      };
+      const result = spawnSync('bun', ['run', BIN, '--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run'], {
+        cwd: ROOT,
+        env: minimalEnv,
+        encoding: 'utf-8',
+        timeout: 15000,
+      });
+      expect(result.status).toBe(0);
+      const out = result.stdout?.toString() ?? '';
+      // gpt + gemini must report NOT READY in this clean env (their auth check
+      // reads paths under the overridden HOME).
+      expect(out).toMatch(/gpt:\s+NOT READY/);
+      expect(out).toMatch(/gemini:\s+NOT READY/);
+      // Every NOT READY line must include a concrete remediation hint so users
+      // can resolve the missing auth. This is the regression we care about.
+      const notReadyLines = out.split('\n').filter(l => l.includes('NOT READY'));
+      expect(notReadyLines.length).toBeGreaterThanOrEqual(2);
+      for (const line of notReadyLines) {
+        expect(line).toMatch(/(install|Install|login|export|Run|Log in)/);
+      }
+    } finally {
+      fs.rmSync(emptyHome, { recursive: true, force: true });
+    }
+  });
+
   test('long prompt is truncated in dry-run display', () => {
     const longPrompt = 'x'.repeat(200);
     const r = run(['--prompt', longPrompt, '--dry-run']);
diff --git a/test/helpers/providers/gpt.ts b/test/helpers/providers/gpt.ts
index 846b8be0..07757dc2 100644
--- a/test/helpers/providers/gpt.ts
+++ b/test/helpers/providers/gpt.ts
@@ -31,9 +31,11 @@ export class GptAdapter implements ProviderAdapter {
 
   async run(opts: RunOpts): Promise<RunResult> {
     const start = Date.now();
-    // `--skip-git-repo-check` lets codex run in arbitrary working directories
-    // (temp dirs, non-git paths) without the interactive trust prompt. Benchmarks
-    // often don't care about the workdir — they're just running a prompt.
+    // `-s read-only` is load-bearing safety. With `--skip-git-repo-check` we
+    // bypass codex's interactive trust prompt for unknown directories (benchmarks
+    // often run in temp dirs / non-git paths), so the read-only sandbox is now
+    // the only boundary preventing codex from mutating the workdir. If you ever
+    // remove `-s read-only`, drop `--skip-git-repo-check` too.
     const args = ['exec', opts.prompt, '-C', opts.workdir, '-s', 'read-only', '--skip-git-repo-check', '--json'];
     if (opts.model) args.push('-m', opts.model);
     if (opts.extraArgs) args.push(...opts.extraArgs);
diff --git a/test/publish-dry-run.test.ts b/test/publish-dry-run.test.ts
index fe040f0b..4088f447 100644
--- a/test/publish-dry-run.test.ts
+++ b/test/publish-dry-run.test.ts
@@ -27,7 +27,6 @@ beforeEach(() => {
   // structure: copy the bin into sandbox/bin/, write a controlled skills.json at the root.
   sandbox = fs.mkdtempSync(path.join(os.tmpdir(), 'publish-sandbox-'));
   fs.mkdirSync(path.join(sandbox, 'bin'));
-  fs.mkdirSync(path.join(sandbox, 'test', 'helpers'), { recursive: true });
   binCopy = path.join(sandbox, 'bin', 'gstack-publish');
   fs.copyFileSync(BIN, binCopy);
   fs.chmodSync(binCopy, 0o755);
diff --git a/test/skill-e2e-benchmark-providers.test.ts b/test/skill-e2e-benchmark-providers.test.ts
index a9368a6b..8220f11a 100644
--- a/test/skill-e2e-benchmark-providers.test.ts
+++ b/test/skill-e2e-benchmark-providers.test.ts
@@ -18,7 +18,7 @@
  *   - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0`
  */
 
-import { describe, test, expect } from 'bun:test';
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
 import { ClaudeAdapter } from './helpers/providers/claude';
 import { GptAdapter } from './helpers/providers/gpt';
 import { GeminiAdapter } from './helpers/providers/gemini';
@@ -41,9 +41,20 @@ const gpt = new GptAdapter();
 const gemini = new GeminiAdapter();
 
 // Use a temp working directory so provider CLIs can't accidentally touch the repo.
-const workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-'));
+// Created in beforeAll / cleaned in afterAll so concurrent CI runs don't leak.
+let workdir: string;
 
 describeIfEvals('multi-provider benchmark adapters (live)', () => {
+  beforeAll(() => {
+    workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-'));
+  });
+
+  afterAll(() => {
+    if (workdir && fs.existsSync(workdir)) {
+      fs.rmSync(workdir, { recursive: true, force: true });
+    }
+  });
+
   test('claude: available() returns structured ok/reason', async () => {
     const check = await claude.available();
     expect(check).toHaveProperty('ok');
diff --git a/test/taste-engine.test.ts b/test/taste-engine.test.ts
index 52a39f89..e92a69da 100644
--- a/test/taste-engine.test.ts
+++ b/test/taste-engine.test.ts
@@ -208,13 +208,16 @@ describe('taste-engine: dimension extraction', () => {
     expect(p.dimensions.aesthetics.approved[0].value).toBe('brutalist');
   });
 
-  test('value matching is case-insensitive', () => {
+  test('value matching is case-insensitive (first casing wins)', () => {
     run(['approved', 'v1', '--reason', 'fonts: Geist']);
     run(['approved', 'v2', '--reason', 'fonts: GEIST']);
     const p = readProfile();
     // Should merge into a single entry
     expect(p.dimensions.fonts.approved).toHaveLength(1);
     expect(p.dimensions.fonts.approved[0].approved_count).toBe(2);
+    // Canonical value is the first-arrival casing. bumpPref() stores value on
+    // insert and never overwrites on subsequent bumps.
+    expect(p.dimensions.fonts.approved[0].value).toBe('Geist');
   });
 
   test('unknown dimension labels are silently ignored', () => {
@@ -231,14 +234,33 @@ describe('taste-engine: dimension extraction', () => {
 
 describe('taste-engine: session cap', () => {
   test('sessions truncate to last 50 entries (FIFO)', () => {
-    for (let i = 0; i < 55; i++) {
-      run(['approved', `v${i}`, '--reason', 'fonts: Geist']);
-    }
+    // Seed the profile with 50 existing sessions, then one real CLI call writes
+    // the 51st → the oldest must drop. Avoids 55 sequential subprocess spawns.
+    const seededSessions = Array.from({ length: 50 }, (_, i) => ({
+      ts: new Date(Date.now() - (50 - i) * 1000).toISOString(),
+      action: 'approved' as const,
+      variant: `seed-${i}`,
+    }));
+    writeProfile({
+      version: 1,
+      updated_at: new Date().toISOString(),
+      dimensions: {
+        fonts: { approved: [], rejected: [] },
+        colors: { approved: [], rejected: [] },
+        layouts: { approved: [], rejected: [] },
+        aesthetics: { approved: [], rejected: [] },
+      },
+      sessions: seededSessions,
+    });
+    const r = run(['approved', 'new-one', '--reason', 'fonts: Geist']);
+    expect(r.status).toBe(0);
     const p = readProfile();
     expect(p.sessions).toHaveLength(50);
-    // Last 5 should be preserved, first 5 dropped
-    expect(p.sessions[0].variant).toBe('v5');
-    expect(p.sessions[49].variant).toBe('v54');
+    // The oldest seed (seed-0) must have been evicted FIFO; seed-1 is now first;
+    // the new entry is last.
+    expect(p.sessions[0].variant).toBe('seed-1');
+    expect(p.sessions[48].variant).toBe('seed-49');
+    expect(p.sessions[49].variant).toBe('new-one');
   });
 });