Files
gstack/test/codex-hardening.test.ts
Garry Tan 4bf5ae12c0 feat: codex/autoplan hardening + Apple Silicon coreutils auto-install
Hardens /codex and /autoplan against silent failures surfaced by the #972
stdin fix and #1003 Apple Silicon codesign. Six-layer defense:

1. **Multi-signal auth probe** (new Step 0.5 / Phase 0.5): env-based auth
   ($CODEX_API_KEY, $OPENAI_API_KEY) OR file-based auth
   (${CODEX_HOME:-~/.codex}/auth.json). Rejects false negatives that the
   old file-only check produced for CI / platform-engineer users.

2. **Timeout wrapper** around every codex exec / codex review invocation:
   gtimeout → timeout → unwrapped fallback chain. On exit 124, surfaces
   common causes + actionable next step. Guards against model-API stalls
   not covered by the #972 stdin fix.

3. **Stderr capture in Challenge mode** (codex/SKILL.md.tmpl:208):
   2>/dev/null → 2>$TMPERR. Post-invocation grep for auth/login/unauthorized
   surfaces errors that were previously dropped silently.

4. **Completeness check** in the Python JSON parser: tracks turn.completed
   events and warns on zero (possible mid-stream disconnect).

5. **Version warning** for known-bad Codex CLI (0.120.0-0.120.2, the range
   that introduced the stdin deadlock #972 fixes). Anchored regex
   `(^|[^0-9.])0\.120\.(0|1|2)([^0-9.]|$)` prevents 0.120.10 / 0.120.20
   false positives.

6. **Failure telemetry + operational learnings**: codex_timeout,
   codex_auth_failed, codex_cli_missing, codex_version_warning events
   land in ~/.gstack/analytics/skill-usage.jsonl behind the existing
   telemetry opt-in. On timeout (exit 124), auto-logs an operational
   learning via gstack-learnings-log so future /investigate sessions
   surface prior hang patterns automatically.

**Shared helper** (bin/gstack-codex-probe): consolidates all four pieces
(auth probe, version check, timeout wrapper, telemetry logger) into one
bash file that /codex and /autoplan source. Namespace-prefixed
(_gstack_codex_*) with a unit test that verifies sourcing does not leak
shell options into the caller. pathRewrites in host configs rewrite
~/.claude/skills/gstack → $GSTACK_ROOT for Codex, $GSTACK_BIN for
Factory/Cursor/etc.

**Apple Silicon coreutils auto-install** (setup:264): macOS lacks GNU
timeout by default; Homebrew's coreutils installs it as gtimeout to
avoid shadowing BSD utilities. ./setup now auto-installs coreutils on
Darwin (arch-agnostic — applies to Intel + Apple Silicon) when neither
gtimeout nor timeout is present. Opt-out via GSTACK_SKIP_COREUTILS=1
for CI, managed machines, or offline envs.

**25 deterministic unit tests** (test/codex-hardening.test.ts):
- 8 auth probe combinations (env precedence, whitespace, alternate
  $CODEX_HOME, corrupt file paths)
- 10 version regex cases including 0.120.10 false-positive guards
  and v-prefixed / multiline output
- 4 timeout wrapper + namespace hygiene (bash -n, gtimeout
  preference, set-option leak check)
- 3 telemetry payload schema checks (confirms env values + auth
  tokens never leak into emitted events)

**1 periodic-tier E2E** (test/skill-e2e-autoplan-dual-voice.test.ts):
gates the /autoplan dual-voice path — asserts both Claude subagent
and Codex voices produce output in Phase 1, OR that [codex-unavailable]
is logged when Codex is absent. ~\$1/run, not a CI gate.

Golden baseline + gen-skill-docs exclusion list updated for the new
codex path references and the 16 < /dev/null redirects from #972.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 12:10:21 +08:00

367 lines
12 KiB
TypeScript

import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as path from 'path';
import * as fs from 'fs';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const PROBE = path.join(ROOT, 'bin/gstack-codex-probe');
// Run a bash snippet that sources the probe and evaluates one of its functions.
// Controlled env + optional tempdir for HOME isolation.
function runProbe(opts: {
snippet: string;
env?: Record<string, string | undefined>;
home?: string;
}): { stdout: string; stderr: string; status: number } {
const env: Record<string, string> = {
// Start from a clean env so test-env vars from the parent don't leak in.
PATH: process.env.PATH ?? '',
_TEL: 'off',
};
if (opts.home) env.HOME = opts.home;
// Apply overrides; undefined means "remove".
if (opts.env) {
for (const [k, v] of Object.entries(opts.env)) {
if (v === undefined) {
delete env[k];
} else {
env[k] = v;
}
}
}
const script = `set +e\nsource "${PROBE}"\n${opts.snippet}\n`;
const result = spawnSync('bash', ['-c', script], {
env,
stdio: ['pipe', 'pipe', 'pipe'],
timeout: 5000,
});
return {
stdout: (result.stdout ?? '').toString(),
stderr: (result.stderr ?? '').toString(),
status: result.status ?? -1,
};
}
function tempHome(): string {
return fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-codex-probe-home-'));
}
describe('gstack-codex-probe: auth probe', () => {
test('CODEX_API_KEY set → AUTH_OK', () => {
const home = tempHome();
try {
const r = runProbe({
snippet: '_gstack_codex_auth_probe',
env: { CODEX_API_KEY: 'sk-test' },
home,
});
expect(r.stdout.trim()).toBe('AUTH_OK');
expect(r.status).toBe(0);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('OPENAI_API_KEY set → AUTH_OK', () => {
const home = tempHome();
try {
const r = runProbe({
snippet: '_gstack_codex_auth_probe',
env: { OPENAI_API_KEY: 'sk-openai' },
home,
});
expect(r.stdout.trim()).toBe('AUTH_OK');
expect(r.status).toBe(0);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('${CODEX_HOME:-~/.codex}/auth.json exists → AUTH_OK', () => {
const home = tempHome();
try {
fs.mkdirSync(path.join(home, '.codex'), { recursive: true });
fs.writeFileSync(path.join(home, '.codex', 'auth.json'), '{}');
const r = runProbe({ snippet: '_gstack_codex_auth_probe', home });
expect(r.stdout.trim()).toBe('AUTH_OK');
expect(r.status).toBe(0);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('no env + no file → AUTH_FAILED with exit 1', () => {
const home = tempHome();
try {
const r = runProbe({ snippet: '_gstack_codex_auth_probe', home });
expect(r.stdout.trim()).toBe('AUTH_FAILED');
expect(r.status).toBe(1);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('both CODEX_API_KEY and OPENAI_API_KEY set → AUTH_OK', () => {
const home = tempHome();
try {
const r = runProbe({
snippet: '_gstack_codex_auth_probe',
env: { CODEX_API_KEY: 'k1', OPENAI_API_KEY: 'k2' },
home,
});
expect(r.stdout.trim()).toBe('AUTH_OK');
expect(r.status).toBe(0);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('empty-string env vars + no file → AUTH_FAILED', () => {
const home = tempHome();
try {
const r = runProbe({
snippet: '_gstack_codex_auth_probe',
env: { CODEX_API_KEY: '', OPENAI_API_KEY: '' },
home,
});
expect(r.stdout.trim()).toBe('AUTH_FAILED');
expect(r.status).toBe(1);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('whitespace-only env vars + no file → AUTH_FAILED', () => {
const home = tempHome();
try {
const r = runProbe({
snippet: '_gstack_codex_auth_probe',
env: { CODEX_API_KEY: ' ', OPENAI_API_KEY: '\t\n' },
home,
});
expect(r.stdout.trim()).toBe('AUTH_FAILED');
expect(r.status).toBe(1);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('alternate $CODEX_HOME → checks the alternate path', () => {
const home = tempHome();
const altCodex = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-alt-codex-'));
try {
fs.writeFileSync(path.join(altCodex, 'auth.json'), '{}');
const r = runProbe({
snippet: '_gstack_codex_auth_probe',
env: { CODEX_HOME: altCodex },
home,
});
expect(r.stdout.trim()).toBe('AUTH_OK');
expect(r.status).toBe(0);
} finally {
fs.rmSync(home, { recursive: true, force: true });
fs.rmSync(altCodex, { recursive: true, force: true });
}
});
});
// --- Group 2: Version check -------------------------------------------------
// Stub `codex --version` by putting a fake `codex` executable on PATH.
function tempStubCodex(versionOutput: string, bool_command_fails = false): {
dir: string;
pathEntry: string;
} {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-codex-stub-'));
const bin = path.join(dir, 'codex');
const script = bool_command_fails
? '#!/bin/bash\nexit 1\n'
: `#!/bin/bash\nif [ "$1" = "--version" ]; then printf '%s' ${JSON.stringify(versionOutput)}; fi\n`;
fs.writeFileSync(bin, script);
fs.chmodSync(bin, 0o755);
return { dir, pathEntry: dir };
}
function runVersionCheck(versionOutput: string): string {
const stub = tempStubCodex(versionOutput);
try {
const r = runProbe({
snippet: '_gstack_codex_version_check',
env: { PATH: `${stub.pathEntry}:${process.env.PATH}` },
});
return r.stdout + r.stderr;
} finally {
fs.rmSync(stub.dir, { recursive: true, force: true });
}
}
describe('gstack-codex-probe: version check (anchored regex per Tension I)', () => {
// Matches (should WARN)
test('codex-cli 0.120.0 → WARN', () => {
const out = runVersionCheck('codex-cli 0.120.0\n');
expect(out).toContain('WARN:');
expect(out).toContain('0.120.0');
});
test('codex-cli 0.120.1 → WARN', () => {
const out = runVersionCheck('codex-cli 0.120.1\n');
expect(out).toContain('WARN:');
});
test('codex-cli 0.120.2 → WARN', () => {
const out = runVersionCheck('codex-cli 0.120.2\n');
expect(out).toContain('WARN:');
});
// Does NOT match (should be silent)
test('codex-cli 0.116.0 → OK (no warn)', () => {
const out = runVersionCheck('codex-cli 0.116.0\n');
expect(out).not.toContain('WARN:');
});
test('codex-cli 0.121.0 → OK (no warn)', () => {
const out = runVersionCheck('codex-cli 0.121.0\n');
expect(out).not.toContain('WARN:');
});
test('codex-cli 0.120.10 → OK (anchored regex prevents substring match)', () => {
const out = runVersionCheck('codex-cli 0.120.10\n');
expect(out).not.toContain('WARN:');
});
test('codex-cli 0.120.20 → OK (anchored regex prevents substring match)', () => {
const out = runVersionCheck('codex-cli 0.120.20\n');
expect(out).not.toContain('WARN:');
});
test('codex-cli 0.120.2-beta → WARN (still a bad release family)', () => {
// 0.120.2-beta: regex (^|[^0-9.])0\.120\.(0|1|2)([^0-9.]|$) treats '-' as a
// non-digit/non-dot boundary → matches.
const out = runVersionCheck('codex-cli 0.120.2-beta\n');
expect(out).toContain('WARN:');
});
test('empty output → OK (silent, no crash)', () => {
const out = runVersionCheck('');
expect(out).not.toContain('WARN:');
});
test('v-prefixed and multiline handled', () => {
const out = runVersionCheck('codex-cli v0.116.0\nsome debug line\n');
expect(out).not.toContain('WARN:');
});
});
// --- Group 3: Timeout wrapper + namespace hygiene ---------------------------
describe('gstack-codex-probe: timeout wrapper + namespace hygiene', () => {
test('bin/gstack-codex-probe is syntactically valid bash (bash -n)', () => {
const result = spawnSync('bash', ['-n', PROBE], { timeout: 5000 });
expect(result.status).toBe(0);
});
test('timeout wrapper executes command directly when neither binary present', () => {
// Clear PATH to simulate no timeout/gtimeout. Use only /bin for `echo`.
const r = runProbe({
snippet: `_gstack_codex_timeout_wrapper 5 echo hello_world`,
env: { PATH: '/bin:/usr/bin' }, // these usually lack gtimeout; timeout may exist on linux
});
// Regardless of whether timeout is on this PATH, echo hello_world should succeed.
expect(r.stdout.trim()).toBe('hello_world');
});
test('timeout wrapper resolves gtimeout preferentially when on PATH', () => {
// Create a stub gtimeout that prints a sentinel so we can verify it was chosen.
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-gto-stub-'));
try {
const stub = path.join(dir, 'gtimeout');
fs.writeFileSync(stub, '#!/bin/bash\necho gtimeout_chosen_$1\n');
fs.chmodSync(stub, 0o755);
const r = runProbe({
snippet: `_gstack_codex_timeout_wrapper 5 echo nope`,
env: { PATH: `${dir}:/bin:/usr/bin` },
});
expect(r.stdout.trim()).toBe('gtimeout_chosen_5');
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
});
test('sourcing probe does NOT set errexit/trap/IFS in caller shell (namespace hygiene)', () => {
// Capture `set -o` output before and after sourcing. Any drift means the
// probe polluted the caller.
const r = runProbe({
snippet: `
BEFORE=$(set -o | sort)
source "${PROBE}" # source again to catch accumulation
AFTER=$(set -o | sort)
if [ "$BEFORE" = "$AFTER" ]; then
echo "CLEAN"
else
echo "POLLUTED"
diff <(echo "$BEFORE") <(echo "$AFTER")
fi
`,
});
expect(r.stdout).toContain('CLEAN');
});
});
// --- Group 4: Telemetry event emission --------------------------------------
describe('gstack-codex-probe: telemetry event emission', () => {
test('_gstack_codex_log_event writes jsonl when _TEL != off', () => {
const home = tempHome();
try {
const r = runProbe({
snippet: `_gstack_codex_log_event "codex_test_event" "42"; cat "$HOME/.gstack/analytics/skill-usage.jsonl"`,
env: { _TEL: 'community' },
home,
});
expect(r.stdout).toContain('"event":"codex_test_event"');
expect(r.stdout).toContain('"duration_s":"42"');
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('_gstack_codex_log_event skips write when _TEL = off', () => {
const home = tempHome();
try {
runProbe({
snippet: `_gstack_codex_log_event "codex_test_event" "99"`,
env: { _TEL: 'off' },
home,
});
const jsonl = path.join(home, '.gstack/analytics/skill-usage.jsonl');
expect(fs.existsSync(jsonl)).toBe(false);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
test('payload never contains prompt content, env values, or auth tokens (schema check)', () => {
const home = tempHome();
try {
const r = runProbe({
snippet: `_gstack_codex_log_event "codex_test_event" "1"; cat "$HOME/.gstack/analytics/skill-usage.jsonl"`,
env: {
_TEL: 'community',
CODEX_API_KEY: 'SECRET_TOKEN_SHOULD_NOT_LEAK',
OPENAI_API_KEY: 'ANOTHER_SECRET',
},
home,
});
// The emitted JSON payload should ONLY have {skill, event, duration_s, ts}.
// Specifically, it must not contain any env values or auth material.
expect(r.stdout).not.toContain('SECRET_TOKEN_SHOULD_NOT_LEAK');
expect(r.stdout).not.toContain('ANOTHER_SECRET');
// Schema: exactly these keys, in any order.
const parsed = JSON.parse(r.stdout.trim().split('\n').pop() ?? '{}');
expect(Object.keys(parsed).sort()).toEqual(['duration_s', 'event', 'skill', 'ts']);
} finally {
fs.rmSync(home, { recursive: true, force: true });
}
});
});