mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-18 07:40:09 +02:00
Merge remote-tracking branch 'origin/main' into garrytan/trunk-land-skill
# Conflicts: # CHANGELOG.md # VERSION # package.json
This commit is contained in:
@@ -347,7 +347,13 @@ describe('runAgentSdkTest — options propagation', () => {
|
||||
expect(opts.permissionMode).toBe('bypassPermissions');
|
||||
expect(opts.allowDangerouslySkipPermissions).toBe(true);
|
||||
expect(opts.settingSources).toEqual([]);
|
||||
expect(opts.env).toEqual({ ANTHROPIC_API_KEY: 'fake' });
|
||||
// env is the COMPLETE hermetic env with the per-test override merged
|
||||
// last — partial pass-through was the documented SDK auth-breaker
|
||||
// (Options.env replaces the child's entire environment).
|
||||
expect(opts.env?.ANTHROPIC_API_KEY).toBe('fake');
|
||||
expect(opts.env?.PATH).toBeTruthy();
|
||||
expect(opts.env?.CLAUDE_CONFIG_DIR).toMatch(/\/\.claude$/);
|
||||
expect(opts.env?.GSTACK_HOME).toContain('gstack-home');
|
||||
expect(opts.pathToClaudeCodeExecutable).toBe('/fake/path/claude');
|
||||
});
|
||||
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as path from 'path';
|
||||
import * as fs from 'fs';
|
||||
|
||||
// Static tripwires for the B2 render-isolation wiring. These fail CI if a
|
||||
// refactor drops a load-bearing line, re-introducing the "dev-setup dirties
|
||||
// tracked SKILL.md" drift (or worse, leaks the skip-guard into real installs).
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const read = (rel: string) => fs.readFileSync(path.join(ROOT, rel), 'utf-8');
|
||||
|
||||
describe('dev-setup: worktree stays canonical', () => {
|
||||
const devSetup = read('bin/dev-setup');
|
||||
|
||||
test('passes GSTACK_SKIP_GBRAIN_REGEN inline on the nested setup call', () => {
|
||||
expect(devSetup).toContain('GSTACK_SKIP_GBRAIN_REGEN=1 "$GSTACK_LINK/setup"');
|
||||
});
|
||||
|
||||
test('never exports GSTACK_SKIP_GBRAIN_REGEN (would leak into other setup paths)', () => {
|
||||
expect(devSetup).not.toMatch(/export\s+GSTACK_SKIP_GBRAIN_REGEN/);
|
||||
});
|
||||
|
||||
test('renders the :user variant into an out-dir, not in place', () => {
|
||||
expect(devSetup).toContain('--out-dir');
|
||||
expect(devSetup).toContain('.claude/gstack-rendered');
|
||||
});
|
||||
|
||||
test('gates the render on gstack-gbrain-detect --is-ok', () => {
|
||||
expect(devSetup).toContain('--is-ok');
|
||||
});
|
||||
});
|
||||
|
||||
describe('setup: honors GSTACK_SKIP_GBRAIN_REGEN', () => {
|
||||
const setup = read('setup');
|
||||
|
||||
test('skips the in-place :user regen when the guard is set', () => {
|
||||
expect(setup).toContain('${GSTACK_SKIP_GBRAIN_REGEN:-}');
|
||||
// The guard must wrap the in-place render, not the detection persist.
|
||||
const idx = setup.indexOf('GSTACK_SKIP_GBRAIN_REGEN');
|
||||
const after = setup.slice(idx, idx + 600);
|
||||
expect(after).toContain('leaving tracked SKILL.md canonical');
|
||||
});
|
||||
|
||||
test('uses a PID-unique detection tmp (no concurrent clobber)', () => {
|
||||
expect(setup).toContain('$DETECTION_FILE.$$.tmp');
|
||||
});
|
||||
|
||||
test('gates detection on the shared --is-ok check', () => {
|
||||
expect(setup).toContain('"$DETECT_BIN" --is-ok');
|
||||
});
|
||||
});
|
||||
|
||||
describe('gen-skill-docs: section rewrite is gated on --out-dir', () => {
|
||||
const gen = read('scripts/gen-skill-docs.ts');
|
||||
|
||||
test('rewriteSectionBase is a no-op without --out-dir', () => {
|
||||
expect(gen).toContain('function rewriteSectionBase');
|
||||
const idx = gen.indexOf('function rewriteSectionBase');
|
||||
const body = gen.slice(idx, idx + 400);
|
||||
expect(body).toContain('if (!OUT_DIR) return content');
|
||||
expect(body).toContain('sections'); // surgical: regex targets only /sections/ paths
|
||||
});
|
||||
});
|
||||
|
||||
describe('dev-teardown: removes the untracked render', () => {
|
||||
const teardown = read('bin/dev-teardown');
|
||||
|
||||
test('rm -rf the gstack-rendered dir', () => {
|
||||
expect(teardown).toContain('gstack-rendered');
|
||||
expect(teardown).toMatch(/rm -rf .*RENDER_DIR/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('.gitignore: render dir is declared untracked', () => {
|
||||
test('.claude/gstack-rendered/ is ignored', () => {
|
||||
expect(read('.gitignore')).toContain('.claude/gstack-rendered/');
|
||||
});
|
||||
});
|
||||
|
||||
describe('dev-skill: refreshes the render on template change', () => {
|
||||
const devSkill = read('scripts/dev-skill.ts');
|
||||
|
||||
test('re-renders the :user variant into the workspace render dir', () => {
|
||||
expect(devSkill).toContain('gstack-rendered');
|
||||
expect(devSkill).toContain('--out-dir');
|
||||
expect(devSkill).toContain('--respect-detection');
|
||||
});
|
||||
|
||||
test('only refreshes when the render dir already exists (never creates it during plain dev)', () => {
|
||||
expect(devSkill).toContain('fs.existsSync(RENDER_DIR)');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,96 @@
|
||||
/**
|
||||
* Drift guards for the committed diagram-render bundle (eng-review D2).
|
||||
*
|
||||
* Tier 1 (always, free, <50ms): dist/diagram-render.html must hash to exactly
|
||||
* what dist/BUILD_INFO.json records, and the BUILD_INFO dependency pins must
|
||||
* match package.json. Catches hand-edited dist files and "bumped the pin,
|
||||
* forgot to rebuild" commits.
|
||||
*
|
||||
* Tier 2 (deep, CI / post-install only): rebuild from source and compare
|
||||
* hashes. Skipped when lib/diagram-render/node_modules is absent (fresh
|
||||
* clone without `bun install` in that dir) or when the local bun version
|
||||
* differs from the one recorded at build time (minifier output is only
|
||||
* guaranteed deterministic within a bun version).
|
||||
*/
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import { createHash } from "node:crypto";
|
||||
import { existsSync } from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, "..", "lib", "diagram-render");
|
||||
const DIST_HTML = path.join(ROOT, "dist", "diagram-render.html");
|
||||
const BUILD_INFO = path.join(ROOT, "dist", "BUILD_INFO.json");
|
||||
|
||||
describe("diagram-render bundle drift", () => {
|
||||
test("dist hash matches BUILD_INFO (tamper check)", async () => {
|
||||
const html = await Bun.file(DIST_HTML).text();
|
||||
const info = await Bun.file(BUILD_INFO).json();
|
||||
const sha = createHash("sha256").update(html).digest("hex");
|
||||
expect(sha).toBe(info.sha256);
|
||||
expect(Buffer.byteLength(html)).toBe(info.bytes);
|
||||
});
|
||||
|
||||
test("BUILD_INFO dependency pins match package.json", async () => {
|
||||
const info = await Bun.file(BUILD_INFO).json();
|
||||
const pkg = await Bun.file(path.join(ROOT, "package.json")).json();
|
||||
expect(info.deps).toEqual(pkg.dependencies);
|
||||
});
|
||||
|
||||
test("BUILD_INFO srcSha256 matches src on disk (edited-src-forgot-rebuild guard)", async () => {
|
||||
// The deep rebuild check below needs node_modules, which CI doesn't
|
||||
// install for this nested package — this tier-1.5 fingerprint catches a
|
||||
// src edit committed without a rebuild using nothing but file hashes.
|
||||
const info = await Bun.file(BUILD_INFO).json();
|
||||
const srcSha = createHash("sha256")
|
||||
.update(await Bun.file(path.join(ROOT, "src", "entry.ts")).text())
|
||||
.update(await Bun.file(path.join(ROOT, "scripts", "build.ts")).text())
|
||||
.digest("hex");
|
||||
expect(srcSha).toBe(info.srcSha256);
|
||||
});
|
||||
|
||||
test("bundle font stack matches print-css (text-measurement drift guard)", async () => {
|
||||
const entrySrc = await Bun.file(path.join(ROOT, "src", "entry.ts")).text();
|
||||
// Every family print-css composes into the body stack must appear in the
|
||||
// bundle's PRINT_SANS literal — mermaid measures text with these fonts and
|
||||
// the print document lays it out with print-css's; drift = overflowing
|
||||
// labels (eng-review D3).
|
||||
for (const family of [
|
||||
"Helvetica", "Liberation Sans", "Arial",
|
||||
"Hiragino Kaku Gothic ProN", "Noto Sans CJK JP", "Microsoft YaHei",
|
||||
"Apple Color Emoji", "Segoe UI Emoji", "Noto Color Emoji",
|
||||
]) {
|
||||
expect(entrySrc).toContain(family);
|
||||
}
|
||||
});
|
||||
|
||||
test("page invariants: module script, base href, escaped terminators, error trap", async () => {
|
||||
const html = await Bun.file(DIST_HTML).text();
|
||||
expect(html).toContain('<script type="module">');
|
||||
expect(html).toContain('<base href="https://gstack-render.localhost/">');
|
||||
expect(html).toContain("window.__errors = []");
|
||||
// The inline module must contain no live </script> other than the page's
|
||||
// own closers: head error-trap closer + module closer.
|
||||
const closers = html.match(/<\/script>/g) ?? [];
|
||||
expect(closers.length).toBe(2);
|
||||
});
|
||||
|
||||
const nodeModules = path.join(ROOT, "node_modules");
|
||||
let builtWithSameBun = false;
|
||||
try {
|
||||
const info = require(BUILD_INFO);
|
||||
builtWithSameBun = info.bunVersion === Bun.version;
|
||||
} catch {}
|
||||
const canDeepCheck = existsSync(nodeModules) && builtWithSameBun;
|
||||
|
||||
test.skipIf(!canDeepCheck)(
|
||||
"deep: fresh build reproduces committed dist",
|
||||
async () => {
|
||||
const before = await Bun.file(BUILD_INFO).json();
|
||||
const proc = Bun.spawnSync(["bun", "run", "scripts/build.ts"], { cwd: ROOT });
|
||||
expect(proc.exitCode).toBe(0);
|
||||
const after = await Bun.file(BUILD_INFO).json();
|
||||
expect(after.sha256).toBe(before.sha256);
|
||||
},
|
||||
60000,
|
||||
);
|
||||
});
|
||||
+16
-3
@@ -53,6 +53,13 @@ echo "REPO_MODE: $REPO_MODE"
|
||||
_SESSION_KIND=$(~/.claude/skills/gstack/bin/gstack-session-kind 2>/dev/null || echo "interactive")
|
||||
case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
|
||||
echo "SESSION_KIND: $_SESSION_KIND"
|
||||
# Conductor host: AskUserQuestion is unreliable here (native disabled, MCP
|
||||
# variant flaky), so skills render decisions as prose instead of calling the
|
||||
# tool. Gated on !headless so an eval/CI run INSIDE Conductor (GSTACK_HEADLESS)
|
||||
# still BLOCKs rather than rendering prose to nobody.
|
||||
if [ "$_SESSION_KIND" != "headless" ] && { [ -n "${CONDUCTOR_WORKSPACE_PATH:-}" ] || [ -n "${CONDUCTOR_PORT:-}" ]; }; then
|
||||
echo "CONDUCTOR_SESSION: true"
|
||||
fi
|
||||
_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
|
||||
echo "LAKE_INTRO: $_LAKE_SEEN"
|
||||
_TEL=$(~/.claude/skills/gstack/bin/gstack-config get telemetry 2>/dev/null || true)
|
||||
@@ -302,7 +309,9 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
|
||||
"AskUserQuestion" can resolve to two tools at runtime: the **host MCP variant** (e.g. `mcp__conductor__AskUserQuestion` — appears in your tool list when the host registers it) or the **native** Claude Code tool.
|
||||
|
||||
**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
**Conductor rule (read before the MCP rule):** if `CONDUCTOR_SESSION: true` was echoed by the preamble, do NOT call AskUserQuestion at all — neither native nor any `mcp__*__AskUserQuestion` variant. Render EVERY decision brief as the **prose form** below and STOP. This is proactive, not a reaction to a failure: Conductor disables native AUQ and its MCP variant is flaky (it returns `[Tool result missing due to internal error]`), so prose is the reliable path. **Auto-decide preferences still apply first:** if a `[plan-tune auto-decide] <id> → <option>` result has already surfaced for a question, proceed with that option (no prose). Because in Conductor you go straight to prose without ever calling the tool, this auto-decide-first ordering is enforced HERE, not only by the PreToolUse hook. When you render a Conductor prose brief, also capture it with `bin/gstack-question-log` (the PostToolUse capture hook never fires on a prose path, so `/plan-tune` history/learning depends on this call).
|
||||
|
||||
**Rule (non-Conductor):** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
|
||||
If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.
|
||||
|
||||
@@ -324,7 +333,11 @@ Tell three outcomes apart:
|
||||
2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
|
||||
3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.
|
||||
|
||||
Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
Layout: a `D<N>` title + a one-line note to reply with a letter (in Conductor this is the normal path; elsewhere it means AskUserQuestion was unavailable or errored); the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
|
||||
**Continuation — mapping a typed reply back to a brief.** Each brief carries a stable label (`D<N>`, or `D<N>.k` in a split chain). The user references it (e.g. "3.2: B"). A bare letter maps to the single most-recent UNANSWERED brief; if more than one is open (a split chain), do NOT guess — ask which `D<N>.k` it answers. Never apply a bare letter ambiguously across a chain.
|
||||
|
||||
**One-way / destructive confirmations in prose.** When the decision is a one-way door (irreversible or destructive — delete, force-push, drop, overwrite), prose is a WEAKER gate than the tool, so make it stronger: require an explicit typed confirmation (the exact option letter or word), state plainly what is irreversible, and NEVER proceed on a vague, partial, or ambiguous reply — re-ask instead. Treat silence or "ok"/"sure" without the explicit choice as not-yet-confirmed.
|
||||
|
||||
### Format
|
||||
|
||||
@@ -408,7 +421,7 @@ Before calling AskUserQuestion, verify:
|
||||
- [ ] (recommended) label on one option (even for neutral-posture)
|
||||
- [ ] Dual-scale effort labels on effort-bearing options (human / CC)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] You are calling the tool, not writing prose — unless `CONDUCTOR_SESSION: true` (then prose is the DEFAULT, not the tool) OR the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
|
||||
- [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
|
||||
- [ ] If you split, you checked dependencies between options before firing the chain
|
||||
|
||||
+32
-3
@@ -39,6 +39,13 @@ echo "REPO_MODE: $REPO_MODE"
|
||||
_SESSION_KIND=$($GSTACK_BIN/gstack-session-kind 2>/dev/null || echo "interactive")
|
||||
case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
|
||||
echo "SESSION_KIND: $_SESSION_KIND"
|
||||
# Conductor host: AskUserQuestion is unreliable here (native disabled, MCP
|
||||
# variant flaky), so skills render decisions as prose instead of calling the
|
||||
# tool. Gated on !headless so an eval/CI run INSIDE Conductor (GSTACK_HEADLESS)
|
||||
# still BLOCKs rather than rendering prose to nobody.
|
||||
if [ "$_SESSION_KIND" != "headless" ] && { [ -n "${CONDUCTOR_WORKSPACE_PATH:-}" ] || [ -n "${CONDUCTOR_PORT:-}" ]; }; then
|
||||
echo "CONDUCTOR_SESSION: true"
|
||||
fi
|
||||
_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
|
||||
echo "LAKE_INTRO: $_LAKE_SEEN"
|
||||
_TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true)
|
||||
@@ -288,7 +295,9 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
|
||||
"AskUserQuestion" can resolve to two tools at runtime: the **host MCP variant** (e.g. `mcp__conductor__AskUserQuestion` — appears in your tool list when the host registers it) or the **native** Claude Code tool.
|
||||
|
||||
**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
**Conductor rule (read before the MCP rule):** if `CONDUCTOR_SESSION: true` was echoed by the preamble, do NOT call AskUserQuestion at all — neither native nor any `mcp__*__AskUserQuestion` variant. Render EVERY decision brief as the **prose form** below and STOP. This is proactive, not a reaction to a failure: Conductor disables native AUQ and its MCP variant is flaky (it returns `[Tool result missing due to internal error]`), so prose is the reliable path. **Auto-decide preferences still apply first:** if a `[plan-tune auto-decide] <id> → <option>` result has already surfaced for a question, proceed with that option (no prose). Because in Conductor you go straight to prose without ever calling the tool, this auto-decide-first ordering is enforced HERE, not only by the PreToolUse hook. When you render a Conductor prose brief, also capture it with `bin/gstack-question-log` (the PostToolUse capture hook never fires on a prose path, so `/plan-tune` history/learning depends on this call).
|
||||
|
||||
**Rule (non-Conductor):** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
|
||||
If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.
|
||||
|
||||
@@ -310,7 +319,11 @@ Tell three outcomes apart:
|
||||
2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
|
||||
3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.
|
||||
|
||||
Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
Layout: a `D<N>` title + a one-line note to reply with a letter (in Conductor this is the normal path; elsewhere it means AskUserQuestion was unavailable or errored); the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
|
||||
**Continuation — mapping a typed reply back to a brief.** Each brief carries a stable label (`D<N>`, or `D<N>.k` in a split chain). The user references it (e.g. "3.2: B"). A bare letter maps to the single most-recent UNANSWERED brief; if more than one is open (a split chain), do NOT guess — ask which `D<N>.k` it answers. Never apply a bare letter ambiguously across a chain.
|
||||
|
||||
**One-way / destructive confirmations in prose.** When the decision is a one-way door (irreversible or destructive — delete, force-push, drop, overwrite), prose is a WEAKER gate than the tool, so make it stronger: require an explicit typed confirmation (the exact option letter or word), state plainly what is irreversible, and NEVER proceed on a vague, partial, or ambiguous reply — re-ask instead. Treat silence or "ok"/"sure" without the explicit choice as not-yet-confirmed.
|
||||
|
||||
### Format
|
||||
|
||||
@@ -394,7 +407,7 @@ Before calling AskUserQuestion, verify:
|
||||
- [ ] (recommended) label on one option (even for neutral-posture)
|
||||
- [ ] Dual-scale effort labels on effort-bearing options (human / CC)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] You are calling the tool, not writing prose — unless `CONDUCTOR_SESSION: true` (then prose is the DEFAULT, not the tool) OR the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
|
||||
- [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
|
||||
- [ ] If you split, you checked dependencies between options before firing the chain
|
||||
@@ -1277,6 +1290,22 @@ EVAL_JUDGE_TIER=full EVAL_VERBOSE=1 bin/test-lane --eval test/evals/<suite>_eval
|
||||
|
||||
If multiple suites need to run, run them sequentially (each needs a test lane). If the first suite fails, stop immediately — don't burn API cost on remaining suites.
|
||||
|
||||
**Long eval suites (30+ min): launch detached so a turn boundary can't kill them.**
|
||||
A plain backgrounded eval lives in the harness's process group and dies to a
|
||||
SIGTERM ("polite quit") on a turn boundary, a stopped monitor, or an interruption
|
||||
(observed mid-`/ship`: `script terminated by signal SIGTERM`). Run it through
|
||||
`$GSTACK_ROOT/bin/gstack-detach` instead — it survives in its own
|
||||
session, serializes against other worktrees via a machine lock (no API
|
||||
saturation), and writes a guaranteed `### gstack-detach EXIT=<code> ###` sentinel:
|
||||
|
||||
```bash
|
||||
$GSTACK_ROOT/bin/gstack-detach --label ship-evals --lock gstack-evals --timeout 5400 -- <project eval command>
|
||||
```
|
||||
|
||||
Then poll the printed log path; break on the `EXIT=` sentinel (covers both pass
|
||||
and crash — silence is never success). The detached run survives even if your
|
||||
poller is reaped.
|
||||
|
||||
**4. Check results:**
|
||||
|
||||
- **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed.
|
||||
|
||||
+67
-14
@@ -41,6 +41,13 @@ echo "REPO_MODE: $REPO_MODE"
|
||||
_SESSION_KIND=$($GSTACK_BIN/gstack-session-kind 2>/dev/null || echo "interactive")
|
||||
case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
|
||||
echo "SESSION_KIND: $_SESSION_KIND"
|
||||
# Conductor host: AskUserQuestion is unreliable here (native disabled, MCP
|
||||
# variant flaky), so skills render decisions as prose instead of calling the
|
||||
# tool. Gated on !headless so an eval/CI run INSIDE Conductor (GSTACK_HEADLESS)
|
||||
# still BLOCKs rather than rendering prose to nobody.
|
||||
if [ "$_SESSION_KIND" != "headless" ] && { [ -n "${CONDUCTOR_WORKSPACE_PATH:-}" ] || [ -n "${CONDUCTOR_PORT:-}" ]; }; then
|
||||
echo "CONDUCTOR_SESSION: true"
|
||||
fi
|
||||
_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
|
||||
echo "LAKE_INTRO: $_LAKE_SEEN"
|
||||
_TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true)
|
||||
@@ -290,7 +297,9 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
|
||||
"AskUserQuestion" can resolve to two tools at runtime: the **host MCP variant** (e.g. `mcp__conductor__AskUserQuestion` — appears in your tool list when the host registers it) or the **native** Claude Code tool.
|
||||
|
||||
**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
**Conductor rule (read before the MCP rule):** if `CONDUCTOR_SESSION: true` was echoed by the preamble, do NOT call AskUserQuestion at all — neither native nor any `mcp__*__AskUserQuestion` variant. Render EVERY decision brief as the **prose form** below and STOP. This is proactive, not a reaction to a failure: Conductor disables native AUQ and its MCP variant is flaky (it returns `[Tool result missing due to internal error]`), so prose is the reliable path. **Auto-decide preferences still apply first:** if a `[plan-tune auto-decide] <id> → <option>` result has already surfaced for a question, proceed with that option (no prose). Because in Conductor you go straight to prose without ever calling the tool, this auto-decide-first ordering is enforced HERE, not only by the PreToolUse hook. When you render a Conductor prose brief, also capture it with `bin/gstack-question-log` (the PostToolUse capture hook never fires on a prose path, so `/plan-tune` history/learning depends on this call).
|
||||
|
||||
**Rule (non-Conductor):** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
|
||||
If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.
|
||||
|
||||
@@ -312,7 +321,11 @@ Tell three outcomes apart:
|
||||
2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
|
||||
3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.
|
||||
|
||||
Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
Layout: a `D<N>` title + a one-line note to reply with a letter (in Conductor this is the normal path; elsewhere it means AskUserQuestion was unavailable or errored); the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
|
||||
**Continuation — mapping a typed reply back to a brief.** Each brief carries a stable label (`D<N>`, or `D<N>.k` in a split chain). The user references it (e.g. "3.2: B"). A bare letter maps to the single most-recent UNANSWERED brief; if more than one is open (a split chain), do NOT guess — ask which `D<N>.k` it answers. Never apply a bare letter ambiguously across a chain.
|
||||
|
||||
**One-way / destructive confirmations in prose.** When the decision is a one-way door (irreversible or destructive — delete, force-push, drop, overwrite), prose is a WEAKER gate than the tool, so make it stronger: require an explicit typed confirmation (the exact option letter or word), state plainly what is irreversible, and NEVER proceed on a vague, partial, or ambiguous reply — re-ask instead. Treat silence or "ok"/"sure" without the explicit choice as not-yet-confirmed.
|
||||
|
||||
### Format
|
||||
|
||||
@@ -396,7 +409,7 @@ Before calling AskUserQuestion, verify:
|
||||
- [ ] (recommended) label on one option (even for neutral-posture)
|
||||
- [ ] Dual-scale effort labels on effort-bearing options (human / CC)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] You are calling the tool, not writing prose — unless `CONDUCTOR_SESSION: true` (then prose is the DEFAULT, not the tool) OR the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
|
||||
- [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
|
||||
- [ ] If you split, you checked dependencies between options before firing the chain
|
||||
@@ -1279,6 +1292,22 @@ EVAL_JUDGE_TIER=full EVAL_VERBOSE=1 bin/test-lane --eval test/evals/<suite>_eval
|
||||
|
||||
If multiple suites need to run, run them sequentially (each needs a test lane). If the first suite fails, stop immediately — don't burn API cost on remaining suites.
|
||||
|
||||
**Long eval suites (30+ min): launch detached so a turn boundary can't kill them.**
|
||||
A plain backgrounded eval lives in the harness's process group and dies to a
|
||||
SIGTERM ("polite quit") on a turn boundary, a stopped monitor, or an interruption
|
||||
(observed mid-`/ship`: `script terminated by signal SIGTERM`). Run it through
|
||||
`$GSTACK_ROOT/bin/gstack-detach` instead — it survives in its own
|
||||
session, serializes against other worktrees via a machine lock (no API
|
||||
saturation), and writes a guaranteed `### gstack-detach EXIT=<code> ###` sentinel:
|
||||
|
||||
```bash
|
||||
$GSTACK_ROOT/bin/gstack-detach --label ship-evals --lock gstack-evals --timeout 5400 -- <project eval command>
|
||||
```
|
||||
|
||||
Then poll the printed log path; break on the `EXIT=` sentinel (covers both pass
|
||||
and crash — silence is never success). The detached run survives even if your
|
||||
poller is reaped.
|
||||
|
||||
**4. Check results:**
|
||||
|
||||
- **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed.
|
||||
@@ -2332,23 +2361,47 @@ For each comment in `comments`:
|
||||
|
||||
Every diff gets adversarial review from both Claude and Codex. LOC is not a proxy for risk — a 5-line auth change can be critical.
|
||||
|
||||
**Detect diff size and tool availability:**
|
||||
**Detect diff size:**
|
||||
|
||||
```bash
|
||||
DIFF_BASE=$(git merge-base origin/<base> HEAD)
|
||||
DIFF_INS=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
|
||||
DIFF_DEL=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
|
||||
DIFF_TOTAL=$((DIFF_INS + DIFF_DEL))
|
||||
command -v codex >/dev/null 2>&1 && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
|
||||
# Legacy opt-out — only gates Codex passes, Claude always runs
|
||||
OLD_CFG=$($GSTACK_ROOT/bin/gstack-config get codex_reviews 2>/dev/null || true)
|
||||
echo "DIFF_SIZE: $DIFF_TOTAL"
|
||||
echo "OLD_CFG: ${OLD_CFG:-not_set}"
|
||||
```
|
||||
|
||||
If `OLD_CFG` is `disabled`: skip Codex passes only. Claude adversarial subagent still runs (it's free and fast). Jump to the "Claude adversarial subagent" section.
|
||||
**Detect the Codex master switch + tool availability:**
|
||||
|
||||
**User override:** If the user explicitly requested "full review", "structured review", or "P1 gate", also run the Codex structured review regardless of diff size.
|
||||
```bash
|
||||
# Codex preflight: one block (functions sourced here don't persist to later blocks).
|
||||
_TEL=$($GSTACK_ROOT/bin/gstack-config get telemetry 2>/dev/null || echo off)
|
||||
_CODEX_CFG=$($GSTACK_ROOT/bin/gstack-config get codex_reviews 2>/dev/null || echo enabled)
|
||||
source $GSTACK_ROOT/bin/gstack-codex-probe 2>/dev/null || true
|
||||
if [ "$_CODEX_CFG" = "disabled" ]; then
|
||||
_CODEX_MODE="disabled"
|
||||
elif ! command -v codex >/dev/null 2>&1; then
|
||||
_CODEX_MODE="not_installed"; _gstack_codex_log_event "codex_cli_missing" 2>/dev/null || true
|
||||
elif ! _gstack_codex_auth_probe >/dev/null 2>&1; then
|
||||
_CODEX_MODE="not_authed"; _gstack_codex_log_event "codex_auth_failed" 2>/dev/null || true
|
||||
else
|
||||
_CODEX_MODE="ready"; _gstack_codex_version_check 2>/dev/null || true
|
||||
fi
|
||||
echo "CODEX_MODE: $_CODEX_MODE"
|
||||
```
|
||||
|
||||
Branch on the echoed `CODEX_MODE`:
|
||||
- **`disabled`** — the user turned Codex reviews off (`codex_reviews=disabled`). Skip the Codex passes only; the Claude adversarial subagent below STILL runs (it is free and fast). Print: "Codex passes skipped (codex_reviews disabled) — running Claude adversarial only."
|
||||
- **`not_installed`** — Codex CLI absent. Print: "Codex not installed — using Claude subagent. Install for cross-model coverage: `npm install -g @openai/codex`." Fall back to the Claude subagent path.
|
||||
- **`not_authed`** — installed but no credentials. Print: "Codex installed but not authenticated — using Claude subagent. Run `codex login` or set `$CODEX_API_KEY`." Fall back to the Claude subagent path.
|
||||
- **`ready`** — run the Codex pass below.
|
||||
|
||||
For this diff-review path, `CODEX_MODE: disabled` means skip the Codex passes ONLY — the
|
||||
Claude adversarial subagent below still runs (it's free and fast). `ready` runs the Codex
|
||||
passes; `not_installed` / `not_authed` skip them with the printed note and continue with
|
||||
Claude only.
|
||||
|
||||
**User override:** If the user explicitly requested "full review", "structured review", or "P1 gate", also run the Codex structured review regardless of diff size (still requires `CODEX_MODE: ready`).
|
||||
|
||||
---
|
||||
|
||||
@@ -2369,9 +2422,9 @@ If the subagent fails or times out: "Claude adversarial subagent unavailable. Co
|
||||
|
||||
---
|
||||
|
||||
### Codex adversarial challenge (always runs when available)
|
||||
### Codex adversarial challenge (runs whenever `CODEX_MODE: ready`)
|
||||
|
||||
If Codex is available AND `OLD_CFG` is NOT `disabled`:
|
||||
If `CODEX_MODE` is `ready`:
|
||||
|
||||
```bash
|
||||
TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX)
|
||||
@@ -2393,13 +2446,13 @@ Present the full output verbatim. This is informational — it never blocks ship
|
||||
|
||||
**Cleanup:** Run `rm -f "$TMPERR_ADV"` after processing.
|
||||
|
||||
If Codex is NOT available: "Codex CLI not found — running Claude adversarial only. Install Codex for cross-model coverage: `npm install -g @openai/codex`"
|
||||
If `CODEX_MODE` is `not_installed` / `not_authed` / `disabled`: the preflight already printed the reason; run Claude adversarial only.
|
||||
|
||||
---
|
||||
|
||||
### Codex structured review (large diffs only, 200+ lines)
|
||||
|
||||
If `DIFF_TOTAL >= 200` AND Codex is available AND `OLD_CFG` is NOT `disabled`:
|
||||
If `DIFF_TOTAL >= 200` AND `CODEX_MODE` is `ready`:
|
||||
|
||||
```bash
|
||||
TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX)
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from "bun:test";
|
||||
import { execFileSync } from "child_process";
|
||||
import { execFileSync, spawnSync } from "child_process";
|
||||
import {
|
||||
mkdtempSync,
|
||||
mkdirSync,
|
||||
@@ -47,6 +47,16 @@ function runDetect(env: Partial<NodeJS.ProcessEnv>): string {
|
||||
});
|
||||
}
|
||||
|
||||
/** Run detect with --is-ok and return its exit code (never throws). */
|
||||
function runIsOk(env: Partial<NodeJS.ProcessEnv>): number {
|
||||
const r = spawnSync(BUN_BIN, ["run", DETECT_BIN, "--is-ok"], {
|
||||
timeout: 15_000,
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
env: { ...process.env, ...env },
|
||||
});
|
||||
return r.status ?? 1;
|
||||
}
|
||||
|
||||
interface DetectShape {
|
||||
gbrain_on_path: boolean;
|
||||
gbrain_version: string | null;
|
||||
@@ -244,3 +254,66 @@ exit 0
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("bin/gstack-gbrain-detect --is-ok — live gate", () => {
|
||||
it("exits non-zero when gbrain is not on PATH (no-cli)", () => {
|
||||
const tmp = mkdtempSync(join(tmpdir(), "detect-isok-"));
|
||||
try {
|
||||
const code = runIsOk({
|
||||
HOME: tmp,
|
||||
PATH: "/usr/bin:/bin", // no gbrain
|
||||
GSTACK_HOME: tmp,
|
||||
GSTACK_DETECT_NO_CACHE: "1",
|
||||
});
|
||||
expect(code).not.toBe(0);
|
||||
} finally {
|
||||
rmSync(tmp, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("exits 0 when a fake gbrain reports a healthy engine (ok)", () => {
|
||||
const tmp = mkdtempSync(join(tmpdir(), "detect-isok-"));
|
||||
const bindir = join(tmp, "bin");
|
||||
const home = join(tmp, "home");
|
||||
const configDir = join(home, ".gbrain");
|
||||
try {
|
||||
mkdirSync(bindir, { recursive: true });
|
||||
mkdirSync(configDir, { recursive: true });
|
||||
writeFileSync(join(configDir, "config.json"), JSON.stringify({ engine: "pglite" }));
|
||||
const fake = `#!/bin/sh
|
||||
case "$1 $2" in
|
||||
"--version ") echo "gbrain 0.33.1.0"; exit 0 ;;
|
||||
"sources list") echo '{"sources":[]}'; exit 0 ;;
|
||||
"doctor "*) echo '{"status":"ok","checks":[]}'; exit 0 ;;
|
||||
esac
|
||||
exit 0
|
||||
`;
|
||||
const gbrainPath = join(bindir, "gbrain");
|
||||
writeFileSync(gbrainPath, fake);
|
||||
chmodSync(gbrainPath, 0o755);
|
||||
|
||||
const code = runIsOk({
|
||||
HOME: home,
|
||||
PATH: `${bindir}:/usr/bin:/bin`,
|
||||
GSTACK_HOME: tmp,
|
||||
GSTACK_DETECT_NO_CACHE: "1",
|
||||
});
|
||||
expect(code).toBe(0);
|
||||
} finally {
|
||||
rmSync(tmp, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("exit code agrees with the JSON gbrain_local_status (no skew)", () => {
|
||||
// Run both surfaces against the same env and assert they never disagree.
|
||||
const tmp = mkdtempSync(join(tmpdir(), "detect-isok-"));
|
||||
try {
|
||||
const env = { HOME: tmp, PATH: "/usr/bin:/bin", GSTACK_HOME: tmp, GSTACK_DETECT_NO_CACHE: "1" };
|
||||
const status = (JSON.parse(runDetect(env)) as DetectShape).gbrain_local_status;
|
||||
const code = runIsOk(env);
|
||||
expect(code === 0).toBe(status === "ok");
|
||||
} finally {
|
||||
rmSync(tmp, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as path from 'path';
|
||||
import * as fs from 'fs';
|
||||
|
||||
// Static tripwires for the C (machine-wide) render in `gstack-config
|
||||
// gbrain-refresh`. The render mutates the shared global install, so the guards
|
||||
// that stop it from touching the wrong directory are load-bearing — these fail
|
||||
// CI if any guard is dropped.
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const SRC = fs.readFileSync(path.join(ROOT, 'bin', 'gstack-config'), 'utf-8');
|
||||
|
||||
// Pull out just the gbrain-refresh `ok)` branch so assertions can't be
|
||||
// satisfied by unrelated text elsewhere in the file.
|
||||
function okBranch(): string {
|
||||
const start = SRC.indexOf('gbrain-refresh)');
|
||||
const ok = SRC.indexOf('ok)', start);
|
||||
const end = SRC.indexOf(';;', ok);
|
||||
if (start < 0 || ok < 0 || end < 0) throw new Error('Could not locate gbrain-refresh ok) branch');
|
||||
return SRC.slice(ok, end);
|
||||
}
|
||||
|
||||
describe('gstack-config gbrain-refresh: machine-wide render guards', () => {
|
||||
const branch = okBranch();
|
||||
|
||||
test('targets the global install', () => {
|
||||
expect(branch).toContain('$HOME/.claude/skills/gstack');
|
||||
});
|
||||
|
||||
test('refuses a symlinked install (would dirty a dev worktree)', () => {
|
||||
expect(branch).toMatch(/\[ -L "\$INSTALL_DIR" \]/);
|
||||
});
|
||||
|
||||
test('verifies it is a real gstack clone before mutating it', () => {
|
||||
expect(branch).toContain('$INSTALL_DIR/VERSION');
|
||||
expect(branch).toContain('$INSTALL_DIR/package.json');
|
||||
});
|
||||
|
||||
test('requires bun on PATH', () => {
|
||||
expect(branch).toContain('command -v bun');
|
||||
});
|
||||
|
||||
test('renders the :user variant in place into the install', () => {
|
||||
expect(branch).toContain('gen:skill-docs:user --host claude');
|
||||
});
|
||||
|
||||
test('is self-documenting about the reset --hard / re-run cycle', () => {
|
||||
expect(branch).toContain('reset --hard');
|
||||
expect(branch).toContain('gbrain-refresh');
|
||||
});
|
||||
});
|
||||
|
||||
describe('CLAUDE.md: deploy section documents the re-run', () => {
|
||||
test('notes re-running gbrain-refresh after reset --hard', () => {
|
||||
const claudeMd = fs.readFileSync(path.join(ROOT, 'CLAUDE.md'), 'utf-8');
|
||||
const idx = claudeMd.indexOf('## Deploying to the active skill');
|
||||
expect(idx).toBeGreaterThan(-1);
|
||||
const section = claudeMd.slice(idx, idx + 1200);
|
||||
expect(section).toContain('gbrain-refresh');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,84 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import { createHash } from 'crypto';
|
||||
import * as path from 'path';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// Render the gbrain `:user` variant into a temp out-dir, forcing detection ON
|
||||
// via a crafted GSTACK_HOME so the test is deterministic regardless of whether
|
||||
// the dev machine actually has gbrain installed. Asserts the B2 contract:
|
||||
// (a) the worktree SKILL.md is byte-unchanged (source stays canonical),
|
||||
// (b) the out-dir SKILL.md gained the inline Brain Context Load block,
|
||||
// (c) its section refs point at the out-dir, not ~/.claude/skills/gstack,
|
||||
// (d) bin/ refs are left pointing at the global install,
|
||||
// (e) the out-dir section file gained the Save Results to Brain block.
|
||||
describe('gen-skill-docs --out-dir (B2 render isolation)', () => {
|
||||
function hashFile(p: string): string {
|
||||
return createHash('sha256').update(fs.readFileSync(p)).digest('hex');
|
||||
}
|
||||
|
||||
test('renders :user to out-dir, rewrites section paths, leaves worktree canonical', () => {
|
||||
const tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-home-'));
|
||||
const outDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-out-'));
|
||||
const worktreeSkill = path.join(ROOT, 'ship', 'SKILL.md');
|
||||
const beforeHash = hashFile(worktreeSkill);
|
||||
try {
|
||||
// Force gbrain detection ON for --respect-detection.
|
||||
fs.writeFileSync(
|
||||
path.join(tmpHome, 'gbrain-detection.json'),
|
||||
JSON.stringify({ gbrain_local_status: 'ok', gbrain_version: '9.9.9' }),
|
||||
);
|
||||
|
||||
const res = spawnSync(
|
||||
'bun',
|
||||
['run', 'scripts/gen-skill-docs.ts', '--respect-detection', '--host', 'claude', '--out-dir', outDir],
|
||||
{ cwd: ROOT, encoding: 'utf-8', timeout: 120_000, env: { ...process.env, GSTACK_HOME: tmpHome } },
|
||||
);
|
||||
expect(res.status).toBe(0);
|
||||
|
||||
const outSkill = path.join(outDir, 'ship', 'SKILL.md');
|
||||
const outSection = path.join(outDir, 'ship', 'sections', 'adversarial.md');
|
||||
expect(fs.existsSync(outSkill)).toBe(true);
|
||||
const skillContent = fs.readFileSync(outSkill, 'utf-8');
|
||||
|
||||
// (a) worktree byte-unchanged
|
||||
expect(hashFile(worktreeSkill)).toBe(beforeHash);
|
||||
|
||||
// (b) inline block present in the rendered SKILL.md
|
||||
expect(skillContent).toContain('Brain Context Load');
|
||||
|
||||
// (c) section refs repointed to the out-dir; none left pointing at the install
|
||||
expect(skillContent).toContain(`${outDir}/ship/sections/`);
|
||||
expect(skillContent).not.toContain('~/.claude/skills/gstack/ship/sections/');
|
||||
|
||||
// (d) bin refs are NOT rewritten — they still resolve to the global install
|
||||
expect(skillContent).toContain('~/.claude/skills/gstack/bin/');
|
||||
|
||||
// (e) the SAVE block landed in the rendered section file
|
||||
expect(fs.existsSync(outSection)).toBe(true);
|
||||
expect(fs.readFileSync(outSection, 'utf-8')).toContain('Save Results to Brain');
|
||||
} finally {
|
||||
fs.rmSync(tmpHome, { recursive: true, force: true });
|
||||
fs.rmSync(outDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
test('global extras (proactive-suggestions.json) are NOT written in out-dir mode', () => {
|
||||
const outDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-out-'));
|
||||
try {
|
||||
const res = spawnSync(
|
||||
'bun',
|
||||
['run', 'scripts/gen-skill-docs.ts', '--host', 'claude', '--out-dir', outDir],
|
||||
{ cwd: ROOT, encoding: 'utf-8', timeout: 120_000 },
|
||||
);
|
||||
expect(res.status).toBe(0);
|
||||
// proactive-suggestions.json lives at a repo path; out-dir mode must skip it.
|
||||
expect(fs.existsSync(path.join(outDir, 'scripts', 'proactive-suggestions.json'))).toBe(false);
|
||||
} finally {
|
||||
fs.rmSync(outDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,96 @@
|
||||
/**
|
||||
* gstack-detach — the eval-infra robustness guard. Pins the four killer fixes:
|
||||
* 1. SIGTERM-proof detachment (runs in a different process group, outlives the launcher)
|
||||
* 2. run-scoped default log path (no shared-/tmp collision between worktrees)
|
||||
* 3. watchdog --timeout (no silent hang) + guaranteed EXIT sentinel
|
||||
* 4. machine-wide --lock serialization (no cross-worktree API saturation)
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync, spawn } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const DETACH = path.join(ROOT, 'bin', 'gstack-detach');
|
||||
|
||||
function ownPgid(): string {
|
||||
return (spawnSync('ps', ['-o', 'pgid=', '-p', String(process.pid)], { encoding: 'utf-8' }).stdout || '').trim();
|
||||
}
|
||||
function waitFor(pred: () => boolean, ms: number): boolean {
|
||||
const end = Date.now() + ms;
|
||||
while (Date.now() < end) {
|
||||
if (pred()) return true;
|
||||
spawnSync('sleep', ['0.2']);
|
||||
}
|
||||
return pred();
|
||||
}
|
||||
function logHas(p: string, needle: string): boolean {
|
||||
try { return fs.readFileSync(p, 'utf-8').includes(needle); } catch { return false; }
|
||||
}
|
||||
|
||||
describe('gstack-detach', () => {
|
||||
test('detaches (different pgid), returns immediately, completes, writes EXIT sentinel', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
|
||||
const log = path.join(dir, 'run.log');
|
||||
try {
|
||||
const t0 = Date.now();
|
||||
const r = spawnSync(DETACH, ['--log', log, '--', 'bash', '-c', 'sleep 2; echo body-ran'], { encoding: 'utf-8', timeout: 10000 });
|
||||
const elapsed = Date.now() - t0;
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain(`gstack-detach LOG ${log}`);
|
||||
expect(elapsed).toBeLessThan(1500); // non-blocking
|
||||
expect(waitFor(() => logHas(log, '### gstack-detach EXIT=0 ###'), 8000)).toBe(true);
|
||||
expect(logHas(log, 'body-ran')).toBe(true); // ran to completion after launcher returned
|
||||
const m = fs.readFileSync(log, 'utf-8').match(/pgid=(\d+)/);
|
||||
expect(m).not.toBeNull();
|
||||
expect(m![1]).not.toBe(ownPgid()); // detached into its own group
|
||||
} finally { fs.rmSync(dir, { recursive: true, force: true }); }
|
||||
}, 15000);
|
||||
|
||||
test('default log is run-scoped under ~/.gstack-dev/eval-runs (no shared /tmp)', () => {
|
||||
const r = spawnSync(DETACH, ['--label', 'unittest', '--', 'true'], { encoding: 'utf-8', timeout: 10000 });
|
||||
const log = (r.stdout.match(/gstack-detach LOG (\S+)/) || [])[1];
|
||||
try {
|
||||
expect(log).toContain('/.gstack-dev/eval-runs/');
|
||||
expect(path.basename(log)).toContain('unittest-');
|
||||
expect(path.basename(log)).toMatch(/-\d+\.log$/); // pid-unique
|
||||
waitFor(() => logHas(log, '### gstack-detach EXIT=0 ###'), 6000);
|
||||
} finally { if (log) fs.rmSync(log, { force: true }); }
|
||||
}, 12000);
|
||||
|
||||
test('watchdog kills a stalled run and records EXIT=timeout (no silent hang)', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
|
||||
const log = path.join(dir, 'run.log');
|
||||
try {
|
||||
spawnSync(DETACH, ['--log', log, '--timeout', '1', '--', 'sleep', '60'], { encoding: 'utf-8', timeout: 10000 });
|
||||
expect(waitFor(() => logHas(log, '### gstack-detach EXIT=timeout ###'), 12000)).toBe(true);
|
||||
expect(logHas(log, 'WATCHDOG fired')).toBe(true);
|
||||
} finally { fs.rmSync(dir, { recursive: true, force: true }); }
|
||||
}, 16000);
|
||||
|
||||
test('machine --lock serializes concurrent runs (second WAITS for the first)', () => {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
|
||||
const lock = `gstack-detach-test-${process.pid}`;
|
||||
const logA = path.join(dir, 'a.log');
|
||||
const logB = path.join(dir, 'b.log');
|
||||
try {
|
||||
// First holds the lock for ~3s; second must wait then acquire.
|
||||
spawnSync(DETACH, ['--log', logA, '--lock', lock, '--', 'sleep', '3'], { encoding: 'utf-8', timeout: 10000 });
|
||||
waitFor(() => logHas(logA, "ACQUIRED"), 4000);
|
||||
spawnSync(DETACH, ['--log', logB, '--lock', lock, '--', 'echo', 'second-ran'], { encoding: 'utf-8', timeout: 10000 });
|
||||
// Second should report WAITING (first still holds it) then ACQUIRE after release.
|
||||
expect(waitFor(() => logHas(logB, 'WAITING for lock'), 4000)).toBe(true);
|
||||
expect(waitFor(() => logHas(logB, '### gstack-detach EXIT=0 ###'), 12000)).toBe(true);
|
||||
expect(logHas(logB, 'second-ran')).toBe(true);
|
||||
} finally {
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
fs.rmSync(path.join(os.homedir(), '.gstack', 'locks', `${lock}.lock`), { force: true });
|
||||
}
|
||||
}, 20000);
|
||||
|
||||
test('rejects missing command (exit 2)', () => {
|
||||
const r = spawnSync(DETACH, ['--label', 'x'], { encoding: 'utf-8' });
|
||||
expect(r.status).toBe(2);
|
||||
});
|
||||
});
|
||||
@@ -36,6 +36,7 @@ import {
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { resolveClaudeBinary as resolveClaudeBinaryShared } from '../../browse/src/claude-bin';
|
||||
import { hermeticChildEnv } from './hermetic-env';
|
||||
import type { SkillTestResult } from './session-runner';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -300,12 +301,17 @@ export async function runAgentSdkTest(
|
||||
const queryImpl: QueryProvider = opts.queryProvider ?? query;
|
||||
const model = opts.model ?? 'claude-opus-4-7';
|
||||
|
||||
// NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless
|
||||
// classification for eval/E2E runs is set by the `test:gate` / `test:evals`
|
||||
// package.json scripts (scoped to that invocation), NOT mutated here. We must not
|
||||
// pass sdkOpts.env (it breaks the SDK auth pipeline — see CLAUDE.md) and must not
|
||||
// mutate process.env ambiently (it would leak headless into later interactive-path
|
||||
// tests in the same Bun process — Codex review finding).
|
||||
// NOTE on env: the SDK child gets the COMPLETE hermetic env (allowlist
|
||||
// scrub + ANTHROPIC_API_KEY + hermetic CLAUDE_CONFIG_DIR/GSTACK_HOME), with
|
||||
// per-test opts.env merging last. The historical "passing env: breaks SDK
|
||||
// auth" failure (old CLAUDE.md warning) was partial-env replacement —
|
||||
// Options.env REPLACES the child's entire environment, so an object without
|
||||
// the key killed auth. A complete env is safe (validated 2026-06-12 via
|
||||
// query() with hermeticChildEnv(): success, real cost, Bash tool working).
|
||||
// Do not mutate process.env ambiently here (it would leak into later
|
||||
// interactive-path tests in the same Bun process — Codex review finding);
|
||||
// ambient ANTHROPIC_API_KEY mutation by tests still works because the
|
||||
// builder reads process.env at call time.
|
||||
|
||||
let attempt = 0;
|
||||
let lastErr: unknown = null;
|
||||
@@ -356,7 +362,7 @@ export async function runAgentSdkTest(
|
||||
permissionMode: resolvedPermissionMode,
|
||||
allowDangerouslySkipPermissions: resolvedPermissionMode === 'bypassPermissions',
|
||||
settingSources: opts.settingSources ?? [],
|
||||
env: opts.env,
|
||||
env: hermeticChildEnv(opts.env),
|
||||
pathToClaudeCodeExecutable: opts.pathToClaudeCodeExecutable,
|
||||
...(hasCanUseTool ? { canUseTool: opts.canUseTool } : {}),
|
||||
};
|
||||
|
||||
@@ -145,6 +145,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
maxSkeletonBytes: 90_000,
|
||||
minUnionBytes: 80_000,
|
||||
mustContain: ['SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'HOLD SCOPE', 'SCOPE REDUCTION'],
|
||||
// Default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
|
||||
// prose replacing the smaller opt-in question) lands this ~5.2% over baseline.
|
||||
maxSizeRatio: 1.08,
|
||||
},
|
||||
'plan-eng-review': {
|
||||
skill: 'plan-eng-review',
|
||||
@@ -162,9 +165,11 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['Architecture', 'Code Quality', 'Test', 'Performance'],
|
||||
// Cross-cutting preamble growth (v1.57.2.0 AUQ-failure prose fallback + the
|
||||
// decision-memory nudge + the v1.57.4.0 Boil-the-Ocean rename) lands this just
|
||||
// over the strict 1.05; small headroom for the shared preamble additions.
|
||||
maxSizeRatio: 1.06,
|
||||
// decision-memory nudge + the v1.57.4.0 Boil-the-Ocean rename) plus the
|
||||
// default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
|
||||
// prose, replacing the smaller opt-in question) land this at ~6.6% over the
|
||||
// v1.53.0.0 baseline. Headroom for those intentional additions.
|
||||
maxSizeRatio: 1.08,
|
||||
},
|
||||
'plan-design-review': {
|
||||
skill: 'plan-design-review',
|
||||
@@ -178,7 +183,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'plan',
|
||||
maxSkeletonBytes: 82_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 84_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['design', 'visual'],
|
||||
},
|
||||
@@ -194,9 +201,14 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'plan',
|
||||
maxSkeletonBytes: 76_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/destructive prose safety +
|
||||
// continuation protocol in the always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 78_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['developer experience', 'Getting Started'],
|
||||
// Default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
|
||||
// prose replacing the smaller opt-in question) lands this ~5.7% over baseline.
|
||||
maxSizeRatio: 1.08,
|
||||
},
|
||||
'office-hours': {
|
||||
skill: 'office-hours',
|
||||
@@ -229,14 +241,20 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 50_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 53_000,
|
||||
minUnionBytes: 55_000,
|
||||
mustContain: ['CHANGELOG', 'Diataxis', 'coverage'],
|
||||
// The AUQ-failure prose fallback (v1.57.2.0) adds ~2KB to every skill's
|
||||
// always-loaded preamble; on this small carved skeleton that lands at ~5.9%
|
||||
// over the pre-carve/pre-AUQ v1.53.0.0 baseline. Headroom for the
|
||||
// cross-cutting addition; all other skills keep the strict 1.05 ceiling.
|
||||
maxSizeRatio: 1.08,
|
||||
// Two intentional additions stack on this small skill: the AUQ-failure prose
|
||||
// fallback (v1.57.2.0, ~2KB to every preamble) AND the new default-on Codex
|
||||
// documentation-review section (codexPreflight + prompt + apply-gate, carved
|
||||
// into release-body so the SKELETON stays under maxSkeletonBytes). On a ~55KB
|
||||
// baseline that whole new capability is ~18.6% of union bytes. The doc review
|
||||
// is a deliberate new feature, not preamble creep; the union ceiling is raised
|
||||
// to match while the skeleton budget (50_000) still holds the always-loaded
|
||||
// cost flat.
|
||||
maxSizeRatio: 1.20,
|
||||
},
|
||||
'design-consultation': {
|
||||
skill: 'design-consultation',
|
||||
@@ -250,7 +268,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 64_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 67_000,
|
||||
minUnionBytes: 72_000,
|
||||
mustContain: ['Typography', 'Color', 'Aesthetic Direction'],
|
||||
// Cross-cutting preamble growth (v1.57.2.0 AUQ-failure prose fallback ~2KB +
|
||||
@@ -286,7 +306,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 70_000,
|
||||
// +Conductor AUQ-default-prose rule + one-way/continuation safety in the
|
||||
// always-loaded AskUserQuestion Format section.
|
||||
maxSkeletonBytes: 73_000,
|
||||
minUnionBytes: 72_000,
|
||||
mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
|
||||
// cso keeps its mode-dispatch + FP-filtering phases always-loaded, so the
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env';
|
||||
|
||||
/** Strip ANSI escapes for pattern-matching against visible text. */
|
||||
export function stripAnsi(s: string): string {
|
||||
@@ -120,6 +121,13 @@ export interface ClaudePtySession {
|
||||
exited(): boolean;
|
||||
/** Exit code, if known. */
|
||||
exitCode(): number | null;
|
||||
/**
|
||||
* The hermetic CLAUDE_CONFIG_DIR this session's claude was pointed at, or
|
||||
* null when EVALS_HERMETIC=0. Forensics: hermetic plan files live under
|
||||
* `<hermeticConfigDir>/plans/` (extractPlanFilePath still matches them —
|
||||
* the dir name ends in `/.claude` by contract).
|
||||
*/
|
||||
hermeticConfigDir: string | null;
|
||||
/**
|
||||
* Send SIGINT, then SIGKILL after 1s. Always safe to call multiple times.
|
||||
* Awaits process exit before resolving.
|
||||
@@ -1143,8 +1151,17 @@ export async function launchClaudePty(
|
||||
if (permissionMode !== null) {
|
||||
args.push('--permission-mode', permissionMode);
|
||||
}
|
||||
// Hermetic children get zero MCP servers; gated on the same call-time
|
||||
// check as the env scrub so EVALS_HERMETIC=0 restores operator MCP too.
|
||||
// Before opts.extraArgs so a test could theoretically supply --mcp-config.
|
||||
const hermetic = isHermeticEnabled();
|
||||
if (hermetic) args.push('--strict-mcp-config');
|
||||
if (opts.extraArgs) args.push(...opts.extraArgs);
|
||||
|
||||
// Hermetic by default (test/helpers/hermetic-env.ts): operator session
|
||||
// context never reaches the child; per-test opts.env merges last.
|
||||
const childEnv = hermeticChildEnv(opts.env);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const proc = (Bun as any).spawn([claudePath, ...args], {
|
||||
terminal: {
|
||||
@@ -1155,7 +1172,7 @@ export async function launchClaudePty(
|
||||
},
|
||||
},
|
||||
cwd,
|
||||
env: { ...process.env, ...(opts.env ?? {}) },
|
||||
env: childEnv,
|
||||
});
|
||||
|
||||
// Track exit so waitForAny can fail fast if claude crashes.
|
||||
@@ -1307,6 +1324,7 @@ export async function launchClaudePty(
|
||||
pid: () => proc.pid as number | undefined,
|
||||
exited: () => exited,
|
||||
exitCode: () => exitCodeCaptured,
|
||||
hermeticConfigDir: hermetic ? childEnv.CLAUDE_CONFIG_DIR ?? null : null,
|
||||
close,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { hermeticChildEnv } from './hermetic-env';
|
||||
|
||||
// --- Interfaces ---
|
||||
|
||||
@@ -201,15 +202,18 @@ export async function runCodexSkill(opts: {
|
||||
// Build codex exec command
|
||||
const args = ['exec', prompt, '--json', '-s', sandbox];
|
||||
|
||||
// Spawn codex with temp HOME so it discovers our installed skill
|
||||
// Spawn codex with temp HOME so it discovers our installed skill.
|
||||
// Hermetic scrub (test/helpers/hermetic-env.ts) with codex's auth surface
|
||||
// re-admitted: codex auths from $HOME/.codex (copied into tempHome above)
|
||||
// plus OPENAI_API_KEY/CODEX_* when present. HOME override merges last.
|
||||
const proc = Bun.spawn(['codex', ...args], {
|
||||
cwd: cwd || skillDir,
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
env: {
|
||||
...process.env,
|
||||
HOME: tempHome,
|
||||
},
|
||||
env: hermeticChildEnv(
|
||||
{ HOME: tempHome },
|
||||
{ extraAllow: ['OPENAI_API_KEY', 'CODEX_*'] },
|
||||
),
|
||||
});
|
||||
|
||||
// Race against timeout
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
*/
|
||||
|
||||
import * as path from 'path';
|
||||
import { hermeticChildEnv } from './hermetic-env';
|
||||
|
||||
// --- Interfaces ---
|
||||
|
||||
@@ -122,11 +123,16 @@ export async function runGeminiSkill(opts: {
|
||||
// Build gemini command
|
||||
const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];
|
||||
|
||||
// Spawn gemini — uses real HOME for auth, cwd for skill discovery
|
||||
// Spawn gemini — uses real HOME for auth (~/.gemini; HOME is allowlisted),
|
||||
// cwd for skill discovery. Hermetic scrub with gemini's auth surface
|
||||
// re-admitted (previously this spawn inherited the full operator env).
|
||||
const proc = Bun.spawn(['gemini', ...args], {
|
||||
cwd: cwd || process.cwd(),
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
env: hermeticChildEnv(undefined, {
|
||||
extraAllow: ['GEMINI_API_KEY', 'GOOGLE_API_KEY', 'GOOGLE_APPLICATION_CREDENTIALS', 'GOOGLE_CLOUD_*', 'GEMINI_*'],
|
||||
}),
|
||||
});
|
||||
|
||||
// Race against timeout
|
||||
|
||||
@@ -0,0 +1,269 @@
|
||||
/**
|
||||
* Unit tests for the hermetic child-env builder. Free tier — no API calls.
|
||||
*
|
||||
* Pins three contracts:
|
||||
* 1. Allowlist semantics: contamination vars dropped, basics/auth/network
|
||||
* kept, overrides merge last, EVALS_HERMETIC=0 is byte-identical legacy.
|
||||
* 2. Seed-config shape: 20-char key suffix, trusted dirs, undefined-key safe.
|
||||
* 3. Dir lifecycle: /.claude suffix (extractPlanFilePath contract —
|
||||
* claude-pty-runner.ts:191), sync singleton reuse, pid-aware GC.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, afterAll } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import {
|
||||
buildHermeticEnv,
|
||||
buildSeedConfig,
|
||||
isHermeticEnabled,
|
||||
getHermeticDirs,
|
||||
gcStaleHermeticDirs,
|
||||
hermeticChildEnv,
|
||||
} from './hermetic-env';
|
||||
|
||||
const CONTAMINATED: NodeJS.ProcessEnv = {
|
||||
PATH: '/usr/bin', HOME: '/Users/op', TMPDIR: '/tmp', TERM: 'xterm',
|
||||
ANTHROPIC_API_KEY: 'sk-ant-0123456789abcdefghijklmn',
|
||||
ANTHROPIC_BASE_URL: 'https://proxy.example/api',
|
||||
ANTHROPIC_MODEL: 'sneaky-model-override',
|
||||
EVALS_MODEL: 'claude-sonnet-4-6',
|
||||
GITHUB_ACTIONS: 'true',
|
||||
HTTPS_PROXY: 'http://corp:3128',
|
||||
NODE_EXTRA_CA_CERTS: '/etc/corp.pem',
|
||||
CONDUCTOR_WORKSPACE_PATH: '/Users/op/conductor/ws',
|
||||
CONDUCTOR_SESSION: '1',
|
||||
CLAUDECODE: '1',
|
||||
CLAUDE_CODE_ENTRYPOINT: 'cli',
|
||||
CLAUDE_CONFIG_DIR: '/Users/op/.claude',
|
||||
GSTACK_HOME: '/Users/op/.gstack',
|
||||
GSTACK_HEADLESS_DEFAULT: 'x',
|
||||
MCP_TIMEOUT: '5000',
|
||||
GBRAIN_ENDPOINT: 'http://localhost:1234',
|
||||
OPENAI_API_KEY: 'sk-openai-secret',
|
||||
VOYAGE_API_KEY: 'vg-secret',
|
||||
GH_TOKEN: 'gho_secret',
|
||||
SSH_AUTH_SOCK: '/tmp/ssh.sock',
|
||||
GIT_AUTHOR_NAME: 'Op',
|
||||
};
|
||||
|
||||
const HERMETIC_VARS = { CLAUDE_CONFIG_DIR: '/x/.claude', GSTACK_HOME: '/x/gstack-home' };
|
||||
|
||||
describe('buildHermeticEnv allowlist', () => {
|
||||
const env = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS);
|
||||
|
||||
test('keeps process basics, network, CI, and eval knobs', () => {
|
||||
expect(env.PATH).toBe('/usr/bin');
|
||||
expect(env.HOME).toBe('/Users/op');
|
||||
expect(env.EVALS_MODEL).toBe('claude-sonnet-4-6');
|
||||
expect(env.GITHUB_ACTIONS).toBe('true');
|
||||
expect(env.HTTPS_PROXY).toBe('http://corp:3128');
|
||||
expect(env.NODE_EXTRA_CA_CERTS).toBe('/etc/corp.pem');
|
||||
});
|
||||
|
||||
test('keeps named auth vars but not the broad ANTHROPIC_ prefix', () => {
|
||||
expect(env.ANTHROPIC_API_KEY).toBe(CONTAMINATED.ANTHROPIC_API_KEY);
|
||||
expect(env.ANTHROPIC_BASE_URL).toBe(CONTAMINATED.ANTHROPIC_BASE_URL);
|
||||
expect(env.ANTHROPIC_MODEL).toBeUndefined(); // behavior knob, not auth
|
||||
});
|
||||
|
||||
test('drops session-context and operator-credential vars', () => {
|
||||
for (const k of [
|
||||
'CONDUCTOR_WORKSPACE_PATH', 'CONDUCTOR_SESSION', 'CLAUDECODE',
|
||||
'CLAUDE_CODE_ENTRYPOINT', 'GSTACK_HEADLESS_DEFAULT', 'MCP_TIMEOUT',
|
||||
'GBRAIN_ENDPOINT', 'OPENAI_API_KEY', 'VOYAGE_API_KEY', 'GH_TOKEN',
|
||||
'SSH_AUTH_SOCK', 'GIT_AUTHOR_NAME',
|
||||
]) {
|
||||
expect(env[k]).toBeUndefined();
|
||||
}
|
||||
});
|
||||
|
||||
test('redirects CLAUDE_CONFIG_DIR and GSTACK_HOME to hermetic values', () => {
|
||||
expect(env.CLAUDE_CONFIG_DIR).toBe('/x/.claude');
|
||||
expect(env.GSTACK_HOME).toBe('/x/gstack-home');
|
||||
});
|
||||
|
||||
test('overrides merge last — per-test re-contamination is deliberate', () => {
|
||||
const e = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS, {
|
||||
CONDUCTOR_WORKSPACE_PATH: '/tmp/test-ws',
|
||||
GSTACK_HOME: '/tmp/test-home',
|
||||
GSTACK_HEADLESS: '',
|
||||
});
|
||||
expect(e.CONDUCTOR_WORKSPACE_PATH).toBe('/tmp/test-ws');
|
||||
expect(e.GSTACK_HOME).toBe('/tmp/test-home');
|
||||
expect(e.GSTACK_HEADLESS).toBe('');
|
||||
});
|
||||
|
||||
test('promotes GSTACK_ANTHROPIC_API_KEY when canonical absent (shared shim fn)', () => {
|
||||
const base = { ...CONTAMINATED } as NodeJS.ProcessEnv;
|
||||
delete base.ANTHROPIC_API_KEY;
|
||||
base.GSTACK_ANTHROPIC_API_KEY = 'sk-ant-promoted-9876543210';
|
||||
const e = buildHermeticEnv(base, HERMETIC_VARS);
|
||||
expect(e.ANTHROPIC_API_KEY).toBe('sk-ant-promoted-9876543210');
|
||||
expect(e.GSTACK_ANTHROPIC_API_KEY).toBeUndefined(); // GSTACK_* still dropped
|
||||
});
|
||||
|
||||
test('extraAllow re-admits exact names and prefixes per runner', () => {
|
||||
const e = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS, undefined, {
|
||||
extraAllow: ['OPENAI_API_KEY', 'GIT_*'],
|
||||
});
|
||||
expect(e.OPENAI_API_KEY).toBe('sk-openai-secret');
|
||||
expect(e.GIT_AUTHOR_NAME).toBe('Op');
|
||||
expect(e.GH_TOKEN).toBeUndefined(); // not in extraAllow
|
||||
});
|
||||
|
||||
test('TERM falls back when base omits it', () => {
|
||||
const base = { ...CONTAMINATED } as NodeJS.ProcessEnv;
|
||||
delete base.TERM;
|
||||
expect(buildHermeticEnv(base, HERMETIC_VARS).TERM).toBe('xterm-256color');
|
||||
});
|
||||
});
|
||||
|
||||
describe('EVALS_HERMETIC=0 escape hatch', () => {
|
||||
test('returns byte-identical legacy env, overrides still last', () => {
|
||||
const base = { ...CONTAMINATED, EVALS_HERMETIC: '0' } as NodeJS.ProcessEnv;
|
||||
const e = buildHermeticEnv(base, HERMETIC_VARS, { GSTACK_HEADLESS: '1' });
|
||||
// Legacy spread: every base var survives, hermeticVars NOT applied.
|
||||
expect(e.CONDUCTOR_WORKSPACE_PATH).toBe(CONTAMINATED.CONDUCTOR_WORKSPACE_PATH);
|
||||
expect(e.CLAUDE_CONFIG_DIR).toBe('/Users/op/.claude');
|
||||
expect(e.GSTACK_HOME).toBe('/Users/op/.gstack');
|
||||
expect(e.GSTACK_HEADLESS).toBe('1');
|
||||
expect(e).toEqual({ ...(base as Record<string, string>), GSTACK_HEADLESS: '1' });
|
||||
});
|
||||
|
||||
test('isHermeticEnabled reads at call time (ESM-hoist safety)', () => {
|
||||
const prev = process.env.EVALS_HERMETIC;
|
||||
try {
|
||||
process.env.EVALS_HERMETIC = '0';
|
||||
expect(isHermeticEnabled()).toBe(false);
|
||||
process.env.EVALS_HERMETIC = '1';
|
||||
expect(isHermeticEnabled()).toBe(true);
|
||||
delete process.env.EVALS_HERMETIC;
|
||||
expect(isHermeticEnabled()).toBe(true);
|
||||
} finally {
|
||||
if (prev === undefined) delete process.env.EVALS_HERMETIC;
|
||||
else process.env.EVALS_HERMETIC = prev;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildSeedConfig', () => {
|
||||
test('stores only the 20-char key suffix and trusts the given dirs', () => {
|
||||
const seed = buildSeedConfig({
|
||||
apiKey: 'sk-ant-0123456789abcdefghijklmn',
|
||||
trustedDirs: ['/repo/root'],
|
||||
}) as any;
|
||||
expect(seed.hasCompletedOnboarding).toBe(true);
|
||||
const approved = seed.customApiKeyResponses.approved;
|
||||
expect(approved).toHaveLength(1);
|
||||
expect(approved[0]).toHaveLength(20);
|
||||
expect('sk-ant-0123456789abcdefghijklmn'.endsWith(approved[0])).toBe(true);
|
||||
expect(seed.projects['/repo/root'].hasTrustDialogAccepted).toBe(true);
|
||||
expect(seed.projects['/repo/root'].hasCompletedProjectOnboarding).toBe(true);
|
||||
});
|
||||
|
||||
test('apiKey undefined → omits customApiKeyResponses, does not throw', () => {
|
||||
const seed = buildSeedConfig({ apiKey: undefined, trustedDirs: [] }) as any;
|
||||
expect(seed.customApiKeyResponses).toBeUndefined();
|
||||
expect(seed.hasCompletedOnboarding).toBe(true);
|
||||
});
|
||||
|
||||
test('no full key material anywhere in the seed', () => {
|
||||
const key = 'sk-ant-0123456789abcdefghijklmn';
|
||||
const json = JSON.stringify(buildSeedConfig({ apiKey: key, trustedDirs: [] }));
|
||||
expect(json.includes(key)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getHermeticDirs lifecycle', () => {
|
||||
test('configDir ends in /.claude — extractPlanFilePath contract', () => {
|
||||
// claude-pty-runner.ts:191 anchors plan paths on `.claude/plans/` under
|
||||
// /var|/tmp prefixes; the dir-name suffix is what keeps PTY plan-mode
|
||||
// tests extracting hermetic plan files with zero extractor changes.
|
||||
const dirs = getHermeticDirs();
|
||||
expect(dirs.configDir.endsWith(`${path.sep}.claude`)).toBe(true);
|
||||
expect(dirs.configDir.startsWith(os.tmpdir())).toBe(true);
|
||||
});
|
||||
|
||||
test('sync singleton: repeat calls return the same dirs', () => {
|
||||
expect(getHermeticDirs()).toBe(getHermeticDirs());
|
||||
});
|
||||
|
||||
test('seeds .claude.json in the config dir', () => {
|
||||
const dirs = getHermeticDirs();
|
||||
const seed = JSON.parse(fs.readFileSync(path.join(dirs.configDir, '.claude.json'), 'utf-8'));
|
||||
expect(seed.hasCompletedOnboarding).toBe(true);
|
||||
const root = path.resolve(__dirname, '..', '..');
|
||||
expect(seed.projects[root].hasTrustDialogAccepted).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('gcStaleHermeticDirs', () => {
|
||||
test('removes dead-pid dirs, keeps live-pid and foreign dirs', () => {
|
||||
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-gc-test-'));
|
||||
// Find a pid that is definitely dead: spawn-and-reap is overkill; use a
|
||||
// huge pid beyond pid_max on macOS/Linux defaults.
|
||||
const deadPid = 99999999;
|
||||
const dead = path.join(tmp, `gstack-hermetic-${deadPid}-abc`);
|
||||
const live = path.join(tmp, `gstack-hermetic-${process.pid}-abc`);
|
||||
const foreign = path.join(tmp, 'unrelated-dir');
|
||||
const malformed = path.join(tmp, 'gstack-hermetic-notapid-abc');
|
||||
for (const d of [dead, live, foreign, malformed]) fs.mkdirSync(d);
|
||||
// GC only reclaims dirs older than its 1h age floor (PID-reuse guard);
|
||||
// backdate the dead-pid dir's mtime so it qualifies.
|
||||
const old = new Date(Date.now() - 2 * 60 * 60 * 1000);
|
||||
fs.utimesSync(dead, old, old);
|
||||
|
||||
gcStaleHermeticDirs(tmp);
|
||||
|
||||
expect(fs.existsSync(dead)).toBe(false);
|
||||
expect(fs.existsSync(live)).toBe(true);
|
||||
expect(fs.existsSync(foreign)).toBe(true);
|
||||
expect(fs.existsSync(malformed)).toBe(true); // never guess on malformed names
|
||||
fs.rmSync(tmp, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test('keeps a fresh dead-pid dir (PID-reuse grace window)', () => {
|
||||
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-gc-fresh-'));
|
||||
// Dead pid but just created — must survive GC, else PID reuse could delete
|
||||
// a dir whose original pid exited and got recycled to a live process.
|
||||
const freshDead = path.join(tmp, 'gstack-hermetic-99999999-xyz');
|
||||
fs.mkdirSync(freshDead);
|
||||
gcStaleHermeticDirs(tmp);
|
||||
expect(fs.existsSync(freshDead)).toBe(true);
|
||||
fs.rmSync(tmp, { recursive: true, force: true });
|
||||
});
|
||||
});
|
||||
|
||||
describe('hermeticChildEnv composition', () => {
|
||||
test('hermetic by default: redirects config dirs, drops contamination', () => {
|
||||
// process.env in a real test run may carry CONDUCTOR_*/CLAUDECODE — the
|
||||
// composition must scrub them and point at the singleton dirs.
|
||||
const e = hermeticChildEnv({ GSTACK_HEADLESS: '1' });
|
||||
const dirs = getHermeticDirs();
|
||||
expect(e.CLAUDE_CONFIG_DIR).toBe(dirs.configDir);
|
||||
expect(e.GSTACK_HOME).toBe(dirs.gstackHome);
|
||||
expect(e.GSTACK_HEADLESS).toBe('1');
|
||||
expect(e.CLAUDECODE).toBeUndefined();
|
||||
expect(e.CONDUCTOR_WORKSPACE_PATH).toBeUndefined();
|
||||
});
|
||||
|
||||
test('EVALS_HERMETIC=0: legacy passthrough of live process.env', () => {
|
||||
const prev = process.env.EVALS_HERMETIC;
|
||||
try {
|
||||
process.env.EVALS_HERMETIC = '0';
|
||||
const e = hermeticChildEnv({ EXTRA: 'x' });
|
||||
expect(e.PATH).toBe(process.env.PATH as string);
|
||||
expect(e.EXTRA).toBe('x');
|
||||
// No hermetic redirection in legacy mode.
|
||||
expect(e.CLAUDE_CONFIG_DIR).toBe(process.env.CLAUDE_CONFIG_DIR as any);
|
||||
} finally {
|
||||
if (prev === undefined) delete process.env.EVALS_HERMETIC;
|
||||
else process.env.EVALS_HERMETIC = prev;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
// The singleton's own exit hook handles runRoot; nothing else to clean.
|
||||
});
|
||||
@@ -0,0 +1,276 @@
|
||||
/**
|
||||
* Hermetic child environment for E2E test runners.
|
||||
*
|
||||
* Local E2E runs spawn `claude` (and codex/gemini/SDK) children that, until
|
||||
* this module, inherited the operator's full session context: ~/.claude
|
||||
* (user CLAUDE.md, .claude.json MCP servers incl. gbrain + Conductor,
|
||||
* skills), ~/.gstack decision logs, and CONDUCTOR_-/CLAUDECODE-style env vars.
|
||||
* CI was hermetic only by accident (fresh Docker /home/runner). This module
|
||||
* makes local children see a CI-equivalent clean room by default.
|
||||
*
|
||||
* operator shell (contaminated) hermetic child env
|
||||
* ┌─────────────────────────────┐ buildHermeticEnv()
|
||||
* │ PATH, HOME, TMPDIR, ... │── allowlist ─────────► kept
|
||||
* │ HTTP(S)_PROXY, SSL_CERT_* │── allowlist ─────────► kept (network)
|
||||
* │ ANTHROPIC_API_KEY/BASE_URL/ │── named list ────────► kept (auth)
|
||||
* │ AUTH_TOKEN │
|
||||
* │ GSTACK_ANTHROPIC_API_KEY │── promotedEnv() ─────► ANTHROPIC_API_KEY
|
||||
* │ CONDUCTOR_*, CLAUDECODE, │
|
||||
* │ CLAUDE_*, GSTACK_*, MCP_*, │── dropped ───────────► ∅
|
||||
* │ GBRAIN_*, GH_TOKEN, ... │
|
||||
* └─────────────────────────────┘
|
||||
* + per-runner extraAllow (codex: OpenAI vars; gemini: Google vars)
|
||||
* + CLAUDE_CONFIG_DIR=<runRoot>/.claude GSTACK_HOME=<runRoot>/gstack-home
|
||||
* + per-test overrides spread LAST
|
||||
*
|
||||
* Escape hatch: EVALS_HERMETIC=0 restores the legacy contaminated env
|
||||
* byte-identically (runners must also gate --strict-mcp-config on
|
||||
* isHermeticEnabled() so the escape hatch restores args too).
|
||||
*
|
||||
* isHermeticEnabled() is evaluated at CALL time, never at module load —
|
||||
* ESM hoists imports above any in-file `process.env.EVALS_HERMETIC = '0'`
|
||||
* assignment, so a module-load-time read would silently ignore test pins.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { promotedEnv } from '../../lib/conductor-env-shim';
|
||||
import { isProcessAlive } from '../../browse/src/error-handling';
|
||||
|
||||
/** Exact env names a hermetic child keeps. Everything not listed (or matched
|
||||
* by a prefix rule below) is dropped. */
|
||||
const ALLOW_EXACT = new Set([
|
||||
// Process basics
|
||||
'PATH', 'HOME', 'TMPDIR', 'TERM', 'COLORTERM', 'LANG', 'LC_ALL', 'SHELL',
|
||||
'USER', 'LOGNAME', 'TZ', 'NODE_ENV', 'CI',
|
||||
// Browser/runtime caches the child legitimately shares with the operator
|
||||
'PLAYWRIGHT_BROWSERS_PATH',
|
||||
// Network reachability — without these, children on proxied networks can't
|
||||
// reach the Anthropic API at all
|
||||
'HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY',
|
||||
'http_proxy', 'https_proxy', 'no_proxy',
|
||||
'SSL_CERT_FILE', 'SSL_CERT_DIR', 'NODE_EXTRA_CA_CERTS',
|
||||
// Auth — named, NOT the broad ANTHROPIC_* prefix: a prefix rule would
|
||||
// smuggle model/beta/debug knobs that change eval behavior
|
||||
'ANTHROPIC_API_KEY', // the auth credential evals require
|
||||
'ANTHROPIC_BASE_URL', // API endpoint override (corp proxies)
|
||||
'ANTHROPIC_AUTH_TOKEN', // bearer-token auth variant
|
||||
]);
|
||||
|
||||
/** Prefix rules: eval-harness knobs + CI metadata. Deliberately NOT here:
|
||||
* CONDUCTOR_* / CLAUDE_* (incl. CLAUDECODE, CLAUDE_CODE_ENTRYPOINT) /
|
||||
* GSTACK_* / MCP_* / GBRAIN_* — session-context contamination; and operator
|
||||
* credentials (GH_TOKEN, SSH_AUTH_SOCK, GIT_*, OPENAI_API_KEY,
|
||||
* VOYAGE_API_KEY) — CI doesn't have them and eval children have no business
|
||||
* using them. A test that legitimately needs one opts in via its own env
|
||||
* override; a provider runner (codex/gemini) re-admits its auth vars via
|
||||
* opts.extraAllow. */
|
||||
const ALLOW_PREFIXES = ['EVALS_', 'GITHUB_'];
|
||||
|
||||
export interface HermeticEnvOpts {
|
||||
/** Per-runner additional allowed names (exact match) or prefixes (entries
|
||||
* ending in '*'). Example: codex runner passes ['OPENAI_API_KEY', 'CODEX_*']. */
|
||||
extraAllow?: string[];
|
||||
}
|
||||
|
||||
/** EVALS_HERMETIC !== '0'. Read at call time (see module doc — ESM hoist). */
|
||||
export function isHermeticEnabled(env: NodeJS.ProcessEnv = process.env): boolean {
|
||||
return env.EVALS_HERMETIC !== '0';
|
||||
}
|
||||
|
||||
/**
|
||||
* Pure allowlist scrub. No I/O. Overrides spread LAST so per-test env
|
||||
* (GSTACK_HOME, CONDUCTOR_WORKSPACE_PATH, GSTACK_HEADLESS opt-out) always
|
||||
* wins over the scrub — that is the documented re-contamination escape and
|
||||
* the wiring tripwire forbids passing raw process.env through it.
|
||||
*/
|
||||
export function buildHermeticEnv(
|
||||
base: NodeJS.ProcessEnv,
|
||||
hermeticVars: Record<string, string>,
|
||||
overrides?: Record<string, string | undefined>,
|
||||
opts?: HermeticEnvOpts,
|
||||
): Record<string, string> {
|
||||
if (!isHermeticEnabled(base)) {
|
||||
// Escape hatch: byte-identical to the legacy spread.
|
||||
const legacy: Record<string, string> = {};
|
||||
for (const [k, v] of Object.entries(base)) if (v !== undefined) legacy[k] = v;
|
||||
for (const [k, v] of Object.entries(overrides ?? {})) if (v !== undefined) legacy[k] = v;
|
||||
return legacy;
|
||||
}
|
||||
|
||||
const promoted = promotedEnv(base);
|
||||
const extraExact = new Set<string>();
|
||||
const extraPrefixes: string[] = [];
|
||||
for (const entry of opts?.extraAllow ?? []) {
|
||||
if (entry.endsWith('*')) extraPrefixes.push(entry.slice(0, -1));
|
||||
else extraExact.add(entry);
|
||||
}
|
||||
|
||||
const out: Record<string, string> = {};
|
||||
for (const [k, v] of Object.entries(promoted)) {
|
||||
if (v === undefined) continue;
|
||||
const allowed =
|
||||
ALLOW_EXACT.has(k) ||
|
||||
extraExact.has(k) ||
|
||||
ALLOW_PREFIXES.some((p) => k.startsWith(p)) ||
|
||||
extraPrefixes.some((p) => k.startsWith(p));
|
||||
if (allowed) out[k] = v;
|
||||
}
|
||||
if (!out.TERM) out.TERM = 'xterm-256color';
|
||||
Object.assign(out, hermeticVars);
|
||||
for (const [k, v] of Object.entries(overrides ?? {})) if (v !== undefined) out[k] = v;
|
||||
return out;
|
||||
}
|
||||
|
||||
export interface SeedConfigOpts {
|
||||
/** When undefined (operator has no key exported), customApiKeyResponses is
|
||||
* omitted — the child fails auth exactly as it would today, no throw here. */
|
||||
apiKey: string | undefined;
|
||||
trustedDirs: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal $CLAUDE_CONFIG_DIR/.claude.json for fresh-config children.
|
||||
*
|
||||
* Empirically verified 2026-06-12 on claude 2.1.175: PRINT MODE (`claude -p`)
|
||||
* with ANTHROPIC_API_KEY needs NO seed at all — a fresh empty config dir ran
|
||||
* non-interactively (exit 0, real cost billed to the key). The seed exists
|
||||
* for the PTY path, where first-run TUI prompts DO appear:
|
||||
* - hasCompletedOnboarding: suppresses the onboarding flow
|
||||
* - customApiKeyResponses.approved: suppresses the "use this API key?"
|
||||
* prompt; entries are the key's LAST 20 CHARS (shape verified against a
|
||||
* real ~/.claude.json)
|
||||
* - projects[dir].hasTrustDialogAccepted: pre-trusts repo-cwd PTY sessions
|
||||
* (the pty-runner's 15s trust-watcher remains as fallback for temp cwds)
|
||||
* bypassPermissionsModeAccepted was considered and dropped: absent from a
|
||||
* real config even though --dangerously-skip-permissions is in daily use.
|
||||
*/
|
||||
export function buildSeedConfig(opts: SeedConfigOpts): Record<string, unknown> {
|
||||
const seed: Record<string, unknown> = {
|
||||
hasCompletedOnboarding: true,
|
||||
projects: Object.fromEntries(
|
||||
opts.trustedDirs.map((dir) => [
|
||||
dir,
|
||||
{ hasTrustDialogAccepted: true, hasCompletedProjectOnboarding: true },
|
||||
]),
|
||||
),
|
||||
};
|
||||
if (opts.apiKey) {
|
||||
seed.customApiKeyResponses = { approved: [opts.apiKey.slice(-20)] };
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
||||
export interface HermeticDirs {
|
||||
/** Ends in `/.claude` — load-bearing: extractPlanFilePath in
|
||||
* claude-pty-runner.ts:191 anchors plan-file paths on `.claude/plans/`
|
||||
* under a /var|/tmp prefix. Renaming this segment breaks PTY plan tests. */
|
||||
configDir: string;
|
||||
gstackHome: string;
|
||||
runRoot: string;
|
||||
}
|
||||
|
||||
const DIR_PREFIX = 'gstack-hermetic-';
|
||||
|
||||
let cachedDirs: HermeticDirs | null = null;
|
||||
|
||||
/** Repo root for the trusted-dir seed: test files live in <root>/test/helpers. */
|
||||
function repoRoot(): string {
|
||||
return path.resolve(__dirname, '..', '..');
|
||||
}
|
||||
|
||||
/**
|
||||
* Sync memoized per-process singleton — intentionally NO async gap between
|
||||
* the cache check and create+seed, so concurrent first calls under
|
||||
* `bun test --concurrent` cannot double-create or observe a half-seeded dir.
|
||||
* Shared across all tests in the process: that matches CI's within-job
|
||||
* shared /home/runner (operator isolation, not per-test isolation).
|
||||
*/
|
||||
export function getHermeticDirs(): HermeticDirs {
|
||||
if (cachedDirs) return cachedDirs;
|
||||
|
||||
gcStaleHermeticDirs();
|
||||
|
||||
// Embed our pid so the GC of future processes can check liveness.
|
||||
const runRoot = fs.mkdtempSync(path.join(os.tmpdir(), `${DIR_PREFIX}${process.pid}-`));
|
||||
const configDir = path.join(runRoot, '.claude');
|
||||
const gstackHome = path.join(runRoot, 'gstack-home');
|
||||
|
||||
// A half-seeded config dir means children hang on first-run prompts until
|
||||
// the test timeout — far worse than failing loudly here. So we throw on
|
||||
// failure, but tear down the partial dir first: an unseeded runRoot named
|
||||
// with our (alive) pid would be skipped by this process's GC and leak until
|
||||
// process exit, so remove it before rethrowing.
|
||||
try {
|
||||
fs.mkdirSync(configDir, { recursive: true });
|
||||
fs.mkdirSync(gstackHome, { recursive: true });
|
||||
const seed = buildSeedConfig({
|
||||
apiKey: process.env.ANTHROPIC_API_KEY ?? process.env.GSTACK_ANTHROPIC_API_KEY,
|
||||
trustedDirs: [repoRoot()],
|
||||
});
|
||||
fs.writeFileSync(path.join(configDir, '.claude.json'), JSON.stringify(seed, null, 2));
|
||||
} catch (err) {
|
||||
try { fs.rmSync(runRoot, { recursive: true, force: true }); } catch { /* best-effort */ }
|
||||
throw err;
|
||||
}
|
||||
|
||||
process.on('exit', () => {
|
||||
// Exit handlers cannot await: sync best-effort removal only. Anything
|
||||
// left behind is reclaimed by the next process's pid-aware GC.
|
||||
try { fs.rmSync(runRoot, { recursive: true, force: true }); } catch { /* GC reclaims */ }
|
||||
});
|
||||
|
||||
cachedDirs = { configDir, gstackHome, runRoot };
|
||||
return cachedDirs;
|
||||
}
|
||||
|
||||
/** A dir younger than this is never GC'd even if its pid looks dead — guards
|
||||
* against PID reuse deleting a freshly-created dir whose original pid exited
|
||||
* and was recycled to an unrelated live process between create and GC. */
|
||||
const GC_MIN_AGE_MS = 60 * 60 * 1000; // 1h
|
||||
|
||||
/**
|
||||
* Reclaim leftovers from crashed runs. Two signals, both required: the
|
||||
* embedded pid is dead AND the dir is older than GC_MIN_AGE_MS. Pid-alone
|
||||
* would risk PID-reuse false-deletes of live dirs; age-alone would delete a
|
||||
* live >24h eval run's config out from under it. Exported for tests.
|
||||
*/
|
||||
export function gcStaleHermeticDirs(tmpDir: string = os.tmpdir()): void {
|
||||
let entries: string[];
|
||||
try { entries = fs.readdirSync(tmpDir); } catch { return; }
|
||||
const now = Date.now();
|
||||
for (const name of entries) {
|
||||
if (!name.startsWith(DIR_PREFIX)) continue;
|
||||
const pidStr = name.slice(DIR_PREFIX.length).split('-')[0];
|
||||
const pid = Number(pidStr);
|
||||
if (!Number.isInteger(pid) || pid <= 0) continue;
|
||||
if (pid === process.pid || isProcessAlive(pid)) continue;
|
||||
const full = path.join(tmpDir, name);
|
||||
try {
|
||||
if (now - fs.statSync(full).mtimeMs < GC_MIN_AGE_MS) continue; // too fresh
|
||||
} catch { continue; } // vanished or unreadable — leave it
|
||||
try { fs.rmSync(full, { recursive: true, force: true }); } catch { /* best-effort */ }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The composition runners use: scrub process.env, point the child at the
|
||||
* singleton hermetic dirs, apply per-test overrides last. Returns the legacy
|
||||
* env untouched when EVALS_HERMETIC=0 (and skips dir creation entirely).
|
||||
*/
|
||||
export function hermeticChildEnv(
|
||||
overrides?: Record<string, string | undefined>,
|
||||
opts?: HermeticEnvOpts,
|
||||
): Record<string, string> {
|
||||
if (!isHermeticEnabled()) {
|
||||
return buildHermeticEnv(process.env, {}, overrides, opts);
|
||||
}
|
||||
const dirs = getHermeticDirs();
|
||||
return buildHermeticEnv(
|
||||
process.env,
|
||||
{ CLAUDE_CONFIG_DIR: dirs.configDir, GSTACK_HOME: dirs.gstackHome },
|
||||
overrides,
|
||||
opts,
|
||||
);
|
||||
}
|
||||
@@ -210,7 +210,11 @@ const MONOLITH_INVARIANTS: ParityInvariant[] = [
|
||||
skill: 'review',
|
||||
mustContain: ['confidence', 'P1', 'P2'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
// The adversarial step swapped its bare `command -v codex` check for the shared
|
||||
// codexPreflight() block (install + auth tri-state + CODEX_MODE branch prose),
|
||||
// landing ~6.3% over the v1.53.0.0 baseline. Intentional: it adds proper
|
||||
// not-installed vs not-authed handling, not slop.
|
||||
maxSizeRatio: 1.08,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
|
||||
@@ -10,6 +10,7 @@ import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { getProjectEvalDir } from './eval-store';
|
||||
import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env';
|
||||
|
||||
const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
|
||||
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
|
||||
@@ -167,6 +168,10 @@ export async function runSkillTest(options: {
|
||||
'--max-turns', String(maxTurns),
|
||||
'--allowed-tools', ...allowedTools,
|
||||
];
|
||||
// Hermetic children get zero MCP servers (no --mcp-config is passed).
|
||||
// Gated on the same call-time check as the env scrub so EVALS_HERMETIC=0
|
||||
// restores operator MCP along with the operator env.
|
||||
if (isHermeticEnabled()) args.push('--strict-mcp-config');
|
||||
|
||||
// Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions
|
||||
// where afterAll cleanup deletes the dir before cat reads the file (especially
|
||||
@@ -176,11 +181,14 @@ export async function runSkillTest(options: {
|
||||
|
||||
const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
|
||||
cwd: workingDirectory,
|
||||
// Hermetic by default (see test/helpers/hermetic-env.ts): operator
|
||||
// session context (CONDUCTOR_*, CLAUDECODE, ~/.claude config, ~/.gstack)
|
||||
// never reaches the child; EVALS_HERMETIC=0 restores the legacy env.
|
||||
// Default GSTACK_HEADLESS=1 so eval/E2E runs classify as headless (BLOCK on an
|
||||
// AskUserQuestion failure rather than emit a prose question no human reads). A
|
||||
// suite exercising the INTERACTIVE prose-fallback path opts out by passing
|
||||
// `env: { GSTACK_HEADLESS: '' }` — extraEnv wins because it spreads last.
|
||||
env: { ...process.env, GSTACK_HEADLESS: '1', ...extraEnv },
|
||||
env: hermeticChildEnv({ GSTACK_HEADLESS: '1', ...extraEnv }),
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
});
|
||||
|
||||
@@ -36,6 +36,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
|
||||
// Hermetic isolation canaries (hermetic-env.ts is also a GLOBAL touchfile;
|
||||
// these entries exist so the canaries themselves stay tier-classified)
|
||||
'hermetic-canary': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
|
||||
'hermetic-sentinel': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
|
||||
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
@@ -111,7 +116,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// written a never-ask preference, AUQ should still auto-decide rather than
|
||||
// surfacing the question. Touches the question-tuning + preference
|
||||
// infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
|
||||
'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'],
|
||||
'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
|
||||
// Conductor → prose decision brief (Conductor signal makes prose the default;
|
||||
// the PreToolUse hook denies the flaky tool). Touches the resolver that owns
|
||||
// the Conductor rule, the preamble signal, the hook, and the detection helper.
|
||||
'conductor-prose': ['scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble.ts', 'plan-eng-review/**', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-conductor-prose.test.ts'],
|
||||
|
||||
// Real-PTY E2E batch (#6 new tests on the harness).
|
||||
// Each one tests behavior the SDK harness can't observe (rendered TTY,
|
||||
@@ -291,6 +301,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'design-shotgun-session': ['design-shotgun/**', 'scripts/resolvers/design.ts'],
|
||||
'design-shotgun-full': ['design-shotgun/**', 'design/src/**', 'browse/src/**'],
|
||||
|
||||
// /diagram (diagram-render bundle consumers). Triplet = deterministic
|
||||
// functional (gate); authoring quality = LLM-judged benchmark (periodic).
|
||||
'diagram-triplet': ['diagram/**', 'lib/diagram-render/**', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts'],
|
||||
'diagram-authoring-quality': ['diagram/**', 'lib/diagram-render/**', 'test/helpers/llm-judge.ts'],
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
|
||||
@@ -435,6 +450,11 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'browse-basic': 'gate',
|
||||
'browse-snapshot': 'gate',
|
||||
|
||||
// Hermetic isolation — gate (deterministic env/config assertions; if the
|
||||
// clean room breaks, every other eval's signal is contaminated)
|
||||
'hermetic-canary': 'gate',
|
||||
'hermetic-sentinel': 'gate',
|
||||
|
||||
// SKILL.md setup — gate (if setup breaks, no skill works)
|
||||
'skillmd-setup-discovery': 'gate',
|
||||
'skillmd-no-local-binary': 'gate',
|
||||
@@ -508,6 +528,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// v1.21+ auto-mode regression tests
|
||||
'office-hours-auto-mode': 'gate',
|
||||
'auto-decide-preserved': 'periodic',
|
||||
'conductor-prose': 'periodic',
|
||||
'e2e-harness-audit': 'gate',
|
||||
|
||||
// Real-PTY E2E batch — tier classification:
|
||||
@@ -659,6 +680,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'design-shotgun-session': 'gate',
|
||||
'design-shotgun-full': 'periodic',
|
||||
|
||||
// /diagram — triplet is deterministic functional, judge is a quality benchmark
|
||||
'diagram-triplet': 'gate',
|
||||
'diagram-authoring-quality': 'periodic',
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': 'gate',
|
||||
|
||||
@@ -779,6 +804,7 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
*/
|
||||
export const GLOBAL_TOUCHFILES = [
|
||||
'test/helpers/session-runner.ts', // All E2E tests use this runner
|
||||
'test/helpers/hermetic-env.ts', // Changes every E2E child's environment
|
||||
'test/helpers/eval-store.ts', // All E2E tests store results here
|
||||
'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous
|
||||
];
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
/**
|
||||
* Static-grep tripwire for the hermetic E2E wiring. Free tier — no API.
|
||||
*
|
||||
* Every E2E runner spawns its child through hermeticChildEnv(); if a refactor
|
||||
* reverts any spawn site to a raw `...process.env` spread (or a callsite
|
||||
* smuggles the operator env back in through the overrides parameter), local
|
||||
* evals silently re-contaminate and nothing fails until a human notices
|
||||
* weird results again — which took three burned suites last time.
|
||||
*
|
||||
* Pattern mirrors browse/test/terminal-agent-pid-identity.test.ts and
|
||||
* browse/test/server-embedder-terminal-port.test.ts: read source files as
|
||||
* text, assert invariants on their contents. Brittle by design — renaming
|
||||
* the helper must force the author to look here.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(new URL(import.meta.url).pathname, '..', '..');
|
||||
|
||||
const RUNNERS = [
|
||||
'test/helpers/session-runner.ts',
|
||||
'test/helpers/claude-pty-runner.ts',
|
||||
'test/helpers/codex-session-runner.ts',
|
||||
'test/helpers/gemini-session-runner.ts',
|
||||
'test/helpers/agent-sdk-runner.ts',
|
||||
];
|
||||
|
||||
function read(rel: string): string {
|
||||
return fs.readFileSync(path.join(ROOT, rel), 'utf-8');
|
||||
}
|
||||
|
||||
describe('hermetic wiring tripwire', () => {
|
||||
test('every runner builds its child env via hermeticChildEnv()', () => {
|
||||
for (const rel of RUNNERS) {
|
||||
const src = read(rel);
|
||||
expect(src.includes('hermeticChildEnv(') ).toBe(true);
|
||||
expect(src.includes("from './hermetic-env'")).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test('no runner spawns a child with a raw process.env spread', () => {
|
||||
// `...process.env` inside an env object is the exact pre-hermetic leak.
|
||||
// hermetic-env.ts itself legitimately READS process.env (call-time
|
||||
// snapshot); the runners must not SPREAD it into a child env.
|
||||
for (const rel of RUNNERS) {
|
||||
const offenders = read(rel)
|
||||
.split('\n')
|
||||
.map((line, i) => ({ line, n: i + 1 }))
|
||||
.filter(({ line }) => line.includes('...process.env'));
|
||||
expect(
|
||||
offenders,
|
||||
`${rel} spreads raw process.env into a child env at line(s) ` +
|
||||
offenders.map((o) => o.n).join(', ') +
|
||||
' — route through hermeticChildEnv() instead',
|
||||
).toEqual([]);
|
||||
}
|
||||
});
|
||||
|
||||
test('claude runners gate --strict-mcp-config on isHermeticEnabled()', () => {
|
||||
// Zero MCP servers for hermetic children; EVALS_HERMETIC=0 must restore
|
||||
// operator MCP along with the operator env (the flag may not be
|
||||
// unconditional, or the escape hatch lies).
|
||||
for (const rel of ['test/helpers/session-runner.ts', 'test/helpers/claude-pty-runner.ts']) {
|
||||
const src = read(rel);
|
||||
expect(src.includes('--strict-mcp-config')).toBe(true);
|
||||
const gated =
|
||||
/if\s*\(\s*isHermeticEnabled\(\)\s*\)\s*(args\.push\(\s*)?['"]--strict-mcp-config['"]/.test(src) ||
|
||||
/const hermetic = isHermeticEnabled\(\);[\s\S]{0,200}if\s*\(hermetic\)\s*args\.push\(\s*['"]--strict-mcp-config['"]/.test(src);
|
||||
expect(gated, `${rel}: --strict-mcp-config must be gated on isHermeticEnabled()`).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test('no test callsite passes the whole operator env as a RUNNER override', () => {
|
||||
// Overrides merge last by design (per-test GSTACK_HOME etc.) — passing
|
||||
// process.env itself through that hole defeats the entire scrub. Scoped
|
||||
// to OUR runner calls: unit tests that spawnSync gstack bin scripts with
|
||||
// `...process.env` are test-process spawns, not eval children, and are
|
||||
// legitimately the test's own business.
|
||||
const RUNNER_CALL =
|
||||
/\b(runSkillTest|launchClaudePty|runPlanSkillObservation|runPlanSkillCounting|runPlanSkillFloorCheck|runAgentSdkTest|runCodexSkillTest|runGeminiSkillTest)\s*\(/;
|
||||
const DIRECT_SPAWN = /\b(spawnSync|spawn|execSync|exec|Bun\.spawn|Bun\.spawnSync)\s*\(/;
|
||||
const testDir = path.join(ROOT, 'test');
|
||||
const offenders: string[] = [];
|
||||
const walk = (dir: string) => {
|
||||
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) { walk(full); continue; }
|
||||
if (!entry.name.endsWith('.test.ts')) continue;
|
||||
if (entry.name === 'hermetic-wiring.test.ts') continue;
|
||||
const lines = fs.readFileSync(full, 'utf-8').split('\n');
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
if (!/env:\s*(\{\s*\.\.\.\s*process\.env|process\.env\b(?!\.))/.test(lines[i])) continue;
|
||||
// Walk backwards to the nearest enclosing call: runner vs direct spawn.
|
||||
for (let j = i; j >= Math.max(0, i - 25); j--) {
|
||||
if (DIRECT_SPAWN.test(lines[j])) break; // test's own spawn — fine
|
||||
if (RUNNER_CALL.test(lines[j])) {
|
||||
offenders.push(`${path.relative(ROOT, full)}:${i + 1}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
walk(testDir);
|
||||
expect(
|
||||
offenders,
|
||||
'These callsites pass the operator env into an eval child, defeating the hermetic scrub: ' +
|
||||
offenders.join(', '),
|
||||
).toEqual([]);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,50 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { isConductor } from '../lib/is-conductor';
|
||||
|
||||
describe('is-conductor', () => {
|
||||
test('true when CONDUCTOR_WORKSPACE_PATH is set', () => {
|
||||
expect(isConductor({ CONDUCTOR_WORKSPACE_PATH: '/Users/x/conductor/ws' })).toBe(true);
|
||||
});
|
||||
|
||||
test('true when CONDUCTOR_PORT is set', () => {
|
||||
expect(isConductor({ CONDUCTOR_PORT: '55070' })).toBe(true);
|
||||
});
|
||||
|
||||
test('true when both are set', () => {
|
||||
expect(isConductor({ CONDUCTOR_WORKSPACE_PATH: '/ws', CONDUCTOR_PORT: '55070' })).toBe(true);
|
||||
});
|
||||
|
||||
test('false when neither is set', () => {
|
||||
expect(isConductor({ HOME: '/Users/x', PATH: '/usr/bin' })).toBe(false);
|
||||
});
|
||||
|
||||
test('false on an empty env', () => {
|
||||
expect(isConductor({})).toBe(false);
|
||||
});
|
||||
|
||||
test('false when the vars are present but empty (Codex #1 hardening — empty != set)', () => {
|
||||
expect(isConductor({ CONDUCTOR_WORKSPACE_PATH: '', CONDUCTOR_PORT: '' })).toBe(false);
|
||||
});
|
||||
|
||||
test('reads the passed env at call time, not a module-load snapshot', () => {
|
||||
const env: NodeJS.ProcessEnv = {};
|
||||
expect(isConductor(env)).toBe(false);
|
||||
// mutate AFTER the first call — a call-time read must see the new value
|
||||
env.CONDUCTOR_PORT = '55070';
|
||||
expect(isConductor(env)).toBe(true);
|
||||
});
|
||||
|
||||
test('defaults to process.env when no arg is passed', () => {
|
||||
const saved = process.env.CONDUCTOR_PORT;
|
||||
try {
|
||||
process.env.CONDUCTOR_PORT = '12345';
|
||||
expect(isConductor()).toBe(true);
|
||||
delete process.env.CONDUCTOR_PORT;
|
||||
// CONDUCTOR_WORKSPACE_PATH may be set in a real Conductor session; guard the assertion
|
||||
if (!process.env.CONDUCTOR_WORKSPACE_PATH) expect(isConductor()).toBe(false);
|
||||
} finally {
|
||||
if (saved === undefined) delete process.env.CONDUCTOR_PORT;
|
||||
else process.env.CONDUCTOR_PORT = saved;
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -43,6 +43,11 @@ function runHook(stdin: object): { stdout: string; stderr: string; status: numbe
|
||||
env.GSTACK_STATE_ROOT = stateRoot;
|
||||
env.GSTACK_QUESTION_LOG_NO_DERIVE = '1';
|
||||
delete env.GSTACK_HOME;
|
||||
// These cases assert the defer-path memoryContext injection. Strip ambient
|
||||
// Conductor markers so running inside Conductor (CONDUCTOR_WORKSPACE_PATH/PORT
|
||||
// set) doesn't flip the hook into the [conductor] prose deny instead of defer.
|
||||
delete env.CONDUCTOR_WORKSPACE_PATH;
|
||||
delete env.CONDUCTOR_PORT;
|
||||
const res = spawnSync(HOOK, [], {
|
||||
env,
|
||||
input: JSON.stringify({ ...stdin, cwd: fixtureCwd }),
|
||||
|
||||
@@ -70,3 +70,13 @@ describe('Preamble composition order', () => {
|
||||
expect(out).not.toContain('## AskUserQuestion Format');
|
||||
});
|
||||
});
|
||||
|
||||
describe('Conductor signal (preamble bash)', () => {
|
||||
test('claude preamble emits CONDUCTOR_SESSION, gated on != headless (Issue 8)', () => {
|
||||
const out = generatePreamble(makeCtx('claude', 2, 'claude'));
|
||||
expect(out).toContain('echo "CONDUCTOR_SESSION: true"');
|
||||
// The emission must be suppressed when the session is headless (eval/CI
|
||||
// inside Conductor must BLOCK, not render prose to nobody).
|
||||
expect(out).toMatch(/"\$_SESSION_KIND" != "headless"[\s\S]*CONDUCTOR_WORKSPACE_PATH[\s\S]*CONDUCTOR_PORT[\s\S]*CONDUCTOR_SESSION: true/);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -60,7 +60,7 @@ function writeGlobalPref(questionId: string, preference: string): void {
|
||||
fs.writeFileSync(f, JSON.stringify(prefs, null, 2));
|
||||
}
|
||||
|
||||
function runHook(stdin: object, cwd?: string): {
|
||||
function runHook(stdin: object, cwd?: string, extraEnv?: Record<string, string>): {
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
status: number;
|
||||
@@ -72,7 +72,15 @@ function runHook(stdin: object, cwd?: string): {
|
||||
}
|
||||
env.GSTACK_STATE_ROOT = stateRoot;
|
||||
delete env.GSTACK_HOME;
|
||||
// Strip ambient Conductor markers so these cases characterize NON-Conductor
|
||||
// behavior deterministically — otherwise running the suite inside Conductor
|
||||
// (CONDUCTOR_WORKSPACE_PATH/PORT set) would flip every defer into the
|
||||
// [conductor] prose deny. The Conductor cases below opt back in explicitly
|
||||
// via extraEnv.
|
||||
delete env.CONDUCTOR_WORKSPACE_PATH;
|
||||
delete env.CONDUCTOR_PORT;
|
||||
env.GSTACK_QUESTION_LOG_NO_DERIVE = '1';
|
||||
if (extraEnv) Object.assign(env, extraEnv);
|
||||
const res = spawnSync(HOOK, [], {
|
||||
env,
|
||||
input: JSON.stringify({ ...stdin, cwd: cwd || fixtureCwd }),
|
||||
@@ -337,6 +345,108 @@ describe('MCP variant', () => {
|
||||
});
|
||||
});
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Conductor: deny + prose redirect (transport avoidance, not preference)
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
describe('Conductor prose redirect', () => {
|
||||
const CONDUCTOR = { CONDUCTOR_PORT: '55070' };
|
||||
|
||||
test('two-way, no preference → deny with [conductor] prose directive', () => {
|
||||
const r = runHook({
|
||||
session_id: 'c1',
|
||||
tool_name: 'AskUserQuestion',
|
||||
tool_use_id: 'tu-c1',
|
||||
tool_input: {
|
||||
questions: [
|
||||
{ question: '<gstack-qid:test-q> Need approval?', options: ['A) Yes (recommended)', 'B) No'] },
|
||||
],
|
||||
},
|
||||
}, undefined, CONDUCTOR);
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toMatch(/do not call askuserquestion/i);
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toMatch(/reply with a letter/i);
|
||||
});
|
||||
|
||||
test('UNMARKED question (modal path) → deny with prose directive', () => {
|
||||
const r = runHook({
|
||||
session_id: 'c2',
|
||||
tool_name: 'AskUserQuestion',
|
||||
tool_use_id: 'tu-c2',
|
||||
tool_input: {
|
||||
questions: [
|
||||
{ question: 'No marker — an ad-hoc question', options: ['A) Yes (recommended)', 'B) No'] },
|
||||
],
|
||||
},
|
||||
}, undefined, CONDUCTOR);
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
|
||||
});
|
||||
|
||||
test('one-way door → deny with prose directive (NOT defer — destructive must reach human via prose)', () => {
|
||||
const r = runHook({
|
||||
session_id: 'c3',
|
||||
tool_name: 'AskUserQuestion',
|
||||
tool_use_id: 'tu-c3',
|
||||
tool_input: {
|
||||
questions: [
|
||||
{
|
||||
question: '<gstack-qid:ship-test-failure-triage> Tests failed.',
|
||||
options: ['A) Fix now (recommended)', 'B) Investigate', 'C) Ack and ship'],
|
||||
},
|
||||
],
|
||||
},
|
||||
}, undefined, CONDUCTOR);
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toMatch(/typed confirmation/i);
|
||||
});
|
||||
|
||||
test('CONDUCTOR_WORKSPACE_PATH alone also triggers the redirect', () => {
|
||||
const r = runHook({
|
||||
session_id: 'c4',
|
||||
tool_name: 'mcp__conductor__AskUserQuestion',
|
||||
tool_use_id: 'tu-c4',
|
||||
tool_input: {
|
||||
questions: [{ question: '<gstack-qid:test-q> Pick?', options: ['A) X (recommended)', 'B) Y'] }],
|
||||
},
|
||||
}, undefined, { CONDUCTOR_WORKSPACE_PATH: '/Users/x/conductor/ws' });
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
|
||||
});
|
||||
|
||||
test('PRECEDENCE: full never-ask auto-decide still wins over Conductor prose', () => {
|
||||
writeProjectPref('ship-pre-landing-review-fix', 'never-ask');
|
||||
const r = runHook({
|
||||
session_id: 'c5',
|
||||
tool_name: 'AskUserQuestion',
|
||||
tool_use_id: 'tu-c5',
|
||||
tool_input: {
|
||||
questions: [
|
||||
{
|
||||
question: '<gstack-qid:ship-pre-landing-review-fix> Pre-landing review flagged issue.',
|
||||
options: ['A) Fix now (recommended)', 'B) Skip'],
|
||||
},
|
||||
],
|
||||
},
|
||||
}, undefined, CONDUCTOR);
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
|
||||
// auto-decide reason, NOT the conductor prose reason
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('plan-tune auto-decide');
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).not.toContain('[conductor]');
|
||||
});
|
||||
|
||||
test('non-AUQ tool in Conductor → still defer (no redirect on unrelated tools)', () => {
|
||||
const r = runHook(
|
||||
{ session_id: 'c6', tool_name: 'Bash', tool_use_id: 'tu-c6', tool_input: {} },
|
||||
undefined,
|
||||
CONDUCTOR,
|
||||
);
|
||||
expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('defer');
|
||||
});
|
||||
});
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Auto-decided event logging (since PostToolUse never fires on deny)
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
@@ -225,8 +225,25 @@ describe('generateAskUserFormat — runtime-failure prose fallback', () => {
|
||||
expect(out).toMatch(/must be sent as tool_use, not prose — unless the documented failure fallback/);
|
||||
});
|
||||
|
||||
test('OV2: the self-check "not writing prose" line carries the fallback qualifier', () => {
|
||||
expect(out).toMatch(/not writing prose — unless the documented failure fallback applies/);
|
||||
test('OV2: the self-check "not writing prose" line carries the Conductor + fallback qualifiers', () => {
|
||||
// After the Conductor-default-prose change, the exception is two-pronged:
|
||||
// CONDUCTOR_SESSION makes prose the default, OR the documented failure fallback.
|
||||
expect(out).toMatch(/not writing prose — unless `CONDUCTOR_SESSION: true`[\s\S]*OR the documented failure fallback applies/);
|
||||
});
|
||||
|
||||
// Conductor-default-prose contract (the proactive path, distinct from the
|
||||
// failure fallback). Guards the Tool-resolution rule + self-check wording.
|
||||
test('Conductor: do-not-call rule present in Tool resolution', () => {
|
||||
expect(out).toMatch(/CONDUCTOR_SESSION: true/);
|
||||
expect(out).toMatch(/do NOT call AskUserQuestion at all/);
|
||||
expect(out).toMatch(/Auto-decide preferences still apply first/);
|
||||
expect(out).toMatch(/gstack-question-log/);
|
||||
});
|
||||
|
||||
test('Conductor: one-way prose rule + continuation protocol present', () => {
|
||||
expect(out).toMatch(/one-way\b[\s\S]*typed confirmation/i);
|
||||
expect(out).toMatch(/never proceed on a vague/i);
|
||||
expect(out).toMatch(/Continuation — mapping a typed reply/);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -131,6 +131,11 @@ export const SKILL_COVERAGE: Record<string, SkillCoverage> = {
|
||||
'design-consultation': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
|
||||
'design-shotgun': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
|
||||
'design-html': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
|
||||
diagram: {
|
||||
gate: ['test/skill-e2e-diagram.test.ts', 'test/skill-coverage-floor.test.ts'],
|
||||
periodic: ['test/skill-e2e-diagram.test.ts'],
|
||||
rationale: 'Triplet contract is gate-tier deterministic; authoring-quality judge is periodic (E2E_TIERS: diagram-triplet/diagram-authoring-quality).',
|
||||
},
|
||||
cso: {
|
||||
gate: ['test/skill-e2e-cso.test.ts', 'test/cso-preserved.test.ts', 'test/skill-coverage-floor.test.ts'],
|
||||
periodic: [],
|
||||
|
||||
@@ -100,11 +100,19 @@ describeE2E('AUTO_DECIDE opt-in preserved under Conductor flags (periodic)', ()
|
||||
}
|
||||
|
||||
// 4. Run /plan-ceo-review with the Conductor flag set + isolated state.
|
||||
// GSTACK_HOME=tmpHome is REQUIRED: the preference + question_tuning were
|
||||
// seeded there. Without it the spawned claude reads the real ~/.gstack,
|
||||
// never sees the never-ask preference, and the test silently exercises
|
||||
// the wrong state root (pre-existing bug, Codex #9 / Issue 13).
|
||||
// CONDUCTOR_WORKSPACE_PATH additionally proves auto-decide still WINS
|
||||
// over the Conductor prose redirect (precedence: settled preference
|
||||
// beats transport-avoidance).
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-ceo-review',
|
||||
inPlanMode: true,
|
||||
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
|
||||
timeoutMs: 300_000,
|
||||
env: { GSTACK_HOME: tmpHome, CONDUCTOR_WORKSPACE_PATH: tmpHome },
|
||||
});
|
||||
|
||||
// 5. Pass: 'auto_decided' (the strongest signal) or 'plan_ready' with
|
||||
|
||||
@@ -192,13 +192,21 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Copy bin scripts
|
||||
// Copy bin scripts + the lib module they import. gstack-learnings-log
|
||||
// does `import ... from '$SCRIPT_DIR/../lib/jsonl-store.ts'` (v1.57.5.0
|
||||
// injection sanitization) — without lib/ alongside bin/, the script exits
|
||||
// 1 before writing anything, failing this test for a fixture reason, not
|
||||
// a model-behavior reason (root-caused during the v1.58.0.0 ship; fails
|
||||
// identically on main).
|
||||
const binDir = path.join(opDir, 'bin');
|
||||
fs.mkdirSync(binDir, { recursive: true });
|
||||
for (const script of ['gstack-learnings-log', 'gstack-slug']) {
|
||||
fs.copyFileSync(path.join(ROOT, 'bin', script), path.join(binDir, script));
|
||||
fs.chmodSync(path.join(binDir, script), 0o755);
|
||||
}
|
||||
const libDir = path.join(opDir, 'lib');
|
||||
fs.mkdirSync(libDir, { recursive: true });
|
||||
fs.copyFileSync(path.join(ROOT, 'lib', 'jsonl-store.ts'), path.join(libDir, 'jsonl-store.ts'));
|
||||
|
||||
// gstack-learnings-log will create the project dir automatically via gstack-slug
|
||||
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
/**
|
||||
* Conductor → prose decision brief (periodic-tier, paid, real-PTY).
|
||||
*
|
||||
* Proves the end-to-end behavior: when CONDUCTOR_SESSION is signalled, a skill
|
||||
* that hits a decision renders a PROSE decision brief and waits, instead of
|
||||
* silently skipping the user.
|
||||
*
|
||||
* SCOPE — read before trusting this as the Conductor guard. This is END-TO-END
|
||||
* BEHAVIOR coverage, NOT the discriminating Conductor guarantee:
|
||||
* - The deterministic guard is test/question-preference-hook.test.ts
|
||||
* ("Conductor prose redirect") — it sets process.env.CONDUCTOR_* and asserts
|
||||
* the PreToolUse hook denies + redirects. That test CAN fail on unfixed code.
|
||||
* - The PTY harness here cannot register `mcp__conductor__AskUserQuestion`, so
|
||||
* it tests "native AUQ unavailable + Conductor signal → prose," NOT "the MCP
|
||||
* variant exists and must not be called" (Codex #10). Under --disallowedTools
|
||||
* a present-human interactive session already prose-falls-back, so this test
|
||||
* is a smoke check that the Conductor path still produces a prose brief, not
|
||||
* a proof that the Conductor signal (vs the generic fallback) drove it.
|
||||
*
|
||||
* Periodic tier: model-behavior, non-deterministic.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
const FLAWED_PLAN = `# Plan: add a "developer-friendly" pricing tier
|
||||
|
||||
## Goal
|
||||
Increase developer adoption.
|
||||
|
||||
## Premise
|
||||
No tests mentioned, no rollout plan, no auth check on the upgrade endpoint.
|
||||
Adds a Stripe tier, a React pricing page, a Postgres entitlements table, and a
|
||||
Redis cache. The team "feels like" it should be cheaper; no developer was asked.
|
||||
`;
|
||||
|
||||
describeE2E('Conductor renders decisions as prose (periodic)', () => {
|
||||
test('plan-eng-review in a Conductor session surfaces a PROSE decision brief, not a silent skip', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-eng-review',
|
||||
inPlanMode: true,
|
||||
// Mimic Conductor: native AUQ disabled + the Conductor env signal present.
|
||||
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
|
||||
env: { CONDUCTOR_WORKSPACE_PATH: '/tmp/conductor-prose-e2e' },
|
||||
initialPlanContent: FLAWED_PLAN,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
// The decision must reach the human as prose. 'silent_write' (wrote findings
|
||||
// to the plan without asking) is the precise failure we guard against.
|
||||
if (obs.outcome === 'silent_write') {
|
||||
throw new Error(
|
||||
`Conductor prose regression: skill wrote findings without surfacing a decision.\n` +
|
||||
`summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'exited' || obs.outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`Conductor prose test inconclusive: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
// A prose-rendered decision brief was observed at some point in the run.
|
||||
expect(obs.proseAUQEverObserved).toBe(true);
|
||||
}, 360_000);
|
||||
});
|
||||
@@ -0,0 +1,153 @@
|
||||
/**
|
||||
* /diagram skill E2E (paid, claude -p).
|
||||
*
|
||||
* Two tests with deliberately different tiers (eng-review D5):
|
||||
*
|
||||
* diagram-triplet (gate) — deterministic functional contract: from an
|
||||
* English ask, the agent following the skill emits a parseable triplet —
|
||||
* .mmd source, .excalidraw scene with elements, SVG markup, PNG bytes.
|
||||
* No quality judgment; either the artifacts exist and parse or they don't.
|
||||
*
|
||||
* diagram-authoring-quality (periodic) — LLM-judged benchmark of the
|
||||
* authored mermaid itself (faithfulness to the ask, label quality,
|
||||
* readable size). Non-deterministic by nature → never blocks merge.
|
||||
*
|
||||
* Per the extract-don't-copy fixture rule, the prompt embeds only the skill's
|
||||
* working section (from "# /diagram" onward), not the full generated SKILL.md
|
||||
* with its preamble.
|
||||
*/
|
||||
import { describe, expect } from 'bun:test';
|
||||
import * as fs from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
import * as os from 'node:os';
|
||||
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import {
|
||||
ROOT, browseBin, runId,
|
||||
describeIfSelected, testConcurrentIfSelected,
|
||||
logCost,
|
||||
} from './helpers/e2e-helpers';
|
||||
import { callJudge } from './helpers/llm-judge';
|
||||
|
||||
const BUNDLE = path.join(ROOT, 'lib', 'diagram-render', 'dist', 'diagram-render.html');
|
||||
|
||||
/** Extract the working section of the generated skill doc (post-preamble). */
|
||||
function skillExtract(): string {
|
||||
const full = fs.readFileSync(path.join(ROOT, 'diagram', 'SKILL.md'), 'utf-8');
|
||||
const start = full.indexOf('# /diagram');
|
||||
if (start < 0) throw new Error('diagram/SKILL.md missing "# /diagram" section — regenerate skill docs');
|
||||
return full.slice(start);
|
||||
}
|
||||
|
||||
function setupDir(prefix: string): string {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
|
||||
fs.writeFileSync(path.join(dir, 'diagram-skill.md'), skillExtract());
|
||||
// Pre-stage the bundle so the test is hermetic (no global install needed in
|
||||
// CI); the prompt tells the agent discovery is already done.
|
||||
fs.copyFileSync(BUNDLE, path.join(dir, 'diagram-render.html'));
|
||||
fs.mkdirSync(path.join(dir, 'out'));
|
||||
return dir;
|
||||
}
|
||||
|
||||
function basePrompt(dir: string, ask: string): string {
|
||||
return `You have the /diagram skill instructions at ./diagram-skill.md — read them and follow Steps 1-4.
|
||||
|
||||
Environment notes (already set up — skip Step 2's bundle discovery):
|
||||
- The browse binary is at ${browseBin} — use it wherever the skill says $B.
|
||||
- The render bundle is ALREADY staged at ./diagram-render.html in this directory; load it with: ${browseBin} load-html ./diagram-render.html
|
||||
- Write all four artifacts into ./out/ with the slug "flow" (out/flow.mmd, out/flow.excalidraw, out/flow.svg, out/flow.png).
|
||||
- Do not open any other applications. Do not use the Read tool on the PNG (no inline display needed here).
|
||||
|
||||
The diagram to create: ${ask}`;
|
||||
}
|
||||
|
||||
describeIfSelected('/diagram skill E2E', ['diagram-triplet', 'diagram-authoring-quality'], () => {
|
||||
testConcurrentIfSelected('diagram-triplet', async () => {
|
||||
const dir = setupDir('diagram-triplet-');
|
||||
try {
|
||||
const result = await runSkillTest({
|
||||
prompt: basePrompt(
|
||||
dir,
|
||||
'a flowchart (graph LR) of a 4-stage pipeline: markdown → prepass → Chromium → PDF.',
|
||||
),
|
||||
workingDirectory: dir,
|
||||
maxTurns: 25,
|
||||
allowedTools: ['Bash', 'Read', 'Write'],
|
||||
timeout: 240_000,
|
||||
testName: 'diagram-triplet',
|
||||
runId,
|
||||
});
|
||||
logCost('diagram triplet', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// The deterministic contract: all four artifacts exist and parse.
|
||||
const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8');
|
||||
expect(mmd).toMatch(/graph\s+(LR|TD)/);
|
||||
|
||||
const scene = JSON.parse(fs.readFileSync(path.join(dir, 'out', 'flow.excalidraw'), 'utf-8'));
|
||||
expect(scene.type).toBe('excalidraw');
|
||||
expect(Array.isArray(scene.elements)).toBe(true);
|
||||
expect(scene.elements.length).toBeGreaterThan(3);
|
||||
|
||||
const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8');
|
||||
expect(svg).toMatch(/<svg/i);
|
||||
|
||||
const png = fs.readFileSync(path.join(dir, 'out', 'flow.png'));
|
||||
expect(png.subarray(0, 4)).toEqual(Buffer.from([0x89, 0x50, 0x4e, 0x47]));
|
||||
expect(png.length).toBeGreaterThan(5_000);
|
||||
} finally {
|
||||
try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||
}
|
||||
}, 300_000);
|
||||
|
||||
testConcurrentIfSelected('diagram-authoring-quality', async () => {
|
||||
const dir = setupDir('diagram-quality-');
|
||||
try {
|
||||
const result = await runSkillTest({
|
||||
prompt: basePrompt(
|
||||
dir,
|
||||
'how gstack renders diagrams in PDFs: markdown containing mermaid fences goes through a pre-pass that extracts the fences, renders them in a browse daemon tab using an offline bundle, substitutes the SVG back in, inlines local images, and prints via Chromium. Failures become visible diagnostic blocks.',
|
||||
),
|
||||
workingDirectory: dir,
|
||||
maxTurns: 25,
|
||||
allowedTools: ['Bash', 'Read', 'Write'],
|
||||
timeout: 240_000,
|
||||
testName: 'diagram-authoring-quality',
|
||||
runId,
|
||||
});
|
||||
logCost('diagram authoring quality', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8');
|
||||
const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8');
|
||||
expect(svg).toMatch(/<svg/i);
|
||||
|
||||
const verdict = await callJudge<{ score: number; reasoning: string }>(
|
||||
`You are judging the quality of an agent-authored mermaid diagram.
|
||||
|
||||
THE ASK: a diagram of gstack's PDF diagram-rendering flow — mermaid fences are
|
||||
extracted by a pre-pass, rendered in a browse tab via an offline bundle,
|
||||
substituted back as SVG, images inlined, printed by Chromium, with render
|
||||
failures becoming visible diagnostic blocks.
|
||||
|
||||
THE AUTHORED MERMAID:
|
||||
\`\`\`mermaid
|
||||
${mmd}
|
||||
\`\`\`
|
||||
|
||||
Score 1-10 on: faithfulness to the ask (are the named stages present and
|
||||
correctly ordered?), label quality (short node labels, detail on edges),
|
||||
and readable size (5-15 nodes, not a wall). A diagram that misses the
|
||||
failure/diagnostic path entirely caps at 5 — that path is an explicitly
|
||||
named requirement, so omitting it must fail the run.
|
||||
|
||||
Respond with JSON: {"score": N, "reasoning": "..."}`,
|
||||
);
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(`[diagram-quality] score=${verdict.score} — ${verdict.reasoning}`);
|
||||
expect(verdict.score).toBeGreaterThanOrEqual(6);
|
||||
} finally {
|
||||
try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||
}
|
||||
}, 300_000);
|
||||
});
|
||||
@@ -0,0 +1,190 @@
|
||||
/**
|
||||
* Hermetic-isolation canaries (gate tier, ~$0.02 each, deterministic).
|
||||
*
|
||||
* Two tests that make the hermeticity claim FALSIFIABLE instead of asserted:
|
||||
*
|
||||
* 1. `hermetic-canary` — env + auth isolation. Plants contamination vars in
|
||||
* the TEST process env, spawns a child through the real runner, and
|
||||
* asserts from the Bash tool_result in the stream-json transcript (never
|
||||
* the model's prose — prose can hallucinate) that the child saw a temp
|
||||
* `/.claude` config dir, a temp GSTACK_HOME, and none of the planted
|
||||
* contamination. Auth hermeticity: hard-fails when ANTHROPIC_API_KEY is
|
||||
* absent (a skip here would be a silent hole), and asserts
|
||||
* total_cost_usd > 0 — subscription/keychain OAuth reports cost 0, so
|
||||
* nonzero cost is the discriminator that the API key actually paid
|
||||
* (verified empirically 2026-06-12; the result record exposes no
|
||||
* auth-source field, so cost is the best available signal — residual
|
||||
* gap documented in the plan).
|
||||
*
|
||||
* 2. `hermetic-sentinel` — config isolation, the poisoned-operator probe.
|
||||
* Builds a FAKE operator config tree (user CLAUDE.md + an mcpServers
|
||||
* entry) and points the test process's CLAUDE_CONFIG_DIR at it. If the
|
||||
* hermetic redirect ever breaks, the child loads that poisoned tree and
|
||||
* the probes fire: init.mcp_servers would list the planted server
|
||||
* (semantic proof that --strict-mcp-config + the redirect yield ZERO MCP
|
||||
* servers, not an assumption), and the child's config dir would contain
|
||||
* the poisoned CLAUDE.md.
|
||||
*
|
||||
* Both canaries double as the seed-schema / CLI version-skew tripwire: a
|
||||
* claude release that changes first-run behavior or config discovery fails
|
||||
* here first, loudly, in the gate tier.
|
||||
*/
|
||||
|
||||
import { expect, afterAll } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import {
|
||||
describeIfSelected, testIfSelected, createEvalCollector, finalizeEvalCollector,
|
||||
recordE2E, runId, logCost,
|
||||
} from './helpers/e2e-helpers';
|
||||
|
||||
const evalCollector = createEvalCollector('e2e-hermetic');
|
||||
|
||||
// Cheap + deterministic: the canaries assert environment facts, not model
|
||||
// quality, so the smallest model is the right tool.
|
||||
const CANARY_MODEL = 'claude-haiku-4-5-20251001';
|
||||
|
||||
/** Extract concatenated tool_result text from the stream-json transcript. */
|
||||
function toolResultText(transcript: any[]): string {
|
||||
const chunks: string[] = [];
|
||||
for (const event of transcript) {
|
||||
if (event.type !== 'user') continue;
|
||||
for (const item of event.message?.content ?? []) {
|
||||
if (item.type !== 'tool_result') continue;
|
||||
if (typeof item.content === 'string') chunks.push(item.content);
|
||||
else for (const c of item.content ?? []) if (c.type === 'text') chunks.push(c.text);
|
||||
}
|
||||
}
|
||||
return chunks.join('\n');
|
||||
}
|
||||
|
||||
function initEvent(transcript: any[]): any {
|
||||
return transcript.find((e) => e.type === 'system' && e.subtype === 'init');
|
||||
}
|
||||
|
||||
describeIfSelected('hermetic isolation canaries', ['hermetic-canary', 'hermetic-sentinel'], () => {
|
||||
testIfSelected('hermetic-canary', async () => {
|
||||
// Auth hermeticity is part of the contract: a missing key must FAIL the
|
||||
// gate, not skip it — a skipped canary is a silent hole.
|
||||
if (!process.env.ANTHROPIC_API_KEY) {
|
||||
throw new Error('hermetic-canary requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip');
|
||||
}
|
||||
|
||||
const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-canary-'));
|
||||
// Plant contamination deterministically — the operator env may or may not
|
||||
// carry these, so set them ourselves and restore after.
|
||||
const planted: Record<string, string> = {
|
||||
CONDUCTOR_WORKSPACE_PATH: '/tmp/poison-conductor-ws',
|
||||
GBRAIN_POISON_PROBE: 'leaked',
|
||||
};
|
||||
const prev: Record<string, string | undefined> = {};
|
||||
for (const [k, v] of Object.entries(planted)) { prev[k] = process.env[k]; process.env[k] = v; }
|
||||
|
||||
try {
|
||||
const result = await runSkillTest({
|
||||
prompt: 'Run exactly this bash command and then stop: ' +
|
||||
'echo "CFG=$CLAUDE_CONFIG_DIR"; echo "GH=$GSTACK_HOME"; ' +
|
||||
'echo "CW=$CONDUCTOR_WORKSPACE_PATH"; echo "GP=$GBRAIN_POISON_PROBE"',
|
||||
workingDirectory: workDir,
|
||||
maxTurns: 3,
|
||||
allowedTools: ['Bash'],
|
||||
timeout: 120_000,
|
||||
testName: 'hermetic-canary',
|
||||
runId,
|
||||
model: CANARY_MODEL,
|
||||
});
|
||||
logCost('hermetic-canary', result);
|
||||
recordE2E(evalCollector, 'hermetic-canary', 'e2e-hermetic', result);
|
||||
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Deterministic: assert the Bash tool OUTPUT, not the model's prose.
|
||||
const bashOut = toolResultText(result.transcript);
|
||||
const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? '';
|
||||
expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/);
|
||||
expect(bashOut).toMatch(/GH=\S*gstack-home/);
|
||||
// Planted contamination must not reach the child. CLAUDECODE is NOT
|
||||
// probed here: the child claude CLI sets CLAUDECODE=1 for its own tool
|
||||
// subprocesses (verified empirically — CI behaves identically), so the
|
||||
// Bash tool can't observe our scrub of it; the unit test pins that.
|
||||
expect(bashOut).toMatch(/(^|\n)CW=\s*($|\n)/); // planted Conductor var scrubbed
|
||||
expect(bashOut).toMatch(/(^|\n)GP=\s*($|\n)/); // GBRAIN_* scrubbed
|
||||
|
||||
// Zero MCP servers — semantic, from the init event, not a flag grep.
|
||||
const init = initEvent(result.transcript);
|
||||
expect(init).toBeTruthy();
|
||||
expect(init.mcp_servers ?? []).toHaveLength(0);
|
||||
|
||||
// Auth: nonzero cost = the API key paid (OAuth/keychain reports 0).
|
||||
expect(result.transcript.find((e) => e.type === 'result')?.total_cost_usd).toBeGreaterThan(0);
|
||||
} finally {
|
||||
for (const [k, v] of Object.entries(prev)) {
|
||||
if (v === undefined) delete process.env[k]; else process.env[k] = v;
|
||||
}
|
||||
fs.rmSync(workDir, { recursive: true, force: true });
|
||||
}
|
||||
}, 180_000);
|
||||
|
||||
testIfSelected('hermetic-sentinel', async () => {
|
||||
if (!process.env.ANTHROPIC_API_KEY) {
|
||||
throw new Error('hermetic-sentinel requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip');
|
||||
}
|
||||
|
||||
const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-sentinel-'));
|
||||
// Poisoned operator config tree: if the hermetic redirect breaks, the
|
||||
// child discovers this dir and both probes below fire.
|
||||
const poisonRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-poison-'));
|
||||
const poisonCfg = path.join(poisonRoot, '.claude');
|
||||
fs.mkdirSync(poisonCfg, { recursive: true });
|
||||
fs.writeFileSync(path.join(poisonCfg, 'CLAUDE.md'), 'POISONED OPERATOR MEMORY — must never load\n');
|
||||
fs.writeFileSync(path.join(poisonCfg, '.claude.json'), JSON.stringify({
|
||||
hasCompletedOnboarding: true,
|
||||
mcpServers: { 'sentinel-mcp': { command: '/usr/bin/true', args: [] } },
|
||||
}));
|
||||
const prevCfgDir = process.env.CLAUDE_CONFIG_DIR;
|
||||
process.env.CLAUDE_CONFIG_DIR = poisonCfg;
|
||||
|
||||
try {
|
||||
const result = await runSkillTest({
|
||||
prompt: 'Run exactly this bash command and then stop: ' +
|
||||
'echo "CFG=$CLAUDE_CONFIG_DIR"; ' +
|
||||
'if [ -f "$CLAUDE_CONFIG_DIR/CLAUDE.md" ]; then echo "USER_MD=present"; else echo "USER_MD=absent"; fi',
|
||||
workingDirectory: workDir,
|
||||
maxTurns: 3,
|
||||
allowedTools: ['Bash'],
|
||||
timeout: 120_000,
|
||||
testName: 'hermetic-sentinel',
|
||||
runId,
|
||||
model: CANARY_MODEL,
|
||||
});
|
||||
logCost('hermetic-sentinel', result);
|
||||
recordE2E(evalCollector, 'hermetic-sentinel', 'e2e-hermetic', result);
|
||||
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const bashOut = toolResultText(result.transcript);
|
||||
const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? '';
|
||||
// The redirect must beat the poisoned operator value...
|
||||
expect(cfg).not.toBe(poisonCfg);
|
||||
expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/);
|
||||
// ...and the active config dir must not carry the poisoned user memory.
|
||||
expect(bashOut).toContain('USER_MD=absent');
|
||||
|
||||
// The planted MCP server must be invisible: zero servers in init.
|
||||
const init = initEvent(result.transcript);
|
||||
expect(init).toBeTruthy();
|
||||
const servers = (init.mcp_servers ?? []).map((s: any) => s?.name ?? s);
|
||||
expect(servers).toHaveLength(0);
|
||||
expect(JSON.stringify(servers)).not.toContain('sentinel-mcp');
|
||||
} finally {
|
||||
if (prevCfgDir === undefined) delete process.env.CLAUDE_CONFIG_DIR;
|
||||
else process.env.CLAUDE_CONFIG_DIR = prevCfgDir;
|
||||
fs.rmSync(workDir, { recursive: true, force: true });
|
||||
fs.rmSync(poisonRoot, { recursive: true, force: true });
|
||||
}
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
afterAll(() => finalizeEvalCollector(evalCollector));
|
||||
@@ -36,6 +36,16 @@ afterEach(() => {
|
||||
rmSync(workDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
// Under `bun test --concurrent`, overlapping tests read the SAME shared
|
||||
// `workDir` binding (beforeEach reassigns it mid-flight), so a fixed
|
||||
// 'daemon.pid' name collides: the first daemon claims it and every sibling
|
||||
// gets already_running against the test process's own (always-alive) pid —
|
||||
// the exact failure seen in full gate runs at 15-way concurrency. Unique
|
||||
// per-claim pidfiles keep the single-instance semantics under test while
|
||||
// removing the cross-test collision.
|
||||
let pidfileSeq = 0;
|
||||
const uniquePidfile = () => join(workDir, `daemon-${++pidfileSeq}.pid`);
|
||||
|
||||
interface StubState {
|
||||
loggedIn: boolean;
|
||||
username: string;
|
||||
@@ -205,7 +215,7 @@ class AppState {
|
||||
const daemon = await startDaemon({
|
||||
loopbackPort: 0,
|
||||
tailnetEnabled: false,
|
||||
pidfilePath: join(workDir, 'daemon.pid'),
|
||||
pidfilePath: uniquePidfile(),
|
||||
tunnelProvider: async () => tunnel,
|
||||
});
|
||||
if ('error' in daemon) throw new Error(daemon.error);
|
||||
@@ -249,7 +259,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
|
||||
const daemon = await startDaemon({
|
||||
loopbackPort: 0,
|
||||
tailnetEnabled: false,
|
||||
pidfilePath: join(workDir, 'daemon.pid'),
|
||||
pidfilePath: uniquePidfile(),
|
||||
tunnelProvider: async () => tunnel,
|
||||
});
|
||||
if ('error' in daemon) throw new Error(daemon.error);
|
||||
@@ -314,7 +324,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
|
||||
const daemon = await startDaemon({
|
||||
loopbackPort: 0,
|
||||
tailnetEnabled: false,
|
||||
pidfilePath: join(workDir, 'daemon.pid'),
|
||||
pidfilePath: uniquePidfile(),
|
||||
tunnelProvider: async () => tunnel,
|
||||
});
|
||||
if ('error' in daemon) throw new Error(daemon.error);
|
||||
@@ -352,7 +362,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
|
||||
const daemon = await startDaemon({
|
||||
loopbackPort: 0,
|
||||
tailnetEnabled: true,
|
||||
pidfilePath: join(workDir, 'daemon.pid'),
|
||||
pidfilePath: uniquePidfile(),
|
||||
tunnelProvider: async () => tunnel,
|
||||
probeImpl: async () => ({ ok: true, ownIdentity: 'mac@e2e' }),
|
||||
whoIsImpl: async () => ({ identity: 'agent@e2e', raw: {} }),
|
||||
@@ -430,7 +440,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
|
||||
const daemon = await startDaemon({
|
||||
loopbackPort: 0,
|
||||
tailnetEnabled: true,
|
||||
pidfilePath: join(workDir, 'daemon.pid'),
|
||||
pidfilePath: uniquePidfile(),
|
||||
tunnelProvider: async () => tunnel,
|
||||
probeImpl: async () => ({ ok: true, ownIdentity: 'mac@e2e' }),
|
||||
whoIsImpl: async () => ({ identity: 'readonly@e2e', raw: {} }),
|
||||
|
||||
@@ -546,10 +546,13 @@ async function runWorkflowJudge(opts: {
|
||||
// slice markers vanish from the skeleton and the judge scores empty content.
|
||||
let content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
|
||||
const secDir = path.join(ROOT, path.dirname(opts.skillPath), 'sections');
|
||||
const sectionBodies: string[] = [];
|
||||
if (fs.existsSync(secDir)) {
|
||||
for (const f of fs.readdirSync(secDir).sort()) {
|
||||
if (f.endsWith('.md') && !f.endsWith('.md.tmpl')) {
|
||||
content += '\n' + fs.readFileSync(path.join(secDir, f), 'utf-8');
|
||||
const body = fs.readFileSync(path.join(secDir, f), 'utf-8');
|
||||
sectionBodies.push(body);
|
||||
content += '\n' + body;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -565,6 +568,17 @@ async function runWorkflowJudge(opts: {
|
||||
section = content.slice(startIdx);
|
||||
}
|
||||
|
||||
// Two carve shapes exist. plan-eng/plan-design moved the MARKERS into the
|
||||
// section files, so the slice above already reaches the carved content.
|
||||
// document-release instead keeps its markers in the skeleton and carves the
|
||||
// workflow BODY (Steps 2-9 → sections/release-body.md) AFTER the endMarker,
|
||||
// so the marker slice drops it. Re-append any carved section the window
|
||||
// excluded, so the judge always sees the full workflow the agent executes.
|
||||
for (const body of sectionBodies) {
|
||||
const head = body.trim().slice(0, 120);
|
||||
if (head && !section.includes(head)) section += '\n' + body;
|
||||
}
|
||||
|
||||
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
|
||||
|
||||
The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
|
||||
|
||||
@@ -1386,15 +1386,16 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('Adversarial review (always-on)');
|
||||
// Always-on: both Claude and Codex adversarial
|
||||
expect(content).toContain('Claude adversarial subagent (always runs)');
|
||||
expect(content).toContain('Codex adversarial challenge (always runs when available)');
|
||||
expect(content).toContain('Codex adversarial challenge (runs whenever');
|
||||
// Claude adversarial subagent dispatch
|
||||
expect(content).toContain('Agent tool');
|
||||
expect(content).toContain('FIXABLE');
|
||||
expect(content).toContain('INVESTIGATE');
|
||||
// Codex availability check
|
||||
expect(content).toContain('CODEX_NOT_AVAILABLE');
|
||||
// OLD_CFG only gates Codex, not Claude
|
||||
expect(content).toContain('skip Codex passes only');
|
||||
// Probe-based availability via the shared codexPreflight() (install + auth)
|
||||
expect(content).toContain('CODEX_MODE');
|
||||
expect(content).toContain('command -v codex'); // install check kept literal
|
||||
// codex_reviews=disabled gates Codex passes only; Claude adversarial still runs
|
||||
expect(content).toContain('skip the Codex passes ONLY');
|
||||
// Review log
|
||||
expect(content).toContain('adversarial-review');
|
||||
expect(content).toContain('reasoning_effort="high"');
|
||||
@@ -1449,6 +1450,43 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('codex exec');
|
||||
});
|
||||
|
||||
// D5 regression guard: the Codex outside voice is default-on, not opt-in. A future
|
||||
// gen-skill-docs change must not silently reintroduce the "Want an outside voice?"
|
||||
// AskUserQuestion. The CODEX_PLAN_REVIEW content renders into each skill's
|
||||
// sections/review-sections.md (the skeleton points at it). plan-design-review uses
|
||||
// DESIGN_OUTSIDE_VOICES, not CODEX_PLAN_REVIEW, so it is excluded here.
|
||||
test('plan reviews run the Codex outside voice default-on (no opt-in question)', () => {
|
||||
for (const skill of ['plan-eng-review', 'plan-ceo-review', 'plan-devex-review']) {
|
||||
const content = fs.readFileSync(
|
||||
path.join(ROOT, skill, 'sections', 'review-sections.md'), 'utf-8');
|
||||
expect(content).not.toContain('Want an outside voice');
|
||||
expect(content).toContain('Outside Voice — Independent Plan Challenge (default-on)');
|
||||
expect(content).toContain('CODEX_MODE');
|
||||
expect(content).toContain('command -v codex'); // preflight install check (e2e relies on it)
|
||||
}
|
||||
});
|
||||
|
||||
test('/document-release includes the default-on Codex documentation review', () => {
|
||||
// The doc-review renders into the carved release-body section (kept out of the
|
||||
// always-loaded skeleton to respect the skeleton-byte budget).
|
||||
const content = fs.readFileSync(
|
||||
path.join(ROOT, 'document-release', 'sections', 'release-body.md'), 'utf-8');
|
||||
expect(content).toContain('Codex Documentation Review (default-on)');
|
||||
expect(content).toContain('CODEX_MODE');
|
||||
expect(content).toContain('codex-doc-review');
|
||||
});
|
||||
|
||||
test('codex-host document-release does NOT contain the Codex doc review', () => {
|
||||
// .agents/ is gitignored — generate on demand (codex never invokes itself)
|
||||
Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
|
||||
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
|
||||
});
|
||||
const content = fs.readFileSync(
|
||||
path.join(ROOT, '.agents', 'skills', 'gstack-document-release', 'SKILL.md'), 'utf-8');
|
||||
expect(content).not.toContain('Codex Documentation Review');
|
||||
expect(content).not.toContain('codex-doc-review');
|
||||
});
|
||||
|
||||
test('codex review invocations avoid the prompt plus --base argument shape', () => {
|
||||
for (const rel of ['codex/SKILL.md', 'review/SKILL.md', 'ship/SKILL.md']) {
|
||||
// ship's codex command moved into sections/adversarial.md (T9 carve).
|
||||
|
||||
Reference in New Issue
Block a user