fix(test): E2E privacy gate — ambient env + skill-file prompt

Two fixes to get the E2E actually running end-to-end (first attempt
failed at the SDK auth step, second at the assertion step):

1. Don't pass an explicit `env:` object to runAgentSdkTest. The SDK's
   auth pipeline misses ANTHROPIC_API_KEY when env is supplied as an
   object (verified against the plan-mode-no-op test, which passes no
   env and auths cleanly). Mutate process.env before the call instead,
   and restore the originals in finally so other tests don't inherit
   the ambient mutation.

2. The "Run /learn with no arguments" user prompt was too narrow — the
   model reduced it to a direct action and skipped the preamble
   privacy-gate directives entirely, so zero AskUserQuestions fired.
   Mirror the plan-mode-no-op pattern: point the model at the skill
   file on disk and ask it to follow every preamble directive. Bumped
   maxTurns from 6 to 10 to give the preamble room to execute.

Verified both tests pass under `EVALS=1 EVALS_TIER=periodic bun test
test/skill-e2e-brain-privacy-gate.test.ts` against a real ANTHROPIC_API_KEY.
Cost per run: ~$0.30-$0.50 per test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-24 01:24:12 -07:00
parent f989fdbf2f
commit 371a7e684a
+41 -14
View File
@@ -59,22 +59,38 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
const binary = resolveClaudeBinary();
// Ambient env mutations — restored in finally so other tests in the file
// don't inherit them.
const origGstackHome = process.env.GSTACK_HOME;
const origPath = process.env.PATH;
process.env.GSTACK_HOME = gstackHome;
process.env.PATH = `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`;
try {
// Pick a small skill with the preamble. `/learn` is read-only +
// short, which keeps the token cost down. The preamble fires
// regardless of which skill we pick.
// Pick a small skill with the preamble and load it via Read to force
// the model to execute every preamble directive. A narrow "run /learn"
// prompt often gets reduced to a direct action, skipping the preamble
// gates. Mirror the plan-mode-no-op test pattern: ask the model to
// follow the skill's instructions in full.
const learnSkill = path.resolve(
import.meta.dir,
'..',
'learn',
'SKILL.md'
);
await runAgentSdkTest({
systemPrompt: { type: 'preset', preset: 'claude_code' },
userPrompt:
'Run /learn with no arguments. Just report the learnings count and answer any AskUserQuestion that fires.',
`Read the skill file at ${learnSkill} and follow its instructions from the top, including every preamble directive. Execute every bash block. If any AskUserQuestion fires, present it.`,
workingDirectory: gstackHome,
maxTurns: 6,
maxTurns: 10,
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
env: {
GSTACK_HOME: gstackHome,
// Prepend the fake gbrain to PATH so the preamble's detection wins.
PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`,
},
// NOTE: do NOT pass `env:` here. When the Agent SDK gets an explicit
// env object, its auth pipeline doesn't pick up ANTHROPIC_API_KEY the
// same way as when env is undefined (SDK-internal detail, verified
// against the plan-mode-no-op test which passes no env and auths
// cleanly). Instead, mutate process.env before the call so the SDK
// inherits our overrides ambiently.
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
canUseTool: async (toolName, input) => {
if (toolName === 'AskUserQuestion') {
@@ -125,6 +141,11 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
// (The preamble is supposed to be idempotent within a session.)
expect(privacyQuestions.length).toBe(1);
} finally {
// Restore ambient env before other tests.
if (origGstackHome === undefined) delete process.env.GSTACK_HOME;
else process.env.GSTACK_HOME = origGstackHome;
if (origPath === undefined) delete process.env.PATH;
else process.env.PATH = origPath;
fs.rmSync(gstackHome, { recursive: true, force: true });
fs.rmSync(fakeBinDir, { recursive: true, force: true });
}
@@ -150,6 +171,12 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
const binary = resolveClaudeBinary();
// Ambient env mutations (see note on the first test).
const origGstackHome = process.env.GSTACK_HOME;
const origPath = process.env.PATH;
process.env.GSTACK_HOME = gstackHome;
process.env.PATH = `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`;
try {
await runAgentSdkTest({
systemPrompt: { type: 'preset', preset: 'claude_code' },
@@ -158,10 +185,6 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
workingDirectory: gstackHome,
maxTurns: 4,
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
env: {
GSTACK_HOME: gstackHome,
PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`,
},
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
canUseTool: async (toolName, input) => {
if (toolName === 'AskUserQuestion') {
@@ -193,6 +216,10 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
});
expect(privacyQuestions.length).toBe(0);
} finally {
if (origGstackHome === undefined) delete process.env.GSTACK_HOME;
else process.env.GSTACK_HOME = origGstackHome;
if (origPath === undefined) delete process.env.PATH;
else process.env.PATH = origPath;
fs.rmSync(gstackHome, { recursive: true, force: true });
fs.rmSync(fakeBinDir, { recursive: true, force: true });
}