test(e2e): privacy-gate AskUserQuestion fires from preamble (periodic tier)

Two periodic-tier E2E tests exercising the preamble's privacy gate
end-to-end via the Agent SDK + canUseTool. Previously uncovered:

- Positive: stages a fake gbrain on PATH + gbrain_sync_mode_prompted=false
  in config, runs a real skill, intercepts tool-use. Asserts the
  preamble fires a 3-option AskUserQuestion matching the canonical
  prose ("publish session memory" / "artifact" / "decline") and does
  NOT fire a second time in the same run (idempotency within session).

- Negative: same staging but prompted=true. Asserts the gate stays
  silent even with gbrain detected on the host.

Registered in test/helpers/touchfiles.ts as `brain-privacy-gate`
(periodic) with dependency tracking on generate-brain-sync-block.ts,
the three gstack-brain-* bins, gstack-config, and the Agent SDK runner.
Diff-based selection re-runs the E2E when any of those change.

Cost: ~$0.30-$0.50 per run. Only fires under EVALS=1 EVALS_TIER=periodic;
gate tier stays free.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-24 01:07:30 -07:00
parent 4a9d964b42
commit c13cee6939
2 changed files with 205 additions and 0 deletions
+5
View File
@@ -92,6 +92,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'],
'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts'],
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'test/helpers/agent-sdk-runner.ts'],
'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],
// AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
// Fires when either template OR the two preamble resolvers change.
@@ -336,6 +337,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'plan-mode-no-op': 'gate',
'e2e-harness-audit': 'gate',
// Privacy gate for gstack-brain-sync — periodic (non-deterministic LLM call,
// costs ~$0.30-$0.50 per run, not needed on every commit)
'brain-privacy-gate': 'periodic',
// AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark)
'plan-ceo-review-format-mode': 'periodic',
'plan-ceo-review-format-approach': 'periodic',
+200
View File
@@ -0,0 +1,200 @@
/**
* Privacy-gate E2E (periodic tier, paid).
*
* The gbrain-sync preamble block instructs the model to fire a one-time
* AskUserQuestion when:
* - `BRAIN_SYNC: off` in the preamble echo (sync mode not on)
* - config `gbrain_sync_mode_prompted` is "false"
* - gbrain is detected on the host (binary on PATH or `gbrain doctor`
* --fast --json succeeds)
*
* This test stages all three conditions (via env + a fake `gbrain` binary
* on PATH), runs a cheap gstack skill through the Agent SDK, intercepts
* every tool use via canUseTool, and asserts: one of the AskUserQuestions
* fired by the preamble is the privacy gate with its distinctive prose
* and three options (full / artifacts-only / decline).
*
* Cost: ~$0.30-$0.50 per run. Periodic tier (EVALS=1 EVALS_TIER=periodic).
*
* See scripts/resolvers/preamble/generate-brain-sync-block.ts for the
* prose contract this test locks in.
*/
import { describe, test, expect } from 'bun:test';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import { runAgentSdkTest, passThroughNonAskUserQuestion, resolveClaudeBinary } from './helpers/agent-sdk-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
test('gstack skill preamble fires the 3-option AskUserQuestion when gbrain is detected', async () => {
// Stage a fresh GSTACK_HOME with gbrain_sync_mode_prompted=false.
const gstackHome = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-gstack-'));
const fakeBinDir = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-bin-'));
// Seed the config so the gate's condition passes.
fs.writeFileSync(
path.join(gstackHome, 'config.yaml'),
'gbrain_sync_mode: off\ngbrain_sync_mode_prompted: false\n',
{ mode: 0o600 }
);
// Fake `gbrain` binary that makes the host-detection probe succeed.
// The preamble checks `gbrain doctor --fast --json` OR `which gbrain`.
// Either branch counts as "gbrain detected."
fs.writeFileSync(
path.join(fakeBinDir, 'gbrain'),
'#!/bin/bash\n' +
'case "$1" in\n' +
' doctor) echo \'{"status":"ok","schema_version":2}\' ; exit 0 ;;\n' +
' --version) echo "0.18.2" ; exit 0 ;;\n' +
' *) exit 0 ;;\n' +
'esac\n',
{ mode: 0o755 }
);
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
const binary = resolveClaudeBinary();
try {
// Pick a small skill with the preamble. `/learn` is read-only +
// short, which keeps the token cost down. The preamble fires
// regardless of which skill we pick.
await runAgentSdkTest({
systemPrompt: { type: 'preset', preset: 'claude_code' },
userPrompt:
'Run /learn with no arguments. Just report the learnings count and answer any AskUserQuestion that fires.',
workingDirectory: gstackHome,
maxTurns: 6,
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
env: {
GSTACK_HOME: gstackHome,
// Prepend the fake gbrain to PATH so the preamble's detection wins.
PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`,
},
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
canUseTool: async (toolName, input) => {
if (toolName === 'AskUserQuestion') {
askUserQuestions.push({ input });
// Auto-answer "Decline — keep everything local" (option C)
// so the skill can continue without actually turning on sync.
const q = (input.questions as Array<{
question: string;
options: Array<{ label: string }>;
}>)[0];
const decline =
q.options.find((o) => /decline|keep everything local|no thanks/i.test(o.label)) ??
q.options[q.options.length - 1]!;
return {
behavior: 'allow',
updatedInput: {
questions: input.questions,
answers: { [q.question]: decline.label },
},
};
}
return passThroughNonAskUserQuestion(toolName, input);
},
});
// Assertion 1: the privacy gate fired.
const privacyQuestions = askUserQuestions.filter((aq) => {
const qs = aq.input.questions as Array<{ question: string }>;
return qs.some(
(q) =>
/publish.*session memory|private github repo|gbrain indexes/i.test(q.question)
);
});
expect(privacyQuestions.length).toBeGreaterThanOrEqual(1);
// Assertion 2: the question has the three expected options.
const gate = privacyQuestions[0]!.input.questions as Array<{
question: string;
options: Array<{ label: string }>;
}>;
const labels = gate[0]!.options.map((o) => o.label.toLowerCase()).join(' | ');
// Full / artifacts-only / decline are the three canonical options.
expect(labels).toMatch(/everything|allowlisted|full/);
expect(labels).toMatch(/artifact/);
expect(labels).toMatch(/decline|local|no thanks/);
// Assertion 3: the gate should NOT fire twice in one run.
// (The preamble is supposed to be idempotent within a session.)
expect(privacyQuestions.length).toBe(1);
} finally {
fs.rmSync(gstackHome, { recursive: true, force: true });
fs.rmSync(fakeBinDir, { recursive: true, force: true });
}
}, 180_000);
test('privacy gate does NOT fire when gbrain_sync_mode_prompted is already true', async () => {
// Same staging, but prompted=true this time. Gate should be silent.
const gstackHome = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-off-'));
const fakeBinDir = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-off-bin-'));
fs.writeFileSync(
path.join(gstackHome, 'config.yaml'),
'gbrain_sync_mode: off\ngbrain_sync_mode_prompted: true\n',
{ mode: 0o600 }
);
fs.writeFileSync(
path.join(fakeBinDir, 'gbrain'),
'#!/bin/bash\necho \'{"status":"ok"}\'\nexit 0\n',
{ mode: 0o755 }
);
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
const binary = resolveClaudeBinary();
try {
await runAgentSdkTest({
systemPrompt: { type: 'preset', preset: 'claude_code' },
userPrompt:
'Run /learn with no arguments. Just report the learnings count.',
workingDirectory: gstackHome,
maxTurns: 4,
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
env: {
GSTACK_HOME: gstackHome,
PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`,
},
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
canUseTool: async (toolName, input) => {
if (toolName === 'AskUserQuestion') {
askUserQuestions.push({ input });
// Pass through whatever the model asks; don't prefer anything.
const q = (input.questions as Array<{
question: string;
options: Array<{ label: string }>;
}>)[0];
return {
behavior: 'allow',
updatedInput: {
questions: input.questions,
answers: { [q.question]: q.options[0]!.label },
},
};
}
return passThroughNonAskUserQuestion(toolName, input);
},
});
// No AskUserQuestion should have matched the privacy gate's prose.
const privacyQuestions = askUserQuestions.filter((aq) => {
const qs = aq.input.questions as Array<{ question: string }>;
return qs.some(
(q) =>
/publish.*session memory|private github repo|gbrain indexes/i.test(q.question)
);
});
expect(privacyQuestions.length).toBe(0);
} finally {
fs.rmSync(gstackHome, { recursive: true, force: true });
fs.rmSync(fakeBinDir, { recursive: true, force: true });
}
}, 180_000);
});