From c13cee6939d0d3c0e8fc70eb853a30f069173f70 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 24 Apr 2026 01:07:30 -0700 Subject: [PATCH] test(e2e): privacy-gate AskUserQuestion fires from preamble (periodic tier) Two periodic-tier E2E tests exercising the preamble's privacy gate end-to-end via the Agent SDK + canUseTool. Previously uncovered: - Positive: stages a fake gbrain on PATH + gbrain_sync_mode_prompted=false in config, runs a real skill, intercepts tool-use. Asserts the preamble fires a 3-option AskUserQuestion matching the canonical prose ("publish session memory" / "artifact" / "decline") and does NOT fire a second time in the same run (idempotency within session). - Negative: same staging but prompted=true. Asserts the gate stays silent even with gbrain detected on the host. Registered in test/helpers/touchfiles.ts as `brain-privacy-gate` (periodic) with dependency tracking on generate-brain-sync-block.ts, the three gstack-brain-* bins, gstack-config, and the Agent SDK runner. Diff-based selection re-runs the E2E when any of those change. Cost: ~$0.30-$0.50 per run. Only fires under EVALS=1 EVALS_TIER=periodic; gate tier stays free. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 5 + test/skill-e2e-brain-privacy-gate.test.ts | 200 ++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 test/skill-e2e-brain-privacy-gate.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index acde310d..d039e771 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -92,6 +92,7 @@ export const E2E_TOUCHFILES: Record = { 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts'], 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'test/helpers/agent-sdk-runner.ts'], + 'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'], // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) // Fires when either template OR the two preamble resolvers change. @@ -336,6 +337,10 @@ export const E2E_TIERS: Record = { 'plan-mode-no-op': 'gate', 'e2e-harness-audit': 'gate', + // Privacy gate for gstack-brain-sync — periodic (non-deterministic LLM call, + // costs ~$0.30-$0.50 per run, not needed on every commit) + 'brain-privacy-gate': 'periodic', + // AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark) 'plan-ceo-review-format-mode': 'periodic', 'plan-ceo-review-format-approach': 'periodic', diff --git a/test/skill-e2e-brain-privacy-gate.test.ts b/test/skill-e2e-brain-privacy-gate.test.ts new file mode 100644 index 00000000..e4dd5e26 --- /dev/null +++ b/test/skill-e2e-brain-privacy-gate.test.ts @@ -0,0 +1,200 @@ +/** + * Privacy-gate E2E (periodic tier, paid). + * + * The gbrain-sync preamble block instructs the model to fire a one-time + * AskUserQuestion when: + * - `BRAIN_SYNC: off` in the preamble echo (sync mode not on) + * - config `gbrain_sync_mode_prompted` is "false" + * - gbrain is detected on the host (binary on PATH or `gbrain doctor` + * --fast --json succeeds) + * + * This test stages all three conditions (via env + a fake `gbrain` binary + * on PATH), runs a cheap gstack skill through the Agent SDK, intercepts + * every tool use via canUseTool, and asserts: one of the AskUserQuestions + * fired by the preamble is the privacy gate with its distinctive prose + * and three options (full / artifacts-only / decline). + * + * Cost: ~$0.30-$0.50 per run. Periodic tier (EVALS=1 EVALS_TIER=periodic). + * + * See scripts/resolvers/preamble/generate-brain-sync-block.ts for the + * prose contract this test locks in. + */ + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { runAgentSdkTest, passThroughNonAskUserQuestion, resolveClaudeBinary } from './helpers/agent-sdk-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('gbrain-sync privacy gate fires once via preamble', () => { + test('gstack skill preamble fires the 3-option AskUserQuestion when gbrain is detected', async () => { + // Stage a fresh GSTACK_HOME with gbrain_sync_mode_prompted=false. + const gstackHome = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-gstack-')); + const fakeBinDir = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-bin-')); + + // Seed the config so the gate's condition passes. + fs.writeFileSync( + path.join(gstackHome, 'config.yaml'), + 'gbrain_sync_mode: off\ngbrain_sync_mode_prompted: false\n', + { mode: 0o600 } + ); + + // Fake `gbrain` binary that makes the host-detection probe succeed. + // The preamble checks `gbrain doctor --fast --json` OR `which gbrain`. + // Either branch counts as "gbrain detected." + fs.writeFileSync( + path.join(fakeBinDir, 'gbrain'), + '#!/bin/bash\n' + + 'case "$1" in\n' + + ' doctor) echo \'{"status":"ok","schema_version":2}\' ; exit 0 ;;\n' + + ' --version) echo "0.18.2" ; exit 0 ;;\n' + + ' *) exit 0 ;;\n' + + 'esac\n', + { mode: 0o755 } + ); + + const askUserQuestions: Array<{ input: Record }> = []; + const binary = resolveClaudeBinary(); + + try { + // Pick a small skill with the preamble. `/learn` is read-only + + // short, which keeps the token cost down. The preamble fires + // regardless of which skill we pick. + await runAgentSdkTest({ + systemPrompt: { type: 'preset', preset: 'claude_code' }, + userPrompt: + 'Run /learn with no arguments. Just report the learnings count and answer any AskUserQuestion that fires.', + workingDirectory: gstackHome, + maxTurns: 6, + allowedTools: ['Read', 'Grep', 'Glob', 'Bash'], + env: { + GSTACK_HOME: gstackHome, + // Prepend the fake gbrain to PATH so the preamble's detection wins. + PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`, + }, + ...(binary ? { pathToClaudeCodeExecutable: binary } : {}), + canUseTool: async (toolName, input) => { + if (toolName === 'AskUserQuestion') { + askUserQuestions.push({ input }); + // Auto-answer "Decline — keep everything local" (option C) + // so the skill can continue without actually turning on sync. + const q = (input.questions as Array<{ + question: string; + options: Array<{ label: string }>; + }>)[0]; + const decline = + q.options.find((o) => /decline|keep everything local|no thanks/i.test(o.label)) ?? + q.options[q.options.length - 1]!; + return { + behavior: 'allow', + updatedInput: { + questions: input.questions, + answers: { [q.question]: decline.label }, + }, + }; + } + return passThroughNonAskUserQuestion(toolName, input); + }, + }); + + // Assertion 1: the privacy gate fired. + const privacyQuestions = askUserQuestions.filter((aq) => { + const qs = aq.input.questions as Array<{ question: string }>; + return qs.some( + (q) => + /publish.*session memory|private github repo|gbrain indexes/i.test(q.question) + ); + }); + expect(privacyQuestions.length).toBeGreaterThanOrEqual(1); + + // Assertion 2: the question has the three expected options. + const gate = privacyQuestions[0]!.input.questions as Array<{ + question: string; + options: Array<{ label: string }>; + }>; + const labels = gate[0]!.options.map((o) => o.label.toLowerCase()).join(' | '); + // Full / artifacts-only / decline are the three canonical options. + expect(labels).toMatch(/everything|allowlisted|full/); + expect(labels).toMatch(/artifact/); + expect(labels).toMatch(/decline|local|no thanks/); + + // Assertion 3: the gate should NOT fire twice in one run. + // (The preamble is supposed to be idempotent within a session.) + expect(privacyQuestions.length).toBe(1); + } finally { + fs.rmSync(gstackHome, { recursive: true, force: true }); + fs.rmSync(fakeBinDir, { recursive: true, force: true }); + } + }, 180_000); + + test('privacy gate does NOT fire when gbrain_sync_mode_prompted is already true', async () => { + // Same staging, but prompted=true this time. Gate should be silent. + const gstackHome = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-off-')); + const fakeBinDir = fs.mkdtempSync(path.join(os.tmpdir(), 'privacy-gate-off-bin-')); + + fs.writeFileSync( + path.join(gstackHome, 'config.yaml'), + 'gbrain_sync_mode: off\ngbrain_sync_mode_prompted: true\n', + { mode: 0o600 } + ); + + fs.writeFileSync( + path.join(fakeBinDir, 'gbrain'), + '#!/bin/bash\necho \'{"status":"ok"}\'\nexit 0\n', + { mode: 0o755 } + ); + + const askUserQuestions: Array<{ input: Record }> = []; + const binary = resolveClaudeBinary(); + + try { + await runAgentSdkTest({ + systemPrompt: { type: 'preset', preset: 'claude_code' }, + userPrompt: + 'Run /learn with no arguments. Just report the learnings count.', + workingDirectory: gstackHome, + maxTurns: 4, + allowedTools: ['Read', 'Grep', 'Glob', 'Bash'], + env: { + GSTACK_HOME: gstackHome, + PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`, + }, + ...(binary ? { pathToClaudeCodeExecutable: binary } : {}), + canUseTool: async (toolName, input) => { + if (toolName === 'AskUserQuestion') { + askUserQuestions.push({ input }); + // Pass through whatever the model asks; don't prefer anything. + const q = (input.questions as Array<{ + question: string; + options: Array<{ label: string }>; + }>)[0]; + return { + behavior: 'allow', + updatedInput: { + questions: input.questions, + answers: { [q.question]: q.options[0]!.label }, + }, + }; + } + return passThroughNonAskUserQuestion(toolName, input); + }, + }); + + // No AskUserQuestion should have matched the privacy gate's prose. + const privacyQuestions = askUserQuestions.filter((aq) => { + const qs = aq.input.questions as Array<{ question: string }>; + return qs.some( + (q) => + /publish.*session memory|private github repo|gbrain indexes/i.test(q.question) + ); + }); + expect(privacyQuestions.length).toBe(0); + } finally { + fs.rmSync(gstackHome, { recursive: true, force: true }); + fs.rmSync(fakeBinDir, { recursive: true, force: true }); + } + }, 180_000); +});