From c98f360ad061f951d89f2bf9c12174b90cf0c77c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 20 Apr 2026 05:40:54 +0800 Subject: [PATCH] =?UTF-8?q?test(security):=20full-stack=20E2E=20=E2=80=94?= =?UTF-8?q?=20the=20security-contract=20anchor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spins up a real browse server + real sidebar-agent subprocess + mock claude binary, POSTs an injection via /sidebar-command, and verifies the whole pipeline reacts end-to-end: 1. Server canary-injects into the system prompt (assert: queue entry .canary field, .prompt includes it + "NEVER include it") 2. Sidebar-agent spawns mock-claude with PATH-overriden claude binary 3. Mock emits tool_use with CANARY-XXX in a URL query arg 4. Sidebar-agent detectCanaryLeak fires on the stream event 5. onCanaryLeaked logs + SIGTERM's the mock + emits security_event 6. /sidebar-chat returns security_event { verdict: 'block', reason: 'canary_leaked', layer: 'canary', domain: 'attacker.example.com' } 7. /sidebar-chat returns agent_error with "Session terminated — prompt injection detected" 8. ~/.gstack/security/attempts.jsonl has an entry with salted sha256 payload_hash, verdict=block, layer=canary, urlDomain=attacker.example.com 9. The log entry does NOT contain the raw canary value (hash only) Caught a real bug on first run: processAgentEvent didn't relay security_event, so the banner would never render in prod. Fixed in a separate commit. This test prevents that whole class of regression. Zero LLM cost, <10s runtime, fully deterministic. Gate tier. Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/test/security-e2e-fullstack.test.ts | 218 +++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 browse/test/security-e2e-fullstack.test.ts diff --git a/browse/test/security-e2e-fullstack.test.ts b/browse/test/security-e2e-fullstack.test.ts new file mode 100644 index 00000000..01d347a0 --- /dev/null +++ b/browse/test/security-e2e-fullstack.test.ts @@ -0,0 +1,218 @@ +/** + * Full-stack E2E — the security-contract anchor test. + * + * Spins up a real browse server + real sidebar-agent subprocess, points + * them at a MOCK claude binary (browse/test/fixtures/mock-claude/claude) + * that deterministically emits a canary-leaking tool_use event, then + * verifies the whole pipeline reacts: + * + * 1. Server canary-injects into the system prompt + * 2. Server queues the message + * 3. Sidebar-agent spawns mock-claude + * 4. Mock-claude emits tool_use with CANARY-XXX in a URL arg + * 5. Sidebar-agent's detectCanaryLeak fires on the stream event + * 6. onCanaryLeaked logs, SIGTERM's mock-claude, emits security_event + * 7. /sidebar-chat returns security_event + agent_error entries + * + * This test proves the end-to-end contract: when a canary leak happens, + * the session terminates AND the sidepanel receives the events that drive + * the approved banner render. No LLM cost, <10s total runtime. + * + * Fully deterministic — safe to run on every commit (gate tier). + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { spawn, type Subprocess } from 'bun'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +let serverProc: Subprocess | null = null; +let agentProc: Subprocess | null = null; +let serverPort = 0; +let authToken = ''; +let tmpDir = ''; +let stateFile = ''; +let queueFile = ''; +const MOCK_CLAUDE_DIR = path.resolve(import.meta.dir, 'fixtures', 'mock-claude'); + +async function apiFetch(pathname: string, opts: RequestInit = {}): Promise { + const headers: Record = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${authToken}`, + ...(opts.headers as Record | undefined), + }; + return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers }); +} + +beforeAll(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'security-e2e-fullstack-')); + stateFile = path.join(tmpDir, 'browse.json'); + queueFile = path.join(tmpDir, 'sidebar-queue.jsonl'); + fs.mkdirSync(path.dirname(queueFile), { recursive: true }); + + const serverScript = path.resolve(import.meta.dir, '..', 'src', 'server.ts'); + const agentScript = path.resolve(import.meta.dir, '..', 'src', 'sidebar-agent.ts'); + + // 1) Start the browse server. + serverProc = spawn(['bun', 'run', serverScript], { + env: { + ...process.env, + BROWSE_STATE_FILE: stateFile, + BROWSE_HEADLESS_SKIP: '1', // no Chromium for this test + BROWSE_PORT: '0', + SIDEBAR_QUEUE_PATH: queueFile, + BROWSE_IDLE_TIMEOUT: '300', + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + // Wait for state file with token + port + const deadline = Date.now() + 15000; + while (Date.now() < deadline) { + if (fs.existsSync(stateFile)) { + try { + const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8')); + if (state.port && state.token) { + serverPort = state.port; + authToken = state.token; + break; + } + } catch {} + } + await new Promise((r) => setTimeout(r, 100)); + } + if (!serverPort) throw new Error('Server did not start in time'); + + // 2) Start the sidebar-agent with PATH prepended by the mock-claude dir. + // sidebar-agent spawns `claude` via PATH lookup (spawn('claude', ...) — see + // browse/src/sidebar-agent.ts spawnClaude), so prepending works without any + // source change. + const shimmedPath = `${MOCK_CLAUDE_DIR}:${process.env.PATH ?? ''}`; + agentProc = spawn(['bun', 'run', agentScript], { + env: { + ...process.env, + PATH: shimmedPath, + BROWSE_STATE_FILE: stateFile, + SIDEBAR_QUEUE_PATH: queueFile, + BROWSE_SERVER_PORT: String(serverPort), + BROWSE_PORT: String(serverPort), + BROWSE_NO_AUTOSTART: '1', + // Scenario for mock-claude inherits through spawn env below — the agent + // itself doesn't read this, but the claude subprocess it spawns does. + MOCK_CLAUDE_SCENARIO: 'canary_leak_in_tool_arg', + // Force classifier off so pre-spawn ML scan doesn't fire on our + // benign synthetic test prompt. This test exercises the canary + // path specifically. + GSTACK_SECURITY_OFF: '1', + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + // Give the agent a moment to establish its poll loop. + await new Promise((r) => setTimeout(r, 500)); +}, 30000); + +async function drainStderr(proc: Subprocess | null, label: string): Promise { + if (!proc?.stderr) return; + try { + const reader = (proc.stderr as ReadableStream).getReader(); + // Drain briefly — don't block shutdown + const result = await Promise.race([ + reader.read(), + new Promise>((resolve) => + setTimeout(() => resolve({ done: true, value: undefined }), 100) + ), + ]); + if (result?.value) { + const text = new TextDecoder().decode(result.value); + if (text.trim()) console.error(`[${label} stderr]`, text.slice(0, 2000)); + } + } catch {} +} + +afterAll(async () => { + // Dump agent stderr for diagnostic + await drainStderr(agentProc, 'agent'); + for (const proc of [serverProc, agentProc]) { + if (proc) { + try { proc.kill('SIGTERM'); } catch {} + try { setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} }, 1500); } catch {} + } + } + try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} +}); + +describe('security pipeline E2E (mock claude)', () => { + test('server injects canary, queues message, agent spawns mock claude', async () => { + const resp = await apiFetch('/sidebar-command', { + method: 'POST', + body: JSON.stringify({ + message: "What's on this page?", + activeTabUrl: 'https://attacker.example.com/', + }), + }); + expect(resp.status).toBe(200); + + // Wait for the sidebar-agent to pick up the entry and spawn mock-claude. + // Queue entry must contain `canary` field (added by server.ts spawnClaude). + await new Promise((r) => setTimeout(r, 250)); + const queueContent = fs.readFileSync(queueFile, 'utf-8').trim(); + const lines = queueContent.split('\n').filter(Boolean); + expect(lines.length).toBeGreaterThan(0); + const entry = JSON.parse(lines[lines.length - 1]); + expect(entry.canary).toMatch(/^CANARY-[0-9A-F]+$/); + expect(entry.prompt).toContain(entry.canary); + expect(entry.prompt).toContain('NEVER include it'); + }); + + test('canary leak triggers security_event + agent_error in /sidebar-chat', async () => { + // By now the mock-claude subprocess has emitted the tool_use with the + // leaked canary. Sidebar-agent's handleStreamEvent -> detectCanaryLeak + // -> onCanaryLeaked should have fired security_event + agent_error and + // SIGTERM'd the mock. Poll /sidebar-chat up to 10s for the events. + const deadline = Date.now() + 10000; + let securityEvent: any = null; + let agentError: any = null; + while (Date.now() < deadline && (!securityEvent || !agentError)) { + const resp = await apiFetch('/sidebar-chat'); + const data: any = await resp.json(); + for (const entry of data.entries ?? []) { + if (entry.type === 'security_event') securityEvent = entry; + if (entry.type === 'agent_error') agentError = entry; + } + if (securityEvent && agentError) break; + await new Promise((r) => setTimeout(r, 250)); + } + + expect(securityEvent).not.toBeNull(); + expect(securityEvent.verdict).toBe('block'); + expect(securityEvent.reason).toBe('canary_leaked'); + expect(securityEvent.layer).toBe('canary'); + // The leak is on a tool_use channel — onCanaryLeaked records "tool_use:Bash" + expect(String(securityEvent.channel)).toContain('tool_use'); + expect(securityEvent.domain).toBe('attacker.example.com'); + + expect(agentError).not.toBeNull(); + expect(agentError.error).toContain('Session terminated'); + expect(agentError.error).toContain('prompt injection detected'); + }, 15000); + + test('attempts.jsonl logged with salted payload_hash and verdict=block', async () => { + // onCanaryLeaked also calls logAttempt — check the log file exists + // and contains the event. The file lives at ~/.gstack/security/attempts.jsonl. + const logPath = path.join(os.homedir(), '.gstack', 'security', 'attempts.jsonl'); + expect(fs.existsSync(logPath)).toBe(true); + const content = fs.readFileSync(logPath, 'utf-8'); + const recent = content.split('\n').filter(Boolean).slice(-10); + // Find at least one entry with verdict=block and layer=canary from our run + const ourEntry = recent + .map((l) => { try { return JSON.parse(l); } catch { return null; } }) + .find((e) => e && e.layer === 'canary' && e.verdict === 'block' && e.urlDomain === 'attacker.example.com'); + expect(ourEntry).toBeTruthy(); + // payload_hash is a 64-char sha256 hex + expect(String(ourEntry.payloadHash)).toMatch(/^[0-9a-f]{64}$/); + // Never stored the payload itself — only the hash + expect(JSON.stringify(ourEntry)).not.toContain('CANARY-'); + }); +});