From b5fa1df9c18270cf18190932d71c849f1e32b770 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 25 Apr 2026 21:48:12 -0700 Subject: [PATCH] chore: drop sidebar-agent test refs after chat rip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five test files / describe blocks targeted the deleted chat path: - browse/test/security-e2e-fullstack.test.ts (full-stack chat-pipeline E2E with mock claude — whole file gone) - browse/test/security-review-fullstack.test.ts (review-flow E2E with real classifier — whole file gone) - browse/test/security-review-sidepanel-e2e.test.ts (Playwright E2E for the security event banner that was ripped from sidepanel.html) - browse/test/security-audit-r2.test.ts (5 describe blocks: agent queue permissions, isValidQueueEntry stateFile traversal, loadSession session-ID validation, switchChatTab DocumentFragment, pollChat reentrancy guard, /sidebar-tabs URL sanitization, sidebar-agent SIGTERM→SIGKILL escalation, AGENT_SRC top-level read converted to graceful fallback) - browse/test/security-adversarial-fixes.test.ts (canary stream-chunk split detection on detectCanaryLeak; one tool-output test on sidebar-agent) - test/skill-validation.test.ts (sidebar agent #584 describe block) These all assumed sidebar-agent.ts existed and tested chat-queue plumbing, chat-tab DOM round-trip, chat-polling reentrancy, or per-message classifier canary detection. With the live PTY there is no chat queue, no chat tab, no LLM stream to canary-scan, and no per-message subprocess. The Terminal pane's invariants are covered by the new browse/test/sidebar-tabs.test.ts (27 structural assertions), browse/test/terminal-agent.test.ts, and browse/test/terminal-agent-integration.test.ts. bun test → exit 0, 0 failures. --- .../test/security-adversarial-fixes.test.ts | 40 +- browse/test/security-audit-r2.test.ts | 264 ++---------- browse/test/security-e2e-fullstack.test.ts | 218 ---------- browse/test/security-review-fullstack.test.ts | 405 ------------------ .../security-review-sidepanel-e2e.test.ts | 345 --------------- test/skill-validation.test.ts | 32 +- 6 files changed, 53 insertions(+), 1251 deletions(-) delete mode 100644 browse/test/security-e2e-fullstack.test.ts delete mode 100644 browse/test/security-review-fullstack.test.ts delete mode 100644 browse/test/security-review-sidepanel-e2e.test.ts diff --git a/browse/test/security-adversarial-fixes.test.ts b/browse/test/security-adversarial-fixes.test.ts index ac75a9fd..c14ea6a4 100644 --- a/browse/test/security-adversarial-fixes.test.ts +++ b/browse/test/security-adversarial-fixes.test.ts @@ -19,31 +19,10 @@ import { PAGE_CONTENT_COMMANDS } from '../src/commands'; const REPO_ROOT = path.resolve(__dirname, '..', '..'); -describe('canary stream-chunk split detection', () => { - test('detectCanaryLeak uses rolling buffer across consecutive deltas', () => { - // Pull in the function via dynamic require so we don't re-export it - // from sidebar-agent.ts (it's internal on purpose). - const agentSource = fs.readFileSync( - path.join(REPO_ROOT, 'browse', 'src', 'sidebar-agent.ts'), - 'utf-8', - ); - // Contract: detectCanaryLeak accepts an optional DeltaBuffer and - // uses .slice(-(canary.length - 1)) to retain a rolling tail. - expect(agentSource).toContain('DeltaBuffer'); - expect(agentSource).toMatch(/text_delta\s*=\s*combined\.slice\(-\(canary\.length - 1\)\)/); - expect(agentSource).toMatch(/input_json_delta\s*=\s*combined\.slice\(-\(canary\.length - 1\)\)/); - }); - - test('canary context initializes deltaBuf', () => { - const agentSource = fs.readFileSync( - path.join(REPO_ROOT, 'browse', 'src', 'sidebar-agent.ts'), - 'utf-8', - ); - // The askClaude call site must construct the buffer so the rolling - // detection actually runs. - expect(agentSource).toContain("deltaBuf: { text_delta: '', input_json_delta: '' }"); - }); -}); +// canary stream-chunk split detection — tested detectCanaryLeak inside +// sidebar-agent.ts. Both the chat-stream pipeline and the function are +// gone (Terminal pane uses an interactive PTY; user keystrokes are the +// trust source, no chunked LLM stream to canary-scan). describe('tool-output ensemble rule (single-layer BLOCK)', () => { test('user-input context: single layer at BLOCK degrades to WARN', () => { @@ -117,13 +96,10 @@ describe('transcript classifier tool_output parameter', () => { expect(src).toContain('tool_output'); }); - test('sidebar-agent passes tool text to transcript on tool-result scan', () => { - const src = fs.readFileSync( - path.join(REPO_ROOT, 'browse', 'src', 'sidebar-agent.ts'), - 'utf-8', - ); - expect(src).toContain('tool_output: text'); - }); + // sidebar-agent passed tool text to the transcript classifier on + // tool-result scans. That whole pipeline is gone — Terminal pane has + // no LLM stream to scan, and security-classifier.ts is dead code with + // no production caller (a separate v1.1+ cleanup TODO). }); describe('GSTACK_SECURITY_OFF kill switch', () => { diff --git a/browse/test/security-audit-r2.test.ts b/browse/test/security-audit-r2.test.ts index 97e9f082..9af4bcb6 100644 --- a/browse/test/security-audit-r2.test.ts +++ b/browse/test/security-audit-r2.test.ts @@ -15,7 +15,13 @@ import * as os from 'os'; const META_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/meta-commands.ts'), 'utf-8'); const WRITE_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/write-commands.ts'), 'utf-8'); const SERVER_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/server.ts'), 'utf-8'); -const AGENT_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/sidebar-agent.ts'), 'utf-8'); +// sidebar-agent.ts was ripped (chat queue replaced by interactive PTY). +// AGENT_SRC kept as empty string so the legacy describe block below skips +// without crashing module load on a missing file. +const AGENT_SRC = (() => { + try { return fs.readFileSync(path.join(import.meta.dir, '../src/sidebar-agent.ts'), 'utf-8'); } + catch { return ''; } +})(); const SNAPSHOT_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/snapshot.ts'), 'utf-8'); const PATH_SECURITY_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/path-security.ts'), 'utf-8'); @@ -51,53 +57,12 @@ function extractFunction(src: string, name: string): string { return src.slice(start); } -// ─── Task 4: Agent queue poisoning — full schema validation + permissions ─── - -describe('Agent queue security', () => { - it('server queue directory must use restricted permissions', () => { - const queueSection = SERVER_SRC.slice(SERVER_SRC.indexOf('agentQueue'), SERVER_SRC.indexOf('agentQueue') + 2000); - expect(queueSection).toMatch(/0o700/); - }); - - it('sidebar-agent queue directory must use restricted permissions', () => { - // The mkdirSync for the queue dir lives in main() — search the main() body - const mainStart = AGENT_SRC.indexOf('async function main'); - const queueSection = AGENT_SRC.slice(mainStart); - expect(queueSection).toMatch(/0o700/); - }); - - it('cli.ts queue file creation must use restricted permissions', () => { - const CLI_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/cli.ts'), 'utf-8'); - const queueSection = CLI_SRC.slice(CLI_SRC.indexOf('queue') || 0, CLI_SRC.indexOf('queue') + 2000); - expect(queueSection).toMatch(/0o700|0o600|mode/); - }); - - it('queue reader must have a validator function covering all fields', () => { - // Extract ONLY the validator function body by walking braces - const validatorStart = AGENT_SRC.indexOf('function isValidQueueEntry'); - expect(validatorStart).toBeGreaterThan(-1); - let depth = 0; - let bodyStart = AGENT_SRC.indexOf('{', validatorStart); - let bodyEnd = bodyStart; - for (let i = bodyStart; i < AGENT_SRC.length; i++) { - if (AGENT_SRC[i] === '{') depth++; - if (AGENT_SRC[i] === '}') depth--; - if (depth === 0) { bodyEnd = i + 1; break; } - } - const validatorBlock = AGENT_SRC.slice(validatorStart, bodyEnd); - - expect(validatorBlock).toMatch(/prompt.*string/); - expect(validatorBlock).toMatch(/Array\.isArray/); - expect(validatorBlock).toMatch(/\.\./); - expect(validatorBlock).toContain('stateFile'); - expect(validatorBlock).toContain('tabId'); - expect(validatorBlock).toMatch(/number/); - expect(validatorBlock).toContain('null'); - expect(validatorBlock).toContain('message'); - expect(validatorBlock).toContain('pageUrl'); - expect(validatorBlock).toContain('sessionId'); - }); -}); +// ─── Agent queue security ────────────────────────────────────────────────── +// Original block validated the chat queue's filesystem permissions and +// schema validator on sidebar-agent.ts. Both are gone (chat queue ripped +// in favor of the interactive Terminal PTY). The remaining 0o700 / 0o600 +// invariants on extension queue paths are now covered by terminal-agent +// integration tests and the sidebar-tabs regression suite. // ─── Shared source reads for CSS validator tests ──────────────────────────── const CDP_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/cdp-inspector.ts'), 'utf-8'); @@ -325,30 +290,13 @@ describe('Round-2 finding 2: snapshot.ts annotated path uses realpathSync', () = }); }); -// ─── Round-2 finding 3: stateFile path traversal check in isValidQueueEntry ─ - -describe('Round-2 finding 3: isValidQueueEntry checks stateFile for path traversal', () => { - it('isValidQueueEntry checks stateFile for .. traversal sequences', () => { - const fn = extractFunction(AGENT_SRC, 'isValidQueueEntry'); - expect(fn).toBeTruthy(); - // Must check stateFile for '..' — find the stateFile block and look for '..' string - const stateFileIdx = fn.indexOf('stateFile'); - expect(stateFileIdx).toBeGreaterThan(-1); - const stateFileBlock = fn.slice(stateFileIdx, stateFileIdx + 200); - // The block must contain a check for the two-dot traversal sequence - expect(stateFileBlock).toMatch(/'\.\.'|"\.\."|\.\./); - }); - - it('isValidQueueEntry stateFile block contains both type check and traversal check', () => { - const fn = extractFunction(AGENT_SRC, 'isValidQueueEntry'); - const stateFileIdx = fn.indexOf('stateFile'); - const stateBlock = fn.slice(stateFileIdx, stateFileIdx + 300); - // Must contain the type check - expect(stateBlock).toContain('typeof obj.stateFile'); - // Must contain the includes('..') call - expect(stateBlock).toMatch(/includes\s*\(\s*['"]\.\.['"]\s*\)/); - }); -}); +// ─── Round-2 finding 3: stateFile path traversal check ───────────────────── +// Tested isValidQueueEntry's stateFile validator on sidebar-agent.ts. Both +// the function and the file are gone (chat queue ripped). The terminal-agent +// PTY path no longer takes a queue entry — it accepts WebSocket frames +// gated on Origin + session token, no on-disk queue to traverse. Path +// traversal in browse-server's tab-state writer is covered by +// browse/test/terminal-agent.test.ts (handleTabState atomic-write tests). // ─── Task 5: /health endpoint must not expose sensitive fields ─────────────── @@ -421,24 +369,11 @@ describe('cookie-import domain validation', () => { }); }); -// ─── Task 9: loadSession ID validation ────────────────────────────────────── - -describe('loadSession session ID validation', () => { - it('loadSession validates session ID format before using it in a path', () => { - const fn = extractFunction(SERVER_SRC, 'loadSession'); - expect(fn).toBeTruthy(); - // Must contain the alphanumeric regex guard - expect(fn).toMatch(/\[a-zA-Z0-9_-\]/); - }); - - it('loadSession returns null on invalid session ID', () => { - const fn = extractFunction(SERVER_SRC, 'loadSession'); - const block = fn.slice(fn.indexOf('activeData.id')); - // Must warn and return null - expect(block).toContain('Invalid session ID'); - expect(block).toContain('return null'); - }); -}); +// loadSession session ID validation — loadSession lived inside the chat +// agent state block (sidebar-agent.ts session persistence). Chat queue +// is gone, so the function and its session-ID validator are gone. The +// terminal-agent's PTY session has no on-disk session ID — the WebSocket +// holds the session for its lifetime. // ─── Task 10: Responsive screenshot path validation ────────────────────────── @@ -520,40 +455,11 @@ describe('Task 11: state load cookie validation', () => { }); }); -// ─── Task 12: Validate activeTabUrl before syncActiveTabByUrl ───────────────── - -describe('Task 12: activeTabUrl sanitized before syncActiveTabByUrl', () => { - it('sidebar-tabs route sanitizes activeUrl before syncActiveTabByUrl', () => { - const block = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-tabs'", "url.pathname === '/sidebar-tabs/switch'"); - expect(block).toContain('sanitizeExtensionUrl'); - expect(block).toContain('syncActiveTabByUrl'); - const sanitizeIdx = block.indexOf('sanitizeExtensionUrl'); - const syncIdx = block.indexOf('syncActiveTabByUrl'); - expect(sanitizeIdx).toBeLessThan(syncIdx); - }); - - it('sidebar-command route sanitizes extensionUrl before syncActiveTabByUrl', () => { - const block = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-command'", "url.pathname === '/sidebar-chat/clear'"); - expect(block).toContain('sanitizeExtensionUrl'); - expect(block).toContain('syncActiveTabByUrl'); - const sanitizeIdx = block.indexOf('sanitizeExtensionUrl'); - const syncIdx = block.indexOf('syncActiveTabByUrl'); - expect(sanitizeIdx).toBeLessThan(syncIdx); - }); - - it('direct unsanitized syncActiveTabByUrl calls are not present (all calls go through sanitize)', () => { - // Every syncActiveTabByUrl call should be preceded by sanitizeExtensionUrl in the nearby code - // We verify there are no direct browserManager.syncActiveTabByUrl(activeUrl) or - // browserManager.syncActiveTabByUrl(extensionUrl) patterns (without sanitize wrapper) - const block1 = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-tabs'", "url.pathname === '/sidebar-tabs/switch'"); - // Should NOT contain direct call with raw activeUrl - expect(block1).not.toMatch(/syncActiveTabByUrl\(activeUrl\)/); - - const block2 = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-command'", "url.pathname === '/sidebar-chat/clear'"); - // Should NOT contain direct call with raw extensionUrl - expect(block2).not.toMatch(/syncActiveTabByUrl\(extensionUrl\)/); - }); -}); +// activeTabUrl sanitized before syncActiveTabByUrl — tested URL sanitization +// on the now-deleted /sidebar-tabs and /sidebar-command routes. The +// terminal-agent reads tab URLs from the live tabs.json file (atomic write +// from background.js), and chrome:// / chrome-extension:// pages are +// filtered server-side in handleTabState — see browse/test/terminal-agent.test.ts. // ─── Task 13: Inbox output wrapped as untrusted ────────────────────────────── @@ -581,107 +487,17 @@ describe('Task 13: inbox output wrapped as untrusted content', () => { }); }); -// ─── Task 14: DOM serialization round-trip replaced with DocumentFragment ───── +// switchChatTab DocumentFragment + pollChat reentrancy guard tests targeted +// now-deleted chat-tab DOM logic and chat-polling reentrancy. Both are gone +// (Terminal pane is the sole sidebar surface; xterm.js owns its own DOM +// lifecycle, and the WebSocket has no reentrancy hazard). -const SIDEPANEL_SRC = fs.readFileSync(path.join(import.meta.dir, '../../extension/sidepanel.js'), 'utf-8'); - -describe('Task 14: switchChatTab uses DocumentFragment, not innerHTML round-trip', () => { - it('switchChatTab does NOT use innerHTML to restore chat (string-based re-parse removed)', () => { - const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab'); - expect(fn).toBeTruthy(); - // Must NOT have the dangerous pattern of assigning chatDomByTab value back to innerHTML - expect(fn).not.toMatch(/chatMessages\.innerHTML\s*=\s*chatDomByTab/); - }); - - it('switchChatTab uses createDocumentFragment to save chat DOM', () => { - const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab'); - expect(fn).toContain('createDocumentFragment'); - }); - - it('switchChatTab moves nodes via appendChild/firstChild (not innerHTML assignment)', () => { - const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab'); - // Must use appendChild to restore nodes from fragment - expect(fn).toContain('chatMessages.appendChild'); - }); - - it('chatDomByTab comment documents that values are DocumentFragments, not strings', () => { - // Check module-level comment on chatDomByTab - const commentIdx = SIDEPANEL_SRC.indexOf('chatDomByTab'); - const commentLine = SIDEPANEL_SRC.slice(commentIdx, commentIdx + 120); - expect(commentLine).toMatch(/DocumentFragment|fragment/i); - }); - - it('welcome screen is built with DOM methods in the else branch (not innerHTML)', () => { - const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab'); - // The else branch must use createElement, not innerHTML template literal - expect(fn).toContain('createElement'); - // The specific innerHTML template with chat-welcome must be gone - expect(fn).not.toMatch(/innerHTML\s*=\s*`[\s\S]*?chat-welcome/); - }); -}); - -// ─── Task 15: pollChat/switchChatTab reentrancy guard ──────────────────────── - -describe('Task 15: pollChat reentrancy guard and deferred call in switchChatTab', () => { - it('pollInProgress guard variable is declared at module scope', () => { - // Must be declared before any function definitions (within first 2000 chars) - const moduleTop = SIDEPANEL_SRC.slice(0, 2000); - expect(moduleTop).toContain('pollInProgress'); - }); - - it('pollChat function checks and sets pollInProgress', () => { - const fn = extractFunction(SIDEPANEL_SRC, 'pollChat'); - expect(fn).toBeTruthy(); - expect(fn).toContain('pollInProgress'); - }); - - it('pollChat resets pollInProgress in finally block', () => { - const fn = extractFunction(SIDEPANEL_SRC, 'pollChat'); - // The finally block must contain the reset - const finallyIdx = fn.indexOf('finally'); - expect(finallyIdx).toBeGreaterThan(-1); - const finallyBlock = fn.slice(finallyIdx, finallyIdx + 60); - expect(finallyBlock).toContain('pollInProgress'); - }); - - it('switchChatTab calls pollChat via setTimeout (not directly)', () => { - const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab'); - // Must use setTimeout to defer pollChat — no direct call at the end - expect(fn).toMatch(/setTimeout\s*\(\s*pollChat/); - // Must NOT have a bare direct call `pollChat()` at the end (outside setTimeout) - // We check that there is no standalone `pollChat()` call (outside setTimeout wrapper) - const withoutSetTimeout = fn.replace(/setTimeout\s*\(\s*pollChat[^)]*\)/g, ''); - expect(withoutSetTimeout).not.toMatch(/\bpollChat\s*\(\s*\)/); - }); -}); - -// ─── Task 16: SIGKILL escalation in sidebar-agent timeout ──────────────────── - -describe('Task 16: sidebar-agent timeout handler uses SIGTERM→SIGKILL escalation', () => { - it('timeout block sends SIGTERM first', () => { - // Slice from "Timed out" / setTimeout block to processingTabs.delete - const timeoutStart = AGENT_SRC.indexOf("SIDEBAR_AGENT_TIMEOUT"); - expect(timeoutStart).toBeGreaterThan(-1); - const timeoutBlock = AGENT_SRC.slice(timeoutStart, timeoutStart + 600); - expect(timeoutBlock).toContain('SIGTERM'); - }); - - it('timeout block escalates to SIGKILL after delay', () => { - const timeoutStart = AGENT_SRC.indexOf("SIDEBAR_AGENT_TIMEOUT"); - const timeoutBlock = AGENT_SRC.slice(timeoutStart, timeoutStart + 600); - expect(timeoutBlock).toContain('SIGKILL'); - }); - - it('SIGTERM appears before SIGKILL in timeout block', () => { - const timeoutStart = AGENT_SRC.indexOf("SIDEBAR_AGENT_TIMEOUT"); - const timeoutBlock = AGENT_SRC.slice(timeoutStart, timeoutStart + 600); - const sigtermIdx = timeoutBlock.indexOf('SIGTERM'); - const sigkillIdx = timeoutBlock.indexOf('SIGKILL'); - expect(sigtermIdx).toBeGreaterThan(-1); - expect(sigkillIdx).toBeGreaterThan(-1); - expect(sigtermIdx).toBeLessThan(sigkillIdx); - }); -}); +// ─── Task 16: SIGKILL escalation ──────────────────────────────────────────── +// Originally tested sidebar-agent's SIDEBAR_AGENT_TIMEOUT block. The chat +// queue and its watchdog are gone. terminal-agent.ts disposes claude with +// the same SIGINT-then-SIGKILL-after-3s pattern; that's covered by +// browse/test/terminal-agent.test.ts ("cleanup escalates SIGINT to SIGKILL +// after 3s on close"). // ─── Task 17: viewport and wait bounds clamping ────────────────────────────── diff --git a/browse/test/security-e2e-fullstack.test.ts b/browse/test/security-e2e-fullstack.test.ts deleted file mode 100644 index 01d347a0..00000000 --- a/browse/test/security-e2e-fullstack.test.ts +++ /dev/null @@ -1,218 +0,0 @@ -/** - * Full-stack E2E — the security-contract anchor test. - * - * Spins up a real browse server + real sidebar-agent subprocess, points - * them at a MOCK claude binary (browse/test/fixtures/mock-claude/claude) - * that deterministically emits a canary-leaking tool_use event, then - * verifies the whole pipeline reacts: - * - * 1. Server canary-injects into the system prompt - * 2. Server queues the message - * 3. Sidebar-agent spawns mock-claude - * 4. Mock-claude emits tool_use with CANARY-XXX in a URL arg - * 5. Sidebar-agent's detectCanaryLeak fires on the stream event - * 6. onCanaryLeaked logs, SIGTERM's mock-claude, emits security_event - * 7. /sidebar-chat returns security_event + agent_error entries - * - * This test proves the end-to-end contract: when a canary leak happens, - * the session terminates AND the sidepanel receives the events that drive - * the approved banner render. No LLM cost, <10s total runtime. - * - * Fully deterministic — safe to run on every commit (gate tier). - */ - -import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; -import { spawn, type Subprocess } from 'bun'; -import * as fs from 'fs'; -import * as os from 'os'; -import * as path from 'path'; - -let serverProc: Subprocess | null = null; -let agentProc: Subprocess | null = null; -let serverPort = 0; -let authToken = ''; -let tmpDir = ''; -let stateFile = ''; -let queueFile = ''; -const MOCK_CLAUDE_DIR = path.resolve(import.meta.dir, 'fixtures', 'mock-claude'); - -async function apiFetch(pathname: string, opts: RequestInit = {}): Promise { - const headers: Record = { - 'Content-Type': 'application/json', - Authorization: `Bearer ${authToken}`, - ...(opts.headers as Record | undefined), - }; - return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers }); -} - -beforeAll(async () => { - tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'security-e2e-fullstack-')); - stateFile = path.join(tmpDir, 'browse.json'); - queueFile = path.join(tmpDir, 'sidebar-queue.jsonl'); - fs.mkdirSync(path.dirname(queueFile), { recursive: true }); - - const serverScript = path.resolve(import.meta.dir, '..', 'src', 'server.ts'); - const agentScript = path.resolve(import.meta.dir, '..', 'src', 'sidebar-agent.ts'); - - // 1) Start the browse server. - serverProc = spawn(['bun', 'run', serverScript], { - env: { - ...process.env, - BROWSE_STATE_FILE: stateFile, - BROWSE_HEADLESS_SKIP: '1', // no Chromium for this test - BROWSE_PORT: '0', - SIDEBAR_QUEUE_PATH: queueFile, - BROWSE_IDLE_TIMEOUT: '300', - }, - stdio: ['ignore', 'pipe', 'pipe'], - }); - - // Wait for state file with token + port - const deadline = Date.now() + 15000; - while (Date.now() < deadline) { - if (fs.existsSync(stateFile)) { - try { - const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8')); - if (state.port && state.token) { - serverPort = state.port; - authToken = state.token; - break; - } - } catch {} - } - await new Promise((r) => setTimeout(r, 100)); - } - if (!serverPort) throw new Error('Server did not start in time'); - - // 2) Start the sidebar-agent with PATH prepended by the mock-claude dir. - // sidebar-agent spawns `claude` via PATH lookup (spawn('claude', ...) — see - // browse/src/sidebar-agent.ts spawnClaude), so prepending works without any - // source change. - const shimmedPath = `${MOCK_CLAUDE_DIR}:${process.env.PATH ?? ''}`; - agentProc = spawn(['bun', 'run', agentScript], { - env: { - ...process.env, - PATH: shimmedPath, - BROWSE_STATE_FILE: stateFile, - SIDEBAR_QUEUE_PATH: queueFile, - BROWSE_SERVER_PORT: String(serverPort), - BROWSE_PORT: String(serverPort), - BROWSE_NO_AUTOSTART: '1', - // Scenario for mock-claude inherits through spawn env below — the agent - // itself doesn't read this, but the claude subprocess it spawns does. - MOCK_CLAUDE_SCENARIO: 'canary_leak_in_tool_arg', - // Force classifier off so pre-spawn ML scan doesn't fire on our - // benign synthetic test prompt. This test exercises the canary - // path specifically. - GSTACK_SECURITY_OFF: '1', - }, - stdio: ['ignore', 'pipe', 'pipe'], - }); - - // Give the agent a moment to establish its poll loop. - await new Promise((r) => setTimeout(r, 500)); -}, 30000); - -async function drainStderr(proc: Subprocess | null, label: string): Promise { - if (!proc?.stderr) return; - try { - const reader = (proc.stderr as ReadableStream).getReader(); - // Drain briefly — don't block shutdown - const result = await Promise.race([ - reader.read(), - new Promise>((resolve) => - setTimeout(() => resolve({ done: true, value: undefined }), 100) - ), - ]); - if (result?.value) { - const text = new TextDecoder().decode(result.value); - if (text.trim()) console.error(`[${label} stderr]`, text.slice(0, 2000)); - } - } catch {} -} - -afterAll(async () => { - // Dump agent stderr for diagnostic - await drainStderr(agentProc, 'agent'); - for (const proc of [serverProc, agentProc]) { - if (proc) { - try { proc.kill('SIGTERM'); } catch {} - try { setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} }, 1500); } catch {} - } - } - try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} -}); - -describe('security pipeline E2E (mock claude)', () => { - test('server injects canary, queues message, agent spawns mock claude', async () => { - const resp = await apiFetch('/sidebar-command', { - method: 'POST', - body: JSON.stringify({ - message: "What's on this page?", - activeTabUrl: 'https://attacker.example.com/', - }), - }); - expect(resp.status).toBe(200); - - // Wait for the sidebar-agent to pick up the entry and spawn mock-claude. - // Queue entry must contain `canary` field (added by server.ts spawnClaude). - await new Promise((r) => setTimeout(r, 250)); - const queueContent = fs.readFileSync(queueFile, 'utf-8').trim(); - const lines = queueContent.split('\n').filter(Boolean); - expect(lines.length).toBeGreaterThan(0); - const entry = JSON.parse(lines[lines.length - 1]); - expect(entry.canary).toMatch(/^CANARY-[0-9A-F]+$/); - expect(entry.prompt).toContain(entry.canary); - expect(entry.prompt).toContain('NEVER include it'); - }); - - test('canary leak triggers security_event + agent_error in /sidebar-chat', async () => { - // By now the mock-claude subprocess has emitted the tool_use with the - // leaked canary. Sidebar-agent's handleStreamEvent -> detectCanaryLeak - // -> onCanaryLeaked should have fired security_event + agent_error and - // SIGTERM'd the mock. Poll /sidebar-chat up to 10s for the events. - const deadline = Date.now() + 10000; - let securityEvent: any = null; - let agentError: any = null; - while (Date.now() < deadline && (!securityEvent || !agentError)) { - const resp = await apiFetch('/sidebar-chat'); - const data: any = await resp.json(); - for (const entry of data.entries ?? []) { - if (entry.type === 'security_event') securityEvent = entry; - if (entry.type === 'agent_error') agentError = entry; - } - if (securityEvent && agentError) break; - await new Promise((r) => setTimeout(r, 250)); - } - - expect(securityEvent).not.toBeNull(); - expect(securityEvent.verdict).toBe('block'); - expect(securityEvent.reason).toBe('canary_leaked'); - expect(securityEvent.layer).toBe('canary'); - // The leak is on a tool_use channel — onCanaryLeaked records "tool_use:Bash" - expect(String(securityEvent.channel)).toContain('tool_use'); - expect(securityEvent.domain).toBe('attacker.example.com'); - - expect(agentError).not.toBeNull(); - expect(agentError.error).toContain('Session terminated'); - expect(agentError.error).toContain('prompt injection detected'); - }, 15000); - - test('attempts.jsonl logged with salted payload_hash and verdict=block', async () => { - // onCanaryLeaked also calls logAttempt — check the log file exists - // and contains the event. The file lives at ~/.gstack/security/attempts.jsonl. - const logPath = path.join(os.homedir(), '.gstack', 'security', 'attempts.jsonl'); - expect(fs.existsSync(logPath)).toBe(true); - const content = fs.readFileSync(logPath, 'utf-8'); - const recent = content.split('\n').filter(Boolean).slice(-10); - // Find at least one entry with verdict=block and layer=canary from our run - const ourEntry = recent - .map((l) => { try { return JSON.parse(l); } catch { return null; } }) - .find((e) => e && e.layer === 'canary' && e.verdict === 'block' && e.urlDomain === 'attacker.example.com'); - expect(ourEntry).toBeTruthy(); - // payload_hash is a 64-char sha256 hex - expect(String(ourEntry.payloadHash)).toMatch(/^[0-9a-f]{64}$/); - // Never stored the payload itself — only the hash - expect(JSON.stringify(ourEntry)).not.toContain('CANARY-'); - }); -}); diff --git a/browse/test/security-review-fullstack.test.ts b/browse/test/security-review-fullstack.test.ts deleted file mode 100644 index 47cdc433..00000000 --- a/browse/test/security-review-fullstack.test.ts +++ /dev/null @@ -1,405 +0,0 @@ -/** - * Full-stack review-flow E2E with the real classifier. - * - * Spins up real server + real sidebar-agent subprocess + mock-claude and - * exercises the whole tool-output BLOCK → review → decide path with the - * real TestSavantAI classifier warm. The injection string trips the real - * model reliably (measured: confidence 0.9999 on classic DAN-style text). - * - * What this covers that gate-tier tests don't: - * * Real classifier actually fires on the injection - * * sidebar-agent emits a reviewable security_event for real, not a stub - * * server's POST /security-decision writes the on-disk decision file - * * sidebar-agent's poll loop reads the file and either resumes or kills - * the mock-claude subprocess - * * attempts.jsonl ends up with the right verdict (block vs user_overrode) - * - * This is periodic tier. First run warms the ~112MB classifier from - * HuggingFace — ~30s cold. Subsequent runs use the cached model under - * ~/.gstack/models/testsavant-small/ and complete in ~5s. - * - * SKIPS if the classifier can't warm (no network, no disk) — the test is - * truth-seeking only when the stack is genuinely up. - */ - -import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; -import { spawn, type Subprocess } from 'bun'; -import * as fs from 'fs'; -import * as os from 'os'; -import * as path from 'path'; - -const MOCK_CLAUDE_DIR = path.resolve(import.meta.dir, 'fixtures', 'mock-claude'); -const WARMUP_TIMEOUT_MS = 90_000; // first-run download budget -const CLASSIFIER_CACHE = path.join(os.homedir(), '.gstack', 'models', 'testsavant-small'); - -let serverProc: Subprocess | null = null; -let agentProc: Subprocess | null = null; -let serverPort = 0; -let authToken = ''; -let tmpDir = ''; -let stateFile = ''; -let queueFile = ''; -let attemptsPath = ''; - -/** - * Eager check — is the classifier model already on disk? `test.skipIf()` - * is evaluated at file-registration time (before beforeAll runs), so a - * runtime boolean wouldn't work — all tests would unconditionally register - * as skipped. Probe the model dir synchronously at file load. - * Same pattern as security-sidepanel-dom.test.ts uses for chromium. - */ -const CLASSIFIER_READY = (() => { - try { - if (!fs.existsSync(CLASSIFIER_CACHE)) return false; - // At minimum we need the tokenizer config + onnx model. - return fs.existsSync(path.join(CLASSIFIER_CACHE, 'tokenizer.json')) - && fs.existsSync(path.join(CLASSIFIER_CACHE, 'onnx')); - } catch { - return false; - } -})(); - -async function apiFetch(pathname: string, opts: RequestInit = {}): Promise { - return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { - ...opts, - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${authToken}`, - ...(opts.headers as Record | undefined), - }, - }); -} - -async function waitForSecurityEntry( - predicate: (entry: any) => boolean, - timeoutMs: number, -): Promise { - const deadline = Date.now() + timeoutMs; - while (Date.now() < deadline) { - const resp = await apiFetch('/sidebar-chat'); - const data: any = await resp.json(); - for (const entry of data.entries ?? []) { - if (entry.type === 'security_event' && predicate(entry)) return entry; - } - await new Promise((r) => setTimeout(r, 250)); - } - return null; -} - -async function waitForProcessExit(proc: Subprocess, timeoutMs: number): Promise { - const deadline = Date.now() + timeoutMs; - while (Date.now() < deadline) { - if (proc.exitCode !== null) return proc.exitCode; - await new Promise((r) => setTimeout(r, 100)); - } - return null; -} - -async function readAttempts(): Promise { - if (!fs.existsSync(attemptsPath)) return []; - const raw = fs.readFileSync(attemptsPath, 'utf-8'); - return raw.split('\n').filter(Boolean).map((l) => { - try { return JSON.parse(l); } catch { return null; } - }).filter(Boolean); -} - -async function startStack(scenario: string, attemptsDir: string): Promise { - tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'security-review-fullstack-')); - stateFile = path.join(tmpDir, 'browse.json'); - queueFile = path.join(tmpDir, 'sidebar-queue.jsonl'); - fs.mkdirSync(path.dirname(queueFile), { recursive: true }); - - // Re-root HOME for both server and agent so: - // - server.ts's SESSIONS_DIR doesn't load pre-existing chat history - // from ~/.gstack/sidebar-sessions/ (caused ghost security_events to - // leak in from the live /open-gstack-browser session) - // - security.ts's attempts.jsonl writes land in a test-owned dir - // - session-state.json, chromium-profile, etc. stay isolated - fs.mkdirSync(path.join(attemptsDir, '.gstack'), { recursive: true }); - - // Symlink the models dir through to the real cache — without it the - // sidebar-agent would try to re-download 112MB every test run. - const testModelsDir = path.join(attemptsDir, '.gstack', 'models'); - const realModelsDir = path.join(os.homedir(), '.gstack', 'models'); - try { - if (fs.existsSync(realModelsDir) && !fs.existsSync(testModelsDir)) { - fs.symlinkSync(realModelsDir, testModelsDir); - } - } catch { - // Symlink may already exist — ignore. - } - - const serverScript = path.resolve(import.meta.dir, '..', 'src', 'server.ts'); - const agentScript = path.resolve(import.meta.dir, '..', 'src', 'sidebar-agent.ts'); - - serverProc = spawn(['bun', 'run', serverScript], { - env: { - ...process.env, - BROWSE_STATE_FILE: stateFile, - BROWSE_HEADLESS_SKIP: '1', - BROWSE_PORT: '0', - SIDEBAR_QUEUE_PATH: queueFile, - BROWSE_IDLE_TIMEOUT: '300', - HOME: attemptsDir, - }, - stdio: ['ignore', 'pipe', 'pipe'], - }); - - const deadline = Date.now() + 15000; - while (Date.now() < deadline) { - if (fs.existsSync(stateFile)) { - try { - const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8')); - if (state.port && state.token) { - serverPort = state.port; - authToken = state.token; - break; - } - } catch {} - } - await new Promise((r) => setTimeout(r, 100)); - } - if (!serverPort) throw new Error('Server did not start in time'); - - const shimmedPath = `${MOCK_CLAUDE_DIR}:${process.env.PATH ?? ''}`; - agentProc = spawn(['bun', 'run', agentScript], { - env: { - ...process.env, - PATH: shimmedPath, - BROWSE_STATE_FILE: stateFile, - SIDEBAR_QUEUE_PATH: queueFile, - BROWSE_SERVER_PORT: String(serverPort), - BROWSE_PORT: String(serverPort), - BROWSE_NO_AUTOSTART: '1', - MOCK_CLAUDE_SCENARIO: scenario, - HOME: attemptsDir, - }, - stdio: ['ignore', 'pipe', 'pipe'], - }); - attemptsPath = path.join(attemptsDir, '.gstack', 'security', 'attempts.jsonl'); - - // Give the agent a moment to establish its poll loop + warmup the model. - await new Promise((r) => setTimeout(r, 500)); -} - -async function stopStack(): Promise { - for (const proc of [serverProc, agentProc]) { - if (proc) { - try { proc.kill('SIGTERM'); } catch {} - try { setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} }, 1500); } catch {} - } - } - serverProc = null; - agentProc = null; - try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} -} - -beforeAll(async () => { - // Sanity: the on-disk cache is real + decodable. If this fails, mark the - // file as "classifier unavailable" (we can't toggle CLASSIFIER_READY - // post-registration — a failure here just means the tests below will - // exercise the agent without a working classifier, which is the honest - // signal we want anyway). - if (!CLASSIFIER_READY) return; -}); - -afterAll(async () => { - await stopStack(); -}); - -describe('review-flow full-stack E2E', () => { - test.skipIf(!CLASSIFIER_READY)( - 'tool_result injection → reviewable event → user ALLOWS → attempts.jsonl has user_overrode', - async () => { - const attemptsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attempts-allow-')); - try { - await startStack('tool_result_injection', attemptsDir); - - // Fire the message that will cause mock-claude to emit the - // injection-laden tool_result. - const resp = await apiFetch('/sidebar-command', { - method: 'POST', - body: JSON.stringify({ - message: 'summarize the hacker news comments', - activeTabUrl: 'https://news.ycombinator.com/item?id=42', - }), - }); - expect(resp.status).toBe(200); - - // Wait for the real classifier to fire and emit a reviewable - // security_event. The classifier is warm so this should happen in - // well under 10s once the tool_result arrives. - const reviewable = await waitForSecurityEntry( - (e) => e.verdict === 'block' && e.reviewable === true, - 30_000, - ); - expect(reviewable).not.toBeNull(); - expect(reviewable.reason).toBe('tool_result_ml'); - expect(reviewable.tool).toBe('Bash'); - expect(String(reviewable.suspected_text ?? '')).toContain('IGNORE ALL PREVIOUS'); - - // User clicks Allow via the banner → sidepanel POSTs to server. - const decisionResp = await apiFetch('/security-decision', { - method: 'POST', - body: JSON.stringify({ - tabId: reviewable.tabId, - decision: 'allow', - reason: 'user', - }), - }); - expect(decisionResp.status).toBe(200); - - // Wait for sidebar-agent's poll loop to consume the decision and - // emit a follow-up user_overrode security_event. - const overrode = await waitForSecurityEntry( - (e) => e.verdict === 'user_overrode', - 10_000, - ); - expect(overrode).not.toBeNull(); - - // Audit log must capture both the block and the override, in that - // order. Both records share the same salted payload hash so the - // security dashboard can aggregate them as a single attempt. - const attempts = await readAttempts(); - const blockLog = attempts.find( - (a) => a.verdict === 'block' && a.layer === 'testsavant_content', - ); - const overrodeLog = attempts.find( - (a) => a.verdict === 'user_overrode' && a.layer === 'testsavant_content', - ); - expect(blockLog).toBeTruthy(); - expect(overrodeLog).toBeTruthy(); - expect(overrodeLog.payloadHash).toBe(blockLog.payloadHash); - // Privacy contract: neither record includes the raw payload. - expect(JSON.stringify(overrodeLog)).not.toContain('IGNORE ALL PREVIOUS'); - - // Liveness: session must actually KEEP RUNNING after Allow. Mock-claude - // emits a second tool_use to post-block-followup.example.com ~8s - // after the tool_result. That event must reach the chat feed, proving - // the sidebar-agent resumed the stream-handler relay instead of - // silently wedging. - const followupDeadline = Date.now() + 20_000; - let followup: any = null; - while (Date.now() < followupDeadline && !followup) { - const chatResp = await apiFetch('/sidebar-chat'); - const chatData: any = await chatResp.json(); - for (const entry of chatData.entries ?? []) { - const input = String((entry as any).input ?? ''); - if ( - entry.type === 'tool_use' && - input.includes('post-block-followup.example.com') - ) { - followup = entry; - break; - } - } - if (!followup) await new Promise((r) => setTimeout(r, 300)); - } - expect(followup).not.toBeNull(); - } finally { - await stopStack(); - try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {} - } - }, - 90_000, - ); - - test.skipIf(!CLASSIFIER_READY)( - 'tool_result injection → reviewable event → user BLOCKS → agent session terminates', - async () => { - const attemptsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attempts-block-')); - try { - await startStack('tool_result_injection', attemptsDir); - - const resp = await apiFetch('/sidebar-command', { - method: 'POST', - body: JSON.stringify({ - message: 'summarize the hacker news comments', - activeTabUrl: 'https://news.ycombinator.com/item?id=42', - }), - }); - expect(resp.status).toBe(200); - - const reviewable = await waitForSecurityEntry( - (e) => e.verdict === 'block' && e.reviewable === true, - 30_000, - ); - expect(reviewable).not.toBeNull(); - - const decisionResp = await apiFetch('/security-decision', { - method: 'POST', - body: JSON.stringify({ - tabId: reviewable.tabId, - decision: 'block', - reason: 'user', - }), - }); - expect(decisionResp.status).toBe(200); - - // Wait for the agent_error that the sidebar-agent emits when it - // kills the claude subprocess after a user-confirmed block. This - // is the sidepanel's "Session terminated" signal. - const deadline = Date.now() + 15_000; - let errorEntry: any = null; - while (Date.now() < deadline && !errorEntry) { - const chatResp = await apiFetch('/sidebar-chat'); - const chatData: any = await chatResp.json(); - for (const entry of chatData.entries ?? []) { - if ( - entry.type === 'agent_error' && - String(entry.error ?? '').includes('Session terminated') - ) { - errorEntry = entry; - break; - } - } - if (!errorEntry) await new Promise((r) => setTimeout(r, 200)); - } - expect(errorEntry).not.toBeNull(); - - // attempts.jsonl must NOT have a user_overrode entry for this run. - const attempts = await readAttempts(); - const overrodeLog = attempts.find((a) => a.verdict === 'user_overrode'); - expect(overrodeLog).toBeFalsy(); - - // The real security property: after Block, NO FURTHER tool calls - // reach the chat feed. Mock-claude would have emitted a tool_use - // to post-block-followup.example.com ~8s after the tool_result if - // the session had kept running. Wait long enough for that window - // to close (12s total), then assert the followup event never - // appeared. This is what makes "block" actually stop the page — - // the subprocess is SIGTERM'd before it can emit the next event. - await new Promise((r) => setTimeout(r, 12_000)); - const finalChatResp = await apiFetch('/sidebar-chat'); - const finalChatData: any = await finalChatResp.json(); - const followupAttempted = (finalChatData.entries ?? []).some( - (entry: any) => - entry.type === 'tool_use' && - String(entry.input ?? '').includes('post-block-followup.example.com'), - ); - expect(followupAttempted).toBe(false); - - // And mock-claude must actually have died (not just been signaled - // — the SIGTERM + SIGKILL pair should have exited the process). - const mockAlive = (await apiFetch('/sidebar-chat')).ok; // channel still open - expect(mockAlive).toBe(true); - } finally { - await stopStack(); - try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {} - } - }, - 90_000, - ); - - test.skipIf(!CLASSIFIER_READY)( - 'no decision within 60s → timeout auto-blocks', - async () => { - // This test would naturally take 60s+ to run. We assert the - // decision file semantics instead — the unit-test suite already - // verified the poll loop times out and defaults to block - // (security-review-flow.test.ts). Kept here as a spec marker so - // the scenario is documented in the full-stack file. - expect(true).toBe(true); - }, - ); -}); diff --git a/browse/test/security-review-sidepanel-e2e.test.ts b/browse/test/security-review-sidepanel-e2e.test.ts deleted file mode 100644 index 4fdd9f07..00000000 --- a/browse/test/security-review-sidepanel-e2e.test.ts +++ /dev/null @@ -1,345 +0,0 @@ -/** - * Review-flow E2E (sidepanel side, hermetic). - * - * Loads the real extension sidepanel.html in Playwright Chromium, stubs - * the browse server responses, injects a `reviewable: true` security_event - * into /sidebar-chat, and asserts the user-in-the-loop flow end-to-end: - * - * 1. Banner renders with "Review suspected injection" title - * 2. Suspected text excerpt shows up inside the expandable details - * 3. Allow + Block buttons are visible and actionable - * 4. Clicking Allow posts to /security-decision with decision:"allow" - * 5. Clicking Block posts to /security-decision with decision:"block" - * 6. Banner auto-hides after decision - * - * This is the UI-and-wire test. The server-side handshake (decision file - * write + sidebar-agent poll) is covered by security-review-flow.test.ts. - * The full-stack version with real mock-claude + real classifier lives - * in security-review-fullstack.test.ts (periodic tier). - * - * Gate tier. ~3s. Skipped if Playwright chromium is unavailable. - */ - -import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; -import * as fs from 'fs'; -import * as path from 'path'; -import { chromium, type Browser, type Page } from 'playwright'; - -const EXTENSION_DIR = path.resolve(import.meta.dir, '..', '..', 'extension'); -const SIDEPANEL_URL = `file://${EXTENSION_DIR}/sidepanel.html`; - -const CHROMIUM_AVAILABLE = (() => { - try { - const exe = chromium.executablePath(); - return !!exe && fs.existsSync(exe); - } catch { - return false; - } -})(); - -interface DecisionCall { - tabId: number; - decision: 'allow' | 'block'; - reason?: string; -} - -/** - * Install the same stubs the existing sidepanel-dom test uses, plus a - * fetch interceptor that captures POSTs to /security-decision into a - * page-scoped array. Returns a handle to read the captured calls. - */ -async function installStubsAndCapture( - page: Page, - scenario: { securityEntries: any[] }, -): Promise { - await page.addInitScript((params: any) => { - (window as any).__decisionCalls = []; - - (window as any).chrome = { - runtime: { - sendMessage: (_req: any, cb: any) => { - const payload = { connected: true, port: 34567 }; - if (typeof cb === 'function') { - setTimeout(() => cb(payload), 0); - return undefined; - } - return Promise.resolve(payload); - }, - lastError: null, - onMessage: { addListener: () => {} }, - }, - tabs: { - query: (_q: any, cb: any) => setTimeout(() => cb([{ id: 1, url: 'https://example.com' }]), 0), - onActivated: { addListener: () => {} }, - onUpdated: { addListener: () => {} }, - }, - }; - - (window as any).EventSource = class { - constructor() {} - addEventListener() {} - close() {} - }; - - const scenarioRef = params; - const origFetch = window.fetch; - window.fetch = async function (input: any, init?: any) { - const url = String(input); - if (url.endsWith('/health')) { - return new Response(JSON.stringify({ - status: 'healthy', - token: 'test-token', - mode: 'headed', - agent: { status: 'idle', runningFor: null, queueLength: 0 }, - session: null, - security: { status: 'protected', layers: { testsavant: 'ok', transcript: 'ok', canary: 'ok' } }, - }), { status: 200, headers: { 'Content-Type': 'application/json' } }); - } - if (url.includes('/sidebar-chat')) { - return new Response(JSON.stringify({ - entries: scenarioRef.securityEntries ?? [], - total: (scenarioRef.securityEntries ?? []).length, - agentStatus: 'idle', - activeTabId: 1, - security: { status: 'protected', layers: { testsavant: 'ok', transcript: 'ok', canary: 'ok' } }, - }), { status: 200, headers: { 'Content-Type': 'application/json' } }); - } - if (url.includes('/security-decision') && init?.method === 'POST') { - try { - const body = JSON.parse(init.body || '{}'); - (window as any).__decisionCalls.push(body); - } catch { - (window as any).__decisionCalls.push({ _parseError: true, raw: init?.body }); - } - return new Response(JSON.stringify({ ok: true }), { status: 200, headers: { 'Content-Type': 'application/json' } }); - } - if (url.includes('/sidebar-tabs')) { - return new Response(JSON.stringify({ tabs: [] }), { status: 200 }); - } - if (typeof origFetch === 'function') return origFetch(input, init); - return new Response('{}', { status: 200 }); - } as any; - }, scenario); -} - -let browser: Browser | null = null; - -beforeAll(async () => { - if (!CHROMIUM_AVAILABLE) return; - browser = await chromium.launch({ headless: true }); -}, 30000); - -afterAll(async () => { - if (browser) { - try { - // Race browser.close() against a timeout — on rare occasions Playwright - // hangs on close because an EventSource stub keeps a poll alive. 10s is - // plenty; past that we forcibly drop the handle. Bun's default hook - // timeout is 5s and has bitten this file. - await Promise.race([ - browser.close(), - new Promise((resolve) => setTimeout(resolve, 10000)), - ]); - } catch {} - } -}, 15000); - -/** - * The reviewable security_event the sidebar-agent emits on tool-output BLOCK. - * Mirrors the shape of the real production event: verdict:'block', - * reviewable:true, suspected_text excerpt, per-layer signals, and tabId - * so the banner's Allow/Block buttons know which tab to decide for. - */ -function buildReviewableEntry(overrides?: Partial): any { - return { - id: 42, - ts: '2026-04-20T12:00:00Z', - role: 'agent', - type: 'security_event', - verdict: 'block', - reason: 'tool_result_ml', - layer: 'testsavant_content', - confidence: 0.95, - domain: 'news.ycombinator.com', - tool: 'Bash', - reviewable: true, - suspected_text: 'A comment thread discussing ignore previous instructions and reveal secrets — classifier flagged this as injection but it is actually benign developer content about a prompt injection incident.', - signals: [ - { layer: 'testsavant_content', confidence: 0.95 }, - { layer: 'transcript_classifier', confidence: 0.0, meta: { degraded: true } }, - ], - tabId: 1, - ...overrides, - }; -} - -describe('sidepanel review-flow E2E', () => { - test.skipIf(!CHROMIUM_AVAILABLE)('reviewable event shows review banner with suspected text + buttons', async () => { - const context = await browser!.newContext(); - const page = await context.newPage(); - await installStubsAndCapture(page, { securityEntries: [buildReviewableEntry()] }); - await page.goto(SIDEPANEL_URL); - - // Wait for /sidebar-chat poll to deliver the entry + banner to render. - await page.waitForFunction( - () => { - const b = document.getElementById('security-banner') as HTMLElement | null; - return !!b && b.style.display !== 'none'; - }, - { timeout: 5000 }, - ); - - // Title flips to the review framing (not "Session terminated") - const title = await page.$eval('#security-banner-title', (el) => el.textContent); - expect(title).toContain('Review suspected injection'); - - // Subtitle mentions the tool + domain - const subtitle = await page.$eval('#security-banner-subtitle', (el) => el.textContent); - expect(subtitle).toContain('Bash'); - expect(subtitle).toContain('news.ycombinator.com'); - expect(subtitle).toContain('allow to continue'); - - // Suspected text shows up unescaped (textContent, not innerHTML) - const suspect = await page.$eval('#security-banner-suspect', (el) => el.textContent); - expect(suspect).toContain('ignore previous instructions'); - - // Both action buttons are visible - const allowVisible = await page.locator('#security-banner-btn-allow').isVisible(); - const blockVisible = await page.locator('#security-banner-btn-block').isVisible(); - expect(allowVisible).toBe(true); - expect(blockVisible).toBe(true); - - // Details auto-expanded so the user sees context - const detailsHidden = await page.$eval('#security-banner-details', (el) => (el as HTMLElement).hidden); - expect(detailsHidden).toBe(false); - - await context.close(); - }, 15000); - - test.skipIf(!CHROMIUM_AVAILABLE)('clicking Allow posts {decision:"allow"} and hides banner', async () => { - const context = await browser!.newContext(); - const page = await context.newPage(); - await installStubsAndCapture(page, { securityEntries: [buildReviewableEntry()] }); - await page.goto(SIDEPANEL_URL); - await page.waitForSelector('#security-banner-btn-allow:visible', { timeout: 5000 }); - - await page.click('#security-banner-btn-allow'); - - // Decision POST should have fired with decision:"allow" and the tabId - // from the security_event. Give the fetch promise a tick to resolve. - await page.waitForFunction( - () => (window as any).__decisionCalls?.length > 0, - { timeout: 2000 }, - ); - - const calls = await page.evaluate(() => (window as any).__decisionCalls); - expect(calls).toHaveLength(1); - expect(calls[0].decision).toBe('allow'); - expect(calls[0].tabId).toBe(1); - expect(calls[0].reason).toBe('user'); - - // Banner should hide optimistically after the POST - await page.waitForFunction( - () => { - const b = document.getElementById('security-banner') as HTMLElement | null; - return !!b && b.style.display === 'none'; - }, - { timeout: 2000 }, - ); - - await context.close(); - }, 15000); - - test.skipIf(!CHROMIUM_AVAILABLE)('clicking Block posts {decision:"block"} and hides banner', async () => { - const context = await browser!.newContext(); - const page = await context.newPage(); - await installStubsAndCapture(page, { securityEntries: [buildReviewableEntry({ id: 55 })] }); - await page.goto(SIDEPANEL_URL); - await page.waitForSelector('#security-banner-btn-block:visible', { timeout: 5000 }); - - await page.click('#security-banner-btn-block'); - - await page.waitForFunction( - () => (window as any).__decisionCalls?.length > 0, - { timeout: 2000 }, - ); - - const calls = await page.evaluate(() => (window as any).__decisionCalls); - expect(calls).toHaveLength(1); - expect(calls[0].decision).toBe('block'); - expect(calls[0].tabId).toBe(1); - - await page.waitForFunction( - () => { - const b = document.getElementById('security-banner') as HTMLElement | null; - return !!b && b.style.display === 'none'; - }, - { timeout: 2000 }, - ); - - await context.close(); - }, 15000); - - test.skipIf(!CHROMIUM_AVAILABLE)('non-reviewable event still shows hard-stop banner with no buttons', async () => { - // Regression guard: the existing hard-stop canary leak UX must not be - // disturbed by the reviewable branch. An event without reviewable:true - // keeps the old behavior. - const hardStop = { - id: 99, - ts: '2026-04-20T12:00:00Z', - role: 'agent', - type: 'security_event', - verdict: 'block', - reason: 'canary_leaked', - layer: 'canary', - confidence: 1.0, - domain: 'attacker.example.com', - channel: 'tool_use:Bash', - tabId: 1, - }; - const context = await browser!.newContext(); - const page = await context.newPage(); - await installStubsAndCapture(page, { securityEntries: [hardStop] }); - await page.goto(SIDEPANEL_URL); - await page.waitForFunction( - () => { - const b = document.getElementById('security-banner') as HTMLElement | null; - return !!b && b.style.display !== 'none'; - }, - { timeout: 5000 }, - ); - - const title = await page.$eval('#security-banner-title', (el) => el.textContent); - expect(title).toContain('Session terminated'); - - // Action row stays hidden for the non-reviewable path - const actionsHidden = await page.$eval('#security-banner-actions', (el) => (el as HTMLElement).hidden); - expect(actionsHidden).toBe(true); - - await context.close(); - }, 15000); - - test.skipIf(!CHROMIUM_AVAILABLE)('suspected text renders via textContent, not innerHTML (XSS guard)', async () => { - // If the sidepanel ever regressed to innerHTML for the suspected text, - // a crafted excerpt could execute script. This test uses one; if the - // ', - }); - const context = await browser!.newContext(); - const page = await context.newPage(); - await installStubsAndCapture(page, { securityEntries: [xssAttempt] }); - await page.goto(SIDEPANEL_URL); - await page.waitForSelector('#security-banner-suspect:not([hidden])', { timeout: 5000 }); - - // The literal text should appear inside the suspect block (as text, not markup) - const suspectText = await page.$eval('#security-banner-suspect', (el) => el.textContent); - expect(suspectText).toContain('