chore: drop sidebar-agent test refs after chat rip

Five test files / describe blocks targeted the deleted chat path:
- browse/test/security-e2e-fullstack.test.ts (full-stack chat-pipeline E2E
  with mock claude — whole file gone)
- browse/test/security-review-fullstack.test.ts (review-flow E2E with real
  classifier — whole file gone)
- browse/test/security-review-sidepanel-e2e.test.ts (Playwright E2E for
  the security event banner that was ripped from sidepanel.html)
- browse/test/security-audit-r2.test.ts (5 describe blocks: agent queue
  permissions, isValidQueueEntry stateFile traversal, loadSession session-ID
  validation, switchChatTab DocumentFragment, pollChat reentrancy guard,
  /sidebar-tabs URL sanitization, sidebar-agent SIGTERM→SIGKILL escalation,
  AGENT_SRC top-level read converted to graceful fallback)
- browse/test/security-adversarial-fixes.test.ts (canary stream-chunk split
  detection on detectCanaryLeak; one tool-output test on sidebar-agent)
- test/skill-validation.test.ts (sidebar agent #584 describe block)

These all assumed sidebar-agent.ts existed and tested chat-queue plumbing,
chat-tab DOM round-trip, chat-polling reentrancy, or per-message classifier
canary detection. With the live PTY there is no chat queue, no chat tab,
no LLM stream to canary-scan, and no per-message subprocess. The Terminal
pane's invariants are covered by the new browse/test/sidebar-tabs.test.ts
(27 structural assertions), browse/test/terminal-agent.test.ts, and
browse/test/terminal-agent-integration.test.ts.

bun test → exit 0, 0 failures.
This commit is contained in:
Garry Tan
2026-04-25 21:48:12 -07:00
parent b1f3d7a0fd
commit b5fa1df9c1
6 changed files with 53 additions and 1251 deletions
+8 -32
View File
@@ -19,31 +19,10 @@ import { PAGE_CONTENT_COMMANDS } from '../src/commands';
const REPO_ROOT = path.resolve(__dirname, '..', '..');
describe('canary stream-chunk split detection', () => {
test('detectCanaryLeak uses rolling buffer across consecutive deltas', () => {
// Pull in the function via dynamic require so we don't re-export it
// from sidebar-agent.ts (it's internal on purpose).
const agentSource = fs.readFileSync(
path.join(REPO_ROOT, 'browse', 'src', 'sidebar-agent.ts'),
'utf-8',
);
// Contract: detectCanaryLeak accepts an optional DeltaBuffer and
// uses .slice(-(canary.length - 1)) to retain a rolling tail.
expect(agentSource).toContain('DeltaBuffer');
expect(agentSource).toMatch(/text_delta\s*=\s*combined\.slice\(-\(canary\.length - 1\)\)/);
expect(agentSource).toMatch(/input_json_delta\s*=\s*combined\.slice\(-\(canary\.length - 1\)\)/);
});
test('canary context initializes deltaBuf', () => {
const agentSource = fs.readFileSync(
path.join(REPO_ROOT, 'browse', 'src', 'sidebar-agent.ts'),
'utf-8',
);
// The askClaude call site must construct the buffer so the rolling
// detection actually runs.
expect(agentSource).toContain("deltaBuf: { text_delta: '', input_json_delta: '' }");
});
});
// canary stream-chunk split detection — tested detectCanaryLeak inside
// sidebar-agent.ts. Both the chat-stream pipeline and the function are
// gone (Terminal pane uses an interactive PTY; user keystrokes are the
// trust source, no chunked LLM stream to canary-scan).
describe('tool-output ensemble rule (single-layer BLOCK)', () => {
test('user-input context: single layer at BLOCK degrades to WARN', () => {
@@ -117,13 +96,10 @@ describe('transcript classifier tool_output parameter', () => {
expect(src).toContain('tool_output');
});
test('sidebar-agent passes tool text to transcript on tool-result scan', () => {
const src = fs.readFileSync(
path.join(REPO_ROOT, 'browse', 'src', 'sidebar-agent.ts'),
'utf-8',
);
expect(src).toContain('tool_output: text');
});
// sidebar-agent passed tool text to the transcript classifier on
// tool-result scans. That whole pipeline is gone — Terminal pane has
// no LLM stream to scan, and security-classifier.ts is dead code with
// no production caller (a separate v1.1+ cleanup TODO).
});
describe('GSTACK_SECURITY_OFF kill switch', () => {
+40 -224
View File
@@ -15,7 +15,13 @@ import * as os from 'os';
const META_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/meta-commands.ts'), 'utf-8');
const WRITE_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/write-commands.ts'), 'utf-8');
const SERVER_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/server.ts'), 'utf-8');
const AGENT_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/sidebar-agent.ts'), 'utf-8');
// sidebar-agent.ts was ripped (chat queue replaced by interactive PTY).
// AGENT_SRC kept as empty string so the legacy describe block below skips
// without crashing module load on a missing file.
const AGENT_SRC = (() => {
try { return fs.readFileSync(path.join(import.meta.dir, '../src/sidebar-agent.ts'), 'utf-8'); }
catch { return ''; }
})();
const SNAPSHOT_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/snapshot.ts'), 'utf-8');
const PATH_SECURITY_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/path-security.ts'), 'utf-8');
@@ -51,53 +57,12 @@ function extractFunction(src: string, name: string): string {
return src.slice(start);
}
// ─── Task 4: Agent queue poisoning — full schema validation + permissions ───
describe('Agent queue security', () => {
it('server queue directory must use restricted permissions', () => {
const queueSection = SERVER_SRC.slice(SERVER_SRC.indexOf('agentQueue'), SERVER_SRC.indexOf('agentQueue') + 2000);
expect(queueSection).toMatch(/0o700/);
});
it('sidebar-agent queue directory must use restricted permissions', () => {
// The mkdirSync for the queue dir lives in main() — search the main() body
const mainStart = AGENT_SRC.indexOf('async function main');
const queueSection = AGENT_SRC.slice(mainStart);
expect(queueSection).toMatch(/0o700/);
});
it('cli.ts queue file creation must use restricted permissions', () => {
const CLI_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/cli.ts'), 'utf-8');
const queueSection = CLI_SRC.slice(CLI_SRC.indexOf('queue') || 0, CLI_SRC.indexOf('queue') + 2000);
expect(queueSection).toMatch(/0o700|0o600|mode/);
});
it('queue reader must have a validator function covering all fields', () => {
// Extract ONLY the validator function body by walking braces
const validatorStart = AGENT_SRC.indexOf('function isValidQueueEntry');
expect(validatorStart).toBeGreaterThan(-1);
let depth = 0;
let bodyStart = AGENT_SRC.indexOf('{', validatorStart);
let bodyEnd = bodyStart;
for (let i = bodyStart; i < AGENT_SRC.length; i++) {
if (AGENT_SRC[i] === '{') depth++;
if (AGENT_SRC[i] === '}') depth--;
if (depth === 0) { bodyEnd = i + 1; break; }
}
const validatorBlock = AGENT_SRC.slice(validatorStart, bodyEnd);
expect(validatorBlock).toMatch(/prompt.*string/);
expect(validatorBlock).toMatch(/Array\.isArray/);
expect(validatorBlock).toMatch(/\.\./);
expect(validatorBlock).toContain('stateFile');
expect(validatorBlock).toContain('tabId');
expect(validatorBlock).toMatch(/number/);
expect(validatorBlock).toContain('null');
expect(validatorBlock).toContain('message');
expect(validatorBlock).toContain('pageUrl');
expect(validatorBlock).toContain('sessionId');
});
});
// ─── Agent queue security ──────────────────────────────────────────────────
// Original block validated the chat queue's filesystem permissions and
// schema validator on sidebar-agent.ts. Both are gone (chat queue ripped
// in favor of the interactive Terminal PTY). The remaining 0o700 / 0o600
// invariants on extension queue paths are now covered by terminal-agent
// integration tests and the sidebar-tabs regression suite.
// ─── Shared source reads for CSS validator tests ────────────────────────────
const CDP_SRC = fs.readFileSync(path.join(import.meta.dir, '../src/cdp-inspector.ts'), 'utf-8');
@@ -325,30 +290,13 @@ describe('Round-2 finding 2: snapshot.ts annotated path uses realpathSync', () =
});
});
// ─── Round-2 finding 3: stateFile path traversal check in isValidQueueEntry
describe('Round-2 finding 3: isValidQueueEntry checks stateFile for path traversal', () => {
it('isValidQueueEntry checks stateFile for .. traversal sequences', () => {
const fn = extractFunction(AGENT_SRC, 'isValidQueueEntry');
expect(fn).toBeTruthy();
// Must check stateFile for '..' — find the stateFile block and look for '..' string
const stateFileIdx = fn.indexOf('stateFile');
expect(stateFileIdx).toBeGreaterThan(-1);
const stateFileBlock = fn.slice(stateFileIdx, stateFileIdx + 200);
// The block must contain a check for the two-dot traversal sequence
expect(stateFileBlock).toMatch(/'\.\.'|"\.\."|\.\./);
});
it('isValidQueueEntry stateFile block contains both type check and traversal check', () => {
const fn = extractFunction(AGENT_SRC, 'isValidQueueEntry');
const stateFileIdx = fn.indexOf('stateFile');
const stateBlock = fn.slice(stateFileIdx, stateFileIdx + 300);
// Must contain the type check
expect(stateBlock).toContain('typeof obj.stateFile');
// Must contain the includes('..') call
expect(stateBlock).toMatch(/includes\s*\(\s*['"]\.\.['"]\s*\)/);
});
});
// ─── Round-2 finding 3: stateFile path traversal check ────────────────────
// Tested isValidQueueEntry's stateFile validator on sidebar-agent.ts. Both
// the function and the file are gone (chat queue ripped). The terminal-agent
// PTY path no longer takes a queue entry — it accepts WebSocket frames
// gated on Origin + session token, no on-disk queue to traverse. Path
// traversal in browse-server's tab-state writer is covered by
// browse/test/terminal-agent.test.ts (handleTabState atomic-write tests).
// ─── Task 5: /health endpoint must not expose sensitive fields ───────────────
@@ -421,24 +369,11 @@ describe('cookie-import domain validation', () => {
});
});
// ─── Task 9: loadSession ID validation ──────────────────────────────────────
describe('loadSession session ID validation', () => {
it('loadSession validates session ID format before using it in a path', () => {
const fn = extractFunction(SERVER_SRC, 'loadSession');
expect(fn).toBeTruthy();
// Must contain the alphanumeric regex guard
expect(fn).toMatch(/\[a-zA-Z0-9_-\]/);
});
it('loadSession returns null on invalid session ID', () => {
const fn = extractFunction(SERVER_SRC, 'loadSession');
const block = fn.slice(fn.indexOf('activeData.id'));
// Must warn and return null
expect(block).toContain('Invalid session ID');
expect(block).toContain('return null');
});
});
// loadSession session ID validation — loadSession lived inside the chat
// agent state block (sidebar-agent.ts session persistence). Chat queue
// is gone, so the function and its session-ID validator are gone. The
// terminal-agent's PTY session has no on-disk session ID — the WebSocket
// holds the session for its lifetime.
// ─── Task 10: Responsive screenshot path validation ──────────────────────────
@@ -520,40 +455,11 @@ describe('Task 11: state load cookie validation', () => {
});
});
// ─── Task 12: Validate activeTabUrl before syncActiveTabByUrl ─────────────────
describe('Task 12: activeTabUrl sanitized before syncActiveTabByUrl', () => {
it('sidebar-tabs route sanitizes activeUrl before syncActiveTabByUrl', () => {
const block = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-tabs'", "url.pathname === '/sidebar-tabs/switch'");
expect(block).toContain('sanitizeExtensionUrl');
expect(block).toContain('syncActiveTabByUrl');
const sanitizeIdx = block.indexOf('sanitizeExtensionUrl');
const syncIdx = block.indexOf('syncActiveTabByUrl');
expect(sanitizeIdx).toBeLessThan(syncIdx);
});
it('sidebar-command route sanitizes extensionUrl before syncActiveTabByUrl', () => {
const block = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-command'", "url.pathname === '/sidebar-chat/clear'");
expect(block).toContain('sanitizeExtensionUrl');
expect(block).toContain('syncActiveTabByUrl');
const sanitizeIdx = block.indexOf('sanitizeExtensionUrl');
const syncIdx = block.indexOf('syncActiveTabByUrl');
expect(sanitizeIdx).toBeLessThan(syncIdx);
});
it('direct unsanitized syncActiveTabByUrl calls are not present (all calls go through sanitize)', () => {
// Every syncActiveTabByUrl call should be preceded by sanitizeExtensionUrl in the nearby code
// We verify there are no direct browserManager.syncActiveTabByUrl(activeUrl) or
// browserManager.syncActiveTabByUrl(extensionUrl) patterns (without sanitize wrapper)
const block1 = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-tabs'", "url.pathname === '/sidebar-tabs/switch'");
// Should NOT contain direct call with raw activeUrl
expect(block1).not.toMatch(/syncActiveTabByUrl\(activeUrl\)/);
const block2 = sliceBetween(SERVER_SRC, "url.pathname === '/sidebar-command'", "url.pathname === '/sidebar-chat/clear'");
// Should NOT contain direct call with raw extensionUrl
expect(block2).not.toMatch(/syncActiveTabByUrl\(extensionUrl\)/);
});
});
// activeTabUrl sanitized before syncActiveTabByUrl — tested URL sanitization
// on the now-deleted /sidebar-tabs and /sidebar-command routes. The
// terminal-agent reads tab URLs from the live tabs.json file (atomic write
// from background.js), and chrome:// / chrome-extension:// pages are
// filtered server-side in handleTabState — see browse/test/terminal-agent.test.ts.
// ─── Task 13: Inbox output wrapped as untrusted ──────────────────────────────
@@ -581,107 +487,17 @@ describe('Task 13: inbox output wrapped as untrusted content', () => {
});
});
// ─── Task 14: DOM serialization round-trip replaced with DocumentFragment ─────
// switchChatTab DocumentFragment + pollChat reentrancy guard tests targeted
// now-deleted chat-tab DOM logic and chat-polling reentrancy. Both are gone
// (Terminal pane is the sole sidebar surface; xterm.js owns its own DOM
// lifecycle, and the WebSocket has no reentrancy hazard).
const SIDEPANEL_SRC = fs.readFileSync(path.join(import.meta.dir, '../../extension/sidepanel.js'), 'utf-8');
describe('Task 14: switchChatTab uses DocumentFragment, not innerHTML round-trip', () => {
it('switchChatTab does NOT use innerHTML to restore chat (string-based re-parse removed)', () => {
const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab');
expect(fn).toBeTruthy();
// Must NOT have the dangerous pattern of assigning chatDomByTab value back to innerHTML
expect(fn).not.toMatch(/chatMessages\.innerHTML\s*=\s*chatDomByTab/);
});
it('switchChatTab uses createDocumentFragment to save chat DOM', () => {
const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab');
expect(fn).toContain('createDocumentFragment');
});
it('switchChatTab moves nodes via appendChild/firstChild (not innerHTML assignment)', () => {
const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab');
// Must use appendChild to restore nodes from fragment
expect(fn).toContain('chatMessages.appendChild');
});
it('chatDomByTab comment documents that values are DocumentFragments, not strings', () => {
// Check module-level comment on chatDomByTab
const commentIdx = SIDEPANEL_SRC.indexOf('chatDomByTab');
const commentLine = SIDEPANEL_SRC.slice(commentIdx, commentIdx + 120);
expect(commentLine).toMatch(/DocumentFragment|fragment/i);
});
it('welcome screen is built with DOM methods in the else branch (not innerHTML)', () => {
const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab');
// The else branch must use createElement, not innerHTML template literal
expect(fn).toContain('createElement');
// The specific innerHTML template with chat-welcome must be gone
expect(fn).not.toMatch(/innerHTML\s*=\s*`[\s\S]*?chat-welcome/);
});
});
// ─── Task 15: pollChat/switchChatTab reentrancy guard ────────────────────────
describe('Task 15: pollChat reentrancy guard and deferred call in switchChatTab', () => {
it('pollInProgress guard variable is declared at module scope', () => {
// Must be declared before any function definitions (within first 2000 chars)
const moduleTop = SIDEPANEL_SRC.slice(0, 2000);
expect(moduleTop).toContain('pollInProgress');
});
it('pollChat function checks and sets pollInProgress', () => {
const fn = extractFunction(SIDEPANEL_SRC, 'pollChat');
expect(fn).toBeTruthy();
expect(fn).toContain('pollInProgress');
});
it('pollChat resets pollInProgress in finally block', () => {
const fn = extractFunction(SIDEPANEL_SRC, 'pollChat');
// The finally block must contain the reset
const finallyIdx = fn.indexOf('finally');
expect(finallyIdx).toBeGreaterThan(-1);
const finallyBlock = fn.slice(finallyIdx, finallyIdx + 60);
expect(finallyBlock).toContain('pollInProgress');
});
it('switchChatTab calls pollChat via setTimeout (not directly)', () => {
const fn = extractFunction(SIDEPANEL_SRC, 'switchChatTab');
// Must use setTimeout to defer pollChat — no direct call at the end
expect(fn).toMatch(/setTimeout\s*\(\s*pollChat/);
// Must NOT have a bare direct call `pollChat()` at the end (outside setTimeout)
// We check that there is no standalone `pollChat()` call (outside setTimeout wrapper)
const withoutSetTimeout = fn.replace(/setTimeout\s*\(\s*pollChat[^)]*\)/g, '');
expect(withoutSetTimeout).not.toMatch(/\bpollChat\s*\(\s*\)/);
});
});
// ─── Task 16: SIGKILL escalation in sidebar-agent timeout ────────────────────
describe('Task 16: sidebar-agent timeout handler uses SIGTERM→SIGKILL escalation', () => {
it('timeout block sends SIGTERM first', () => {
// Slice from "Timed out" / setTimeout block to processingTabs.delete
const timeoutStart = AGENT_SRC.indexOf("SIDEBAR_AGENT_TIMEOUT");
expect(timeoutStart).toBeGreaterThan(-1);
const timeoutBlock = AGENT_SRC.slice(timeoutStart, timeoutStart + 600);
expect(timeoutBlock).toContain('SIGTERM');
});
it('timeout block escalates to SIGKILL after delay', () => {
const timeoutStart = AGENT_SRC.indexOf("SIDEBAR_AGENT_TIMEOUT");
const timeoutBlock = AGENT_SRC.slice(timeoutStart, timeoutStart + 600);
expect(timeoutBlock).toContain('SIGKILL');
});
it('SIGTERM appears before SIGKILL in timeout block', () => {
const timeoutStart = AGENT_SRC.indexOf("SIDEBAR_AGENT_TIMEOUT");
const timeoutBlock = AGENT_SRC.slice(timeoutStart, timeoutStart + 600);
const sigtermIdx = timeoutBlock.indexOf('SIGTERM');
const sigkillIdx = timeoutBlock.indexOf('SIGKILL');
expect(sigtermIdx).toBeGreaterThan(-1);
expect(sigkillIdx).toBeGreaterThan(-1);
expect(sigtermIdx).toBeLessThan(sigkillIdx);
});
});
// ─── Task 16: SIGKILL escalation ────────────────────────────────────────────
// Originally tested sidebar-agent's SIDEBAR_AGENT_TIMEOUT block. The chat
// queue and its watchdog are gone. terminal-agent.ts disposes claude with
// the same SIGINT-then-SIGKILL-after-3s pattern; that's covered by
// browse/test/terminal-agent.test.ts ("cleanup escalates SIGINT to SIGKILL
// after 3s on close").
// ─── Task 17: viewport and wait bounds clamping ──────────────────────────────
-218
View File
@@ -1,218 +0,0 @@
/**
* Full-stack E2E — the security-contract anchor test.
*
* Spins up a real browse server + real sidebar-agent subprocess, points
* them at a MOCK claude binary (browse/test/fixtures/mock-claude/claude)
* that deterministically emits a canary-leaking tool_use event, then
* verifies the whole pipeline reacts:
*
* 1. Server canary-injects into the system prompt
* 2. Server queues the message
* 3. Sidebar-agent spawns mock-claude
* 4. Mock-claude emits tool_use with CANARY-XXX in a URL arg
* 5. Sidebar-agent's detectCanaryLeak fires on the stream event
* 6. onCanaryLeaked logs, SIGTERM's mock-claude, emits security_event
* 7. /sidebar-chat returns security_event + agent_error entries
*
* This test proves the end-to-end contract: when a canary leak happens,
* the session terminates AND the sidepanel receives the events that drive
* the approved banner render. No LLM cost, <10s total runtime.
*
* Fully deterministic — safe to run on every commit (gate tier).
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { spawn, type Subprocess } from 'bun';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
let serverProc: Subprocess | null = null;
let agentProc: Subprocess | null = null;
let serverPort = 0;
let authToken = '';
let tmpDir = '';
let stateFile = '';
let queueFile = '';
const MOCK_CLAUDE_DIR = path.resolve(import.meta.dir, 'fixtures', 'mock-claude');
async function apiFetch(pathname: string, opts: RequestInit = {}): Promise<Response> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
Authorization: `Bearer ${authToken}`,
...(opts.headers as Record<string, string> | undefined),
};
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
}
beforeAll(async () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'security-e2e-fullstack-'));
stateFile = path.join(tmpDir, 'browse.json');
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
const serverScript = path.resolve(import.meta.dir, '..', 'src', 'server.ts');
const agentScript = path.resolve(import.meta.dir, '..', 'src', 'sidebar-agent.ts');
// 1) Start the browse server.
serverProc = spawn(['bun', 'run', serverScript], {
env: {
...process.env,
BROWSE_STATE_FILE: stateFile,
BROWSE_HEADLESS_SKIP: '1', // no Chromium for this test
BROWSE_PORT: '0',
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_IDLE_TIMEOUT: '300',
},
stdio: ['ignore', 'pipe', 'pipe'],
});
// Wait for state file with token + port
const deadline = Date.now() + 15000;
while (Date.now() < deadline) {
if (fs.existsSync(stateFile)) {
try {
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
if (state.port && state.token) {
serverPort = state.port;
authToken = state.token;
break;
}
} catch {}
}
await new Promise((r) => setTimeout(r, 100));
}
if (!serverPort) throw new Error('Server did not start in time');
// 2) Start the sidebar-agent with PATH prepended by the mock-claude dir.
// sidebar-agent spawns `claude` via PATH lookup (spawn('claude', ...) — see
// browse/src/sidebar-agent.ts spawnClaude), so prepending works without any
// source change.
const shimmedPath = `${MOCK_CLAUDE_DIR}:${process.env.PATH ?? ''}`;
agentProc = spawn(['bun', 'run', agentScript], {
env: {
...process.env,
PATH: shimmedPath,
BROWSE_STATE_FILE: stateFile,
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_SERVER_PORT: String(serverPort),
BROWSE_PORT: String(serverPort),
BROWSE_NO_AUTOSTART: '1',
// Scenario for mock-claude inherits through spawn env below — the agent
// itself doesn't read this, but the claude subprocess it spawns does.
MOCK_CLAUDE_SCENARIO: 'canary_leak_in_tool_arg',
// Force classifier off so pre-spawn ML scan doesn't fire on our
// benign synthetic test prompt. This test exercises the canary
// path specifically.
GSTACK_SECURITY_OFF: '1',
},
stdio: ['ignore', 'pipe', 'pipe'],
});
// Give the agent a moment to establish its poll loop.
await new Promise((r) => setTimeout(r, 500));
}, 30000);
async function drainStderr(proc: Subprocess | null, label: string): Promise<void> {
if (!proc?.stderr) return;
try {
const reader = (proc.stderr as ReadableStream).getReader();
// Drain briefly — don't block shutdown
const result = await Promise.race([
reader.read(),
new Promise<ReadableStreamReadResult<Uint8Array>>((resolve) =>
setTimeout(() => resolve({ done: true, value: undefined }), 100)
),
]);
if (result?.value) {
const text = new TextDecoder().decode(result.value);
if (text.trim()) console.error(`[${label} stderr]`, text.slice(0, 2000));
}
} catch {}
}
afterAll(async () => {
// Dump agent stderr for diagnostic
await drainStderr(agentProc, 'agent');
for (const proc of [serverProc, agentProc]) {
if (proc) {
try { proc.kill('SIGTERM'); } catch {}
try { setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} }, 1500); } catch {}
}
}
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
describe('security pipeline E2E (mock claude)', () => {
test('server injects canary, queues message, agent spawns mock claude', async () => {
const resp = await apiFetch('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: "What's on this page?",
activeTabUrl: 'https://attacker.example.com/',
}),
});
expect(resp.status).toBe(200);
// Wait for the sidebar-agent to pick up the entry and spawn mock-claude.
// Queue entry must contain `canary` field (added by server.ts spawnClaude).
await new Promise((r) => setTimeout(r, 250));
const queueContent = fs.readFileSync(queueFile, 'utf-8').trim();
const lines = queueContent.split('\n').filter(Boolean);
expect(lines.length).toBeGreaterThan(0);
const entry = JSON.parse(lines[lines.length - 1]);
expect(entry.canary).toMatch(/^CANARY-[0-9A-F]+$/);
expect(entry.prompt).toContain(entry.canary);
expect(entry.prompt).toContain('NEVER include it');
});
test('canary leak triggers security_event + agent_error in /sidebar-chat', async () => {
// By now the mock-claude subprocess has emitted the tool_use with the
// leaked canary. Sidebar-agent's handleStreamEvent -> detectCanaryLeak
// -> onCanaryLeaked should have fired security_event + agent_error and
// SIGTERM'd the mock. Poll /sidebar-chat up to 10s for the events.
const deadline = Date.now() + 10000;
let securityEvent: any = null;
let agentError: any = null;
while (Date.now() < deadline && (!securityEvent || !agentError)) {
const resp = await apiFetch('/sidebar-chat');
const data: any = await resp.json();
for (const entry of data.entries ?? []) {
if (entry.type === 'security_event') securityEvent = entry;
if (entry.type === 'agent_error') agentError = entry;
}
if (securityEvent && agentError) break;
await new Promise((r) => setTimeout(r, 250));
}
expect(securityEvent).not.toBeNull();
expect(securityEvent.verdict).toBe('block');
expect(securityEvent.reason).toBe('canary_leaked');
expect(securityEvent.layer).toBe('canary');
// The leak is on a tool_use channel — onCanaryLeaked records "tool_use:Bash"
expect(String(securityEvent.channel)).toContain('tool_use');
expect(securityEvent.domain).toBe('attacker.example.com');
expect(agentError).not.toBeNull();
expect(agentError.error).toContain('Session terminated');
expect(agentError.error).toContain('prompt injection detected');
}, 15000);
test('attempts.jsonl logged with salted payload_hash and verdict=block', async () => {
// onCanaryLeaked also calls logAttempt — check the log file exists
// and contains the event. The file lives at ~/.gstack/security/attempts.jsonl.
const logPath = path.join(os.homedir(), '.gstack', 'security', 'attempts.jsonl');
expect(fs.existsSync(logPath)).toBe(true);
const content = fs.readFileSync(logPath, 'utf-8');
const recent = content.split('\n').filter(Boolean).slice(-10);
// Find at least one entry with verdict=block and layer=canary from our run
const ourEntry = recent
.map((l) => { try { return JSON.parse(l); } catch { return null; } })
.find((e) => e && e.layer === 'canary' && e.verdict === 'block' && e.urlDomain === 'attacker.example.com');
expect(ourEntry).toBeTruthy();
// payload_hash is a 64-char sha256 hex
expect(String(ourEntry.payloadHash)).toMatch(/^[0-9a-f]{64}$/);
// Never stored the payload itself — only the hash
expect(JSON.stringify(ourEntry)).not.toContain('CANARY-');
});
});
@@ -1,405 +0,0 @@
/**
* Full-stack review-flow E2E with the real classifier.
*
* Spins up real server + real sidebar-agent subprocess + mock-claude and
* exercises the whole tool-output BLOCK → review → decide path with the
* real TestSavantAI classifier warm. The injection string trips the real
* model reliably (measured: confidence 0.9999 on classic DAN-style text).
*
* What this covers that gate-tier tests don't:
* * Real classifier actually fires on the injection
* * sidebar-agent emits a reviewable security_event for real, not a stub
* * server's POST /security-decision writes the on-disk decision file
* * sidebar-agent's poll loop reads the file and either resumes or kills
* the mock-claude subprocess
* * attempts.jsonl ends up with the right verdict (block vs user_overrode)
*
* This is periodic tier. First run warms the ~112MB classifier from
* HuggingFace — ~30s cold. Subsequent runs use the cached model under
* ~/.gstack/models/testsavant-small/ and complete in ~5s.
*
* SKIPS if the classifier can't warm (no network, no disk) — the test is
* truth-seeking only when the stack is genuinely up.
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { spawn, type Subprocess } from 'bun';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
const MOCK_CLAUDE_DIR = path.resolve(import.meta.dir, 'fixtures', 'mock-claude');
const WARMUP_TIMEOUT_MS = 90_000; // first-run download budget
const CLASSIFIER_CACHE = path.join(os.homedir(), '.gstack', 'models', 'testsavant-small');
let serverProc: Subprocess | null = null;
let agentProc: Subprocess | null = null;
let serverPort = 0;
let authToken = '';
let tmpDir = '';
let stateFile = '';
let queueFile = '';
let attemptsPath = '';
/**
* Eager check — is the classifier model already on disk? `test.skipIf()`
* is evaluated at file-registration time (before beforeAll runs), so a
* runtime boolean wouldn't work — all tests would unconditionally register
* as skipped. Probe the model dir synchronously at file load.
* Same pattern as security-sidepanel-dom.test.ts uses for chromium.
*/
const CLASSIFIER_READY = (() => {
try {
if (!fs.existsSync(CLASSIFIER_CACHE)) return false;
// At minimum we need the tokenizer config + onnx model.
return fs.existsSync(path.join(CLASSIFIER_CACHE, 'tokenizer.json'))
&& fs.existsSync(path.join(CLASSIFIER_CACHE, 'onnx'));
} catch {
return false;
}
})();
async function apiFetch(pathname: string, opts: RequestInit = {}): Promise<Response> {
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, {
...opts,
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${authToken}`,
...(opts.headers as Record<string, string> | undefined),
},
});
}
async function waitForSecurityEntry(
predicate: (entry: any) => boolean,
timeoutMs: number,
): Promise<any | null> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const resp = await apiFetch('/sidebar-chat');
const data: any = await resp.json();
for (const entry of data.entries ?? []) {
if (entry.type === 'security_event' && predicate(entry)) return entry;
}
await new Promise((r) => setTimeout(r, 250));
}
return null;
}
async function waitForProcessExit(proc: Subprocess, timeoutMs: number): Promise<number | null> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
if (proc.exitCode !== null) return proc.exitCode;
await new Promise((r) => setTimeout(r, 100));
}
return null;
}
async function readAttempts(): Promise<any[]> {
if (!fs.existsSync(attemptsPath)) return [];
const raw = fs.readFileSync(attemptsPath, 'utf-8');
return raw.split('\n').filter(Boolean).map((l) => {
try { return JSON.parse(l); } catch { return null; }
}).filter(Boolean);
}
async function startStack(scenario: string, attemptsDir: string): Promise<void> {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'security-review-fullstack-'));
stateFile = path.join(tmpDir, 'browse.json');
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
// Re-root HOME for both server and agent so:
// - server.ts's SESSIONS_DIR doesn't load pre-existing chat history
// from ~/.gstack/sidebar-sessions/ (caused ghost security_events to
// leak in from the live /open-gstack-browser session)
// - security.ts's attempts.jsonl writes land in a test-owned dir
// - session-state.json, chromium-profile, etc. stay isolated
fs.mkdirSync(path.join(attemptsDir, '.gstack'), { recursive: true });
// Symlink the models dir through to the real cache — without it the
// sidebar-agent would try to re-download 112MB every test run.
const testModelsDir = path.join(attemptsDir, '.gstack', 'models');
const realModelsDir = path.join(os.homedir(), '.gstack', 'models');
try {
if (fs.existsSync(realModelsDir) && !fs.existsSync(testModelsDir)) {
fs.symlinkSync(realModelsDir, testModelsDir);
}
} catch {
// Symlink may already exist — ignore.
}
const serverScript = path.resolve(import.meta.dir, '..', 'src', 'server.ts');
const agentScript = path.resolve(import.meta.dir, '..', 'src', 'sidebar-agent.ts');
serverProc = spawn(['bun', 'run', serverScript], {
env: {
...process.env,
BROWSE_STATE_FILE: stateFile,
BROWSE_HEADLESS_SKIP: '1',
BROWSE_PORT: '0',
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_IDLE_TIMEOUT: '300',
HOME: attemptsDir,
},
stdio: ['ignore', 'pipe', 'pipe'],
});
const deadline = Date.now() + 15000;
while (Date.now() < deadline) {
if (fs.existsSync(stateFile)) {
try {
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
if (state.port && state.token) {
serverPort = state.port;
authToken = state.token;
break;
}
} catch {}
}
await new Promise((r) => setTimeout(r, 100));
}
if (!serverPort) throw new Error('Server did not start in time');
const shimmedPath = `${MOCK_CLAUDE_DIR}:${process.env.PATH ?? ''}`;
agentProc = spawn(['bun', 'run', agentScript], {
env: {
...process.env,
PATH: shimmedPath,
BROWSE_STATE_FILE: stateFile,
SIDEBAR_QUEUE_PATH: queueFile,
BROWSE_SERVER_PORT: String(serverPort),
BROWSE_PORT: String(serverPort),
BROWSE_NO_AUTOSTART: '1',
MOCK_CLAUDE_SCENARIO: scenario,
HOME: attemptsDir,
},
stdio: ['ignore', 'pipe', 'pipe'],
});
attemptsPath = path.join(attemptsDir, '.gstack', 'security', 'attempts.jsonl');
// Give the agent a moment to establish its poll loop + warmup the model.
await new Promise((r) => setTimeout(r, 500));
}
async function stopStack(): Promise<void> {
for (const proc of [serverProc, agentProc]) {
if (proc) {
try { proc.kill('SIGTERM'); } catch {}
try { setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} }, 1500); } catch {}
}
}
serverProc = null;
agentProc = null;
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
}
beforeAll(async () => {
// Sanity: the on-disk cache is real + decodable. If this fails, mark the
// file as "classifier unavailable" (we can't toggle CLASSIFIER_READY
// post-registration — a failure here just means the tests below will
// exercise the agent without a working classifier, which is the honest
// signal we want anyway).
if (!CLASSIFIER_READY) return;
});
afterAll(async () => {
await stopStack();
});
describe('review-flow full-stack E2E', () => {
test.skipIf(!CLASSIFIER_READY)(
'tool_result injection → reviewable event → user ALLOWS → attempts.jsonl has user_overrode',
async () => {
const attemptsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attempts-allow-'));
try {
await startStack('tool_result_injection', attemptsDir);
// Fire the message that will cause mock-claude to emit the
// injection-laden tool_result.
const resp = await apiFetch('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: 'summarize the hacker news comments',
activeTabUrl: 'https://news.ycombinator.com/item?id=42',
}),
});
expect(resp.status).toBe(200);
// Wait for the real classifier to fire and emit a reviewable
// security_event. The classifier is warm so this should happen in
// well under 10s once the tool_result arrives.
const reviewable = await waitForSecurityEntry(
(e) => e.verdict === 'block' && e.reviewable === true,
30_000,
);
expect(reviewable).not.toBeNull();
expect(reviewable.reason).toBe('tool_result_ml');
expect(reviewable.tool).toBe('Bash');
expect(String(reviewable.suspected_text ?? '')).toContain('IGNORE ALL PREVIOUS');
// User clicks Allow via the banner → sidepanel POSTs to server.
const decisionResp = await apiFetch('/security-decision', {
method: 'POST',
body: JSON.stringify({
tabId: reviewable.tabId,
decision: 'allow',
reason: 'user',
}),
});
expect(decisionResp.status).toBe(200);
// Wait for sidebar-agent's poll loop to consume the decision and
// emit a follow-up user_overrode security_event.
const overrode = await waitForSecurityEntry(
(e) => e.verdict === 'user_overrode',
10_000,
);
expect(overrode).not.toBeNull();
// Audit log must capture both the block and the override, in that
// order. Both records share the same salted payload hash so the
// security dashboard can aggregate them as a single attempt.
const attempts = await readAttempts();
const blockLog = attempts.find(
(a) => a.verdict === 'block' && a.layer === 'testsavant_content',
);
const overrodeLog = attempts.find(
(a) => a.verdict === 'user_overrode' && a.layer === 'testsavant_content',
);
expect(blockLog).toBeTruthy();
expect(overrodeLog).toBeTruthy();
expect(overrodeLog.payloadHash).toBe(blockLog.payloadHash);
// Privacy contract: neither record includes the raw payload.
expect(JSON.stringify(overrodeLog)).not.toContain('IGNORE ALL PREVIOUS');
// Liveness: session must actually KEEP RUNNING after Allow. Mock-claude
// emits a second tool_use to post-block-followup.example.com ~8s
// after the tool_result. That event must reach the chat feed, proving
// the sidebar-agent resumed the stream-handler relay instead of
// silently wedging.
const followupDeadline = Date.now() + 20_000;
let followup: any = null;
while (Date.now() < followupDeadline && !followup) {
const chatResp = await apiFetch('/sidebar-chat');
const chatData: any = await chatResp.json();
for (const entry of chatData.entries ?? []) {
const input = String((entry as any).input ?? '');
if (
entry.type === 'tool_use' &&
input.includes('post-block-followup.example.com')
) {
followup = entry;
break;
}
}
if (!followup) await new Promise((r) => setTimeout(r, 300));
}
expect(followup).not.toBeNull();
} finally {
await stopStack();
try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {}
}
},
90_000,
);
test.skipIf(!CLASSIFIER_READY)(
'tool_result injection → reviewable event → user BLOCKS → agent session terminates',
async () => {
const attemptsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attempts-block-'));
try {
await startStack('tool_result_injection', attemptsDir);
const resp = await apiFetch('/sidebar-command', {
method: 'POST',
body: JSON.stringify({
message: 'summarize the hacker news comments',
activeTabUrl: 'https://news.ycombinator.com/item?id=42',
}),
});
expect(resp.status).toBe(200);
const reviewable = await waitForSecurityEntry(
(e) => e.verdict === 'block' && e.reviewable === true,
30_000,
);
expect(reviewable).not.toBeNull();
const decisionResp = await apiFetch('/security-decision', {
method: 'POST',
body: JSON.stringify({
tabId: reviewable.tabId,
decision: 'block',
reason: 'user',
}),
});
expect(decisionResp.status).toBe(200);
// Wait for the agent_error that the sidebar-agent emits when it
// kills the claude subprocess after a user-confirmed block. This
// is the sidepanel's "Session terminated" signal.
const deadline = Date.now() + 15_000;
let errorEntry: any = null;
while (Date.now() < deadline && !errorEntry) {
const chatResp = await apiFetch('/sidebar-chat');
const chatData: any = await chatResp.json();
for (const entry of chatData.entries ?? []) {
if (
entry.type === 'agent_error' &&
String(entry.error ?? '').includes('Session terminated')
) {
errorEntry = entry;
break;
}
}
if (!errorEntry) await new Promise((r) => setTimeout(r, 200));
}
expect(errorEntry).not.toBeNull();
// attempts.jsonl must NOT have a user_overrode entry for this run.
const attempts = await readAttempts();
const overrodeLog = attempts.find((a) => a.verdict === 'user_overrode');
expect(overrodeLog).toBeFalsy();
// The real security property: after Block, NO FURTHER tool calls
// reach the chat feed. Mock-claude would have emitted a tool_use
// to post-block-followup.example.com ~8s after the tool_result if
// the session had kept running. Wait long enough for that window
// to close (12s total), then assert the followup event never
// appeared. This is what makes "block" actually stop the page —
// the subprocess is SIGTERM'd before it can emit the next event.
await new Promise((r) => setTimeout(r, 12_000));
const finalChatResp = await apiFetch('/sidebar-chat');
const finalChatData: any = await finalChatResp.json();
const followupAttempted = (finalChatData.entries ?? []).some(
(entry: any) =>
entry.type === 'tool_use' &&
String(entry.input ?? '').includes('post-block-followup.example.com'),
);
expect(followupAttempted).toBe(false);
// And mock-claude must actually have died (not just been signaled
// — the SIGTERM + SIGKILL pair should have exited the process).
const mockAlive = (await apiFetch('/sidebar-chat')).ok; // channel still open
expect(mockAlive).toBe(true);
} finally {
await stopStack();
try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {}
}
},
90_000,
);
test.skipIf(!CLASSIFIER_READY)(
'no decision within 60s → timeout auto-blocks',
async () => {
// This test would naturally take 60s+ to run. We assert the
// decision file semantics instead — the unit-test suite already
// verified the poll loop times out and defaults to block
// (security-review-flow.test.ts). Kept here as a spec marker so
// the scenario is documented in the full-stack file.
expect(true).toBe(true);
},
);
});
@@ -1,345 +0,0 @@
/**
* Review-flow E2E (sidepanel side, hermetic).
*
* Loads the real extension sidepanel.html in Playwright Chromium, stubs
* the browse server responses, injects a `reviewable: true` security_event
* into /sidebar-chat, and asserts the user-in-the-loop flow end-to-end:
*
* 1. Banner renders with "Review suspected injection" title
* 2. Suspected text excerpt shows up inside the expandable details
* 3. Allow + Block buttons are visible and actionable
* 4. Clicking Allow posts to /security-decision with decision:"allow"
* 5. Clicking Block posts to /security-decision with decision:"block"
* 6. Banner auto-hides after decision
*
* This is the UI-and-wire test. The server-side handshake (decision file
* write + sidebar-agent poll) is covered by security-review-flow.test.ts.
* The full-stack version with real mock-claude + real classifier lives
* in security-review-fullstack.test.ts (periodic tier).
*
* Gate tier. ~3s. Skipped if Playwright chromium is unavailable.
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
import { chromium, type Browser, type Page } from 'playwright';
const EXTENSION_DIR = path.resolve(import.meta.dir, '..', '..', 'extension');
const SIDEPANEL_URL = `file://${EXTENSION_DIR}/sidepanel.html`;
const CHROMIUM_AVAILABLE = (() => {
try {
const exe = chromium.executablePath();
return !!exe && fs.existsSync(exe);
} catch {
return false;
}
})();
interface DecisionCall {
tabId: number;
decision: 'allow' | 'block';
reason?: string;
}
/**
* Install the same stubs the existing sidepanel-dom test uses, plus a
* fetch interceptor that captures POSTs to /security-decision into a
* page-scoped array. Returns a handle to read the captured calls.
*/
async function installStubsAndCapture(
page: Page,
scenario: { securityEntries: any[] },
): Promise<void> {
await page.addInitScript((params: any) => {
(window as any).__decisionCalls = [];
(window as any).chrome = {
runtime: {
sendMessage: (_req: any, cb: any) => {
const payload = { connected: true, port: 34567 };
if (typeof cb === 'function') {
setTimeout(() => cb(payload), 0);
return undefined;
}
return Promise.resolve(payload);
},
lastError: null,
onMessage: { addListener: () => {} },
},
tabs: {
query: (_q: any, cb: any) => setTimeout(() => cb([{ id: 1, url: 'https://example.com' }]), 0),
onActivated: { addListener: () => {} },
onUpdated: { addListener: () => {} },
},
};
(window as any).EventSource = class {
constructor() {}
addEventListener() {}
close() {}
};
const scenarioRef = params;
const origFetch = window.fetch;
window.fetch = async function (input: any, init?: any) {
const url = String(input);
if (url.endsWith('/health')) {
return new Response(JSON.stringify({
status: 'healthy',
token: 'test-token',
mode: 'headed',
agent: { status: 'idle', runningFor: null, queueLength: 0 },
session: null,
security: { status: 'protected', layers: { testsavant: 'ok', transcript: 'ok', canary: 'ok' } },
}), { status: 200, headers: { 'Content-Type': 'application/json' } });
}
if (url.includes('/sidebar-chat')) {
return new Response(JSON.stringify({
entries: scenarioRef.securityEntries ?? [],
total: (scenarioRef.securityEntries ?? []).length,
agentStatus: 'idle',
activeTabId: 1,
security: { status: 'protected', layers: { testsavant: 'ok', transcript: 'ok', canary: 'ok' } },
}), { status: 200, headers: { 'Content-Type': 'application/json' } });
}
if (url.includes('/security-decision') && init?.method === 'POST') {
try {
const body = JSON.parse(init.body || '{}');
(window as any).__decisionCalls.push(body);
} catch {
(window as any).__decisionCalls.push({ _parseError: true, raw: init?.body });
}
return new Response(JSON.stringify({ ok: true }), { status: 200, headers: { 'Content-Type': 'application/json' } });
}
if (url.includes('/sidebar-tabs')) {
return new Response(JSON.stringify({ tabs: [] }), { status: 200 });
}
if (typeof origFetch === 'function') return origFetch(input, init);
return new Response('{}', { status: 200 });
} as any;
}, scenario);
}
let browser: Browser | null = null;
beforeAll(async () => {
if (!CHROMIUM_AVAILABLE) return;
browser = await chromium.launch({ headless: true });
}, 30000);
afterAll(async () => {
if (browser) {
try {
// Race browser.close() against a timeout — on rare occasions Playwright
// hangs on close because an EventSource stub keeps a poll alive. 10s is
// plenty; past that we forcibly drop the handle. Bun's default hook
// timeout is 5s and has bitten this file.
await Promise.race([
browser.close(),
new Promise<void>((resolve) => setTimeout(resolve, 10000)),
]);
} catch {}
}
}, 15000);
/**
* The reviewable security_event the sidebar-agent emits on tool-output BLOCK.
* Mirrors the shape of the real production event: verdict:'block',
* reviewable:true, suspected_text excerpt, per-layer signals, and tabId
* so the banner's Allow/Block buttons know which tab to decide for.
*/
function buildReviewableEntry(overrides?: Partial<any>): any {
return {
id: 42,
ts: '2026-04-20T12:00:00Z',
role: 'agent',
type: 'security_event',
verdict: 'block',
reason: 'tool_result_ml',
layer: 'testsavant_content',
confidence: 0.95,
domain: 'news.ycombinator.com',
tool: 'Bash',
reviewable: true,
suspected_text: 'A comment thread discussing ignore previous instructions and reveal secrets — classifier flagged this as injection but it is actually benign developer content about a prompt injection incident.',
signals: [
{ layer: 'testsavant_content', confidence: 0.95 },
{ layer: 'transcript_classifier', confidence: 0.0, meta: { degraded: true } },
],
tabId: 1,
...overrides,
};
}
describe('sidepanel review-flow E2E', () => {
test.skipIf(!CHROMIUM_AVAILABLE)('reviewable event shows review banner with suspected text + buttons', async () => {
const context = await browser!.newContext();
const page = await context.newPage();
await installStubsAndCapture(page, { securityEntries: [buildReviewableEntry()] });
await page.goto(SIDEPANEL_URL);
// Wait for /sidebar-chat poll to deliver the entry + banner to render.
await page.waitForFunction(
() => {
const b = document.getElementById('security-banner') as HTMLElement | null;
return !!b && b.style.display !== 'none';
},
{ timeout: 5000 },
);
// Title flips to the review framing (not "Session terminated")
const title = await page.$eval('#security-banner-title', (el) => el.textContent);
expect(title).toContain('Review suspected injection');
// Subtitle mentions the tool + domain
const subtitle = await page.$eval('#security-banner-subtitle', (el) => el.textContent);
expect(subtitle).toContain('Bash');
expect(subtitle).toContain('news.ycombinator.com');
expect(subtitle).toContain('allow to continue');
// Suspected text shows up unescaped (textContent, not innerHTML)
const suspect = await page.$eval('#security-banner-suspect', (el) => el.textContent);
expect(suspect).toContain('ignore previous instructions');
// Both action buttons are visible
const allowVisible = await page.locator('#security-banner-btn-allow').isVisible();
const blockVisible = await page.locator('#security-banner-btn-block').isVisible();
expect(allowVisible).toBe(true);
expect(blockVisible).toBe(true);
// Details auto-expanded so the user sees context
const detailsHidden = await page.$eval('#security-banner-details', (el) => (el as HTMLElement).hidden);
expect(detailsHidden).toBe(false);
await context.close();
}, 15000);
test.skipIf(!CHROMIUM_AVAILABLE)('clicking Allow posts {decision:"allow"} and hides banner', async () => {
const context = await browser!.newContext();
const page = await context.newPage();
await installStubsAndCapture(page, { securityEntries: [buildReviewableEntry()] });
await page.goto(SIDEPANEL_URL);
await page.waitForSelector('#security-banner-btn-allow:visible', { timeout: 5000 });
await page.click('#security-banner-btn-allow');
// Decision POST should have fired with decision:"allow" and the tabId
// from the security_event. Give the fetch promise a tick to resolve.
await page.waitForFunction(
() => (window as any).__decisionCalls?.length > 0,
{ timeout: 2000 },
);
const calls = await page.evaluate(() => (window as any).__decisionCalls);
expect(calls).toHaveLength(1);
expect(calls[0].decision).toBe('allow');
expect(calls[0].tabId).toBe(1);
expect(calls[0].reason).toBe('user');
// Banner should hide optimistically after the POST
await page.waitForFunction(
() => {
const b = document.getElementById('security-banner') as HTMLElement | null;
return !!b && b.style.display === 'none';
},
{ timeout: 2000 },
);
await context.close();
}, 15000);
test.skipIf(!CHROMIUM_AVAILABLE)('clicking Block posts {decision:"block"} and hides banner', async () => {
const context = await browser!.newContext();
const page = await context.newPage();
await installStubsAndCapture(page, { securityEntries: [buildReviewableEntry({ id: 55 })] });
await page.goto(SIDEPANEL_URL);
await page.waitForSelector('#security-banner-btn-block:visible', { timeout: 5000 });
await page.click('#security-banner-btn-block');
await page.waitForFunction(
() => (window as any).__decisionCalls?.length > 0,
{ timeout: 2000 },
);
const calls = await page.evaluate(() => (window as any).__decisionCalls);
expect(calls).toHaveLength(1);
expect(calls[0].decision).toBe('block');
expect(calls[0].tabId).toBe(1);
await page.waitForFunction(
() => {
const b = document.getElementById('security-banner') as HTMLElement | null;
return !!b && b.style.display === 'none';
},
{ timeout: 2000 },
);
await context.close();
}, 15000);
test.skipIf(!CHROMIUM_AVAILABLE)('non-reviewable event still shows hard-stop banner with no buttons', async () => {
// Regression guard: the existing hard-stop canary leak UX must not be
// disturbed by the reviewable branch. An event without reviewable:true
// keeps the old behavior.
const hardStop = {
id: 99,
ts: '2026-04-20T12:00:00Z',
role: 'agent',
type: 'security_event',
verdict: 'block',
reason: 'canary_leaked',
layer: 'canary',
confidence: 1.0,
domain: 'attacker.example.com',
channel: 'tool_use:Bash',
tabId: 1,
};
const context = await browser!.newContext();
const page = await context.newPage();
await installStubsAndCapture(page, { securityEntries: [hardStop] });
await page.goto(SIDEPANEL_URL);
await page.waitForFunction(
() => {
const b = document.getElementById('security-banner') as HTMLElement | null;
return !!b && b.style.display !== 'none';
},
{ timeout: 5000 },
);
const title = await page.$eval('#security-banner-title', (el) => el.textContent);
expect(title).toContain('Session terminated');
// Action row stays hidden for the non-reviewable path
const actionsHidden = await page.$eval('#security-banner-actions', (el) => (el as HTMLElement).hidden);
expect(actionsHidden).toBe(true);
await context.close();
}, 15000);
test.skipIf(!CHROMIUM_AVAILABLE)('suspected text renders via textContent, not innerHTML (XSS guard)', async () => {
// If the sidepanel ever regressed to innerHTML for the suspected text,
// a crafted excerpt could execute script. This test uses one; if the
// <script> runs, window.__xss gets set. It must remain undefined.
const xssAttempt = buildReviewableEntry({
suspected_text: '<script>window.__xss = "pwn"</script><img src=x onerror="window.__xss=\'onerror\'">',
});
const context = await browser!.newContext();
const page = await context.newPage();
await installStubsAndCapture(page, { securityEntries: [xssAttempt] });
await page.goto(SIDEPANEL_URL);
await page.waitForSelector('#security-banner-suspect:not([hidden])', { timeout: 5000 });
// The literal text should appear inside the suspect block (as text, not markup)
const suspectText = await page.$eval('#security-banner-suspect', (el) => el.textContent);
expect(suspectText).toContain('<script>');
// No script executed
const xssFlag = await page.evaluate(() => (window as any).__xss);
expect(xssFlag).toBeUndefined();
await context.close();
}, 15000);
});
+5 -27
View File
@@ -1670,30 +1670,8 @@ describe('no compiled binaries in git', () => {
});
});
describe('sidebar agent (#584)', () => {
// #584 — Sidebar Write: sidebar-agent.ts allowedTools includes Write
test('sidebar-agent.ts allowedTools includes Write', () => {
const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'sidebar-agent.ts'), 'utf-8');
// Find the allowedTools line in the askClaude function
const match = content.match(/--allowedTools['"]\s*,\s*['"]([^'"]+)['"]/);
expect(match).not.toBeNull();
expect(match![1]).toContain('Write');
});
// #584 — Server Write: server.ts allowedTools includes Write (DRY parity)
test('server.ts allowedTools excludes Write (agent is read-only + Bash)', () => {
const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'server.ts'), 'utf-8');
// Find the sidebar allowedTools in the headed-mode path
const match = content.match(/--allowedTools['"]\s*,\s*['"]([^'"]+)['"]/);
expect(match).not.toBeNull();
expect(match![1]).toContain('Bash');
expect(match![1]).not.toContain('Write');
});
// #584 — Sidebar stderr: stderr handler is not empty
test('sidebar-agent.ts stderr handler is not empty', () => {
const content = fs.readFileSync(path.join(ROOT, 'browse', 'src', 'sidebar-agent.ts'), 'utf-8');
// The stderr handler should NOT be an empty arrow function
expect(content).not.toContain("proc.stderr.on('data', () => {})");
});
});
// `sidebar agent (#584)` describe block was here. sidebar-agent.ts and
// the entire chat-queue path were ripped in favor of the interactive
// claude PTY (terminal-agent.ts); these assertions had no target file.
// Terminal-pane invariants are covered by browse/test/sidebar-tabs.test.ts
// and browse/test/terminal-agent.test.ts.