From d50cdc46119f2f144d65b7b0ed17f56c7e097690 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 19 Apr 2026 18:48:02 +0800 Subject: [PATCH] feat(security): wire canary injection into sidebar spawnClaude MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every sidebar message now gets a fresh CANARY-XXXXXXXXXXXX token embedded in the system prompt with an instruction for Claude to never output it on any channel. The token flows through the queue entry so sidebar-agent.ts can check every outbound operation for leaks. If Claude echoes the canary into any outbound channel (text stream, tool arguments, URLs, file write paths), the sidebar-agent terminates the session and the user sees the approved canary leak banner. This operation is pure string manipulation — safe in the compiled browse binary. The actual output-stream check (which also has to be safe in compiled contexts) lives in sidebar-agent.ts (next commit). Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/server.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/browse/src/server.ts b/browse/src/server.ts index 3a825c1e..e3f24fa0 100644 --- a/browse/src/server.ts +++ b/browse/src/server.ts @@ -25,6 +25,7 @@ import { runContentFilters, type ContentFilterResult, markHiddenElements, getCleanTextWithStripping, cleanupHiddenMarkers, } from './content-security'; +import { generateCanary, injectCanary } from './security'; import { handleSnapshot, SNAPSHOT_FLAGS } from './snapshot'; import { initRegistry, validateToken as validateScopedToken, checkScope, checkDomain, @@ -551,7 +552,13 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId const escapeXml = (s: string) => s.replace(/&/g, '&').replace(//g, '>'); const escapedMessage = escapeXml(userMessage); - const systemPrompt = [ + // Fresh canary per message. The sidebar-agent checks every outbound channel + // (stream text, tool_use arguments, URLs, file writes) for this token. + // If Claude echoes it anywhere, that's evidence a prompt injection overrode + // the system prompt — session is killed, user sees the banner. + const canary = generateCanary(); + + const baseSystemPrompt = [ '', `Browser co-pilot. Binary: ${B}`, 'Run `' + B + ' url` first to check the actual page. NEVER assume the URL.', @@ -576,6 +583,10 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId '', ].join('\n'); + // Append the canary instruction. injectCanary() tells Claude never to + // output the token on any channel. + const systemPrompt = injectCanary(baseSystemPrompt, canary); + const prompt = `${systemPrompt}\n\n\n${escapedMessage}\n`; // Never resume — each message is a fresh context. Resuming carries stale // page URLs and old navigation state that makes the agent fight the user. @@ -607,6 +618,7 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId sessionId: sidebarSession?.claudeSessionId || null, pageUrl: pageUrl, tabId: agentTabId, + canary, // sidebar-agent scans all outbound channels for this token }); try { fs.mkdirSync(gstackDir, { recursive: true, mode: 0o700 });