From d50cdc46119f2f144d65b7b0ed17f56c7e097690 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 19 Apr 2026 18:48:02 +0800
Subject: [PATCH] feat(security): wire canary injection into sidebar
 spawnClaude
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every sidebar message now gets a fresh CANARY-XXXXXXXXXXXX token embedded
in the system prompt with an instruction for Claude to never output it on
any channel. The token flows through the queue entry so sidebar-agent.ts
can check every outbound operation for leaks.

If Claude echoes the canary into any outbound channel (text stream, tool
arguments, URLs, file write paths), the sidebar-agent terminates the
session and the user sees the approved canary leak banner.

This operation is pure string manipulation — safe in the compiled browse
binary. The actual output-stream check (which also has to be safe in
compiled contexts) lives in sidebar-agent.ts (next commit).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 browse/src/server.ts | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/browse/src/server.ts b/browse/src/server.ts
index 3a825c1e..e3f24fa0 100644
--- a/browse/src/server.ts
+++ b/browse/src/server.ts
@@ -25,6 +25,7 @@ import {
   runContentFilters, type ContentFilterResult,
   markHiddenElements, getCleanTextWithStripping, cleanupHiddenMarkers,
 } from './content-security';
+import { generateCanary, injectCanary } from './security';
 import { handleSnapshot, SNAPSHOT_FLAGS } from './snapshot';
 import {
   initRegistry, validateToken as validateScopedToken, checkScope, checkDomain,
@@ -551,7 +552,13 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId
   const escapeXml = (s: string) => s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
   const escapedMessage = escapeXml(userMessage);
 
-  const systemPrompt = [
+  // Fresh canary per message. The sidebar-agent checks every outbound channel
+  // (stream text, tool_use arguments, URLs, file writes) for this token.
+  // If Claude echoes it anywhere, that's evidence a prompt injection overrode
+  // the system prompt — session is killed, user sees the banner.
+  const canary = generateCanary();
+
+  const baseSystemPrompt = [
     '<system>',
     `Browser co-pilot. Binary: ${B}`,
     'Run `' + B + ' url` first to check the actual page. NEVER assume the URL.',
@@ -576,6 +583,10 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId
     '</system>',
   ].join('\n');
 
+  // Append the canary instruction. injectCanary() tells Claude never to
+  // output the token on any channel.
+  const systemPrompt = injectCanary(baseSystemPrompt, canary);
+
   const prompt = `${systemPrompt}\n\n<user-message>\n${escapedMessage}\n</user-message>`;
   // Never resume — each message is a fresh context. Resuming carries stale
   // page URLs and old navigation state that makes the agent fight the user.
@@ -607,6 +618,7 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId
     sessionId: sidebarSession?.claudeSessionId || null,
     pageUrl: pageUrl,
     tabId: agentTabId,
+    canary, // sidebar-agent scans all outbound channels for this token
   });
   try {
     fs.mkdirSync(gstackDir, { recursive: true, mode: 0o700 });