feat(security): wire canary injection into sidebar spawnClaude

Every sidebar message now gets a fresh CANARY-XXXXXXXXXXXX token embedded
in the system prompt with an instruction for Claude to never output it on
any channel. The token flows through the queue entry so sidebar-agent.ts
can check every outbound operation for leaks.

If Claude echoes the canary into any outbound channel (text stream, tool
arguments, URLs, file write paths), the sidebar-agent terminates the
session and the user sees the approved canary leak banner.

This operation is pure string manipulation — safe in the compiled browse
binary. The actual output-stream check (which also has to be safe in
compiled contexts) lives in sidebar-agent.ts (next commit).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-19 18:48:02 +08:00
parent 900cc0902b
commit d50cdc4611
+13 -1
View File
@@ -25,6 +25,7 @@ import {
runContentFilters, type ContentFilterResult, runContentFilters, type ContentFilterResult,
markHiddenElements, getCleanTextWithStripping, cleanupHiddenMarkers, markHiddenElements, getCleanTextWithStripping, cleanupHiddenMarkers,
} from './content-security'; } from './content-security';
import { generateCanary, injectCanary } from './security';
import { handleSnapshot, SNAPSHOT_FLAGS } from './snapshot'; import { handleSnapshot, SNAPSHOT_FLAGS } from './snapshot';
import { import {
initRegistry, validateToken as validateScopedToken, checkScope, checkDomain, initRegistry, validateToken as validateScopedToken, checkScope, checkDomain,
@@ -551,7 +552,13 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId
const escapeXml = (s: string) => s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;'); const escapeXml = (s: string) => s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
const escapedMessage = escapeXml(userMessage); const escapedMessage = escapeXml(userMessage);
const systemPrompt = [ // Fresh canary per message. The sidebar-agent checks every outbound channel
// (stream text, tool_use arguments, URLs, file writes) for this token.
// If Claude echoes it anywhere, that's evidence a prompt injection overrode
// the system prompt — session is killed, user sees the banner.
const canary = generateCanary();
const baseSystemPrompt = [
'<system>', '<system>',
`Browser co-pilot. Binary: ${B}`, `Browser co-pilot. Binary: ${B}`,
'Run `' + B + ' url` first to check the actual page. NEVER assume the URL.', 'Run `' + B + ' url` first to check the actual page. NEVER assume the URL.',
@@ -576,6 +583,10 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId
'</system>', '</system>',
].join('\n'); ].join('\n');
// Append the canary instruction. injectCanary() tells Claude never to
// output the token on any channel.
const systemPrompt = injectCanary(baseSystemPrompt, canary);
const prompt = `${systemPrompt}\n\n<user-message>\n${escapedMessage}\n</user-message>`; const prompt = `${systemPrompt}\n\n<user-message>\n${escapedMessage}\n</user-message>`;
// Never resume — each message is a fresh context. Resuming carries stale // Never resume — each message is a fresh context. Resuming carries stale
// page URLs and old navigation state that makes the agent fight the user. // page URLs and old navigation state that makes the agent fight the user.
@@ -607,6 +618,7 @@ function spawnClaude(userMessage: string, extensionUrl?: string | null, forTabId
sessionId: sidebarSession?.claudeSessionId || null, sessionId: sidebarSession?.claudeSessionId || null,
pageUrl: pageUrl, pageUrl: pageUrl,
tabId: agentTabId, tabId: agentTabId,
canary, // sidebar-agent scans all outbound channels for this token
}); });
try { try {
fs.mkdirSync(gstackDir, { recursive: true, mode: 0o700 }); fs.mkdirSync(gstackDir, { recursive: true, mode: 0o700 });