feat(security): canary leak check across all outbound channels

The sidebar-agent now scans every Claude stream event for the session's canary token before relaying any data to the sidepanel. Channels covered (per CEO review cross-model tension #2): * Assistant text blocks * Assistant text_delta streaming * tool_use arguments (recursively, via checkCanaryInStructure — catches URLs, commands, file paths nested at any depth) * tool_use content_block_start * tool_input_delta partial JSON * Final result payload If the canary leaks on any channel, onCanaryLeaked() fires once per session: 1. logAttempt() writes the event to ~/.gstack/security/attempts.jsonl with the canary's salted hash (never the payload content). 2. sends a `security_event` to the sidepanel so it can render the approved canary-leak banner (variant A mockup — ceo-plan 2026-04-19). 3. sends an `agent_error` for backward-compat with existing error surfaces. 4. SIGTERM's the claude subprocess (SIGKILL after 2s if still alive). The leaked content itself is never relayed to the sidepanel — the event is dropped at the boundary. Canary detection is pure-string substring match, so this all runs safely in the sidebar-agent (non-compiled bun) context. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-08-03 04:48:42 +02:00 · 2026-04-19 18:51:18 +08:00
parent 65bf4514b8
commit 2137417f63
1 changed files with 130 additions and 4 deletions
@@ -13,6 +13,7 @@ import { spawn } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import { safeUnlink } from './error-handling';
+import { checkCanaryInStructure, logAttempt, hashPayload, extractDomain } from './security';

 const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
 const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
@@ -36,6 +37,7 @@ interface QueueEntry {
  pageUrl?: string | null;
  sessionId?: string | null;
  ts?: string;
+  canary?: string; // session-scoped token; leak = prompt injection evidence
 }

 function isValidQueueEntry(e: unknown): e is QueueEntry {
@@ -55,6 +57,7 @@ function isValidQueueEntry(e: unknown): e is QueueEntry {
  if (obj.message !== undefined && obj.message !== null && typeof obj.message !== 'string') return false;
  if (obj.pageUrl !== undefined && obj.pageUrl !== null && typeof obj.pageUrl !== 'string') return false;
  if (obj.sessionId !== undefined && obj.sessionId !== null && typeof obj.sessionId !== 'string') return false;
+  if (obj.canary !== undefined && typeof obj.canary !== 'string') return false;
  return true;
 }

@@ -228,7 +231,63 @@ function summarizeToolInput(tool: string, input: any): string {
  return describeToolCall(tool, input);
 }

-async function handleStreamEvent(event: any, tabId?: number): Promise<void> {
+/**
+ * Scan a Claude stream event for the session canary. Returns the channel where
+ * it leaked, or null if clean. Covers every outbound channel: text blocks,
+ * text deltas, tool_use arguments (including nested URL/path/command strings),
+ * and result payloads.
+ */
+function detectCanaryLeak(event: any, canary: string): string | null {
+  if (!canary) return null;
+
+  if (event.type === 'assistant' && event.message?.content) {
+    for (const block of event.message.content) {
+      if (block.type === 'text' && typeof block.text === 'string' && block.text.includes(canary)) {
+        return 'assistant_text';
+      }
+      if (block.type === 'tool_use' && checkCanaryInStructure(block.input, canary)) {
+        return `tool_use:${block.name}`;
+      }
+    }
+  }
+  if (event.type === 'content_block_start' && event.content_block?.type === 'tool_use') {
+    if (checkCanaryInStructure(event.content_block.input, canary)) {
+      return `tool_use:${event.content_block.name}`;
+    }
+  }
+  if (event.type === 'content_block_delta' && event.delta?.type === 'text_delta') {
+    if (typeof event.delta.text === 'string' && event.delta.text.includes(canary)) {
+      return 'text_delta';
+    }
+  }
+  if (event.type === 'content_block_delta' && event.delta?.type === 'input_json_delta') {
+    if (typeof event.delta.partial_json === 'string' && event.delta.partial_json.includes(canary)) {
+      return 'tool_input_delta';
+    }
+  }
+  if (event.type === 'result' && typeof event.result === 'string' && event.result.includes(canary)) {
+    return 'result';
+  }
+  return null;
+}
+
+interface CanaryContext {
+  canary: string;
+  pageUrl: string;
+  onLeak: (channel: string) => void;
+}
+
+async function handleStreamEvent(event: any, tabId?: number, canaryCtx?: CanaryContext): Promise<void> {
+  // Canary check runs BEFORE any outbound send — we never want to relay
+  // a leaked token to the sidepanel UI.
+  if (canaryCtx) {
+    const channel = detectCanaryLeak(event, canaryCtx.canary);
+    if (channel) {
+      canaryCtx.onLeak(channel);
+      return; // drop the event — never relay content that leaked the canary
+    }
+  }
+
  if (event.type === 'system' && event.session_id) {
    // Relay claude session ID for --resume support
    await sendEvent({ type: 'system', claudeSessionId: event.session_id }, tabId);
@@ -267,14 +326,62 @@ async function handleStreamEvent(event: any, tabId?: number): Promise<void> {
  }
 }

+/**
+ * Fire the prompt-injection-detected event to the server. This terminates
+ * the session from the sidepanel's perspective and renders the canary leak
+ * banner. Also logs locally (salted hash + domain only) and fires telemetry
+ * if configured.
+ */
+async function onCanaryLeaked(params: {
+  tabId: number;
+  channel: string;
+  canary: string;
+  pageUrl: string;
+}): Promise<void> {
+  const { tabId, channel, canary, pageUrl } = params;
+  const domain = extractDomain(pageUrl);
+  console.warn(`[sidebar-agent] CANARY LEAK detected on ${channel} for tab ${tabId} (domain=${domain || 'unknown'})`);
+
+  // Local log — salted hash + domain only, never the payload
+  logAttempt({
+    ts: new Date().toISOString(),
+    urlDomain: domain,
+    payloadHash: hashPayload(canary), // hash the canary, not the payload (which might be leaked content)
+    confidence: 1.0,
+    layer: 'canary',
+    verdict: 'block',
+  });
+
+  // Broadcast to sidepanel so it can render the approved banner
+  await sendEvent({
+    type: 'security_event',
+    verdict: 'block',
+    reason: 'canary_leaked',
+    layer: 'canary',
+    channel,
+    domain,
+  }, tabId);
+
+  // Also emit agent_error so the sidepanel's existing error surface
+  // reflects that the session terminated. Keeps old clients working.
+  await sendEvent({
+    type: 'agent_error',
+    error: `Session terminated — prompt injection detected${domain ? ` from ${domain}` : ''}`,
+  }, tabId);
+}
+
 async function askClaude(queueEntry: QueueEntry): Promise<void> {
-  const { prompt, args, stateFile, cwd, tabId } = queueEntry;
+  const { prompt, args, stateFile, cwd, tabId, canary, pageUrl } = queueEntry;
  const tid = tabId ?? 0;

  processingTabs.add(tid);
  await sendEvent({ type: 'agent_start' }, tid);

  return new Promise((resolve) => {
+    // Canary context is set after proc is spawned (needs proc reference for kill).
+    let canaryCtx: CanaryContext | undefined;
+    let canaryTriggered = false;
+
    // Use args from queue entry (server sets --model, --allowedTools, prompt framing).
    // Fall back to defaults only if queue entry has no args (backward compat).
    // Write doesn't expand attack surface beyond what Bash already provides.
@@ -317,6 +424,25 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {

    proc.stdin.end();

+    // Now that proc exists, set up the canary-leak handler. It fires at most
+    // once; on fire we kill the subprocess, emit security_event + agent_error,
+    // and let the normal close handler resolve the promise.
+    if (canary) {
+      canaryCtx = {
+        canary,
+        pageUrl: pageUrl ?? '',
+        onLeak: (channel: string) => {
+          if (canaryTriggered) return;
+          canaryTriggered = true;
+          onCanaryLeaked({ tabId: tid, channel, canary, pageUrl: pageUrl ?? '' });
+          try { proc.kill('SIGTERM'); } catch (err: any) { if (err?.code !== 'ESRCH') throw err; }
+          setTimeout(() => {
+            try { proc.kill('SIGKILL'); } catch (err: any) { if (err?.code !== 'ESRCH') throw err; }
+          }, 2000);
+        },
+      };
+    }
+
    // Poll for per-tab cancel signal from server's killAgent()
    const cancelCheck = setInterval(() => {
      try {
@@ -338,7 +464,7 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
      buffer = lines.pop() || '';
      for (const line of lines) {
        if (!line.trim()) continue;
-        try { handleStreamEvent(JSON.parse(line), tid); } catch (err: any) {
+        try { handleStreamEvent(JSON.parse(line), tid, canaryCtx); } catch (err: any) {
          console.error(`[sidebar-agent] Tab ${tid}: Failed to parse stream line:`, line.slice(0, 100), err.message);
        }
      }
@@ -354,7 +480,7 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
      activeProc = null;
      activeProcs.delete(tid);
      if (buffer.trim()) {
-        try { handleStreamEvent(JSON.parse(buffer), tid); } catch (err: any) {
+        try { handleStreamEvent(JSON.parse(buffer), tid, canaryCtx); } catch (err: any) {
          console.error(`[sidebar-agent] Tab ${tid}: Failed to parse final buffer:`, buffer.slice(0, 100), err.message);
        }
      }