feat(security): wait-for-decision instead of hard-kill on tool-output BLOCK

Was: tool-output BLOCK → immediate SIGTERM, session dies, user stranded. A false positive on benign content (e.g. HN comments discussing prompt injection) killed the session and lost the message. Now: tool-output BLOCK → emit security_event with reviewable:true + suspected_text + per-layer scores. Poll ~/.gstack/security/decisions/ for up to 60s. On "allow" — log the override to attempts.jsonl as verdict=user_overrode and let the session continue. On "block" or timeout — kill as before. Canary leaks stay hard-stop (no review path). User-input pre-spawn scans unchanged in this commit. Only tool-output scans gain review. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-17 23:30:09 +02:00 · 2026-04-20 20:25:20 +08:00
parent 26fd1b2825
commit 13b8db9e7a
1 changed files with 66 additions and 3 deletions
@@ -16,6 +16,7 @@ import { safeUnlink } from './error-handling';
 import {
  checkCanaryInStructure, logAttempt, hashPayload, extractDomain,
  combineVerdict, writeSessionState, readSessionState, THRESHOLDS,
+  readDecision, clearDecision, excerptForReview,
  type LayerSignal,
 } from './security';
 import {
@@ -643,15 +644,26 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
        if (result.verdict !== 'block') return;
        toolResultBlockFired = true;
        const domain = extractDomain(pageUrl ?? '');
+        const payloadHash = hashPayload(text.slice(0, 4096));
+
+        // Log pending — if the user overrides, we'll update via a separate
+        // log line. The attempts.jsonl is append-only so both entries survive.
        logAttempt({
          ts: new Date().toISOString(),
          urlDomain: domain,
-          payloadHash: hashPayload(text.slice(0, 4096)),
+          payloadHash,
          confidence: result.confidence,
          layer: 'testsavant_content',
          verdict: 'block',
        });
-        console.warn(`[sidebar-agent] Tool-result BLOCK on ${toolName} for tab ${tid} (confidence=${result.confidence.toFixed(3)})`);
+        console.warn(`[sidebar-agent] Tool-result BLOCK on ${toolName} for tab ${tid} (confidence=${result.confidence.toFixed(3)}) — awaiting user decision`);
+
+        // Surface a REVIEWABLE block event. Sidepanel renders the suspected
+        // text + layer scores + [Allow and continue] / [Block session] buttons.
+        // The user has 60s to decide; default is BLOCK (safe fallback).
+        const layerScores = signals
+          .filter((s) => s.confidence > 0)
+          .map((s) => ({ layer: s.layer, confidence: s.confidence }));
        await sendEvent({
          type: 'security_event',
          verdict: 'block',
@@ -660,10 +672,61 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
          confidence: result.confidence,
          domain,
          tool: toolName,
+          reviewable: true,
+          suspected_text: excerptForReview(text),
+          signals: layerScores,
        }, tid);
+
+        // Poll for the user's decision. Default to BLOCK on timeout.
+        const REVIEW_TIMEOUT_MS = 60_000;
+        const POLL_MS = 500;
+        clearDecision(tid); // clear any stale decision from a prior session
+        const deadline = Date.now() + REVIEW_TIMEOUT_MS;
+        let decision: 'allow' | 'block' = 'block';
+        let decisionReason = 'timeout';
+        while (Date.now() < deadline) {
+          const rec = readDecision(tid);
+          if (rec?.decision === 'allow' || rec?.decision === 'block') {
+            decision = rec.decision;
+            decisionReason = rec.reason ?? 'user';
+            break;
+          }
+          await new Promise((r) => setTimeout(r, POLL_MS));
+        }
+        clearDecision(tid);
+
+        if (decision === 'allow') {
+          // User overrode. Log the override so the audit trail captures it.
+          // toolResultBlockFired stays true so we don't re-prompt within the
+          // same message — one override per BLOCK event.
+          logAttempt({
+            ts: new Date().toISOString(),
+            urlDomain: domain,
+            payloadHash,
+            confidence: result.confidence,
+            layer: 'testsavant_content',
+            verdict: 'user_overrode',
+          });
+          await sendEvent({
+            type: 'security_event',
+            verdict: 'user_overrode',
+            reason: 'tool_result_ml',
+            layer: 'testsavant_content',
+            confidence: result.confidence,
+            domain,
+            tool: toolName,
+          }, tid);
+          console.warn(`[sidebar-agent] Tab ${tid}: user overrode BLOCK — session continues`);
+          // Let the block stay consumed; reset the flag so subsequent tool
+          // results get scanned fresh.
+          toolResultBlockFired = false;
+          return;
+        }
+
+        // User chose BLOCK (or timed out). Kill the session as before.
        await sendEvent({
          type: 'agent_error',
-          error: `Session terminated — prompt injection detected in ${toolName} output`,
+          error: `Session terminated — prompt injection detected in ${toolName} output${decisionReason === 'timeout' ? ' (review timeout)' : ''}`,
        }, tid);
        try { proc.kill('SIGTERM'); } catch (err: any) { if (err?.code !== 'ESRCH') throw err; }
        setTimeout(() => {