From 88b12c2b4c7550904d5c7132d328b1cca510ac5b Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 20 Apr 2026 11:07:18 +0800 Subject: [PATCH] fix(security): tool-output context allows single-layer BLOCK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit combineVerdict's 2-of-N ensemble rule was designed for user input — the Stack Overflow FP mitigation where a dev asking about injection shouldn't kill the session. For tool output (page content, Read/Grep results), the content wasn't user-authored, so that FP risk doesn't apply. Before this change: testsavant_content=0.99 on a hostile page downgraded to WARN when the transcript classifier degraded (timeout, Haiku unavailable) or voted differently. Add CombineVerdictOpts.toolOutput flag. When true, a single ML classifier >= BLOCK threshold blocks directly. User-input default path unchanged — still requires 2-of-N to block. Caller: sidebar-agent.ts tool-result scan now passes { toolOutput: true }. Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/security.ts | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/browse/src/security.ts b/browse/src/security.ts index 121d435d..7dacaa6a 100644 --- a/browse/src/security.ts +++ b/browse/src/security.ts @@ -83,7 +83,17 @@ export interface StatusDetail { * Canary leak (confidence >= 1.0 on 'canary' layer) always BLOCKs — it's * deterministic, not a confidence signal. */ -export function combineVerdict(signals: LayerSignal[]): SecurityResult { +export interface CombineVerdictOpts { + /** + * When true, a single ML classifier at >= BLOCK threshold blocks even if + * no other classifier confirms. Used for tool-output scans where the + * content was not authored by the user, so the Stack-Overflow-FP risk + * that motivated the 2-of-N rule for user input doesn't apply. + */ + toolOutput?: boolean; +} + +export function combineVerdict(signals: LayerSignal[], opts: CombineVerdictOpts = {}): SecurityResult { const byLayer: Record = {}; for (const s of signals) { byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence); @@ -122,9 +132,23 @@ export function combineVerdict(signals: LayerSignal[]): SecurityResult { }; } - // Single layer >= BLOCK (no cross-confirm) degrades to WARN to avoid FPs. + // Single layer >= BLOCK (no cross-confirm). + // For user-input: degrade to WARN (Stack Overflow FP mitigation). + // For tool-output (opts.toolOutput): BLOCK directly — the content wasn't + // user-authored, so the "it might be a developer asking about injection" + // concern doesn't apply. The transcript classifier may have degraded + // (timeout, Haiku unavailable) and should not be a get-out-of-jail card + // for a hostile page. const maxMl = Math.max(content, deberta, transcript); if (maxMl >= THRESHOLDS.BLOCK) { + if (opts.toolOutput) { + return { + verdict: 'block', + reason: 'single_layer_tool_output', + signals, + confidence: maxMl, + }; + } return { verdict: 'warn', reason: 'single_layer_high',