fix(security): tool-output context allows single-layer BLOCK

combineVerdict's 2-of-N ensemble rule was designed for user input —
the Stack Overflow FP mitigation where a dev asking about injection
shouldn't kill the session. For tool output (page content, Read/Grep
results), the content wasn't user-authored, so that FP risk doesn't
apply. Before this change: testsavant_content=0.99 on a hostile page
downgraded to WARN when the transcript classifier degraded (timeout,
Haiku unavailable) or voted differently.

Add CombineVerdictOpts.toolOutput flag. When true, a single ML
classifier >= BLOCK threshold blocks directly. User-input default
path unchanged — still requires 2-of-N to block.

Caller: sidebar-agent.ts tool-result scan now passes { toolOutput: true }.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-20 11:07:18 +08:00
parent 407c36b48a
commit 88b12c2b4c
+26 -2
View File
@@ -83,7 +83,17 @@ export interface StatusDetail {
* Canary leak (confidence >= 1.0 on 'canary' layer) always BLOCKs — it's * Canary leak (confidence >= 1.0 on 'canary' layer) always BLOCKs — it's
* deterministic, not a confidence signal. * deterministic, not a confidence signal.
*/ */
export function combineVerdict(signals: LayerSignal[]): SecurityResult { export interface CombineVerdictOpts {
/**
* When true, a single ML classifier at >= BLOCK threshold blocks even if
* no other classifier confirms. Used for tool-output scans where the
* content was not authored by the user, so the Stack-Overflow-FP risk
* that motivated the 2-of-N rule for user input doesn't apply.
*/
toolOutput?: boolean;
}
export function combineVerdict(signals: LayerSignal[], opts: CombineVerdictOpts = {}): SecurityResult {
const byLayer: Record<string, number> = {}; const byLayer: Record<string, number> = {};
for (const s of signals) { for (const s of signals) {
byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence); byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence);
@@ -122,9 +132,23 @@ export function combineVerdict(signals: LayerSignal[]): SecurityResult {
}; };
} }
// Single layer >= BLOCK (no cross-confirm) degrades to WARN to avoid FPs. // Single layer >= BLOCK (no cross-confirm).
// For user-input: degrade to WARN (Stack Overflow FP mitigation).
// For tool-output (opts.toolOutput): BLOCK directly — the content wasn't
// user-authored, so the "it might be a developer asking about injection"
// concern doesn't apply. The transcript classifier may have degraded
// (timeout, Haiku unavailable) and should not be a get-out-of-jail card
// for a hostile page.
const maxMl = Math.max(content, deberta, transcript); const maxMl = Math.max(content, deberta, transcript);
if (maxMl >= THRESHOLDS.BLOCK) { if (maxMl >= THRESHOLDS.BLOCK) {
if (opts.toolOutput) {
return {
verdict: 'block',
reason: 'single_layer_tool_output',
signals,
confidence: maxMl,
};
}
return { return {
verdict: 'warn', verdict: 'warn',
reason: 'single_layer_high', reason: 'single_layer_high',