mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
fix(security): tool-output context allows single-layer BLOCK
combineVerdict's 2-of-N ensemble rule was designed for user input —
the Stack Overflow FP mitigation where a dev asking about injection
shouldn't kill the session. For tool output (page content, Read/Grep
results), the content wasn't user-authored, so that FP risk doesn't
apply. Before this change: testsavant_content=0.99 on a hostile page
downgraded to WARN when the transcript classifier degraded (timeout,
Haiku unavailable) or voted differently.
Add CombineVerdictOpts.toolOutput flag. When true, a single ML
classifier >= BLOCK threshold blocks directly. User-input default
path unchanged — still requires 2-of-N to block.
Caller: sidebar-agent.ts tool-result scan now passes { toolOutput: true }.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+26
-2
@@ -83,7 +83,17 @@ export interface StatusDetail {
|
|||||||
* Canary leak (confidence >= 1.0 on 'canary' layer) always BLOCKs — it's
|
* Canary leak (confidence >= 1.0 on 'canary' layer) always BLOCKs — it's
|
||||||
* deterministic, not a confidence signal.
|
* deterministic, not a confidence signal.
|
||||||
*/
|
*/
|
||||||
export function combineVerdict(signals: LayerSignal[]): SecurityResult {
|
export interface CombineVerdictOpts {
|
||||||
|
/**
|
||||||
|
* When true, a single ML classifier at >= BLOCK threshold blocks even if
|
||||||
|
* no other classifier confirms. Used for tool-output scans where the
|
||||||
|
* content was not authored by the user, so the Stack-Overflow-FP risk
|
||||||
|
* that motivated the 2-of-N rule for user input doesn't apply.
|
||||||
|
*/
|
||||||
|
toolOutput?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function combineVerdict(signals: LayerSignal[], opts: CombineVerdictOpts = {}): SecurityResult {
|
||||||
const byLayer: Record<string, number> = {};
|
const byLayer: Record<string, number> = {};
|
||||||
for (const s of signals) {
|
for (const s of signals) {
|
||||||
byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence);
|
byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence);
|
||||||
@@ -122,9 +132,23 @@ export function combineVerdict(signals: LayerSignal[]): SecurityResult {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Single layer >= BLOCK (no cross-confirm) degrades to WARN to avoid FPs.
|
// Single layer >= BLOCK (no cross-confirm).
|
||||||
|
// For user-input: degrade to WARN (Stack Overflow FP mitigation).
|
||||||
|
// For tool-output (opts.toolOutput): BLOCK directly — the content wasn't
|
||||||
|
// user-authored, so the "it might be a developer asking about injection"
|
||||||
|
// concern doesn't apply. The transcript classifier may have degraded
|
||||||
|
// (timeout, Haiku unavailable) and should not be a get-out-of-jail card
|
||||||
|
// for a hostile page.
|
||||||
const maxMl = Math.max(content, deberta, transcript);
|
const maxMl = Math.max(content, deberta, transcript);
|
||||||
if (maxMl >= THRESHOLDS.BLOCK) {
|
if (maxMl >= THRESHOLDS.BLOCK) {
|
||||||
|
if (opts.toolOutput) {
|
||||||
|
return {
|
||||||
|
verdict: 'block',
|
||||||
|
reason: 'single_layer_tool_output',
|
||||||
|
signals,
|
||||||
|
confidence: maxMl,
|
||||||
|
};
|
||||||
|
}
|
||||||
return {
|
return {
|
||||||
verdict: 'warn',
|
verdict: 'warn',
|
||||||
reason: 'single_layer_high',
|
reason: 'single_layer_high',
|
||||||
|
|||||||
Reference in New Issue
Block a user