From b4e49d080d30d26658ab0ca3713a699a235ea0e5 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Mon, 20 Apr 2026 04:55:23 +0800
Subject: [PATCH] feat(security): 3-way ensemble verdict combiner with
 deberta_content layer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates combineVerdict to support a third ML signal layer (deberta_content)
for opt-in DeBERTa-v3 ensemble. Rule becomes:

  * Canary leak → BLOCK (unchanged, deterministic)
  * 2-of-N ML classifiers >= WARN → BLOCK (ensemble_agreement)
    - N = 2 when DeBERTa disabled (testsavant + transcript)
    - N = 3 when DeBERTa enabled (adds deberta)
  * Any single layer >= BLOCK without cross-confirm → WARN (single_layer_high)
  * Any single layer >= WARN without cross-confirm → WARN (single_layer_medium)
  * Any layer >= LOG_ONLY → log_only
  * Otherwise → safe

Backward compatible: when DeBERTa signal has confidence 0 (meta.disabled
or absent entirely), the combiner treats it like any low-confidence layer.
Existing 2-of-2 ensemble path still fires for testsavant + transcript.

BLOCK confidence reports the MIN of the WARN+ layers — most-conservative
estimate of the agreed-upon signal strength, not the max.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 browse/src/security.ts | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/browse/src/security.ts b/browse/src/security.ts
index 5d331c75..1cf0e916 100644
--- a/browse/src/security.ts
+++ b/browse/src/security.ts
@@ -42,6 +42,7 @@ export type Verdict = 'safe' | 'log_only' | 'warn' | 'block';
 
 export type LayerName =
   | 'testsavant_content'
+  | 'deberta_content'        // opt-in ensemble layer (GSTACK_SECURITY_ENSEMBLE=deberta)
   | 'transcript_classifier'
   | 'aria_regex'
   | 'canary';
@@ -88,6 +89,7 @@ export function combineVerdict(signals: LayerSignal[]): SecurityResult {
     byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence);
   }
   const content = byLayer.testsavant_content ?? 0;
+  const deberta = byLayer.deberta_content ?? 0;
   const transcript = byLayer.transcript_classifier ?? 0;
   const canary = byLayer.canary ?? 0;
 
@@ -101,44 +103,53 @@ export function combineVerdict(signals: LayerSignal[]): SecurityResult {
     };
   }
 
-  // Ensemble: both ML classifiers agree -> BLOCK.
-  if (content >= THRESHOLDS.WARN && transcript >= THRESHOLDS.WARN) {
+  // ML signals at >= WARN. Count how many agree.
+  const mlHighSignals = [content, deberta, transcript].filter(c => c >= THRESHOLDS.WARN);
+  const hasDebertaSignal = deberta > 0;
+
+  // Ensemble rule:
+  //   * 2-of-3 ML classifiers >= WARN → BLOCK when DeBERTa is in the pool
+  //   * 2-of-2 (content + transcript) >= WARN → BLOCK when DeBERTa disabled
+  // In both cases, cross-model agreement is what upgrades from WARN to BLOCK.
+  const requiredForBlock = hasDebertaSignal ? 2 : 2;
+  if (mlHighSignals.length >= requiredForBlock) {
     return {
       verdict: 'block',
       reason: 'ensemble_agreement',
       signals,
-      confidence: Math.min(content, transcript),
+      confidence: Math.min(...mlHighSignals),
     };
   }
 
   // Single layer >= BLOCK (no cross-confirm) degrades to WARN to avoid FPs.
-  if (content >= THRESHOLDS.BLOCK || transcript >= THRESHOLDS.BLOCK) {
+  const maxMl = Math.max(content, deberta, transcript);
+  if (maxMl >= THRESHOLDS.BLOCK) {
     return {
       verdict: 'warn',
       reason: 'single_layer_high',
       signals,
-      confidence: Math.max(content, transcript),
+      confidence: maxMl,
     };
   }
 
-  if (content >= THRESHOLDS.WARN || transcript >= THRESHOLDS.WARN) {
+  if (maxMl >= THRESHOLDS.WARN) {
     return {
       verdict: 'warn',
       reason: 'single_layer_medium',
       signals,
-      confidence: Math.max(content, transcript),
+      confidence: maxMl,
     };
   }
 
-  if (content >= THRESHOLDS.LOG_ONLY || transcript >= THRESHOLDS.LOG_ONLY) {
+  if (maxMl >= THRESHOLDS.LOG_ONLY) {
     return {
       verdict: 'log_only',
       signals,
-      confidence: Math.max(content, transcript),
+      confidence: maxMl,
     };
   }
 
-  return { verdict: 'safe', signals, confidence: Math.max(content, transcript) };
+  return { verdict: 'safe', signals, confidence: maxMl };
 }
 
 // ─── Canary (session-scoped secret token) ────────────────────