From 8e9ec52d6f30b4e9837ec71e0af0c258bcea4019 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Mon, 20 Apr 2026 04:55:23 +0800
Subject: [PATCH] feat(security): DeBERTa-v3 ensemble classifier (opt-in)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds ProtectAI DeBERTa-v3-base-injection-onnx as an optional L4c layer
for cross-model agreement. Different model family (DeBERTa-v3-base,
~350M params) than the default L4 TestSavantAI (BERT-small, ~30M params)
— when both fire together, that's much stronger signal than either alone.

Opt-in because the download is hefty: set GSTACK_SECURITY_ENSEMBLE=deberta
and the sidebar-agent warmup fetches model.onnx (721MB FP32) into
~/.gstack/models/deberta-v3-injection/ on first run. Subsequent runs are
cached.

Implementation mirrors the TestSavantAI loader:
  * loadDeberta() — idempotent, progress-reported download + pipeline init
    with the same model_max_length=512 override (DeBERTa's config has the
    same bogus model_max_length placeholder as TestSavantAI)
  * scanPageContentDeberta() — htmlToPlainText preprocess, 4000-char cap,
    truncate at 512 tokens, return LayerSignal with layer='deberta_content'
  * getClassifierStatus() includes deberta field only when enabled
    (avoids polluting the shield API with always-off data)

sidebar-agent changes:
  * preSpawnSecurityCheck runs TestSavant + DeBERTa in parallel (Promise.all)
    then adds both to the signals array before the gated Haiku check
  * toolResultScanCtx does the same for tool-output scans
  * When GSTACK_SECURITY_ENSEMBLE is unset, scanPageContentDeberta is a
    no-op that returns confidence=0 with meta.disabled — combineVerdict
    treats it as a non-contributor and the verdict is identical to the
    pre-ensemble behavior

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 browse/src/security-classifier.ts | 124 +++++++++++++++++++++++++++++-
 browse/src/sidebar-agent.ts       |  33 ++++++--
 2 files changed, 146 insertions(+), 11 deletions(-)

diff --git a/browse/src/security-classifier.ts b/browse/src/security-classifier.ts
index 62493e56..f934d5da 100644
--- a/browse/src/security-classifier.ts
+++ b/browse/src/security-classifier.ts
@@ -59,6 +59,31 @@ const TESTSAVANT_FILES = [
   'vocab.txt',
 ];
 
+// DeBERTa-v3 (ProtectAI) — OPT-IN ensemble layer. Adds architectural
+// diversity: TestSavantAI-small is BERT-small fine-tuned on injection +
+// jailbreak; DeBERTa-v3-base is a separate model family trained on its
+// own corpus. Agreement between the two is stronger evidence than either
+// alone.
+//
+// Size: model.onnx is 721MB (FP32). Users opt in via
+// GSTACK_SECURITY_ENSEMBLE=deberta. Not forced on every install because
+// most users won't need the higher recall and 721MB download is a lot.
+const DEBERTA_DIR = path.join(MODELS_DIR, 'deberta-v3-injection');
+const DEBERTA_HF_URL = 'https://huggingface.co/protectai/deberta-v3-base-injection-onnx/resolve/main';
+const DEBERTA_FILES = [
+  'config.json',
+  'tokenizer.json',
+  'tokenizer_config.json',
+  'special_tokens_map.json',
+  'spm.model',
+  'added_tokens.json',
+];
+
+function isDebertaEnabled(): boolean {
+  const setting = (process.env.GSTACK_SECURITY_ENSEMBLE ?? '').toLowerCase();
+  return setting.split(',').map(s => s.trim()).includes('deberta');
+}
+
 // ─── Load state ──────────────────────────────────────────────
 
 type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed';
@@ -67,9 +92,14 @@ let testsavantState: LoadState = 'uninitialized';
 let testsavantClassifier: any = null;
 let testsavantLoadError: string | null = null;
 
+let debertaState: LoadState = 'uninitialized';
+let debertaClassifier: any = null;
+let debertaLoadError: string | null = null;
+
 export interface ClassifierStatus {
   testsavant: 'ok' | 'degraded' | 'off';
   transcript: 'ok' | 'degraded' | 'off';
+  deberta?: 'ok' | 'degraded' | 'off'; // only present when ensemble enabled
 }
 
 export function getClassifierStatus(): ClassifierStatus {
@@ -77,11 +107,16 @@ export function getClassifierStatus(): ClassifierStatus {
     testsavantState === 'loaded' ? 'ok' :
     testsavantState === 'failed' ? 'degraded' :
     'off';
-  // Transcript classifier has no persistent load state — it spawns claude-haiku
-  // per-call. We report 'ok' if claude is on PATH (checked lazily on first call).
   const transcript = haikuAvailableCache === null ? 'off' :
     haikuAvailableCache ? 'ok' : 'degraded';
-  return { testsavant, transcript };
+  const status: ClassifierStatus = { testsavant, transcript };
+  if (isDebertaEnabled()) {
+    status.deberta =
+      debertaState === 'loaded' ? 'ok' :
+      debertaState === 'failed' ? 'degraded' :
+      'off';
+  }
+  return status;
 }
 
 // ─── Model download + staging ────────────────────────────────
@@ -245,6 +280,89 @@ export async function scanPageContent(text: string): Promise<LayerSignal> {
   }
 }
 
+// ─── L4c: DeBERTa-v3 ensemble (opt-in) ───────────────────────
+
+async function ensureDebertaStaged(onProgress?: (msg: string) => void): Promise<void> {
+  fs.mkdirSync(path.join(DEBERTA_DIR, 'onnx'), { recursive: true, mode: 0o700 });
+  for (const f of DEBERTA_FILES) {
+    const dst = path.join(DEBERTA_DIR, f);
+    if (fs.existsSync(dst)) continue;
+    onProgress?.(`deberta: downloading ${f}`);
+    await downloadFile(`${DEBERTA_HF_URL}/${f}`, dst);
+  }
+  const modelDst = path.join(DEBERTA_DIR, 'onnx', 'model.onnx');
+  if (!fs.existsSync(modelDst)) {
+    onProgress?.('deberta: downloading model.onnx (721MB) — first run only');
+    await downloadFile(`${DEBERTA_HF_URL}/model.onnx`, modelDst);
+  }
+}
+
+let debertaLoadPromise: Promise<void> | null = null;
+export function loadDeberta(onProgress?: (msg: string) => void): Promise<void> {
+  if (!isDebertaEnabled()) return Promise.resolve();
+  if (debertaState === 'loaded') return Promise.resolve();
+  if (debertaLoadPromise) return debertaLoadPromise;
+  debertaState = 'loading';
+  debertaLoadPromise = (async () => {
+    try {
+      await ensureDebertaStaged(onProgress);
+      onProgress?.('deberta: initializing classifier');
+      const { pipeline, env } = await import('@huggingface/transformers');
+      env.allowLocalModels = true;
+      env.allowRemoteModels = false;
+      env.localModelPath = MODELS_DIR;
+      debertaClassifier = await pipeline(
+        'text-classification',
+        'deberta-v3-injection',
+        { dtype: 'fp32' },
+      );
+      const tok = debertaClassifier?.tokenizer as any;
+      if (tok?._tokenizerConfig) {
+        tok._tokenizerConfig.model_max_length = 512;
+      }
+      debertaState = 'loaded';
+    } catch (err: any) {
+      debertaState = 'failed';
+      debertaLoadError = err?.message ?? String(err);
+      console.error('[security-classifier] Failed to load DeBERTa-v3:', debertaLoadError);
+    }
+  })();
+  return debertaLoadPromise;
+}
+
+/**
+ * Scan text with the DeBERTa-v3 ensemble classifier. Returns a LayerSignal
+ * with layer='deberta_content'. No-op when ensemble is disabled — returns
+ * confidence=0 with meta.disabled=true so combineVerdict treats it as safe.
+ */
+export async function scanPageContentDeberta(text: string): Promise<LayerSignal> {
+  if (!isDebertaEnabled()) {
+    return { layer: 'deberta_content', confidence: 0, meta: { disabled: true } };
+  }
+  if (!text || text.length === 0) {
+    return { layer: 'deberta_content', confidence: 0 };
+  }
+  if (debertaState !== 'loaded') {
+    return { layer: 'deberta_content', confidence: 0, meta: { degraded: true } };
+  }
+  try {
+    const plain = htmlToPlainText(text);
+    const input = plain.slice(0, 4000);
+    const raw = await debertaClassifier(input);
+    const top = Array.isArray(raw) ? raw[0] : raw;
+    const label = top?.label ?? 'SAFE';
+    const score = Number(top?.score ?? 0);
+    if (label === 'INJECTION') {
+      return { layer: 'deberta_content', confidence: score, meta: { label } };
+    }
+    return { layer: 'deberta_content', confidence: 0, meta: { label, safeScore: score } };
+  } catch (err: any) {
+    debertaState = 'failed';
+    debertaLoadError = err?.message ?? String(err);
+    return { layer: 'deberta_content', confidence: 0, meta: { degraded: true, error: debertaLoadError } };
+  }
+}
+
 // ─── L4b: Claude Haiku transcript classifier ─────────────────
 
 /**
diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts
index f5805b83..71bdc609 100644
--- a/browse/src/sidebar-agent.ts
+++ b/browse/src/sidebar-agent.ts
@@ -21,6 +21,7 @@ import {
 import {
   loadTestsavant, scanPageContent, checkTranscript,
   shouldRunTranscriptCheck, getClassifierStatus,
+  loadDeberta, scanPageContentDeberta,
   type ToolCallInput,
 } from './security-classifier';
 
@@ -457,11 +458,15 @@ async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
   if (!message || message.length === 0) return false;
   const tid = tabId ?? 0;
 
-  // L4: scan the user message for direct injection patterns
-  const contentSignal = await scanPageContent(message);
-  const signals: LayerSignal[] = [contentSignal];
+  // L4: scan the user message for direct injection patterns (TestSavantAI)
+  // L4c: also scan with DeBERTa-v3 when ensemble is enabled (opt-in)
+  const [contentSignal, debertaSignal] = await Promise.all([
+    scanPageContent(message),
+    scanPageContentDeberta(message),
+  ]);
+  const signals: LayerSignal[] = [contentSignal, debertaSignal];
 
-  // L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
+  // L4b: only bother with Haiku if another layer already lit up at >= LOG_ONLY.
   // Saves ~70% of Haiku calls per plan §E1 "gating optimization".
   if (shouldRunTranscriptCheck(signals)) {
     const transcriptSignal = await checkTranscript({
@@ -593,10 +598,16 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
     const toolResultScanCtx: ToolResultScanContext = {
       scan: async (toolName: string, text: string) => {
         if (toolResultBlockFired) return;
-        const contentSignal = await scanPageContent(text);
-        if (contentSignal.confidence < THRESHOLDS.WARN) return;
-        // Signal crossed WARN — see if ensemble upgrades to BLOCK.
-        const signals: LayerSignal[] = [contentSignal];
+        // Parallel L4 + L4c ensemble scan (DeBERTa no-op when disabled).
+        const [contentSignal, debertaSignal] = await Promise.all([
+          scanPageContent(text),
+          scanPageContentDeberta(text),
+        ]);
+        // Short-circuit if neither content layer crossed WARN — no point
+        // spinning up Haiku for a clean scan.
+        const maxContent = Math.max(contentSignal.confidence, debertaSignal.confidence);
+        if (maxContent < THRESHOLDS.WARN) return;
+        const signals: LayerSignal[] = [contentSignal, debertaSignal];
         if (shouldRunTranscriptCheck(signals)) {
           signals.push(await checkTranscript({
             user_message: queueEntry.message ?? '',
@@ -809,6 +820,12 @@ async function main() {
   console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
   console.log(`[sidebar-agent] Browse binary: ${B}`);
 
+  // If GSTACK_SECURITY_ENSEMBLE=deberta is set, also warm the DeBERTa-v3
+  // ensemble classifier. Fire-and-forget alongside TestSavantAI — they
+  // warm in parallel. No-op when the env var is unset.
+  loadDeberta((msg) => console.log(`[security-classifier] ${msg}`))
+    .catch((err) => console.warn('[sidebar-agent] DeBERTa warmup failed:', err?.message));
+
   // Warm up the ML classifier in the background. First call triggers a 112MB
   // download (~30s on average broadband). Non-blocking — the sidebar stays
   // functional on cold start; classifier just reports 'off' until warmed.