feat(security): DeBERTa-v3 ensemble classifier (opt-in)

Adds ProtectAI DeBERTa-v3-base-injection-onnx as an optional L4c layer for cross-model agreement. Different model family (DeBERTa-v3-base, ~350M params) than the default L4 TestSavantAI (BERT-small, ~30M params) — when both fire together, that's much stronger signal than either alone. Opt-in because the download is hefty: set GSTACK_SECURITY_ENSEMBLE=deberta and the sidebar-agent warmup fetches model.onnx (721MB FP32) into ~/.gstack/models/deberta-v3-injection/ on first run. Subsequent runs are cached. Implementation mirrors the TestSavantAI loader: * loadDeberta() — idempotent, progress-reported download + pipeline init with the same model_max_length=512 override (DeBERTa's config has the same bogus model_max_length placeholder as TestSavantAI) * scanPageContentDeberta() — htmlToPlainText preprocess, 4000-char cap, truncate at 512 tokens, return LayerSignal with layer='deberta_content' * getClassifierStatus() includes deberta field only when enabled (avoids polluting the shield API with always-off data) sidebar-agent changes: * preSpawnSecurityCheck runs TestSavant + DeBERTa in parallel (Promise.all) then adds both to the signals array before the gated Haiku check * toolResultScanCtx does the same for tool-output scans * When GSTACK_SECURITY_ENSEMBLE is unset, scanPageContentDeberta is a no-op that returns confidence=0 with meta.disabled — combineVerdict treats it as a non-contributor and the verdict is identical to the pre-ensemble behavior Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-04-20 04:55:23 +08:00
parent b4e49d080d
commit 8e9ec52d6f
2 changed files with 146 additions and 11 deletions
@@ -59,6 +59,31 @@ const TESTSAVANT_FILES = [
  'vocab.txt',
 ];

+// DeBERTa-v3 (ProtectAI) — OPT-IN ensemble layer. Adds architectural
+// diversity: TestSavantAI-small is BERT-small fine-tuned on injection +
+// jailbreak; DeBERTa-v3-base is a separate model family trained on its
+// own corpus. Agreement between the two is stronger evidence than either
+// alone.
+//
+// Size: model.onnx is 721MB (FP32). Users opt in via
+// GSTACK_SECURITY_ENSEMBLE=deberta. Not forced on every install because
+// most users won't need the higher recall and 721MB download is a lot.
+const DEBERTA_DIR = path.join(MODELS_DIR, 'deberta-v3-injection');
+const DEBERTA_HF_URL = 'https://huggingface.co/protectai/deberta-v3-base-injection-onnx/resolve/main';
+const DEBERTA_FILES = [
+  'config.json',
+  'tokenizer.json',
+  'tokenizer_config.json',
+  'special_tokens_map.json',
+  'spm.model',
+  'added_tokens.json',
+];
+
+function isDebertaEnabled(): boolean {
+  const setting = (process.env.GSTACK_SECURITY_ENSEMBLE ?? '').toLowerCase();
+  return setting.split(',').map(s => s.trim()).includes('deberta');
+}
+
 // ─── Load state ──────────────────────────────────────────────

 type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed';
@@ -67,9 +92,14 @@ let testsavantState: LoadState = 'uninitialized';
 let testsavantClassifier: any = null;
 let testsavantLoadError: string | null = null;

+let debertaState: LoadState = 'uninitialized';
+let debertaClassifier: any = null;
+let debertaLoadError: string | null = null;
+
 export interface ClassifierStatus {
  testsavant: 'ok' | 'degraded' | 'off';
  transcript: 'ok' | 'degraded' | 'off';
+  deberta?: 'ok' | 'degraded' | 'off'; // only present when ensemble enabled
 }

 export function getClassifierStatus(): ClassifierStatus {
@@ -77,11 +107,16 @@ export function getClassifierStatus(): ClassifierStatus {
    testsavantState === 'loaded' ? 'ok' :
    testsavantState === 'failed' ? 'degraded' :
    'off';
-  // Transcript classifier has no persistent load state — it spawns claude-haiku
-  // per-call. We report 'ok' if claude is on PATH (checked lazily on first call).
  const transcript = haikuAvailableCache === null ? 'off' :
    haikuAvailableCache ? 'ok' : 'degraded';
-  return { testsavant, transcript };
+  const status: ClassifierStatus = { testsavant, transcript };
+  if (isDebertaEnabled()) {
+    status.deberta =
+      debertaState === 'loaded' ? 'ok' :
+      debertaState === 'failed' ? 'degraded' :
+      'off';
+  }
+  return status;
 }

 // ─── Model download + staging ────────────────────────────────
@@ -245,6 +280,89 @@ export async function scanPageContent(text: string): Promise<LayerSignal> {
  }
 }

+// ─── L4c: DeBERTa-v3 ensemble (opt-in) ───────────────────────
+
+async function ensureDebertaStaged(onProgress?: (msg: string) => void): Promise<void> {
+  fs.mkdirSync(path.join(DEBERTA_DIR, 'onnx'), { recursive: true, mode: 0o700 });
+  for (const f of DEBERTA_FILES) {
+    const dst = path.join(DEBERTA_DIR, f);
+    if (fs.existsSync(dst)) continue;
+    onProgress?.(`deberta: downloading ${f}`);
+    await downloadFile(`${DEBERTA_HF_URL}/${f}`, dst);
+  }
+  const modelDst = path.join(DEBERTA_DIR, 'onnx', 'model.onnx');
+  if (!fs.existsSync(modelDst)) {
+    onProgress?.('deberta: downloading model.onnx (721MB) — first run only');
+    await downloadFile(`${DEBERTA_HF_URL}/model.onnx`, modelDst);
+  }
+}
+
+let debertaLoadPromise: Promise<void> | null = null;
+export function loadDeberta(onProgress?: (msg: string) => void): Promise<void> {
+  if (!isDebertaEnabled()) return Promise.resolve();
+  if (debertaState === 'loaded') return Promise.resolve();
+  if (debertaLoadPromise) return debertaLoadPromise;
+  debertaState = 'loading';
+  debertaLoadPromise = (async () => {
+    try {
+      await ensureDebertaStaged(onProgress);
+      onProgress?.('deberta: initializing classifier');
+      const { pipeline, env } = await import('@huggingface/transformers');
+      env.allowLocalModels = true;
+      env.allowRemoteModels = false;
+      env.localModelPath = MODELS_DIR;
+      debertaClassifier = await pipeline(
+        'text-classification',
+        'deberta-v3-injection',
+        { dtype: 'fp32' },
+      );
+      const tok = debertaClassifier?.tokenizer as any;
+      if (tok?._tokenizerConfig) {
+        tok._tokenizerConfig.model_max_length = 512;
+      }
+      debertaState = 'loaded';
+    } catch (err: any) {
+      debertaState = 'failed';
+      debertaLoadError = err?.message ?? String(err);
+      console.error('[security-classifier] Failed to load DeBERTa-v3:', debertaLoadError);
+    }
+  })();
+  return debertaLoadPromise;
+}
+
+/**
+ * Scan text with the DeBERTa-v3 ensemble classifier. Returns a LayerSignal
+ * with layer='deberta_content'. No-op when ensemble is disabled — returns
+ * confidence=0 with meta.disabled=true so combineVerdict treats it as safe.
+ */
+export async function scanPageContentDeberta(text: string): Promise<LayerSignal> {
+  if (!isDebertaEnabled()) {
+    return { layer: 'deberta_content', confidence: 0, meta: { disabled: true } };
+  }
+  if (!text || text.length === 0) {
+    return { layer: 'deberta_content', confidence: 0 };
+  }
+  if (debertaState !== 'loaded') {
+    return { layer: 'deberta_content', confidence: 0, meta: { degraded: true } };
+  }
+  try {
+    const plain = htmlToPlainText(text);
+    const input = plain.slice(0, 4000);
+    const raw = await debertaClassifier(input);
+    const top = Array.isArray(raw) ? raw[0] : raw;
+    const label = top?.label ?? 'SAFE';
+    const score = Number(top?.score ?? 0);
+    if (label === 'INJECTION') {
+      return { layer: 'deberta_content', confidence: score, meta: { label } };
+    }
+    return { layer: 'deberta_content', confidence: 0, meta: { label, safeScore: score } };
+  } catch (err: any) {
+    debertaState = 'failed';
+    debertaLoadError = err?.message ?? String(err);
+    return { layer: 'deberta_content', confidence: 0, meta: { degraded: true, error: debertaLoadError } };
+  }
+}
+
 // ─── L4b: Claude Haiku transcript classifier ─────────────────

 /**
@@ -21,6 +21,7 @@ import {
 import {
  loadTestsavant, scanPageContent, checkTranscript,
  shouldRunTranscriptCheck, getClassifierStatus,
+  loadDeberta, scanPageContentDeberta,
  type ToolCallInput,
 } from './security-classifier';

@@ -457,11 +458,15 @@ async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
  if (!message || message.length === 0) return false;
  const tid = tabId ?? 0;

-  // L4: scan the user message for direct injection patterns
-  const contentSignal = await scanPageContent(message);
-  const signals: LayerSignal[] = [contentSignal];
+  // L4: scan the user message for direct injection patterns (TestSavantAI)
+  // L4c: also scan with DeBERTa-v3 when ensemble is enabled (opt-in)
+  const [contentSignal, debertaSignal] = await Promise.all([
+    scanPageContent(message),
+    scanPageContentDeberta(message),
+  ]);
+  const signals: LayerSignal[] = [contentSignal, debertaSignal];

-  // L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
+  // L4b: only bother with Haiku if another layer already lit up at >= LOG_ONLY.
  // Saves ~70% of Haiku calls per plan §E1 "gating optimization".
  if (shouldRunTranscriptCheck(signals)) {
    const transcriptSignal = await checkTranscript({
@@ -593,10 +598,16 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
    const toolResultScanCtx: ToolResultScanContext = {
      scan: async (toolName: string, text: string) => {
        if (toolResultBlockFired) return;
-        const contentSignal = await scanPageContent(text);
-        if (contentSignal.confidence < THRESHOLDS.WARN) return;
-        // Signal crossed WARN — see if ensemble upgrades to BLOCK.
-        const signals: LayerSignal[] = [contentSignal];
+        // Parallel L4 + L4c ensemble scan (DeBERTa no-op when disabled).
+        const [contentSignal, debertaSignal] = await Promise.all([
+          scanPageContent(text),
+          scanPageContentDeberta(text),
+        ]);
+        // Short-circuit if neither content layer crossed WARN — no point
+        // spinning up Haiku for a clean scan.
+        const maxContent = Math.max(contentSignal.confidence, debertaSignal.confidence);
+        if (maxContent < THRESHOLDS.WARN) return;
+        const signals: LayerSignal[] = [contentSignal, debertaSignal];
        if (shouldRunTranscriptCheck(signals)) {
          signals.push(await checkTranscript({
            user_message: queueEntry.message ?? '',
@@ -809,6 +820,12 @@ async function main() {
  console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
  console.log(`[sidebar-agent] Browse binary: ${B}`);

+  // If GSTACK_SECURITY_ENSEMBLE=deberta is set, also warm the DeBERTa-v3
+  // ensemble classifier. Fire-and-forget alongside TestSavantAI — they
+  // warm in parallel. No-op when the env var is unset.
+  loadDeberta((msg) => console.log(`[security-classifier] ${msg}`))
+    .catch((err) => console.warn('[sidebar-agent] DeBERTa warmup failed:', err?.message));
+
  // Warm up the ML classifier in the background. First call triggers a 112MB
  // download (~30s on average broadband). Non-blocking — the sidebar stays
  // functional on cold start; classifier just reports 'off' until warmed.