feat(security): DeBERTa-v3 ensemble classifier (opt-in)

Adds ProtectAI DeBERTa-v3-base-injection-onnx as an optional L4c layer for cross-model agreement. Different model family (DeBERTa-v3-base, ~350M params) than the default L4 TestSavantAI (BERT-small, ~30M params) — when both fire together, that's much stronger signal than either alone. Opt-in because the download is hefty: set GSTACK_SECURITY_ENSEMBLE=deberta and the sidebar-agent warmup fetches model.onnx (721MB FP32) into ~/.gstack/models/deberta-v3-injection/ on first run. Subsequent runs are cached. Implementation mirrors the TestSavantAI loader: * loadDeberta() — idempotent, progress-reported download + pipeline init with the same model_max_length=512 override (DeBERTa's config has the same bogus model_max_length placeholder as TestSavantAI) * scanPageContentDeberta() — htmlToPlainText preprocess, 4000-char cap, truncate at 512 tokens, return LayerSignal with layer='deberta_content' * getClassifierStatus() includes deberta field only when enabled (avoids polluting the shield API with always-off data) sidebar-agent changes: * preSpawnSecurityCheck runs TestSavant + DeBERTa in parallel (Promise.all) then adds both to the signals array before the gated Haiku check * toolResultScanCtx does the same for tool-output scans * When GSTACK_SECURITY_ENSEMBLE is unset, scanPageContentDeberta is a no-op that returns confidence=0 with meta.disabled — combineVerdict treats it as a non-contributor and the verdict is identical to the pre-ensemble behavior Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-04-20 04:55:23 +08:00
parent b4e49d080d
commit 8e9ec52d6f
2 changed files with 146 additions and 11 deletions
@@ -59,6 +59,31 @@ const TESTSAVANT_FILES = [
  'vocab.txt',
 ];
 // DeBERTa-v3 (ProtectAI) — OPT-IN ensemble layer. Adds architectural
 // diversity: TestSavantAI-small is BERT-small fine-tuned on injection +
 // jailbreak; DeBERTa-v3-base is a separate model family trained on its
 // own corpus. Agreement between the two is stronger evidence than either
 // alone.
 //
 // Size: model.onnx is 721MB (FP32). Users opt in via
 // GSTACK_SECURITY_ENSEMBLE=deberta. Not forced on every install because
 // most users won't need the higher recall and 721MB download is a lot.
 const DEBERTA_DIR = path.join(MODELS_DIR, 'deberta-v3-injection');
 const DEBERTA_HF_URL = 'https://huggingface.co/protectai/deberta-v3-base-injection-onnx/resolve/main';
 const DEBERTA_FILES = [
  'config.json',
  'tokenizer.json',
  'tokenizer_config.json',
  'special_tokens_map.json',
  'spm.model',
  'added_tokens.json',
 ];
 function isDebertaEnabled(): boolean {
  const setting = (process.env.GSTACK_SECURITY_ENSEMBLE ?? '').toLowerCase();
  return setting.split(',').map(s => s.trim()).includes('deberta');
 }
 // ─── Load state ──────────────────────────────────────────────
 type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed';
@@ -67,9 +92,14 @@ let testsavantState: LoadState = 'uninitialized';
 let testsavantClassifier: any = null;
 let testsavantLoadError: string | null = null;
 let debertaState: LoadState = 'uninitialized';
 let debertaClassifier: any = null;
 let debertaLoadError: string | null = null;
 export interface ClassifierStatus {
  testsavant: 'ok' | 'degraded' | 'off';
  transcript: 'ok' | 'degraded' | 'off';
  deberta?: 'ok' | 'degraded' | 'off'; // only present when ensemble enabled
 }
 export function getClassifierStatus(): ClassifierStatus {
@@ -77,11 +107,16 @@ export function getClassifierStatus(): ClassifierStatus {
    testsavantState === 'loaded' ? 'ok' :
    testsavantState === 'failed' ? 'degraded' :
    'off';
  // Transcript classifier has no persistent load state — it spawns claude-haiku
  // per-call. We report 'ok' if claude is on PATH (checked lazily on first call).
  const transcript = haikuAvailableCache === null ? 'off' :
    haikuAvailableCache ? 'ok' : 'degraded';
-  return { testsavant, transcript };
+  const status: ClassifierStatus = { testsavant, transcript };
  if (isDebertaEnabled()) {
    status.deberta =
      debertaState === 'loaded' ? 'ok' :
      debertaState === 'failed' ? 'degraded' :
      'off';
  }
  return status;
 }
 // ─── Model download + staging ────────────────────────────────
@@ -245,6 +280,89 @@ export async function scanPageContent(text: string): Promise<LayerSignal> {
  }
 }
 // ─── L4c: DeBERTa-v3 ensemble (opt-in) ───────────────────────
 async function ensureDebertaStaged(onProgress?: (msg: string) => void): Promise<void> {
  fs.mkdirSync(path.join(DEBERTA_DIR, 'onnx'), { recursive: true, mode: 0o700 });
  for (const f of DEBERTA_FILES) {
    const dst = path.join(DEBERTA_DIR, f);
    if (fs.existsSync(dst)) continue;
    onProgress?.(`deberta: downloading ${f}`);
    await downloadFile(`${DEBERTA_HF_URL}/${f}`, dst);
  }
  const modelDst = path.join(DEBERTA_DIR, 'onnx', 'model.onnx');
  if (!fs.existsSync(modelDst)) {
    onProgress?.('deberta: downloading model.onnx (721MB) — first run only');
    await downloadFile(`${DEBERTA_HF_URL}/model.onnx`, modelDst);
  }
 }
 let debertaLoadPromise: Promise<void> | null = null;
 export function loadDeberta(onProgress?: (msg: string) => void): Promise<void> {
  if (!isDebertaEnabled()) return Promise.resolve();
  if (debertaState === 'loaded') return Promise.resolve();
  if (debertaLoadPromise) return debertaLoadPromise;
  debertaState = 'loading';
  debertaLoadPromise = (async () => {
    try {
      await ensureDebertaStaged(onProgress);
      onProgress?.('deberta: initializing classifier');
      const { pipeline, env } = await import('@huggingface/transformers');
      env.allowLocalModels = true;
      env.allowRemoteModels = false;
      env.localModelPath = MODELS_DIR;
      debertaClassifier = await pipeline(
        'text-classification',
        'deberta-v3-injection',
        { dtype: 'fp32' },
      );
      const tok = debertaClassifier?.tokenizer as any;
      if (tok?._tokenizerConfig) {
        tok._tokenizerConfig.model_max_length = 512;
      }
      debertaState = 'loaded';
    } catch (err: any) {
      debertaState = 'failed';
      debertaLoadError = err?.message ?? String(err);
      console.error('[security-classifier] Failed to load DeBERTa-v3:', debertaLoadError);
    }
  })();
  return debertaLoadPromise;
 }
 /**
 * Scan text with the DeBERTa-v3 ensemble classifier. Returns a LayerSignal
 * with layer='deberta_content'. No-op when ensemble is disabled — returns
 * confidence=0 with meta.disabled=true so combineVerdict treats it as safe.
 */
 export async function scanPageContentDeberta(text: string): Promise<LayerSignal> {
  if (!isDebertaEnabled()) {
    return { layer: 'deberta_content', confidence: 0, meta: { disabled: true } };
  }
  if (!text || text.length === 0) {
    return { layer: 'deberta_content', confidence: 0 };
  }
  if (debertaState !== 'loaded') {
    return { layer: 'deberta_content', confidence: 0, meta: { degraded: true } };
  }
  try {
    const plain = htmlToPlainText(text);
    const input = plain.slice(0, 4000);
    const raw = await debertaClassifier(input);
    const top = Array.isArray(raw) ? raw[0] : raw;
    const label = top?.label ?? 'SAFE';
    const score = Number(top?.score ?? 0);
    if (label === 'INJECTION') {
      return { layer: 'deberta_content', confidence: score, meta: { label } };
    }
    return { layer: 'deberta_content', confidence: 0, meta: { label, safeScore: score } };
  } catch (err: any) {
    debertaState = 'failed';
    debertaLoadError = err?.message ?? String(err);
    return { layer: 'deberta_content', confidence: 0, meta: { degraded: true, error: debertaLoadError } };
  }
 }
 // ─── L4b: Claude Haiku transcript classifier ─────────────────
 /**
@@ -21,6 +21,7 @@ import {
 import {
  loadTestsavant, scanPageContent, checkTranscript,
  shouldRunTranscriptCheck, getClassifierStatus,
  loadDeberta, scanPageContentDeberta,
  type ToolCallInput,
 } from './security-classifier';
@@ -457,11 +458,15 @@ async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
  if (!message || message.length === 0) return false;
  const tid = tabId ?? 0;
-  // L4: scan the user message for direct injection patterns
+  // L4: scan the user message for direct injection patterns (TestSavantAI)
-  const contentSignal = await scanPageContent(message);
+  // L4c: also scan with DeBERTa-v3 when ensemble is enabled (opt-in)
-  const signals: LayerSignal[] = [contentSignal];
+  const [contentSignal, debertaSignal] = await Promise.all([
    scanPageContent(message),
    scanPageContentDeberta(message),
  ]);
  const signals: LayerSignal[] = [contentSignal, debertaSignal];
-  // L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
+  // L4b: only bother with Haiku if another layer already lit up at >= LOG_ONLY.
  // Saves ~70% of Haiku calls per plan §E1 "gating optimization".
  if (shouldRunTranscriptCheck(signals)) {
    const transcriptSignal = await checkTranscript({
@@ -593,10 +598,16 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
    const toolResultScanCtx: ToolResultScanContext = {
      scan: async (toolName: string, text: string) => {
        if (toolResultBlockFired) return;
-        const contentSignal = await scanPageContent(text);
+        // Parallel L4 + L4c ensemble scan (DeBERTa no-op when disabled).
-        if (contentSignal.confidence < THRESHOLDS.WARN) return;
+        const [contentSignal, debertaSignal] = await Promise.all([
-        // Signal crossed WARN — see if ensemble upgrades to BLOCK.
+          scanPageContent(text),
-        const signals: LayerSignal[] = [contentSignal];
+          scanPageContentDeberta(text),
        ]);
        // Short-circuit if neither content layer crossed WARN — no point
        // spinning up Haiku for a clean scan.
        const maxContent = Math.max(contentSignal.confidence, debertaSignal.confidence);
        if (maxContent < THRESHOLDS.WARN) return;
        const signals: LayerSignal[] = [contentSignal, debertaSignal];
        if (shouldRunTranscriptCheck(signals)) {
          signals.push(await checkTranscript({
            user_message: queueEntry.message ?? '',
@@ -809,6 +820,12 @@ async function main() {
  console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
  console.log(`[sidebar-agent] Browse binary: ${B}`);
  // If GSTACK_SECURITY_ENSEMBLE=deberta is set, also warm the DeBERTa-v3
  // ensemble classifier. Fire-and-forget alongside TestSavantAI — they
  // warm in parallel. No-op when the env var is unset.
  loadDeberta((msg) => console.log(`[security-classifier] ${msg}`))
    .catch((err) => console.warn('[sidebar-agent] DeBERTa warmup failed:', err?.message));
  // Warm up the ML classifier in the background. First call triggers a 112MB
  // download (~30s on average broadband). Non-blocking — the sidebar stays
  // functional on cold start; classifier just reports 'off' until warmed.