From 8e9ec52d6f30b4e9837ec71e0af0c258bcea4019 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 20 Apr 2026 04:55:23 +0800 Subject: [PATCH] feat(security): DeBERTa-v3 ensemble classifier (opt-in) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ProtectAI DeBERTa-v3-base-injection-onnx as an optional L4c layer for cross-model agreement. Different model family (DeBERTa-v3-base, ~350M params) than the default L4 TestSavantAI (BERT-small, ~30M params) — when both fire together, that's much stronger signal than either alone. Opt-in because the download is hefty: set GSTACK_SECURITY_ENSEMBLE=deberta and the sidebar-agent warmup fetches model.onnx (721MB FP32) into ~/.gstack/models/deberta-v3-injection/ on first run. Subsequent runs are cached. Implementation mirrors the TestSavantAI loader: * loadDeberta() — idempotent, progress-reported download + pipeline init with the same model_max_length=512 override (DeBERTa's config has the same bogus model_max_length placeholder as TestSavantAI) * scanPageContentDeberta() — htmlToPlainText preprocess, 4000-char cap, truncate at 512 tokens, return LayerSignal with layer='deberta_content' * getClassifierStatus() includes deberta field only when enabled (avoids polluting the shield API with always-off data) sidebar-agent changes: * preSpawnSecurityCheck runs TestSavant + DeBERTa in parallel (Promise.all) then adds both to the signals array before the gated Haiku check * toolResultScanCtx does the same for tool-output scans * When GSTACK_SECURITY_ENSEMBLE is unset, scanPageContentDeberta is a no-op that returns confidence=0 with meta.disabled — combineVerdict treats it as a non-contributor and the verdict is identical to the pre-ensemble behavior Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/security-classifier.ts | 124 +++++++++++++++++++++++++++++- browse/src/sidebar-agent.ts | 33 ++++++-- 2 files changed, 146 insertions(+), 11 deletions(-) diff --git a/browse/src/security-classifier.ts b/browse/src/security-classifier.ts index 62493e56..f934d5da 100644 --- a/browse/src/security-classifier.ts +++ b/browse/src/security-classifier.ts @@ -59,6 +59,31 @@ const TESTSAVANT_FILES = [ 'vocab.txt', ]; +// DeBERTa-v3 (ProtectAI) — OPT-IN ensemble layer. Adds architectural +// diversity: TestSavantAI-small is BERT-small fine-tuned on injection + +// jailbreak; DeBERTa-v3-base is a separate model family trained on its +// own corpus. Agreement between the two is stronger evidence than either +// alone. +// +// Size: model.onnx is 721MB (FP32). Users opt in via +// GSTACK_SECURITY_ENSEMBLE=deberta. Not forced on every install because +// most users won't need the higher recall and 721MB download is a lot. +const DEBERTA_DIR = path.join(MODELS_DIR, 'deberta-v3-injection'); +const DEBERTA_HF_URL = 'https://huggingface.co/protectai/deberta-v3-base-injection-onnx/resolve/main'; +const DEBERTA_FILES = [ + 'config.json', + 'tokenizer.json', + 'tokenizer_config.json', + 'special_tokens_map.json', + 'spm.model', + 'added_tokens.json', +]; + +function isDebertaEnabled(): boolean { + const setting = (process.env.GSTACK_SECURITY_ENSEMBLE ?? '').toLowerCase(); + return setting.split(',').map(s => s.trim()).includes('deberta'); +} + // ─── Load state ────────────────────────────────────────────── type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed'; @@ -67,9 +92,14 @@ let testsavantState: LoadState = 'uninitialized'; let testsavantClassifier: any = null; let testsavantLoadError: string | null = null; +let debertaState: LoadState = 'uninitialized'; +let debertaClassifier: any = null; +let debertaLoadError: string | null = null; + export interface ClassifierStatus { testsavant: 'ok' | 'degraded' | 'off'; transcript: 'ok' | 'degraded' | 'off'; + deberta?: 'ok' | 'degraded' | 'off'; // only present when ensemble enabled } export function getClassifierStatus(): ClassifierStatus { @@ -77,11 +107,16 @@ export function getClassifierStatus(): ClassifierStatus { testsavantState === 'loaded' ? 'ok' : testsavantState === 'failed' ? 'degraded' : 'off'; - // Transcript classifier has no persistent load state — it spawns claude-haiku - // per-call. We report 'ok' if claude is on PATH (checked lazily on first call). const transcript = haikuAvailableCache === null ? 'off' : haikuAvailableCache ? 'ok' : 'degraded'; - return { testsavant, transcript }; + const status: ClassifierStatus = { testsavant, transcript }; + if (isDebertaEnabled()) { + status.deberta = + debertaState === 'loaded' ? 'ok' : + debertaState === 'failed' ? 'degraded' : + 'off'; + } + return status; } // ─── Model download + staging ──────────────────────────────── @@ -245,6 +280,89 @@ export async function scanPageContent(text: string): Promise { } } +// ─── L4c: DeBERTa-v3 ensemble (opt-in) ─────────────────────── + +async function ensureDebertaStaged(onProgress?: (msg: string) => void): Promise { + fs.mkdirSync(path.join(DEBERTA_DIR, 'onnx'), { recursive: true, mode: 0o700 }); + for (const f of DEBERTA_FILES) { + const dst = path.join(DEBERTA_DIR, f); + if (fs.existsSync(dst)) continue; + onProgress?.(`deberta: downloading ${f}`); + await downloadFile(`${DEBERTA_HF_URL}/${f}`, dst); + } + const modelDst = path.join(DEBERTA_DIR, 'onnx', 'model.onnx'); + if (!fs.existsSync(modelDst)) { + onProgress?.('deberta: downloading model.onnx (721MB) — first run only'); + await downloadFile(`${DEBERTA_HF_URL}/model.onnx`, modelDst); + } +} + +let debertaLoadPromise: Promise | null = null; +export function loadDeberta(onProgress?: (msg: string) => void): Promise { + if (!isDebertaEnabled()) return Promise.resolve(); + if (debertaState === 'loaded') return Promise.resolve(); + if (debertaLoadPromise) return debertaLoadPromise; + debertaState = 'loading'; + debertaLoadPromise = (async () => { + try { + await ensureDebertaStaged(onProgress); + onProgress?.('deberta: initializing classifier'); + const { pipeline, env } = await import('@huggingface/transformers'); + env.allowLocalModels = true; + env.allowRemoteModels = false; + env.localModelPath = MODELS_DIR; + debertaClassifier = await pipeline( + 'text-classification', + 'deberta-v3-injection', + { dtype: 'fp32' }, + ); + const tok = debertaClassifier?.tokenizer as any; + if (tok?._tokenizerConfig) { + tok._tokenizerConfig.model_max_length = 512; + } + debertaState = 'loaded'; + } catch (err: any) { + debertaState = 'failed'; + debertaLoadError = err?.message ?? String(err); + console.error('[security-classifier] Failed to load DeBERTa-v3:', debertaLoadError); + } + })(); + return debertaLoadPromise; +} + +/** + * Scan text with the DeBERTa-v3 ensemble classifier. Returns a LayerSignal + * with layer='deberta_content'. No-op when ensemble is disabled — returns + * confidence=0 with meta.disabled=true so combineVerdict treats it as safe. + */ +export async function scanPageContentDeberta(text: string): Promise { + if (!isDebertaEnabled()) { + return { layer: 'deberta_content', confidence: 0, meta: { disabled: true } }; + } + if (!text || text.length === 0) { + return { layer: 'deberta_content', confidence: 0 }; + } + if (debertaState !== 'loaded') { + return { layer: 'deberta_content', confidence: 0, meta: { degraded: true } }; + } + try { + const plain = htmlToPlainText(text); + const input = plain.slice(0, 4000); + const raw = await debertaClassifier(input); + const top = Array.isArray(raw) ? raw[0] : raw; + const label = top?.label ?? 'SAFE'; + const score = Number(top?.score ?? 0); + if (label === 'INJECTION') { + return { layer: 'deberta_content', confidence: score, meta: { label } }; + } + return { layer: 'deberta_content', confidence: 0, meta: { label, safeScore: score } }; + } catch (err: any) { + debertaState = 'failed'; + debertaLoadError = err?.message ?? String(err); + return { layer: 'deberta_content', confidence: 0, meta: { degraded: true, error: debertaLoadError } }; + } +} + // ─── L4b: Claude Haiku transcript classifier ───────────────── /** diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts index f5805b83..71bdc609 100644 --- a/browse/src/sidebar-agent.ts +++ b/browse/src/sidebar-agent.ts @@ -21,6 +21,7 @@ import { import { loadTestsavant, scanPageContent, checkTranscript, shouldRunTranscriptCheck, getClassifierStatus, + loadDeberta, scanPageContentDeberta, type ToolCallInput, } from './security-classifier'; @@ -457,11 +458,15 @@ async function preSpawnSecurityCheck(entry: QueueEntry): Promise { if (!message || message.length === 0) return false; const tid = tabId ?? 0; - // L4: scan the user message for direct injection patterns - const contentSignal = await scanPageContent(message); - const signals: LayerSignal[] = [contentSignal]; + // L4: scan the user message for direct injection patterns (TestSavantAI) + // L4c: also scan with DeBERTa-v3 when ensemble is enabled (opt-in) + const [contentSignal, debertaSignal] = await Promise.all([ + scanPageContent(message), + scanPageContentDeberta(message), + ]); + const signals: LayerSignal[] = [contentSignal, debertaSignal]; - // L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY. + // L4b: only bother with Haiku if another layer already lit up at >= LOG_ONLY. // Saves ~70% of Haiku calls per plan §E1 "gating optimization". if (shouldRunTranscriptCheck(signals)) { const transcriptSignal = await checkTranscript({ @@ -593,10 +598,16 @@ async function askClaude(queueEntry: QueueEntry): Promise { const toolResultScanCtx: ToolResultScanContext = { scan: async (toolName: string, text: string) => { if (toolResultBlockFired) return; - const contentSignal = await scanPageContent(text); - if (contentSignal.confidence < THRESHOLDS.WARN) return; - // Signal crossed WARN — see if ensemble upgrades to BLOCK. - const signals: LayerSignal[] = [contentSignal]; + // Parallel L4 + L4c ensemble scan (DeBERTa no-op when disabled). + const [contentSignal, debertaSignal] = await Promise.all([ + scanPageContent(text), + scanPageContentDeberta(text), + ]); + // Short-circuit if neither content layer crossed WARN — no point + // spinning up Haiku for a clean scan. + const maxContent = Math.max(contentSignal.confidence, debertaSignal.confidence); + if (maxContent < THRESHOLDS.WARN) return; + const signals: LayerSignal[] = [contentSignal, debertaSignal]; if (shouldRunTranscriptCheck(signals)) { signals.push(await checkTranscript({ user_message: queueEntry.message ?? '', @@ -809,6 +820,12 @@ async function main() { console.log(`[sidebar-agent] Server: ${SERVER_URL}`); console.log(`[sidebar-agent] Browse binary: ${B}`); + // If GSTACK_SECURITY_ENSEMBLE=deberta is set, also warm the DeBERTa-v3 + // ensemble classifier. Fire-and-forget alongside TestSavantAI — they + // warm in parallel. No-op when the env var is unset. + loadDeberta((msg) => console.log(`[security-classifier] ${msg}`)) + .catch((err) => console.warn('[sidebar-agent] DeBERTa warmup failed:', err?.message)); + // Warm up the ML classifier in the background. First call triggers a 112MB // download (~30s on average broadband). Non-blocking — the sidebar stays // functional on cold start; classifier just reports 'off' until warmed.