feat(security): DeBERTa-v3 ensemble classifier (opt-in)

Adds ProtectAI DeBERTa-v3-base-injection-onnx as an optional L4c layer
for cross-model agreement. Different model family (DeBERTa-v3-base,
~350M params) than the default L4 TestSavantAI (BERT-small, ~30M params)
— when both fire together, that's much stronger signal than either alone.

Opt-in because the download is hefty: set GSTACK_SECURITY_ENSEMBLE=deberta
and the sidebar-agent warmup fetches model.onnx (721MB FP32) into
~/.gstack/models/deberta-v3-injection/ on first run. Subsequent runs are
cached.

Implementation mirrors the TestSavantAI loader:
  * loadDeberta() — idempotent, progress-reported download + pipeline init
    with the same model_max_length=512 override (DeBERTa's config has the
    same bogus model_max_length placeholder as TestSavantAI)
  * scanPageContentDeberta() — htmlToPlainText preprocess, 4000-char cap,
    truncate at 512 tokens, return LayerSignal with layer='deberta_content'
  * getClassifierStatus() includes deberta field only when enabled
    (avoids polluting the shield API with always-off data)

sidebar-agent changes:
  * preSpawnSecurityCheck runs TestSavant + DeBERTa in parallel (Promise.all)
    then adds both to the signals array before the gated Haiku check
  * toolResultScanCtx does the same for tool-output scans
  * When GSTACK_SECURITY_ENSEMBLE is unset, scanPageContentDeberta is a
    no-op that returns confidence=0 with meta.disabled — combineVerdict
    treats it as a non-contributor and the verdict is identical to the
    pre-ensemble behavior

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-20 04:55:23 +08:00
parent b4e49d080d
commit 8e9ec52d6f
2 changed files with 146 additions and 11 deletions
+121 -3
View File
@@ -59,6 +59,31 @@ const TESTSAVANT_FILES = [
'vocab.txt',
];
// DeBERTa-v3 (ProtectAI) — OPT-IN ensemble layer. Adds architectural
// diversity: TestSavantAI-small is BERT-small fine-tuned on injection +
// jailbreak; DeBERTa-v3-base is a separate model family trained on its
// own corpus. Agreement between the two is stronger evidence than either
// alone.
//
// Size: model.onnx is 721MB (FP32). Users opt in via
// GSTACK_SECURITY_ENSEMBLE=deberta. Not forced on every install because
// most users won't need the higher recall and 721MB download is a lot.
const DEBERTA_DIR = path.join(MODELS_DIR, 'deberta-v3-injection');
const DEBERTA_HF_URL = 'https://huggingface.co/protectai/deberta-v3-base-injection-onnx/resolve/main';
const DEBERTA_FILES = [
'config.json',
'tokenizer.json',
'tokenizer_config.json',
'special_tokens_map.json',
'spm.model',
'added_tokens.json',
];
function isDebertaEnabled(): boolean {
const setting = (process.env.GSTACK_SECURITY_ENSEMBLE ?? '').toLowerCase();
return setting.split(',').map(s => s.trim()).includes('deberta');
}
// ─── Load state ──────────────────────────────────────────────
type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed';
@@ -67,9 +92,14 @@ let testsavantState: LoadState = 'uninitialized';
let testsavantClassifier: any = null;
let testsavantLoadError: string | null = null;
let debertaState: LoadState = 'uninitialized';
let debertaClassifier: any = null;
let debertaLoadError: string | null = null;
export interface ClassifierStatus {
testsavant: 'ok' | 'degraded' | 'off';
transcript: 'ok' | 'degraded' | 'off';
deberta?: 'ok' | 'degraded' | 'off'; // only present when ensemble enabled
}
export function getClassifierStatus(): ClassifierStatus {
@@ -77,11 +107,16 @@ export function getClassifierStatus(): ClassifierStatus {
testsavantState === 'loaded' ? 'ok' :
testsavantState === 'failed' ? 'degraded' :
'off';
// Transcript classifier has no persistent load state — it spawns claude-haiku
// per-call. We report 'ok' if claude is on PATH (checked lazily on first call).
const transcript = haikuAvailableCache === null ? 'off' :
haikuAvailableCache ? 'ok' : 'degraded';
return { testsavant, transcript };
const status: ClassifierStatus = { testsavant, transcript };
if (isDebertaEnabled()) {
status.deberta =
debertaState === 'loaded' ? 'ok' :
debertaState === 'failed' ? 'degraded' :
'off';
}
return status;
}
// ─── Model download + staging ────────────────────────────────
@@ -245,6 +280,89 @@ export async function scanPageContent(text: string): Promise<LayerSignal> {
}
}
// ─── L4c: DeBERTa-v3 ensemble (opt-in) ───────────────────────
async function ensureDebertaStaged(onProgress?: (msg: string) => void): Promise<void> {
fs.mkdirSync(path.join(DEBERTA_DIR, 'onnx'), { recursive: true, mode: 0o700 });
for (const f of DEBERTA_FILES) {
const dst = path.join(DEBERTA_DIR, f);
if (fs.existsSync(dst)) continue;
onProgress?.(`deberta: downloading ${f}`);
await downloadFile(`${DEBERTA_HF_URL}/${f}`, dst);
}
const modelDst = path.join(DEBERTA_DIR, 'onnx', 'model.onnx');
if (!fs.existsSync(modelDst)) {
onProgress?.('deberta: downloading model.onnx (721MB) — first run only');
await downloadFile(`${DEBERTA_HF_URL}/model.onnx`, modelDst);
}
}
let debertaLoadPromise: Promise<void> | null = null;
export function loadDeberta(onProgress?: (msg: string) => void): Promise<void> {
if (!isDebertaEnabled()) return Promise.resolve();
if (debertaState === 'loaded') return Promise.resolve();
if (debertaLoadPromise) return debertaLoadPromise;
debertaState = 'loading';
debertaLoadPromise = (async () => {
try {
await ensureDebertaStaged(onProgress);
onProgress?.('deberta: initializing classifier');
const { pipeline, env } = await import('@huggingface/transformers');
env.allowLocalModels = true;
env.allowRemoteModels = false;
env.localModelPath = MODELS_DIR;
debertaClassifier = await pipeline(
'text-classification',
'deberta-v3-injection',
{ dtype: 'fp32' },
);
const tok = debertaClassifier?.tokenizer as any;
if (tok?._tokenizerConfig) {
tok._tokenizerConfig.model_max_length = 512;
}
debertaState = 'loaded';
} catch (err: any) {
debertaState = 'failed';
debertaLoadError = err?.message ?? String(err);
console.error('[security-classifier] Failed to load DeBERTa-v3:', debertaLoadError);
}
})();
return debertaLoadPromise;
}
/**
* Scan text with the DeBERTa-v3 ensemble classifier. Returns a LayerSignal
* with layer='deberta_content'. No-op when ensemble is disabled — returns
* confidence=0 with meta.disabled=true so combineVerdict treats it as safe.
*/
export async function scanPageContentDeberta(text: string): Promise<LayerSignal> {
if (!isDebertaEnabled()) {
return { layer: 'deberta_content', confidence: 0, meta: { disabled: true } };
}
if (!text || text.length === 0) {
return { layer: 'deberta_content', confidence: 0 };
}
if (debertaState !== 'loaded') {
return { layer: 'deberta_content', confidence: 0, meta: { degraded: true } };
}
try {
const plain = htmlToPlainText(text);
const input = plain.slice(0, 4000);
const raw = await debertaClassifier(input);
const top = Array.isArray(raw) ? raw[0] : raw;
const label = top?.label ?? 'SAFE';
const score = Number(top?.score ?? 0);
if (label === 'INJECTION') {
return { layer: 'deberta_content', confidence: score, meta: { label } };
}
return { layer: 'deberta_content', confidence: 0, meta: { label, safeScore: score } };
} catch (err: any) {
debertaState = 'failed';
debertaLoadError = err?.message ?? String(err);
return { layer: 'deberta_content', confidence: 0, meta: { degraded: true, error: debertaLoadError } };
}
}
// ─── L4b: Claude Haiku transcript classifier ─────────────────
/**
+25 -8
View File
@@ -21,6 +21,7 @@ import {
import {
loadTestsavant, scanPageContent, checkTranscript,
shouldRunTranscriptCheck, getClassifierStatus,
loadDeberta, scanPageContentDeberta,
type ToolCallInput,
} from './security-classifier';
@@ -457,11 +458,15 @@ async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
if (!message || message.length === 0) return false;
const tid = tabId ?? 0;
// L4: scan the user message for direct injection patterns
const contentSignal = await scanPageContent(message);
const signals: LayerSignal[] = [contentSignal];
// L4: scan the user message for direct injection patterns (TestSavantAI)
// L4c: also scan with DeBERTa-v3 when ensemble is enabled (opt-in)
const [contentSignal, debertaSignal] = await Promise.all([
scanPageContent(message),
scanPageContentDeberta(message),
]);
const signals: LayerSignal[] = [contentSignal, debertaSignal];
// L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
// L4b: only bother with Haiku if another layer already lit up at >= LOG_ONLY.
// Saves ~70% of Haiku calls per plan §E1 "gating optimization".
if (shouldRunTranscriptCheck(signals)) {
const transcriptSignal = await checkTranscript({
@@ -593,10 +598,16 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
const toolResultScanCtx: ToolResultScanContext = {
scan: async (toolName: string, text: string) => {
if (toolResultBlockFired) return;
const contentSignal = await scanPageContent(text);
if (contentSignal.confidence < THRESHOLDS.WARN) return;
// Signal crossed WARN — see if ensemble upgrades to BLOCK.
const signals: LayerSignal[] = [contentSignal];
// Parallel L4 + L4c ensemble scan (DeBERTa no-op when disabled).
const [contentSignal, debertaSignal] = await Promise.all([
scanPageContent(text),
scanPageContentDeberta(text),
]);
// Short-circuit if neither content layer crossed WARN — no point
// spinning up Haiku for a clean scan.
const maxContent = Math.max(contentSignal.confidence, debertaSignal.confidence);
if (maxContent < THRESHOLDS.WARN) return;
const signals: LayerSignal[] = [contentSignal, debertaSignal];
if (shouldRunTranscriptCheck(signals)) {
signals.push(await checkTranscript({
user_message: queueEntry.message ?? '',
@@ -809,6 +820,12 @@ async function main() {
console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
console.log(`[sidebar-agent] Browse binary: ${B}`);
// If GSTACK_SECURITY_ENSEMBLE=deberta is set, also warm the DeBERTa-v3
// ensemble classifier. Fire-and-forget alongside TestSavantAI — they
// warm in parallel. No-op when the env var is unset.
loadDeberta((msg) => console.log(`[security-classifier] ${msg}`))
.catch((err) => console.warn('[sidebar-agent] DeBERTa warmup failed:', err?.message));
// Warm up the ML classifier in the background. First call triggers a 112MB
// download (~30s on average broadband). Non-blocking — the sidebar stays
// functional on cold start; classifier just reports 'off' until warmed.