mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat(security): DeBERTa-v3 ensemble classifier (opt-in)
Adds ProtectAI DeBERTa-v3-base-injection-onnx as an optional L4c layer
for cross-model agreement. Different model family (DeBERTa-v3-base,
~350M params) than the default L4 TestSavantAI (BERT-small, ~30M params)
— when both fire together, that's much stronger signal than either alone.
Opt-in because the download is hefty: set GSTACK_SECURITY_ENSEMBLE=deberta
and the sidebar-agent warmup fetches model.onnx (721MB FP32) into
~/.gstack/models/deberta-v3-injection/ on first run. Subsequent runs are
cached.
Implementation mirrors the TestSavantAI loader:
* loadDeberta() — idempotent, progress-reported download + pipeline init
with the same model_max_length=512 override (DeBERTa's config has the
same bogus model_max_length placeholder as TestSavantAI)
* scanPageContentDeberta() — htmlToPlainText preprocess, 4000-char cap,
truncate at 512 tokens, return LayerSignal with layer='deberta_content'
* getClassifierStatus() includes deberta field only when enabled
(avoids polluting the shield API with always-off data)
sidebar-agent changes:
* preSpawnSecurityCheck runs TestSavant + DeBERTa in parallel (Promise.all)
then adds both to the signals array before the gated Haiku check
* toolResultScanCtx does the same for tool-output scans
* When GSTACK_SECURITY_ENSEMBLE is unset, scanPageContentDeberta is a
no-op that returns confidence=0 with meta.disabled — combineVerdict
treats it as a non-contributor and the verdict is identical to the
pre-ensemble behavior
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -59,6 +59,31 @@ const TESTSAVANT_FILES = [
|
|||||||
'vocab.txt',
|
'vocab.txt',
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// DeBERTa-v3 (ProtectAI) — OPT-IN ensemble layer. Adds architectural
|
||||||
|
// diversity: TestSavantAI-small is BERT-small fine-tuned on injection +
|
||||||
|
// jailbreak; DeBERTa-v3-base is a separate model family trained on its
|
||||||
|
// own corpus. Agreement between the two is stronger evidence than either
|
||||||
|
// alone.
|
||||||
|
//
|
||||||
|
// Size: model.onnx is 721MB (FP32). Users opt in via
|
||||||
|
// GSTACK_SECURITY_ENSEMBLE=deberta. Not forced on every install because
|
||||||
|
// most users won't need the higher recall and 721MB download is a lot.
|
||||||
|
const DEBERTA_DIR = path.join(MODELS_DIR, 'deberta-v3-injection');
|
||||||
|
const DEBERTA_HF_URL = 'https://huggingface.co/protectai/deberta-v3-base-injection-onnx/resolve/main';
|
||||||
|
const DEBERTA_FILES = [
|
||||||
|
'config.json',
|
||||||
|
'tokenizer.json',
|
||||||
|
'tokenizer_config.json',
|
||||||
|
'special_tokens_map.json',
|
||||||
|
'spm.model',
|
||||||
|
'added_tokens.json',
|
||||||
|
];
|
||||||
|
|
||||||
|
function isDebertaEnabled(): boolean {
|
||||||
|
const setting = (process.env.GSTACK_SECURITY_ENSEMBLE ?? '').toLowerCase();
|
||||||
|
return setting.split(',').map(s => s.trim()).includes('deberta');
|
||||||
|
}
|
||||||
|
|
||||||
// ─── Load state ──────────────────────────────────────────────
|
// ─── Load state ──────────────────────────────────────────────
|
||||||
|
|
||||||
type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed';
|
type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed';
|
||||||
@@ -67,9 +92,14 @@ let testsavantState: LoadState = 'uninitialized';
|
|||||||
let testsavantClassifier: any = null;
|
let testsavantClassifier: any = null;
|
||||||
let testsavantLoadError: string | null = null;
|
let testsavantLoadError: string | null = null;
|
||||||
|
|
||||||
|
let debertaState: LoadState = 'uninitialized';
|
||||||
|
let debertaClassifier: any = null;
|
||||||
|
let debertaLoadError: string | null = null;
|
||||||
|
|
||||||
export interface ClassifierStatus {
|
export interface ClassifierStatus {
|
||||||
testsavant: 'ok' | 'degraded' | 'off';
|
testsavant: 'ok' | 'degraded' | 'off';
|
||||||
transcript: 'ok' | 'degraded' | 'off';
|
transcript: 'ok' | 'degraded' | 'off';
|
||||||
|
deberta?: 'ok' | 'degraded' | 'off'; // only present when ensemble enabled
|
||||||
}
|
}
|
||||||
|
|
||||||
export function getClassifierStatus(): ClassifierStatus {
|
export function getClassifierStatus(): ClassifierStatus {
|
||||||
@@ -77,11 +107,16 @@ export function getClassifierStatus(): ClassifierStatus {
|
|||||||
testsavantState === 'loaded' ? 'ok' :
|
testsavantState === 'loaded' ? 'ok' :
|
||||||
testsavantState === 'failed' ? 'degraded' :
|
testsavantState === 'failed' ? 'degraded' :
|
||||||
'off';
|
'off';
|
||||||
// Transcript classifier has no persistent load state — it spawns claude-haiku
|
|
||||||
// per-call. We report 'ok' if claude is on PATH (checked lazily on first call).
|
|
||||||
const transcript = haikuAvailableCache === null ? 'off' :
|
const transcript = haikuAvailableCache === null ? 'off' :
|
||||||
haikuAvailableCache ? 'ok' : 'degraded';
|
haikuAvailableCache ? 'ok' : 'degraded';
|
||||||
return { testsavant, transcript };
|
const status: ClassifierStatus = { testsavant, transcript };
|
||||||
|
if (isDebertaEnabled()) {
|
||||||
|
status.deberta =
|
||||||
|
debertaState === 'loaded' ? 'ok' :
|
||||||
|
debertaState === 'failed' ? 'degraded' :
|
||||||
|
'off';
|
||||||
|
}
|
||||||
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── Model download + staging ────────────────────────────────
|
// ─── Model download + staging ────────────────────────────────
|
||||||
@@ -245,6 +280,89 @@ export async function scanPageContent(text: string): Promise<LayerSignal> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── L4c: DeBERTa-v3 ensemble (opt-in) ───────────────────────
|
||||||
|
|
||||||
|
async function ensureDebertaStaged(onProgress?: (msg: string) => void): Promise<void> {
|
||||||
|
fs.mkdirSync(path.join(DEBERTA_DIR, 'onnx'), { recursive: true, mode: 0o700 });
|
||||||
|
for (const f of DEBERTA_FILES) {
|
||||||
|
const dst = path.join(DEBERTA_DIR, f);
|
||||||
|
if (fs.existsSync(dst)) continue;
|
||||||
|
onProgress?.(`deberta: downloading ${f}`);
|
||||||
|
await downloadFile(`${DEBERTA_HF_URL}/${f}`, dst);
|
||||||
|
}
|
||||||
|
const modelDst = path.join(DEBERTA_DIR, 'onnx', 'model.onnx');
|
||||||
|
if (!fs.existsSync(modelDst)) {
|
||||||
|
onProgress?.('deberta: downloading model.onnx (721MB) — first run only');
|
||||||
|
await downloadFile(`${DEBERTA_HF_URL}/model.onnx`, modelDst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let debertaLoadPromise: Promise<void> | null = null;
|
||||||
|
export function loadDeberta(onProgress?: (msg: string) => void): Promise<void> {
|
||||||
|
if (!isDebertaEnabled()) return Promise.resolve();
|
||||||
|
if (debertaState === 'loaded') return Promise.resolve();
|
||||||
|
if (debertaLoadPromise) return debertaLoadPromise;
|
||||||
|
debertaState = 'loading';
|
||||||
|
debertaLoadPromise = (async () => {
|
||||||
|
try {
|
||||||
|
await ensureDebertaStaged(onProgress);
|
||||||
|
onProgress?.('deberta: initializing classifier');
|
||||||
|
const { pipeline, env } = await import('@huggingface/transformers');
|
||||||
|
env.allowLocalModels = true;
|
||||||
|
env.allowRemoteModels = false;
|
||||||
|
env.localModelPath = MODELS_DIR;
|
||||||
|
debertaClassifier = await pipeline(
|
||||||
|
'text-classification',
|
||||||
|
'deberta-v3-injection',
|
||||||
|
{ dtype: 'fp32' },
|
||||||
|
);
|
||||||
|
const tok = debertaClassifier?.tokenizer as any;
|
||||||
|
if (tok?._tokenizerConfig) {
|
||||||
|
tok._tokenizerConfig.model_max_length = 512;
|
||||||
|
}
|
||||||
|
debertaState = 'loaded';
|
||||||
|
} catch (err: any) {
|
||||||
|
debertaState = 'failed';
|
||||||
|
debertaLoadError = err?.message ?? String(err);
|
||||||
|
console.error('[security-classifier] Failed to load DeBERTa-v3:', debertaLoadError);
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
return debertaLoadPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scan text with the DeBERTa-v3 ensemble classifier. Returns a LayerSignal
|
||||||
|
* with layer='deberta_content'. No-op when ensemble is disabled — returns
|
||||||
|
* confidence=0 with meta.disabled=true so combineVerdict treats it as safe.
|
||||||
|
*/
|
||||||
|
export async function scanPageContentDeberta(text: string): Promise<LayerSignal> {
|
||||||
|
if (!isDebertaEnabled()) {
|
||||||
|
return { layer: 'deberta_content', confidence: 0, meta: { disabled: true } };
|
||||||
|
}
|
||||||
|
if (!text || text.length === 0) {
|
||||||
|
return { layer: 'deberta_content', confidence: 0 };
|
||||||
|
}
|
||||||
|
if (debertaState !== 'loaded') {
|
||||||
|
return { layer: 'deberta_content', confidence: 0, meta: { degraded: true } };
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const plain = htmlToPlainText(text);
|
||||||
|
const input = plain.slice(0, 4000);
|
||||||
|
const raw = await debertaClassifier(input);
|
||||||
|
const top = Array.isArray(raw) ? raw[0] : raw;
|
||||||
|
const label = top?.label ?? 'SAFE';
|
||||||
|
const score = Number(top?.score ?? 0);
|
||||||
|
if (label === 'INJECTION') {
|
||||||
|
return { layer: 'deberta_content', confidence: score, meta: { label } };
|
||||||
|
}
|
||||||
|
return { layer: 'deberta_content', confidence: 0, meta: { label, safeScore: score } };
|
||||||
|
} catch (err: any) {
|
||||||
|
debertaState = 'failed';
|
||||||
|
debertaLoadError = err?.message ?? String(err);
|
||||||
|
return { layer: 'deberta_content', confidence: 0, meta: { degraded: true, error: debertaLoadError } };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ─── L4b: Claude Haiku transcript classifier ─────────────────
|
// ─── L4b: Claude Haiku transcript classifier ─────────────────
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ import {
|
|||||||
import {
|
import {
|
||||||
loadTestsavant, scanPageContent, checkTranscript,
|
loadTestsavant, scanPageContent, checkTranscript,
|
||||||
shouldRunTranscriptCheck, getClassifierStatus,
|
shouldRunTranscriptCheck, getClassifierStatus,
|
||||||
|
loadDeberta, scanPageContentDeberta,
|
||||||
type ToolCallInput,
|
type ToolCallInput,
|
||||||
} from './security-classifier';
|
} from './security-classifier';
|
||||||
|
|
||||||
@@ -457,11 +458,15 @@ async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
|
|||||||
if (!message || message.length === 0) return false;
|
if (!message || message.length === 0) return false;
|
||||||
const tid = tabId ?? 0;
|
const tid = tabId ?? 0;
|
||||||
|
|
||||||
// L4: scan the user message for direct injection patterns
|
// L4: scan the user message for direct injection patterns (TestSavantAI)
|
||||||
const contentSignal = await scanPageContent(message);
|
// L4c: also scan with DeBERTa-v3 when ensemble is enabled (opt-in)
|
||||||
const signals: LayerSignal[] = [contentSignal];
|
const [contentSignal, debertaSignal] = await Promise.all([
|
||||||
|
scanPageContent(message),
|
||||||
|
scanPageContentDeberta(message),
|
||||||
|
]);
|
||||||
|
const signals: LayerSignal[] = [contentSignal, debertaSignal];
|
||||||
|
|
||||||
// L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
|
// L4b: only bother with Haiku if another layer already lit up at >= LOG_ONLY.
|
||||||
// Saves ~70% of Haiku calls per plan §E1 "gating optimization".
|
// Saves ~70% of Haiku calls per plan §E1 "gating optimization".
|
||||||
if (shouldRunTranscriptCheck(signals)) {
|
if (shouldRunTranscriptCheck(signals)) {
|
||||||
const transcriptSignal = await checkTranscript({
|
const transcriptSignal = await checkTranscript({
|
||||||
@@ -593,10 +598,16 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
|
|||||||
const toolResultScanCtx: ToolResultScanContext = {
|
const toolResultScanCtx: ToolResultScanContext = {
|
||||||
scan: async (toolName: string, text: string) => {
|
scan: async (toolName: string, text: string) => {
|
||||||
if (toolResultBlockFired) return;
|
if (toolResultBlockFired) return;
|
||||||
const contentSignal = await scanPageContent(text);
|
// Parallel L4 + L4c ensemble scan (DeBERTa no-op when disabled).
|
||||||
if (contentSignal.confidence < THRESHOLDS.WARN) return;
|
const [contentSignal, debertaSignal] = await Promise.all([
|
||||||
// Signal crossed WARN — see if ensemble upgrades to BLOCK.
|
scanPageContent(text),
|
||||||
const signals: LayerSignal[] = [contentSignal];
|
scanPageContentDeberta(text),
|
||||||
|
]);
|
||||||
|
// Short-circuit if neither content layer crossed WARN — no point
|
||||||
|
// spinning up Haiku for a clean scan.
|
||||||
|
const maxContent = Math.max(contentSignal.confidence, debertaSignal.confidence);
|
||||||
|
if (maxContent < THRESHOLDS.WARN) return;
|
||||||
|
const signals: LayerSignal[] = [contentSignal, debertaSignal];
|
||||||
if (shouldRunTranscriptCheck(signals)) {
|
if (shouldRunTranscriptCheck(signals)) {
|
||||||
signals.push(await checkTranscript({
|
signals.push(await checkTranscript({
|
||||||
user_message: queueEntry.message ?? '',
|
user_message: queueEntry.message ?? '',
|
||||||
@@ -809,6 +820,12 @@ async function main() {
|
|||||||
console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
|
console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
|
||||||
console.log(`[sidebar-agent] Browse binary: ${B}`);
|
console.log(`[sidebar-agent] Browse binary: ${B}`);
|
||||||
|
|
||||||
|
// If GSTACK_SECURITY_ENSEMBLE=deberta is set, also warm the DeBERTa-v3
|
||||||
|
// ensemble classifier. Fire-and-forget alongside TestSavantAI — they
|
||||||
|
// warm in parallel. No-op when the env var is unset.
|
||||||
|
loadDeberta((msg) => console.log(`[security-classifier] ${msg}`))
|
||||||
|
.catch((err) => console.warn('[sidebar-agent] DeBERTa warmup failed:', err?.message));
|
||||||
|
|
||||||
// Warm up the ML classifier in the background. First call triggers a 112MB
|
// Warm up the ML classifier in the background. First call triggers a 112MB
|
||||||
// download (~30s on average broadband). Non-blocking — the sidebar stays
|
// download (~30s on average broadband). Non-blocking — the sidebar stays
|
||||||
// functional on cold start; classifier just reports 'off' until warmed.
|
// functional on cold start; classifier just reports 'off' until warmed.
|
||||||
|
|||||||
Reference in New Issue
Block a user