From d5253215c55a32359ec52fad310710e7bf6cbea3 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 20 Apr 2026 04:50:53 +0800 Subject: [PATCH] fix(security-classifier): truncation + HTML preprocessing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real bugs found by the BrowseSafe-Bench smoke harness. 1. Truncation wasn't happening. The TextClassificationPipeline in transformers.js v4 calls the tokenizer with `{ padding: true, truncation: true }` — but truncation needs a max_length, which it reads from tokenizer.model_max_length. TestSavantAI ships with model_max_length set to 1e18 (a common "infinity" placeholder in HF configs) so no truncation actually occurs. Inputs longer than 512 tokens (the BERT-small context limit) crash ONNXRuntime with a broadcast-dimension error. Fix: override tokenizer._tokenizerConfig.model_max_length = 512 right after pipeline load. The getter now returns the real limit and the implicit truncation: true in the pipeline actually clips inputs. 2. Classifier was receiving raw HTML. TestSavantAI is trained on natural language, not markup. Feeding it a blob of
dilutes the injection signal with tag noise. When the Perplexity BrowseSafe-Bench fixture has an attack buried inside HTML, the classifier said SAFE at confidence 0 across the board. Fix: added htmlToPlainText() that strips tags, drops script/style bodies, decodes common entities, and collapses whitespace. scanPageContent now normalizes input through this before handing to the classifier. Result: BrowseSafe-Bench smoke runs without errors. Detection rate is only 15% at WARN=0.6 (see bench test docstring for why — TestSavantAI wasn't trained on this distribution). Ensemble with Haiku transcript classifier filters FPs in prod; DeBERTa-v3 ensemble is a tracked P2 improvement. Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/security-classifier.ts | 47 ++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/browse/src/security-classifier.ts b/browse/src/security-classifier.ts index 6478eaed..62493e56 100644 --- a/browse/src/security-classifier.ts +++ b/browse/src/security-classifier.ts @@ -157,6 +157,17 @@ export function loadTestsavant(onProgress?: (msg: string) => void): Promise void): Promise]*>[\s\S]*?<\/\1>/gi, ' ') // drop script/style bodies entirely + .replace(/<[^>]+>/g, ' ') // drop tags + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/\s+/g, ' ') + .trim(); +} + export async function scanPageContent(text: string): Promise { if (!text || text.length === 0) { return { layer: 'testsavant_content', confidence: 0 }; @@ -187,10 +220,16 @@ export async function scanPageContent(text: string): Promise { return { layer: 'testsavant_content', confidence: 0, meta: { degraded: true } }; } try { - // Classify only the first 512 tokens worth of text (~2000 chars). - // Longer inputs get truncated by the tokenizer anyway, but explicit - // slicing avoids token-overflow warnings. - const input = text.slice(0, 2000); + // Normalize to plain text first — the classifier is trained on natural + // language, not HTML markup. A page with an injection buried in tag + // soup won't fire until we strip the noise. + const plain = htmlToPlainText(text); + // Character-level cap to avoid pathological memory use. The pipeline + // applies tokenizer truncation at 512 tokens (the BERT-small context + // limit — enforced via the model_max_length override in loadTestsavant) + // so the 4000-char cap is just a cheap upper bound. Real-world + // injection signals land in the first few hundred tokens anyway. + const input = plain.slice(0, 4000); const raw = await testsavantClassifier(input); const top = Array.isArray(raw) ? raw[0] : raw; const label = top?.label ?? 'SAFE';