From d5253215c55a32359ec52fad310710e7bf6cbea3 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Mon, 20 Apr 2026 04:50:53 +0800
Subject: [PATCH] fix(security-classifier): truncation + HTML preprocessing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two real bugs found by the BrowseSafe-Bench smoke harness.

1. Truncation wasn't happening.
   The TextClassificationPipeline in transformers.js v4 calls the tokenizer
   with `{ padding: true, truncation: true }` — but truncation needs a
   max_length, which it reads from tokenizer.model_max_length. TestSavantAI
   ships with model_max_length set to 1e18 (a common "infinity" placeholder
   in HF configs) so no truncation actually occurs. Inputs longer than 512
   tokens (the BERT-small context limit) crash ONNXRuntime with a
   broadcast-dimension error.
   Fix: override tokenizer._tokenizerConfig.model_max_length = 512 right
   after pipeline load. The getter now returns the real limit and the
   implicit truncation: true in the pipeline actually clips inputs.

2. Classifier was receiving raw HTML.
   TestSavantAI is trained on natural language, not markup. Feeding it a
   blob of <div style="..."> dilutes the injection signal with tag noise.
   When the Perplexity BrowseSafe-Bench fixture has an attack buried inside
   HTML, the classifier said SAFE at confidence 0 across the board.
   Fix: added htmlToPlainText() that strips tags, drops script/style
   bodies, decodes common entities, and collapses whitespace. scanPageContent
   now normalizes input through this before handing to the classifier.

Result: BrowseSafe-Bench smoke runs without errors. Detection rate is only
15% at WARN=0.6 (see bench test docstring for why — TestSavantAI wasn't
trained on this distribution). Ensemble with Haiku transcript classifier
filters FPs in prod; DeBERTa-v3 ensemble is a tracked P2 improvement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 browse/src/security-classifier.ts | 47 ++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)
diff --git a/browse/src/security-classifier.ts b/browse/src/security-classifier.ts
index 6478eaed..62493e56 100644
--- a/browse/src/security-classifier.ts
+++ b/browse/src/security-classifier.ts
@@ -157,6 +157,17 @@ export function loadTestsavant(onProgress?: (msg: string) => void): Promise<void
         'testsavant-small',
         { dtype: 'fp32' },
       );
+      // TestSavantAI's tokenizer_config.json ships with model_max_length
+      // set to a huge placeholder (1e18) which disables automatic truncation
+      // in the TextClassificationPipeline. The underlying BERT-small has
+      // max_position_embeddings: 512 — passing anything longer throws a
+      // broadcast error. Override via _tokenizerConfig (the internal source
+      // the computed model_max_length getter reads from) so the pipeline's
+      // implicit truncation: true actually kicks in.
+      const tok = testsavantClassifier?.tokenizer as any;
+      if (tok?._tokenizerConfig) {
+        tok._tokenizerConfig.model_max_length = 512;
+      }
       testsavantState = 'loaded';
     } catch (err: any) {
       testsavantState = 'failed';
@@ -179,6 +190,28 @@ export function loadTestsavant(onProgress?: (msg: string) => void): Promise<void
  * label is 'SAFE', we return confidence=0 to the combiner. When label is
  * 'INJECTION', we return the score directly.
  */
+/**
+ * Strip HTML tags and collapse whitespace. TestSavantAI was trained on
+ * plain text, not markup — feeding it raw HTML massively reduces recall
+ * because all the tag noise dilutes the injection signal. Callers that
+ * already have plain text (page snapshot innerText, tool output strings)
+ * get no-op behavior; callers with HTML get the markup stripped.
+ */
+function htmlToPlainText(input: string): string {
+  // Fast path: if no angle brackets, it's already plain text.
+  if (!input.includes('<')) return input;
+  return input
+    .replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/gi, ' ') // drop script/style bodies entirely
+    .replace(/<[^>]+>/g, ' ')                               // drop tags
+    .replace(/&nbsp;/g, ' ')
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/\s+/g, ' ')
+    .trim();
+}
+
 export async function scanPageContent(text: string): Promise<LayerSignal> {
   if (!text || text.length === 0) {
     return { layer: 'testsavant_content', confidence: 0 };
@@ -187,10 +220,16 @@ export async function scanPageContent(text: string): Promise<LayerSignal> {
     return { layer: 'testsavant_content', confidence: 0, meta: { degraded: true } };
   }
   try {
-    // Classify only the first 512 tokens worth of text (~2000 chars).
-    // Longer inputs get truncated by the tokenizer anyway, but explicit
-    // slicing avoids token-overflow warnings.
-    const input = text.slice(0, 2000);
+    // Normalize to plain text first — the classifier is trained on natural
+    // language, not HTML markup. A page with an injection buried in tag
+    // soup won't fire until we strip the noise.
+    const plain = htmlToPlainText(text);
+    // Character-level cap to avoid pathological memory use. The pipeline
+    // applies tokenizer truncation at 512 tokens (the BERT-small context
+    // limit — enforced via the model_max_length override in loadTestsavant)
+    // so the 4000-char cap is just a cheap upper bound. Real-world
+    // injection signals land in the first few hundred tokens anyway.
+    const input = plain.slice(0, 4000);
     const raw = await testsavantClassifier(input);
     const top = Array.isArray(raw) ? raw[0] : raw;
     const label = top?.label ?? 'SAFE';