feat(security): wire TestSavantAI + ensemble into sidebar-agent pre-spawn scan

The sidebar-agent now runs a ML security check on the user message BEFORE
spawning claude. If the content classifier and (gated) transcript classifier
ensemble returns BLOCK, the session is refused with a security_event +
agent_error — the sidepanel renders the approved banner.

Two pieces:

  1. On agent startup, loadTestsavant() warms the classifier in the background.
     First run triggers a 112MB model download from HuggingFace (~30s on
     average broadband). Non-blocking — sidebar stays functional during
     cold-start, shield just reports 'off' until warmed.

  2. preSpawnSecurityCheck() runs the ensemble against the user message:
       - L4 (testsavant_content) always runs
       - L4b (transcript_classifier via Haiku) runs only if L4 flagged at
         >= LOG_ONLY — plan §E1 gating optimization, saves ~70% of Haiku spend
     combineVerdict() applies the BLOCK-requires-both-layers rule, which
     downgrades any single-layer high confidence to WARN. Stack Overflow-style
     instruction-heavy writing false-positives on TestSavantAI alone are
     caught by this degrade — Haiku corrects them when called.

Fail-open everywhere: any subprocess/load/inference error returns confidence=0
so the sidebar keeps working on architectural controls alone. Shield icon
reflects degraded state via getClassifierStatus().

BLOCK path emits both:
  - security_event {verdict, reason, layer, confidence, domain}  (for the
    approved canary-leak banner UX mockup — variant A)
  - agent_error "Session blocked — prompt injection detected..."
    (backward-compat with existing error surface)

Regression test suite still passes (12/12 sidebar-security tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-19 19:05:37 +08:00
parent 63a56e6789
commit 750161bbbe
+88 -1
View File
@@ -13,7 +13,15 @@ import { spawn } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import { safeUnlink } from './error-handling';
import { checkCanaryInStructure, logAttempt, hashPayload, extractDomain } from './security';
import {
checkCanaryInStructure, logAttempt, hashPayload, extractDomain,
combineVerdict, type LayerSignal,
} from './security';
import {
loadTestsavant, scanPageContent, checkTranscript,
shouldRunTranscriptCheck, getClassifierStatus,
type ToolCallInput,
} from './security-classifier';
const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
@@ -370,6 +378,68 @@ async function onCanaryLeaked(params: {
}, tabId);
}
/**
* Pre-spawn ML scan of the user message. If the classifier fires at BLOCK,
* we log the attempt, emit a security_event to the sidepanel, and DO NOT
* spawn claude. Returns true if the scan blocked the session.
*
* Fail-open: any classifier error or degraded state returns false (safe) so
* the sidebar keeps working. The architectural controls (XML framing +
* command allowlist, live in server.ts:554-577) still defend.
*/
async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
const { message, canary, pageUrl, tabId } = entry;
if (!message || message.length === 0) return false;
const tid = tabId ?? 0;
// L4: scan the user message for direct injection patterns
const contentSignal = await scanPageContent(message);
const signals: LayerSignal[] = [contentSignal];
// L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
// Saves ~70% of Haiku calls per plan §E1 "gating optimization".
if (shouldRunTranscriptCheck(signals)) {
const transcriptSignal = await checkTranscript({
user_message: message,
tool_calls: [], // no tool calls yet at session start
});
signals.push(transcriptSignal);
}
const result = combineVerdict(signals);
if (result.verdict !== 'block') return false;
// BLOCK verdict. Log + emit + refuse to spawn.
const domain = extractDomain(pageUrl ?? '');
const leaderSignal = signals.reduce((a, b) => (a.confidence > b.confidence ? a : b));
logAttempt({
ts: new Date().toISOString(),
urlDomain: domain,
payloadHash: hashPayload(message),
confidence: result.confidence,
layer: leaderSignal.layer,
verdict: 'block',
});
console.warn(`[sidebar-agent] Pre-spawn BLOCK (${result.reason}) for tab ${tid}, confidence=${result.confidence.toFixed(3)}`);
await sendEvent({
type: 'security_event',
verdict: 'block',
reason: result.reason ?? 'ml_classifier',
layer: leaderSignal.layer,
confidence: result.confidence,
domain,
}, tid);
await sendEvent({
type: 'agent_error',
error: `Session blocked — prompt injection detected${domain ? ` from ${domain}` : ' in your message'}`,
}, tid);
return true;
}
async function askClaude(queueEntry: QueueEntry): Promise<void> {
const { prompt, args, stateFile, cwd, tabId, canary, pageUrl } = queueEntry;
const tid = tabId ?? 0;
@@ -377,6 +447,13 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
processingTabs.add(tid);
await sendEvent({ type: 'agent_start' }, tid);
// Pre-spawn ML scan: if the user message trips the ensemble, refuse to
// spawn claude. Fail-open on classifier errors.
if (await preSpawnSecurityCheck(queueEntry)) {
processingTabs.delete(tid);
return;
}
return new Promise((resolve) => {
// Canary context is set after proc is spawned (needs proc reference for kill).
let canaryCtx: CanaryContext | undefined;
@@ -616,6 +693,16 @@ async function main() {
console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
console.log(`[sidebar-agent] Browse binary: ${B}`);
// Warm up the ML classifier in the background. First call triggers a 112MB
// download (~30s on average broadband). Non-blocking — the sidebar stays
// functional on cold start; classifier just reports 'off' until warmed.
loadTestsavant((msg) => console.log(`[security-classifier] ${msg}`))
.then(() => {
const s = getClassifierStatus();
console.log(`[sidebar-agent] Classifier warmup complete: ${JSON.stringify(s)}`);
})
.catch((err) => console.warn('[sidebar-agent] Classifier warmup failed (degraded mode):', err?.message));
setInterval(poll, POLL_MS);
setInterval(pollKillFile, POLL_MS);
}