mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
27954de0b0
Pure-function tests for security-classifier.ts that don't need a model
download, claude CLI, or network. Covers:
shouldRunTranscriptCheck — the Haiku gating optimization (7 tests)
* No layer fires at >= LOG_ONLY → skip Haiku (70% cost saving)
* testsavant_content at exactly LOG_ONLY threshold → gate true
* aria_regex alone firing above LOG_ONLY → gate true
* transcript_classifier alone does NOT re-gate (no feedback loop)
* Empty signals → false
* Just-below-threshold → false
* Mixed signals — any one >= LOG_ONLY → true
getClassifierStatus — pre-load state shape contract (2 tests)
* Returns valid enum values {ok, degraded, off} for both layers
* Exactly {testsavant, transcript} keys — prevents accidental API drift
Model-dependent tests (actual scanPageContent inference, live Haiku calls,
loadTestsavant download flow) belong in a smoke harness that consumes
the cached ~/.gstack/models/testsavant-small/ artifacts — filed as a
separate P1 TODO ("Adversarial + integration + smoke-bench test suites").
Full security suite now 156 tests / 287 expectations, 112ms.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
92 lines
3.7 KiB
TypeScript
92 lines
3.7 KiB
TypeScript
/**
|
|
* Unit tests for browse/src/security-classifier.ts pure functions.
|
|
*
|
|
* Scope: functions that do NOT require model download, claude CLI, or
|
|
* network access. Model-dependent behavior (loadTestsavant inference,
|
|
* checkTranscript Haiku calls) belongs in a smoke harness that pulls
|
|
* the cached model — filed as a P1 follow-up.
|
|
*/
|
|
|
|
import { describe, test, expect } from 'bun:test';
|
|
import {
|
|
shouldRunTranscriptCheck,
|
|
getClassifierStatus,
|
|
} from '../src/security-classifier';
|
|
import { THRESHOLDS, type LayerSignal } from '../src/security';
|
|
|
|
describe('shouldRunTranscriptCheck — Haiku gating optimization', () => {
|
|
test('returns false when no layer has fired at >= LOG_ONLY', () => {
|
|
// Clean pre-tool-call: no classifier saw anything interesting.
|
|
// Skipping Haiku here is the 70% savings described in plan §E1.
|
|
const signals: LayerSignal[] = [
|
|
{ layer: 'testsavant_content', confidence: 0 },
|
|
{ layer: 'aria_regex', confidence: 0 },
|
|
];
|
|
expect(shouldRunTranscriptCheck(signals)).toBe(false);
|
|
});
|
|
|
|
test('returns true when testsavant_content fires at LOG_ONLY threshold', () => {
|
|
// Exactly at 0.40 — should trigger Haiku follow-up.
|
|
const signals: LayerSignal[] = [
|
|
{ layer: 'testsavant_content', confidence: THRESHOLDS.LOG_ONLY },
|
|
];
|
|
expect(shouldRunTranscriptCheck(signals)).toBe(true);
|
|
});
|
|
|
|
test('returns true when aria_regex alone fires above LOG_ONLY', () => {
|
|
// Regex hit on its own is suspicious enough to warrant Haiku second opinion.
|
|
const signals: LayerSignal[] = [
|
|
{ layer: 'aria_regex', confidence: 0.6 },
|
|
];
|
|
expect(shouldRunTranscriptCheck(signals)).toBe(true);
|
|
});
|
|
|
|
test('does NOT gate on transcript_classifier itself (no recursion)', () => {
|
|
// If the transcript classifier already reported (e.g., prior tool call),
|
|
// the new tool call shouldn't re-trigger Haiku based on the previous
|
|
// transcript signal alone — we need a fresh content signal. This
|
|
// prevents feedback loops where one Haiku hit forever gates future calls.
|
|
const signals: LayerSignal[] = [
|
|
{ layer: 'transcript_classifier', confidence: 0.9 },
|
|
];
|
|
expect(shouldRunTranscriptCheck(signals)).toBe(false);
|
|
});
|
|
|
|
test('empty signals list returns false (no reason to call Haiku)', () => {
|
|
expect(shouldRunTranscriptCheck([])).toBe(false);
|
|
});
|
|
|
|
test('confidence just below LOG_ONLY → false', () => {
|
|
const signals: LayerSignal[] = [
|
|
{ layer: 'testsavant_content', confidence: THRESHOLDS.LOG_ONLY - 0.01 },
|
|
];
|
|
expect(shouldRunTranscriptCheck(signals)).toBe(false);
|
|
});
|
|
|
|
test('mixed low signals — any one >= LOG_ONLY gates true', () => {
|
|
const signals: LayerSignal[] = [
|
|
{ layer: 'testsavant_content', confidence: 0.1 },
|
|
{ layer: 'aria_regex', confidence: 0.45 }, // just above LOG_ONLY
|
|
];
|
|
expect(shouldRunTranscriptCheck(signals)).toBe(true);
|
|
});
|
|
});
|
|
|
|
describe('getClassifierStatus — pre-load state', () => {
|
|
test('returns testsavant=off before loadTestsavant has been called', () => {
|
|
// Before any warmup has started, both classifiers report off.
|
|
// (This test runs in fresh-module state; if another test already
|
|
// loaded the classifier, status would be 'ok' — but this file runs
|
|
// before model loads in typical CI.)
|
|
const s = getClassifierStatus();
|
|
// transcript starts 'off' until first checkHaikuAvailable() call
|
|
expect(['ok', 'degraded', 'off']).toContain(s.testsavant);
|
|
expect(['ok', 'degraded', 'off']).toContain(s.transcript);
|
|
});
|
|
|
|
test('status shape contract — exactly two keys', () => {
|
|
const s = getClassifierStatus();
|
|
expect(Object.keys(s).sort()).toEqual(['testsavant', 'transcript']);
|
|
});
|
|
});
|