mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat(security): 3-way ensemble verdict combiner with deberta_content layer
Updates combineVerdict to support a third ML signal layer (deberta_content)
for opt-in DeBERTa-v3 ensemble. Rule becomes:
* Canary leak → BLOCK (unchanged, deterministic)
* 2-of-N ML classifiers >= WARN → BLOCK (ensemble_agreement)
- N = 2 when DeBERTa disabled (testsavant + transcript)
- N = 3 when DeBERTa enabled (adds deberta)
* Any single layer >= BLOCK without cross-confirm → WARN (single_layer_high)
* Any single layer >= WARN without cross-confirm → WARN (single_layer_medium)
* Any layer >= LOG_ONLY → log_only
* Otherwise → safe
Backward compatible: when DeBERTa signal has confidence 0 (meta.disabled
or absent entirely), the combiner treats it like any low-confidence layer.
Existing 2-of-2 ensemble path still fires for testsavant + transcript.
BLOCK confidence reports the MIN of the WARN+ layers — most-conservative
estimate of the agreed-upon signal strength, not the max.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+21
-10
@@ -42,6 +42,7 @@ export type Verdict = 'safe' | 'log_only' | 'warn' | 'block';
|
|||||||
|
|
||||||
export type LayerName =
|
export type LayerName =
|
||||||
| 'testsavant_content'
|
| 'testsavant_content'
|
||||||
|
| 'deberta_content' // opt-in ensemble layer (GSTACK_SECURITY_ENSEMBLE=deberta)
|
||||||
| 'transcript_classifier'
|
| 'transcript_classifier'
|
||||||
| 'aria_regex'
|
| 'aria_regex'
|
||||||
| 'canary';
|
| 'canary';
|
||||||
@@ -88,6 +89,7 @@ export function combineVerdict(signals: LayerSignal[]): SecurityResult {
|
|||||||
byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence);
|
byLayer[s.layer] = Math.max(byLayer[s.layer] ?? 0, s.confidence);
|
||||||
}
|
}
|
||||||
const content = byLayer.testsavant_content ?? 0;
|
const content = byLayer.testsavant_content ?? 0;
|
||||||
|
const deberta = byLayer.deberta_content ?? 0;
|
||||||
const transcript = byLayer.transcript_classifier ?? 0;
|
const transcript = byLayer.transcript_classifier ?? 0;
|
||||||
const canary = byLayer.canary ?? 0;
|
const canary = byLayer.canary ?? 0;
|
||||||
|
|
||||||
@@ -101,44 +103,53 @@ export function combineVerdict(signals: LayerSignal[]): SecurityResult {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensemble: both ML classifiers agree -> BLOCK.
|
// ML signals at >= WARN. Count how many agree.
|
||||||
if (content >= THRESHOLDS.WARN && transcript >= THRESHOLDS.WARN) {
|
const mlHighSignals = [content, deberta, transcript].filter(c => c >= THRESHOLDS.WARN);
|
||||||
|
const hasDebertaSignal = deberta > 0;
|
||||||
|
|
||||||
|
// Ensemble rule:
|
||||||
|
// * 2-of-3 ML classifiers >= WARN → BLOCK when DeBERTa is in the pool
|
||||||
|
// * 2-of-2 (content + transcript) >= WARN → BLOCK when DeBERTa disabled
|
||||||
|
// In both cases, cross-model agreement is what upgrades from WARN to BLOCK.
|
||||||
|
const requiredForBlock = hasDebertaSignal ? 2 : 2;
|
||||||
|
if (mlHighSignals.length >= requiredForBlock) {
|
||||||
return {
|
return {
|
||||||
verdict: 'block',
|
verdict: 'block',
|
||||||
reason: 'ensemble_agreement',
|
reason: 'ensemble_agreement',
|
||||||
signals,
|
signals,
|
||||||
confidence: Math.min(content, transcript),
|
confidence: Math.min(...mlHighSignals),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Single layer >= BLOCK (no cross-confirm) degrades to WARN to avoid FPs.
|
// Single layer >= BLOCK (no cross-confirm) degrades to WARN to avoid FPs.
|
||||||
if (content >= THRESHOLDS.BLOCK || transcript >= THRESHOLDS.BLOCK) {
|
const maxMl = Math.max(content, deberta, transcript);
|
||||||
|
if (maxMl >= THRESHOLDS.BLOCK) {
|
||||||
return {
|
return {
|
||||||
verdict: 'warn',
|
verdict: 'warn',
|
||||||
reason: 'single_layer_high',
|
reason: 'single_layer_high',
|
||||||
signals,
|
signals,
|
||||||
confidence: Math.max(content, transcript),
|
confidence: maxMl,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (content >= THRESHOLDS.WARN || transcript >= THRESHOLDS.WARN) {
|
if (maxMl >= THRESHOLDS.WARN) {
|
||||||
return {
|
return {
|
||||||
verdict: 'warn',
|
verdict: 'warn',
|
||||||
reason: 'single_layer_medium',
|
reason: 'single_layer_medium',
|
||||||
signals,
|
signals,
|
||||||
confidence: Math.max(content, transcript),
|
confidence: maxMl,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (content >= THRESHOLDS.LOG_ONLY || transcript >= THRESHOLDS.LOG_ONLY) {
|
if (maxMl >= THRESHOLDS.LOG_ONLY) {
|
||||||
return {
|
return {
|
||||||
verdict: 'log_only',
|
verdict: 'log_only',
|
||||||
signals,
|
signals,
|
||||||
confidence: Math.max(content, transcript),
|
confidence: maxMl,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return { verdict: 'safe', signals, confidence: Math.max(content, transcript) };
|
return { verdict: 'safe', signals, confidence: maxMl };
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── Canary (session-scoped secret token) ────────────────────
|
// ─── Canary (session-scoped secret token) ────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user