harness: reduce false positives (robust verdicts, severity quorum, refute pass)

- Robust verdict parsing (pool::parse_verdict): whitespace-insensitive, checks
  explicit rejection first, counts only explicit confirmations; ambiguous →
  Unclear (not confirmed). Replaces the fragile exact-JSON / loose "yes" match.
- Severity-aware quorum (pool::quorum_confirmed): High/Critical now need ≥2
  validators AND ≥2/3 agreement (a single vote can no longer confirm a
  Critical); lower severities need a strict majority (>half, was ≥half). Single-
  model panels fall back to majority so they aren't nuked.
- Adversarial refute pass (REFUTE_SYS): every confirmed High/Critical is
  re-examined by a skeptical panel that assumes false-positive; findings that
  can't withstand a majority of skeptics are dropped. Survives on infra failure.
- Strengthened VOTE_SYS with an explicit false-positive checklist (reflected-not-
  executed, version/banner guesses, self-XSS, error-as-injection, thin evidence,
  inflated severity); validator query now also includes impact.
- Unit tests for parse_verdict + quorum_confirmed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
CyberSecurityUP
2026-07-01 17:33:15 -03:00
parent 0a181782a4
commit e9f81c164d
2 changed files with 138 additions and 10 deletions
+43 -4
View File
@@ -60,7 +60,11 @@ fn tool_doctrine(mcp_on: bool) -> String {
Use only what is installed; degrade gracefully. Never run destructive or DoS actions.\n\n"
)
}
const VOTE_SYS: &str = "You are an adversarial security validator. Decide if the candidate finding is a REAL, reproducible, exploitable vulnerability with proof. Reply with JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"}. Default to rejected when uncertain.";
const VOTE_SYS: &str = "You are an adversarial security validator. Decide if the candidate finding is a REAL, reproducible, exploitable vulnerability whose EVIDENCE actually proves impact. Reject common false positives: input merely reflected but not executed; version/banner guesses with no working PoC; self-XSS; theoretical issues; an error message or stack trace mistaken for injection; missing, generic, or non-reproducible evidence; severity inflated beyond what the evidence demonstrates. Confirm only if the provided evidence (request/response) concretely proves the vulnerability. Reply with JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"}. Default to rejected when uncertain.";
/// Adversarial second pass for High/Critical findings: assume false positive
/// until the evidence forces otherwise. A finding that can't withstand the
/// skeptics is dropped.
const REFUTE_SYS: &str = "You are a skeptical senior reviewer trying to DISPROVE a reported vulnerability. Assume it is a FALSE POSITIVE unless the evidence forces otherwise. Scrutinize: does the evidence PROVE execution/impact, or only that input was reflected/accepted? Is there a real working PoC, or just a version/banner/theory? Could it be self-XSS, an error message, or an unreachable path? Reply JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"} where confirmed means the vulnerability is REAL and proven by the evidence. When in doubt, reject.";
const CODE_VOTE_SYS: &str = "You are an adversarial source-code reviewer. Decide if the reported issue is a REAL vulnerability in the provided code (reachable, exploitable, not a false positive). Reply JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"}.";
/// ReAct loop directive: make the agent reason → act with a tool → observe →
@@ -225,6 +229,7 @@ pub async fn run(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: Sender<Str
findings.extend(extra);
findings = dedup_findings(findings);
}
let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
finish(cfg, lib, recon, transcript, findings, selected, &mut rl, tx).await
}
@@ -286,6 +291,7 @@ pub async fn run_whitebox(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: S
let candidates = dedup_findings(raw.iter().flat_map(|(_, _, f)| f.clone()).collect());
let _ = tx.send(format!("{} candidate finding(s) (deduped) — validating", candidates.len())).await;
let findings = validate(candidates, pool, CODE_VOTE_SYS, cfg.vote_n, &tx).await;
let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
finish(cfg, lib, "{}".into(), transcript, findings, selected, &mut rl, tx).await
}
@@ -428,6 +434,7 @@ pub async fn run_greybox(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: Se
findings.extend(extra);
findings = dedup_findings(findings);
}
let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
finish(cfg, lib, recon, transcript, findings, selected, &mut rl, tx).await
}
@@ -603,11 +610,11 @@ async fn validate(candidates: Vec<Finding>, pool: &ModelPool, sys: &str, vote_n:
let finder = finder.clone();
async move {
let q = format!(
"Finding: {} | severity {} | {} | at {} | payload {} | evidence {}",
f.title, f.severity, f.cwe, f.endpoint, f.payload, f.evidence
"Finding: {} | severity {} | {} | at {} | payload {} | evidence {} | impact {}",
f.title, f.severity, f.cwe, f.endpoint, f.payload, f.evidence, f.impact
);
let (yes, total) = pool.vote(sys, &q, vote_n, finder.as_deref()).await;
f.validated = total > 0 && yes * 2 >= total;
f.validated = crate::pool::quorum_confirmed(&f.severity, yes, total);
f.votes = format!("{yes}/{total}");
if f.confidence == 0.0 && total > 0 {
f.confidence = yes as f64 / total as f64;
@@ -622,6 +629,37 @@ async fn validate(candidates: Vec<Finding>, pool: &ModelPool, sys: &str, vote_n:
validated.into_iter().filter(|f| f.validated).collect()
}
/// Adversarial refutation pass: every confirmed **High/Critical** finding is
/// re-examined by a skeptical panel that tries to prove it's a false positive.
/// A finding that fails to withstand a majority of skeptics is dropped. Lower
/// severities pass through unchanged. Runs only when a real panel exists.
async fn refute_pass(findings: Vec<Finding>, pool: &ModelPool, vote_n: usize, tx: &Sender<String>) -> Vec<Finding> {
let finder = pool.candidates.first().map(|m| m.label());
let mut kept = Vec::new();
for mut f in findings {
let s = f.severity.to_lowercase();
let high = s.starts_with("crit") || s.starts_with("high");
if !high || pool.stop_exploiting() {
kept.push(f);
continue;
}
let q = format!(
"Finding: {} | severity {} | {} | at {} | payload {} | evidence {} | impact {}",
f.title, f.severity, f.cwe, f.endpoint, f.payload, f.evidence, f.impact
);
let (yes, total) = pool.vote(REFUTE_SYS, &q, vote_n.max(2), finder.as_deref()).await;
// Survive on no-response (infra failure) or a surviving majority.
let survives = total == 0 || yes * 2 > total;
if survives {
if total > 0 { f.votes = format!("{} · refute {yes}/{total}", f.votes); }
kept.push(f);
} else {
let _ = tx.send(format!("vote {} → dropped by adversarial refute ({yes}/{total})", f.title)).await;
}
}
kept
}
async fn finish(cfg: RunConfig, _lib: &Library, recon: String, transcript: String, mut findings: Vec<Finding>,
selected: Vec<Agent>, rl: &mut RlState, tx: Sender<String>) -> RunOutput {
// --- Grounding gate: no claim without a tool receipt (anti-hallucination) ---
@@ -994,5 +1032,6 @@ pub async fn run_host(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: Sende
findings.extend(extra);
findings = dedup_findings(findings);
}
let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
finish(cfg, lib, recon, transcript, findings, selected, &mut rl, tx).await
}
+95 -6
View File
@@ -312,12 +312,7 @@ impl ModelPool {
};
if let Ok(text) = self.one("validate", m, system, user).await {
total += 1;
let t = text.to_lowercase();
if t.contains("\"verdict\": \"confirmed\"")
|| t.trim_start().starts_with("yes")
|| t.contains("confirmed: true")
|| t.contains("is_real\": true")
{
if parse_verdict(&text) == Verdict::Confirmed {
confirmed += 1;
}
}
@@ -333,3 +328,97 @@ async fn wait_cancelled(flag: &Arc<AtomicBool>) {
tokio::time::sleep(Duration::from_millis(120)).await;
}
}
/// A validator's verdict on a candidate finding.
#[derive(Debug, PartialEq, Eq)]
pub enum Verdict {
Confirmed,
Rejected,
/// No clear yes/no — treated conservatively as NOT confirmed.
Unclear,
}
/// Robustly parse a validator reply into a verdict. Whitespace-insensitive
/// (so `{"verdict":"confirmed"}` and `{ "verdict": "confirmed" }` both match),
/// checks explicit rejection first, and only counts an *explicit* confirmation.
/// Anything ambiguous is `Unclear` (does not count as confirmed) — biasing the
/// pipeline against false positives.
pub fn parse_verdict(text: &str) -> Verdict {
let lower = text.to_lowercase();
let dense: String = lower.chars().filter(|c| !c.is_whitespace()).collect();
// Explicit rejection wins (conservative).
let rejected = [
"\"verdict\":\"rejected\"", "\"verdict\":\"reject\"", "verdict:rejected",
"\"is_real\":false", "\"isreal\":false", "\"confirmed\":false", "\"real\":false",
"\"exploitable\":false", "\"valid\":false",
];
if rejected.iter().any(|k| dense.contains(k)) {
return Verdict::Rejected;
}
// Explicit confirmation.
let confirmed = [
"\"verdict\":\"confirmed\"", "verdict:confirmed",
"\"is_real\":true", "\"isreal\":true", "\"confirmed\":true", "\"real\":true",
"\"exploitable\":true", "\"valid\":true",
];
if confirmed.iter().any(|k| dense.contains(k)) {
return Verdict::Confirmed;
}
// Fallback: only a leading, unambiguous "yes" counts as confirmation.
if lower.trim_start().starts_with("yes") {
return Verdict::Confirmed;
}
Verdict::Unclear
}
#[cfg(test)]
mod verdict_tests {
use super::*;
#[test]
fn parses_json_and_prose() {
assert_eq!(parse_verdict(r#"{"verdict":"confirmed","reason":"x"}"#), Verdict::Confirmed);
assert_eq!(parse_verdict(r#"{ "verdict": "confirmed" }"#), Verdict::Confirmed);
assert_eq!(parse_verdict(r#"{ "verdict": "rejected" }"#), Verdict::Rejected);
assert_eq!(parse_verdict(r#"{"is_real": false}"#), Verdict::Rejected);
assert_eq!(parse_verdict("Yes, the evidence proves RCE."), Verdict::Confirmed);
assert_eq!(parse_verdict("This looks theoretical."), Verdict::Unclear); // not counted
}
#[test]
fn rejection_beats_confirmation_when_both_present() {
// an answer that says confirmed:false must not be read as confirmed
assert_eq!(parse_verdict(r#"{"confirmed": false, "note": "verdict was confirmed earlier"}"#), Verdict::Rejected);
}
#[test]
fn quorum_is_severity_aware() {
// high/critical: need >=2 votes AND >=2/3
assert!(!quorum_confirmed("High", 1, 2));
assert!(quorum_confirmed("High", 2, 2));
assert!(quorum_confirmed("Critical", 2, 3));
assert!(!quorum_confirmed("Critical", 1, 3));
// single validator: majority applies to all
assert!(quorum_confirmed("Critical", 1, 1));
// low/medium: strict majority (more than half)
assert!(quorum_confirmed("Low", 1, 1));
assert!(!quorum_confirmed("Medium", 1, 2));
assert!(quorum_confirmed("Low", 2, 3));
assert!(!quorum_confirmed("Low", 0, 2));
}
}
/// Severity-aware confirmation quorum. False High/Critical findings are the most
/// costly, so they require ≥2 validators AND ≥2/3 agreement; lower severities
/// pass on a strict majority (more than half). With only one validator available
/// (single-model panel) the majority rule applies to all severities.
pub fn quorum_confirmed(severity: &str, yes: usize, total: usize) -> bool {
if total == 0 {
return false;
}
let s = severity.to_lowercase();
let high = s.starts_with("crit") || s.starts_with("high");
if high && total >= 2 {
yes * 3 >= total * 2 // ≥ two-thirds
} else {
yes * 2 > total // strict majority
}
}