harness: reduce false positives (robust verdicts, severity quorum, refute pass)

- Robust verdict parsing (pool::parse_verdict): whitespace-insensitive, checks explicit rejection first, counts only explicit confirmations; ambiguous → Unclear (not confirmed). Replaces the fragile exact-JSON / loose "yes" match. - Severity-aware quorum (pool::quorum_confirmed): High/Critical now need ≥2 validators AND ≥2/3 agreement (a single vote can no longer confirm a Critical); lower severities need a strict majority (>half, was ≥half). Single- model panels fall back to majority so they aren't nuked. - Adversarial refute pass (REFUTE_SYS): every confirmed High/Critical is re-examined by a skeptical panel that assumes false-positive; findings that can't withstand a majority of skeptics are dropped. Survives on infra failure. - Strengthened VOTE_SYS with an explicit false-positive checklist (reflected-not- executed, version/banner guesses, self-XSS, error-as-injection, thin evidence, inflated severity); validator query now also includes impact. - Unit tests for parse_verdict + quorum_confirmed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-03 18:07:51 +02:00 · 2026-07-01 17:33:15 -03:00
parent 0a181782a4
commit e9f81c164d
2 changed files with 138 additions and 10 deletions
@@ -60,7 +60,11 @@ fn tool_doctrine(mcp_on: bool) -> String {
         Use only what is installed; degrade gracefully. Never run destructive or DoS actions.\n\n"
    )
 }
-const VOTE_SYS: &str = "You are an adversarial security validator. Decide if the candidate finding is a REAL, reproducible, exploitable vulnerability with proof. Reply with JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"}. Default to rejected when uncertain.";
+const VOTE_SYS: &str = "You are an adversarial security validator. Decide if the candidate finding is a REAL, reproducible, exploitable vulnerability whose EVIDENCE actually proves impact. Reject common false positives: input merely reflected but not executed; version/banner guesses with no working PoC; self-XSS; theoretical issues; an error message or stack trace mistaken for injection; missing, generic, or non-reproducible evidence; severity inflated beyond what the evidence demonstrates. Confirm only if the provided evidence (request/response) concretely proves the vulnerability. Reply with JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"}. Default to rejected when uncertain.";
+/// Adversarial second pass for High/Critical findings: assume false positive
+/// until the evidence forces otherwise. A finding that can't withstand the
+/// skeptics is dropped.
+const REFUTE_SYS: &str = "You are a skeptical senior reviewer trying to DISPROVE a reported vulnerability. Assume it is a FALSE POSITIVE unless the evidence forces otherwise. Scrutinize: does the evidence PROVE execution/impact, or only that input was reflected/accepted? Is there a real working PoC, or just a version/banner/theory? Could it be self-XSS, an error message, or an unreachable path? Reply JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"} where confirmed means the vulnerability is REAL and proven by the evidence. When in doubt, reject.";
 const CODE_VOTE_SYS: &str = "You are an adversarial source-code reviewer. Decide if the reported issue is a REAL vulnerability in the provided code (reachable, exploitable, not a false positive). Reply JSON {\"verdict\":\"confirmed\"|\"rejected\",\"reason\":\"...\"}.";

 /// ReAct loop directive: make the agent reason → act with a tool → observe →
@@ -225,6 +229,7 @@ pub async fn run(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: Sender<Str
        findings.extend(extra);
        findings = dedup_findings(findings);
    }
+    let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
    finish(cfg, lib, recon, transcript, findings, selected, &mut rl, tx).await
 }

@@ -286,6 +291,7 @@ pub async fn run_whitebox(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: S
    let candidates = dedup_findings(raw.iter().flat_map(|(_, _, f)| f.clone()).collect());
    let _ = tx.send(format!("{} candidate finding(s) (deduped) — validating", candidates.len())).await;
    let findings = validate(candidates, pool, CODE_VOTE_SYS, cfg.vote_n, &tx).await;
+    let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
    finish(cfg, lib, "{}".into(), transcript, findings, selected, &mut rl, tx).await
 }

@@ -428,6 +434,7 @@ pub async fn run_greybox(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: Se
        findings.extend(extra);
        findings = dedup_findings(findings);
    }
+    let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
    finish(cfg, lib, recon, transcript, findings, selected, &mut rl, tx).await
 }

@@ -603,11 +610,11 @@ async fn validate(candidates: Vec<Finding>, pool: &ModelPool, sys: &str, vote_n:
            let finder = finder.clone();
            async move {
                let q = format!(
-                    "Finding: {} | severity {} | {} | at {} | payload {} | evidence {}",
-                    f.title, f.severity, f.cwe, f.endpoint, f.payload, f.evidence
+                    "Finding: {} | severity {} | {} | at {} | payload {} | evidence {} | impact {}",
+                    f.title, f.severity, f.cwe, f.endpoint, f.payload, f.evidence, f.impact
                );
                let (yes, total) = pool.vote(sys, &q, vote_n, finder.as_deref()).await;
-                f.validated = total > 0 && yes * 2 >= total;
+                f.validated = crate::pool::quorum_confirmed(&f.severity, yes, total);
                f.votes = format!("{yes}/{total}");
                if f.confidence == 0.0 && total > 0 {
                    f.confidence = yes as f64 / total as f64;
@@ -622,6 +629,37 @@ async fn validate(candidates: Vec<Finding>, pool: &ModelPool, sys: &str, vote_n:
    validated.into_iter().filter(|f| f.validated).collect()
 }

+/// Adversarial refutation pass: every confirmed **High/Critical** finding is
+/// re-examined by a skeptical panel that tries to prove it's a false positive.
+/// A finding that fails to withstand a majority of skeptics is dropped. Lower
+/// severities pass through unchanged. Runs only when a real panel exists.
+async fn refute_pass(findings: Vec<Finding>, pool: &ModelPool, vote_n: usize, tx: &Sender<String>) -> Vec<Finding> {
+    let finder = pool.candidates.first().map(|m| m.label());
+    let mut kept = Vec::new();
+    for mut f in findings {
+        let s = f.severity.to_lowercase();
+        let high = s.starts_with("crit") || s.starts_with("high");
+        if !high || pool.stop_exploiting() {
+            kept.push(f);
+            continue;
+        }
+        let q = format!(
+            "Finding: {} | severity {} | {} | at {} | payload {} | evidence {} | impact {}",
+            f.title, f.severity, f.cwe, f.endpoint, f.payload, f.evidence, f.impact
+        );
+        let (yes, total) = pool.vote(REFUTE_SYS, &q, vote_n.max(2), finder.as_deref()).await;
+        // Survive on no-response (infra failure) or a surviving majority.
+        let survives = total == 0 || yes * 2 > total;
+        if survives {
+            if total > 0 { f.votes = format!("{} · refute {yes}/{total}", f.votes); }
+            kept.push(f);
+        } else {
+            let _ = tx.send(format!("vote {} → dropped by adversarial refute ({yes}/{total})", f.title)).await;
+        }
+    }
+    kept
+}
+
 async fn finish(cfg: RunConfig, _lib: &Library, recon: String, transcript: String, mut findings: Vec<Finding>,
                selected: Vec<Agent>, rl: &mut RlState, tx: Sender<String>) -> RunOutput {
    // --- Grounding gate: no claim without a tool receipt (anti-hallucination) ---
@@ -994,5 +1032,6 @@ pub async fn run_host(cfg: RunConfig, lib: &Library, pool: &ModelPool, tx: Sende
        findings.extend(extra);
        findings = dedup_findings(findings);
    }
+    let findings = refute_pass(findings, pool, cfg.vote_n, &tx).await;
    finish(cfg, lib, recon, transcript, findings, selected, &mut rl, tx).await
 }
@@ -312,12 +312,7 @@ impl ModelPool {
            };
            if let Ok(text) = self.one("validate", m, system, user).await {
                total += 1;
-                let t = text.to_lowercase();
-                if t.contains("\"verdict\": \"confirmed\"")
-                    || t.trim_start().starts_with("yes")
-                    || t.contains("confirmed: true")
-                    || t.contains("is_real\": true")
-                {
+                if parse_verdict(&text) == Verdict::Confirmed {
                    confirmed += 1;
                }
            }
@@ -333,3 +328,97 @@ async fn wait_cancelled(flag: &Arc<AtomicBool>) {
        tokio::time::sleep(Duration::from_millis(120)).await;
    }
 }
+
+/// A validator's verdict on a candidate finding.
+#[derive(Debug, PartialEq, Eq)]
+pub enum Verdict {
+    Confirmed,
+    Rejected,
+    /// No clear yes/no — treated conservatively as NOT confirmed.
+    Unclear,
+}
+
+/// Robustly parse a validator reply into a verdict. Whitespace-insensitive
+/// (so `{"verdict":"confirmed"}` and `{ "verdict": "confirmed" }` both match),
+/// checks explicit rejection first, and only counts an *explicit* confirmation.
+/// Anything ambiguous is `Unclear` (does not count as confirmed) — biasing the
+/// pipeline against false positives.
+pub fn parse_verdict(text: &str) -> Verdict {
+    let lower = text.to_lowercase();
+    let dense: String = lower.chars().filter(|c| !c.is_whitespace()).collect();
+
+    // Explicit rejection wins (conservative).
+    let rejected = [
+        "\"verdict\":\"rejected\"", "\"verdict\":\"reject\"", "verdict:rejected",
+        "\"is_real\":false", "\"isreal\":false", "\"confirmed\":false", "\"real\":false",
+        "\"exploitable\":false", "\"valid\":false",
+    ];
+    if rejected.iter().any(|k| dense.contains(k)) {
+        return Verdict::Rejected;
+    }
+    // Explicit confirmation.
+    let confirmed = [
+        "\"verdict\":\"confirmed\"", "verdict:confirmed",
+        "\"is_real\":true", "\"isreal\":true", "\"confirmed\":true", "\"real\":true",
+        "\"exploitable\":true", "\"valid\":true",
+    ];
+    if confirmed.iter().any(|k| dense.contains(k)) {
+        return Verdict::Confirmed;
+    }
+    // Fallback: only a leading, unambiguous "yes" counts as confirmation.
+    if lower.trim_start().starts_with("yes") {
+        return Verdict::Confirmed;
+    }
+    Verdict::Unclear
+}
+
+#[cfg(test)]
+mod verdict_tests {
+    use super::*;
+    #[test]
+    fn parses_json_and_prose() {
+        assert_eq!(parse_verdict(r#"{"verdict":"confirmed","reason":"x"}"#), Verdict::Confirmed);
+        assert_eq!(parse_verdict(r#"{ "verdict": "confirmed" }"#), Verdict::Confirmed);
+        assert_eq!(parse_verdict(r#"{ "verdict": "rejected" }"#), Verdict::Rejected);
+        assert_eq!(parse_verdict(r#"{"is_real": false}"#), Verdict::Rejected);
+        assert_eq!(parse_verdict("Yes, the evidence proves RCE."), Verdict::Confirmed);
+        assert_eq!(parse_verdict("This looks theoretical."), Verdict::Unclear); // not counted
+    }
+    #[test]
+    fn rejection_beats_confirmation_when_both_present() {
+        // an answer that says confirmed:false must not be read as confirmed
+        assert_eq!(parse_verdict(r#"{"confirmed": false, "note": "verdict was confirmed earlier"}"#), Verdict::Rejected);
+    }
+    #[test]
+    fn quorum_is_severity_aware() {
+        // high/critical: need >=2 votes AND >=2/3
+        assert!(!quorum_confirmed("High", 1, 2));
+        assert!(quorum_confirmed("High", 2, 2));
+        assert!(quorum_confirmed("Critical", 2, 3));
+        assert!(!quorum_confirmed("Critical", 1, 3));
+        // single validator: majority applies to all
+        assert!(quorum_confirmed("Critical", 1, 1));
+        // low/medium: strict majority (more than half)
+        assert!(quorum_confirmed("Low", 1, 1));
+        assert!(!quorum_confirmed("Medium", 1, 2));
+        assert!(quorum_confirmed("Low", 2, 3));
+        assert!(!quorum_confirmed("Low", 0, 2));
+    }
+}
+
+/// Severity-aware confirmation quorum. False High/Critical findings are the most
+/// costly, so they require ≥2 validators AND ≥2/3 agreement; lower severities
+/// pass on a strict majority (more than half). With only one validator available
+/// (single-model panel) the majority rule applies to all severities.
+pub fn quorum_confirmed(severity: &str, yes: usize, total: usize) -> bool {
+    if total == 0 {
+        return false;
+    }
+    let s = severity.to_lowercase();
+    let high = s.starts_with("crit") || s.starts_with("high");
+    if high && total >= 2 {
+        yes * 3 >= total * 2 // ≥ two-thirds
+    } else {
+        yes * 2 > total // strict majority
+    }
+}