NeuroSploit/backend/core/confidence_scorer.py

"""
NeuroSploit v3 - Confidence Scoring Engine

Numeric 0-100 confidence scoring for vulnerability findings.
Combines proof of execution, negative control results, and signal analysis
into a single score with transparent breakdown.

Score Thresholds:
    >= 90 → "confirmed" (AI Verified, high confidence)
    >= 60 → "likely" (needs manual review)
    <  60 → "rejected" (auto-reject, false positive)
"""

import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Result types
# ---------------------------------------------------------------------------

@dataclass
class ConfidenceResult:
    """Result of confidence scoring."""
    score: int                       # 0-100
    verdict: str                     # "confirmed" | "likely" | "rejected"
    breakdown: Dict[str, int] = field(default_factory=dict)  # Component scores
    detail: str = ""                 # Human-readable explanation


# ---------------------------------------------------------------------------
# Scorer
# ---------------------------------------------------------------------------

class ConfidenceScorer:
    """Calculates numeric confidence score 0-100 for vulnerability findings.

    Weights:
        +0-60  Proof of execution (per vuln type — the most important signal)
        +0-30  Proof of impact (severity-aware)
        +0-20  Negative controls passed (response differs from benign)
        -40    Only baseline diff signal (no actual proof of exploitation)
        -60    Same behavior on negative controls (critical false positive indicator)
        -40    AI interpretation says payload was ineffective
    """

    # Threshold constants
    THRESHOLD_CONFIRMED = 90
    THRESHOLD_LIKELY = 60

    # Weight caps
    MAX_PROOF_SCORE = 60
    MAX_IMPACT_SCORE = 30
    MAX_CONTROLS_BONUS = 20
    PENALTY_ONLY_DIFF = -40
    PENALTY_SAME_BEHAVIOR = -60
    PENALTY_AI_INEFFECTIVE = -40

    # Keywords in AI interpretation that indicate payload was ineffective
    INEFFECTIVE_KEYWORDS = [
        "ignored", "not processed", "blocked", "filtered",
        "sanitized", "rejected", "not executed", "was not",
        "does not", "did not", "no effect", "no impact",
        "benign", "safe", "harmless",
    ]

    def calculate(
        self,
        signals: List[str],
        proof_result,       # ProofResult from proof_of_execution
        control_result,     # NegativeControlResult from negative_control
        ai_interpretation: Optional[str] = None,
    ) -> ConfidenceResult:
        """Calculate confidence score from all verification components.

        Args:
            signals: List of signal names from multi_signal_verify
                     (e.g., ["baseline_diff", "payload_effect"])
            proof_result: ProofResult from ProofOfExecution.check()
            control_result: NegativeControlResult from NegativeControlEngine
            ai_interpretation: Optional AI response interpretation text

        Returns:
            ConfidenceResult with score, verdict, breakdown, and detail
        """
        breakdown: Dict[str, int] = {}
        score = 0

        # ── Component 1: Proof of Execution (0-60) ────────────────────
        proof_score = min(proof_result.score, self.MAX_PROOF_SCORE) if proof_result else 0
        score += proof_score
        breakdown["proof_of_execution"] = proof_score

        # ── Component 2: Proof of Impact (0-30) ───────────────────────
        impact_score = 0
        if proof_result and proof_result.proven:
            if proof_result.impact_demonstrated:
                impact_score = self.MAX_IMPACT_SCORE  # Full impact shown
            else:
                impact_score = 15  # Proven but no impact demonstration
        score += impact_score
        breakdown["proof_of_impact"] = impact_score

        # ── Component 3: Negative Controls (bonus/penalty) ─────────────
        controls_score = 0
        if control_result:
            if control_result.same_behavior:
                controls_score = self.PENALTY_SAME_BEHAVIOR  # -60
            else:
                controls_score = min(
                    self.MAX_CONTROLS_BONUS,
                    control_result.confidence_adjustment
                )  # +20
        score += controls_score
        breakdown["negative_controls"] = controls_score

        # ── Penalty: Only baseline diff signal ─────────────────────────
        diff_penalty = 0
        if signals and set(signals) <= {"baseline_diff", "new_errors"}:
            # Only diff-based signals, no actual payload effect
            if proof_score == 0:
                diff_penalty = self.PENALTY_ONLY_DIFF  # -40
                score += diff_penalty
        breakdown["diff_only_penalty"] = diff_penalty

        # ── Penalty: AI says payload was ineffective ──────────────────
        ai_penalty = 0
        if ai_interpretation:
            ai_lower = ai_interpretation.lower()
            if any(kw in ai_lower for kw in self.INEFFECTIVE_KEYWORDS):
                ai_penalty = self.PENALTY_AI_INEFFECTIVE  # -40
                score += ai_penalty
        breakdown["ai_ineffective_penalty"] = ai_penalty

        # ── Clamp and determine verdict ────────────────────────────────
        score = max(0, min(100, score))

        if score >= self.THRESHOLD_CONFIRMED:
            verdict = "confirmed"
        elif score >= self.THRESHOLD_LIKELY:
            verdict = "likely"
        else:
            verdict = "rejected"

        # Build detail string
        detail_parts = []
        if proof_result and proof_result.proven:
            detail_parts.append(f"Proof: {proof_result.proof_type} ({proof_score}pts)")
        else:
            detail_parts.append("No proof of execution (0pts)")

        if impact_score > 0:
            detail_parts.append(f"Impact: +{impact_score}pts")

        if control_result:
            if control_result.same_behavior:
                detail_parts.append(
                    f"NEGATIVE CONTROL FAIL: {control_result.controls_matching}/"
                    f"{control_result.controls_run} same behavior ({controls_score}pts)")
            else:
                detail_parts.append(f"Controls passed (+{controls_score}pts)")

        if diff_penalty:
            detail_parts.append(f"Only-diff penalty ({diff_penalty}pts)")

        if ai_penalty:
            detail_parts.append(f"AI-ineffective penalty ({ai_penalty}pts)")

        detail = f"Score: {score}/100 [{verdict}] — " + "; ".join(detail_parts)

        return ConfidenceResult(
            score=score,
            verdict=verdict,
            breakdown=breakdown,
            detail=detail,
        )