Shadowbroker/backend/analytics/rolling_backtest.py

"""Rolling weekly operational validation for Strategic Risk Analytics.

Freezes live GT scores each ISO week, accepts delayed outcome labels, and
scores prior-week predictions with accuracy + Wilson 95% CI. Unlike the
static historical benchmark, this measures forward operational usefulness.
"""

from __future__ import annotations

import os
from dataclasses import dataclass
from datetime import date, datetime, timezone
from typing import Any, Literal

from analytics.backtest import DEFAULT_BACKTEST_ALERT_THRESHOLD, wilson_interval
from analytics.gt_early_warning import GT_EarlyWarning
from analytics.integration import get_gt_engine
from analytics.weekly_store import (
    VALID_LABELS,
    LabelName,
    RegionSnapshot,
    WeeklySnapshot,
    list_week_ids,
    load_week,
    save_week,
    utc_now_iso,
)

MIN_LABELED_FOR_TREND = 5


def _env_float(name: str, default: float) -> float:
    raw = str(os.environ.get(name, "")).strip()
    if not raw:
        return default
    try:
        return float(raw)
    except ValueError:
        return default


def rolling_alert_threshold() -> float:
    """Fixed operational alert cutoff — not retroactively tuned per week."""
    return _env_float("GT_ROLLING_ALERT_THRESHOLD", DEFAULT_BACKTEST_ALERT_THRESHOLD)


def iso_week_id(when: datetime | date | None = None) -> str:
    """Return ISO week id, e.g. ``2026-W24``."""
    if when is None:
        when = datetime.now(timezone.utc)
    if isinstance(when, datetime):
        when = when.date()
    year, week, _ = when.isocalendar()
    return f"{year}-W{week:02d}"


def _region_rows_from_engine(
    engine: GT_EarlyWarning,
    *,
    alert_threshold: float,
) -> list[RegionSnapshot]:
    heatmap = engine.get_risk_heatmap()
    rows: list[RegionSnapshot] = []
    for feature in heatmap.get("features") or []:
        if not isinstance(feature, dict):
            continue
        props = feature.get("properties") or {}
        region = str(props.get("region") or "").strip().lower()
        if not region:
            continue
        composite = float(props.get("risk") or 0.0)
        financial = float(props.get("financial") or 0.0)
        unrest = float(props.get("unrest") or 0.0)
        conflict = float(props.get("conflict") or 0.0)
        peak_score = max(composite, financial, unrest, conflict)
        rows.append(
            RegionSnapshot(
                region=region,
                composite_risk=composite,
                financial=financial,
                unrest=unrest,
                conflict=conflict,
                alerted=peak_score >= alert_threshold,
                label="pending",
            )
        )
    rows.sort(key=lambda row: row.composite_risk, reverse=True)
    return rows


@dataclass(frozen=True)
class WeekScore:
    week_id: str
    frozen_at: str
    alert_threshold: float
    total_regions: int
    labeled: int
    pending: int
    alerted: int
    correct: int
    accuracy: float
    confidence_rate: float
    wilson_lower_95: float
    wilson_upper_95: float
    true_positives: int
    true_negatives: int
    false_positives: int
    false_negatives: int
    sensitivity: float
    specificity: float
    scorable: bool

    def to_dict(self) -> dict[str, Any]:
        return {
            "week_id": self.week_id,
            "frozen_at": self.frozen_at,
            "alert_threshold": round(self.alert_threshold, 4),
            "total_regions": self.total_regions,
            "labeled": self.labeled,
            "pending": self.pending,
            "alerted": self.alerted,
            "correct": self.correct,
            "accuracy": round(self.accuracy, 4),
            "confidence_rate": round(self.confidence_rate, 4),
            "wilson_lower_95": round(self.wilson_lower_95, 4),
            "wilson_upper_95": round(self.wilson_upper_95, 4),
            "true_positives": self.true_positives,
            "true_negatives": self.true_negatives,
            "false_positives": self.false_positives,
            "false_negatives": self.false_negatives,
            "sensitivity": round(self.sensitivity, 4),
            "specificity": round(self.specificity, 4),
            "scorable": self.scorable,
        }


def _predicted_positive(row: RegionSnapshot) -> bool:
    return row.alerted


def _actual_positive(label: LabelName) -> bool:
    return label == "true_escalation"


def _is_correct(row: RegionSnapshot) -> bool:
    if row.label == "pending":
        return False
    predicted = _predicted_positive(row)
    if row.label == "true_escalation":
        return predicted
    if row.label in ("false_alarm", "benign"):
        return not predicted
    return False


def score_week(snapshot: WeeklySnapshot) -> WeekScore:
    """Score a frozen week against delayed labels (pending rows excluded)."""
    labeled_rows = [row for row in snapshot.regions if row.label != "pending"]
    pending = len(snapshot.regions) - len(labeled_rows)

    tp = sum(
        1
        for row in labeled_rows
        if row.alerted and row.label == "true_escalation"
    )
    tn = sum(
        1
        for row in labeled_rows
        if not row.alerted and row.label in ("benign", "false_alarm")
    )
    fp = sum(
        1
        for row in labeled_rows
        if row.alerted and row.label in ("false_alarm", "benign")
    )
    fn = sum(
        1
        for row in labeled_rows
        if not row.alerted and row.label == "true_escalation"
    )

    correct = tp + tn
    total = len(labeled_rows)
    accuracy = correct / total if total else 0.0
    lower, upper = wilson_interval(correct, total)

    pos_total = sum(1 for row in labeled_rows if _actual_positive(row.label))  # type: ignore[arg-type]
    neg_total = total - pos_total
    pred_pos = sum(1 for row in labeled_rows if row.alerted)
    pred_neg = total - pred_pos

    sensitivity = tp / pos_total if pos_total else 0.0
    specificity = tn / pred_neg if pred_neg else (1.0 if tn == total and total else 0.0)

    return WeekScore(
        week_id=snapshot.week_id,
        frozen_at=snapshot.frozen_at,
        alert_threshold=snapshot.alert_threshold,
        total_regions=len(snapshot.regions),
        labeled=total,
        pending=pending,
        alerted=sum(1 for row in snapshot.regions if row.alerted),
        correct=correct,
        accuracy=accuracy,
        confidence_rate=lower,
        wilson_lower_95=lower,
        wilson_upper_95=upper,
        true_positives=tp,
        true_negatives=tn,
        false_positives=fp,
        false_negatives=fn,
        sensitivity=sensitivity,
        specificity=specificity,
        scorable=total >= MIN_LABELED_FOR_TREND,
    )


def freeze_weekly_snapshot(
    *,
    week_id: str | None = None,
    alert_threshold: float | None = None,
    force: bool = False,
    frozen_by: str = "system",
    engine: GT_EarlyWarning | None = None,
) -> dict[str, Any]:
    """
    Capture current GT heatmap as an immutable weekly operational snapshot.

    Idempotent per week unless ``force=True``.
    """
    resolved_engine = engine or get_gt_engine()
    if resolved_engine is None:
        return {"ok": False, "detail": "GT analytics engine unavailable"}

    resolved_week = week_id or iso_week_id()
    threshold = float(
        alert_threshold if alert_threshold is not None else rolling_alert_threshold()
    )

    existing = load_week(resolved_week)
    if existing and existing.regions and not force:
        score = score_week(existing)
        return {
            "ok": True,
            "created": False,
            "week_id": resolved_week,
            "snapshot": existing.to_dict(),
            "score": score.to_dict(),
        }

    regions = _region_rows_from_engine(resolved_engine, alert_threshold=threshold)
    snapshot = WeeklySnapshot(
        week_id=resolved_week,
        frozen_at=utc_now_iso(),
        alert_threshold=threshold,
        regions=regions,
        frozen_by=frozen_by,
    )
    save_week(snapshot)
    score = score_week(snapshot)
    return {
        "ok": True,
        "created": True,
        "week_id": resolved_week,
        "snapshot": snapshot.to_dict(),
        "score": score.to_dict(),
        "alert_count": sum(1 for row in regions if row.alerted),
        "region_count": len(regions),
    }


def label_regions(
    week_id: str,
    labels: list[dict[str, Any]],
    *,
    labeled_by: str = "operator",
) -> dict[str, Any]:
    """Apply delayed outcome labels to a frozen week."""
    snapshot = load_week(week_id)
    if snapshot is None:
        return {"ok": False, "detail": f"Week {week_id} not found"}

    by_region = {row.region: row for row in snapshot.regions}
    updated = 0
    skipped: list[str] = []
    now = utc_now_iso()

    for entry in labels:
        if not isinstance(entry, dict):
            continue
        region = str(entry.get("region") or "").strip().lower()
        label = str(entry.get("label") or "").strip().lower()
        if not region or label not in VALID_LABELS or label == "pending":
            if region:
                skipped.append(region)
            continue
        row = by_region.get(region)
        if row is None:
            skipped.append(region)
            continue
        row.label = label  # type: ignore[assignment]
        row.labeled_at = now
        notes = entry.get("notes")
        if notes is not None:
            row.notes = str(notes)
        updated += 1

    save_week(snapshot)
    score = score_week(snapshot)
    return {
        "ok": True,
        "week_id": week_id,
        "updated": updated,
        "skipped": skipped,
        "labeled_by": labeled_by,
        "score": score.to_dict(),
    }


def label_region(
    week_id: str,
    region: str,
    label: LabelName,
    *,
    notes: str = "",
    labeled_by: str = "operator",
) -> dict[str, Any]:
    return label_regions(
        week_id,
        [{"region": region, "label": label, "notes": notes}],
        labeled_by=labeled_by,
    )


def rolling_trend(*, weeks: int = 8) -> list[WeekScore]:
    """Return scored weeks newest-first (only weeks with stored snapshots)."""
    ids = list_week_ids(newest_first=True)[: max(1, weeks)]
    scores: list[WeekScore] = []
    for week_id in ids:
        snapshot = load_week(week_id)
        if snapshot is None:
            continue
        scores.append(score_week(snapshot))
    return scores


def rolling_report(*, weeks: int = 8, target_confidence: float = 0.80) -> dict[str, Any]:
    """Aggregate operational validation trend for API / OpenClaw."""
    threshold = rolling_alert_threshold()
    trend = rolling_trend(weeks=weeks)
    scorable = [row for row in trend if row.scorable]

    latest = scorable[0] if scorable else (trend[0] if trend else None)
    accuracy_series = [
        {"week_id": row.week_id, "accuracy": round(row.accuracy, 4), "labeled": row.labeled}
        for row in reversed(scorable)
    ]

    improving = False
    if len(scorable) >= 2:
        improving = scorable[0].accuracy >= scorable[1].accuracy

    return {
        "mode": "rolling_operational",
        "alert_threshold": threshold,
        "target_confidence": target_confidence,
        "weeks_requested": weeks,
        "weeks_stored": len(trend),
        "weeks_scorable": len(scorable),
        "min_labeled_per_week": MIN_LABELED_FOR_TREND,
        "latest": latest.to_dict() if latest else None,
        "trend": [row.to_dict() for row in trend],
        "accuracy_series": accuracy_series,
        "improving_vs_prior": improving,
        "meets_target": bool(
            latest and latest.scorable and latest.confidence_rate >= target_confidence
        ),
        "note": (
            "Operational metric: scores frozen weekly predictions against delayed "
            "labels. Unlike the static benchmark, this measures live forward utility."
        ),
    }