Shadowbroker/backend/tests/test_uap_hf_fallback_cutoff.py

"""HF NUFORC fallback honors the rolling cutoff window.

Background
----------
The UAP sightings layer is sourced primarily from a live scrape of
nuforc.org. When that fails (Cloudflare 403, curl disabled on Windows,
wdtNonce regex stale, etc.) the code falls back to a static CSV mirror
hosted on Hugging Face at ``kcimc/NUFORC/nuforc_str.csv``.

The HF mirror is maintained by a third party and refreshed sporadically.
Pre-fix, the fallback parsed every row, sorted by ``occurred`` descending,
and took the top 250 — **with no date cutoff**. When the HF mirror is
stale (its "newest" rows are ~2-3 years old), users saw a map full of
2022-2023 sightings labeled as the "last 60 days" layer.

These tests pin the new behavior:

* Rows older than ``_NUFORC_RECENT_DAYS`` are dropped before the take-top-N.
* If the HF mirror has nothing in the window, the fallback returns ``[]``
  and logs ERROR (don't silently serve stale data).
* ``fetch_uap_sightings`` records the failure when BOTH paths fail, so
  the layer shows as broken in the health registry instead of "fresh".
"""

from __future__ import annotations

import logging
from datetime import datetime as real_datetime


class _FixedDateTime(real_datetime):
    """A datetime whose utcnow() returns a pinned value, for deterministic
    cutoff math. Subclasses real datetime so existing operations still work."""

    @classmethod
    def utcnow(cls):
        return cls(2026, 5, 1, 12, 0, 0)


class _StubResponse:
    status_code = 200

    def __init__(self, text: str):
        self.text = text


def _stub_geocode_cache(*_args, **_kwargs):
    """Pre-populated location cache so the fallback doesn't try to hit
    Photon during the test."""
    return {
        "Denver, CO, USA": [39.7392, -104.9903],
        "Seattle, WA, USA": [47.6062, -122.3321],
        "Phoenix, AZ, USA": [33.4484, -112.0740],
    }


def test_hf_fallback_drops_rows_older_than_60_days(monkeypatch):
    """Pre-fix: a row from 2023 would make it into the layer if it was
    among the newest 250 in the HF mirror. Post-fix: it's filtered out
    before we even count to 250."""
    from services.fetchers import earth_observation as eo

    # 2026-05-01 - 60 days = 2026-03-02. So 2026-03-01 is one day too old.
    csv_text = (
        "Sighting,Occurred,Location,Shape,Duration,Posted,Summary\n"
        '1,2026-04-15 21:00:00 Local,"Denver, CO, USA",Triangle,5 minutes,2026-04-16,"In-window sighting"\n'
        '2,2023-06-01 21:00:00 Local,"Seattle, WA, USA",Light,30 seconds,2023-06-02,"Three years old"\n'
        '3,2022-01-15 20:00:00 Local,"Phoenix, AZ, USA",Disk,2 minutes,2022-01-16,"Even older"\n'
    )

    monkeypatch.setattr(eo, "datetime", _FixedDateTime)
    monkeypatch.setattr(eo, "fetch_with_curl", lambda *a, **kw: _StubResponse(csv_text))
    monkeypatch.setattr(eo, "_load_nuforc_location_cache", _stub_geocode_cache)
    monkeypatch.setattr(eo, "_save_nuforc_location_cache", lambda cache: None)
    # If the cutoff is missing, the geocoder may still get called for the
    # 2022/2023 rows. We assert geocoder is NEVER invoked for stale rows.
    geocode_calls: list[str] = []

    def _geocode_spy(location, city, state, country=""):
        geocode_calls.append(location)
        return None  # already in cache, shouldn't be hit anyway

    monkeypatch.setattr(eo, "_geocode_uap_location", _geocode_spy)

    sightings = eo._build_uap_sightings_from_hf_mirror()

    ids = [s["id"] for s in sightings]
    assert ids == ["NUFORC-1"], f"only the 2026 row should survive: got {ids}"
    # Stale rows must not have been geocoded — they should be dropped
    # before the geocoding loop is reached.
    assert geocode_calls == []


def test_hf_fallback_returns_empty_when_mirror_is_fully_stale(monkeypatch, caplog):
    """The smoking-gun case: the HF mirror is so stale that NO rows are
    within the rolling window. Pre-fix returned 250 ancient rows. Post-fix
    returns ``[]`` and logs ERROR so the operator knows the layer is dead."""
    from services.fetchers import earth_observation as eo

    csv_text = (
        "Sighting,Occurred,Location,Shape,Duration,Posted,Summary\n"
        '1,2023-04-15 21:00:00 Local,"Denver, CO, USA",Triangle,5 minutes,2023-04-16,"Old"\n'
        '2,2022-06-01 21:00:00 Local,"Seattle, WA, USA",Light,30 seconds,2022-06-02,"Older"\n'
        '3,2021-01-15 20:00:00 Local,"Phoenix, AZ, USA",Disk,2 minutes,2021-01-16,"Ancient"\n'
    )

    monkeypatch.setattr(eo, "datetime", _FixedDateTime)
    monkeypatch.setattr(eo, "fetch_with_curl", lambda *a, **kw: _StubResponse(csv_text))
    monkeypatch.setattr(eo, "_load_nuforc_location_cache", _stub_geocode_cache)
    monkeypatch.setattr(eo, "_save_nuforc_location_cache", lambda cache: None)
    monkeypatch.setattr(eo, "_geocode_uap_location", lambda *a, **kw: None)

    with caplog.at_level(logging.ERROR, logger="services.fetchers.earth_observation"):
        sightings = eo._build_uap_sightings_from_hf_mirror()

    assert sightings == []
    # The error log should mention how many stale rows were dropped so the
    # operator can tell the mirror is the problem (not "we got 0 rows" which
    # could also mean the download failed).
    relevant = [r for r in caplog.records if "HF fallback yielded 0 rows" in r.getMessage()]
    assert relevant, "expected loud ERROR when HF mirror is fully stale"
    # The message should report the count of dropped stale rows.
    assert any("dropped 3" in r.getMessage() for r in relevant)


def test_hf_fallback_still_returns_data_when_some_rows_are_in_window(monkeypatch):
    """Mixed-age mirror: some rows in the window, some not. The fallback
    should return only the in-window rows and not log the doomsday ERROR."""
    from services.fetchers import earth_observation as eo

    csv_text = (
        "Sighting,Occurred,Location,Shape,Duration,Posted,Summary\n"
        '1,2026-04-15 21:00:00 Local,"Denver, CO, USA",Triangle,5 minutes,2026-04-16,"Fresh"\n'
        '2,2026-04-10 21:00:00 Local,"Seattle, WA, USA",Light,30 seconds,2026-04-10,"Also fresh"\n'
        '3,2020-01-15 20:00:00 Local,"Phoenix, AZ, USA",Disk,2 minutes,2020-01-16,"Ancient"\n'
    )

    monkeypatch.setattr(eo, "datetime", _FixedDateTime)
    monkeypatch.setattr(eo, "fetch_with_curl", lambda *a, **kw: _StubResponse(csv_text))
    monkeypatch.setattr(eo, "_load_nuforc_location_cache", _stub_geocode_cache)
    monkeypatch.setattr(eo, "_save_nuforc_location_cache", lambda cache: None)
    monkeypatch.setattr(eo, "_geocode_uap_location", lambda *a, **kw: None)

    sightings = eo._build_uap_sightings_from_hf_mirror()

    ids = sorted(s["id"] for s in sightings)
    assert ids == ["NUFORC-1", "NUFORC-2"], f"only in-window rows should appear: got {ids}"


def test_fetch_uap_sightings_marks_failure_when_both_paths_empty(monkeypatch, caplog):
    """When the live path raises AND the HF fallback returns empty,
    ``fetch_uap_sightings`` must:
      * NOT mark the layer fresh (pre-fix bug: it did, so the layer
        showed as healthy-but-empty for days)
      * call ``assert_canary("uap_sightings", 0)`` so the health
        registry surfaces the broken layer
      * log an ERROR with the live-path exception for debugging
    """
    from services.fetchers import earth_observation as eo
    from services.fetchers import _store

    monkeypatch.setattr(_store, "is_any_active", lambda layer: True)
    monkeypatch.setattr(eo, "_load_nuforc_sightings_cache", lambda force_refresh=False: None)

    def _boom():
        raise RuntimeError("NUFORC live: zero rows pulled across 3 months")

    monkeypatch.setattr(eo, "_build_recent_uap_sightings", _boom)
    monkeypatch.setattr(eo, "_build_uap_sightings_from_hf_mirror", lambda: [])

    marked: list[str] = []
    monkeypatch.setattr(eo, "_mark_fresh", lambda *keys: marked.extend(keys))

    canary_calls: list[tuple[str, int]] = []
    import services.slo as slo
    monkeypatch.setattr(
        slo, "assert_canary", lambda key, value: canary_calls.append((key, int(value)))
    )

    with caplog.at_level(logging.ERROR, logger="services.fetchers.earth_observation"):
        eo.fetch_uap_sightings()

    assert marked == [], "broken layer must NOT be marked fresh"
    assert canary_calls == [("uap_sightings", 0)], (
        f"expected canary trip when both paths fail; got {canary_calls}"
    )
    # The live error message should propagate into the error log so the
    # operator can tell live failed AND fallback was empty (not the other
    # way around).
    assert any(
        "both live NUFORC and HF fallback" in r.getMessage()
        for r in caplog.records
    )


def test_fetch_uap_sightings_succeeds_when_fallback_returns_data(monkeypatch):
    """Positive path: live fails, fallback returns rows. The layer is
    populated and marked fresh; assert_canary is NOT tripped (we only
    trip the canary when the layer has zero data)."""
    from services.fetchers import earth_observation as eo
    from services.fetchers import _store

    monkeypatch.setattr(_store, "is_any_active", lambda layer: True)
    monkeypatch.setattr(eo, "_load_nuforc_sightings_cache", lambda force_refresh=False: None)
    monkeypatch.setattr(
        eo, "_build_recent_uap_sightings", lambda: (_ for _ in ()).throw(RuntimeError("live down"))
    )

    fallback_rows = [{"id": "NUFORC-fb-1", "date_time": "2026-04-20", "lat": 0.0, "lng": 0.0}]
    monkeypatch.setattr(eo, "_build_uap_sightings_from_hf_mirror", lambda: fallback_rows)
    monkeypatch.setattr(eo, "_save_nuforc_sightings_cache", lambda s: None)

    marked: list[str] = []
    monkeypatch.setattr(eo, "_mark_fresh", lambda *keys: marked.extend(keys))

    canary_calls: list[tuple[str, int]] = []
    import services.slo as slo
    monkeypatch.setattr(
        slo, "assert_canary", lambda key, value: canary_calls.append((key, int(value)))
    )

    eo.fetch_uap_sightings()

    assert marked == ["uap_sightings"]
    assert canary_calls == [], "canary should not trip when fallback supplies data"


def test_uap_scheduler_runs_weekly_not_daily():
    """The cron job for the UAP layer must be configured for Mondays at
    12:00 UTC, not daily. Daily was the pre-fix default; weekly matches
    the layer's stated cadence (a rolling 60-day digest) and keeps load
    on nuforc.org light."""
    from services import data_fetcher

    src = data_fetcher.__file__
    with open(src, "r", encoding="utf-8") as f:
        text = f.read()

    # Anchor on the scheduler block by id, then assert the cron triggers.
    assert "uap_sightings_weekly" in text, (
        "scheduler id should be uap_sightings_weekly (was uap_sightings_daily pre-fix)"
    )
    # The day_of_week directive is the difference between daily and weekly.
    # If somebody flips it back to daily, this fires.
    weekly_block = text.split("uap_sightings_weekly", 1)[0]
    # Walk backwards for the matching add_job call.
    add_job_idx = weekly_block.rfind("add_job(")
    assert add_job_idx >= 0, "could not locate add_job block for UAP scheduler"
    job_block = text[add_job_idx : text.find(")", text.index("uap_sightings_weekly")) + 1]
    assert 'day_of_week="mon"' in job_block, (
        f"expected day_of_week='mon' in UAP scheduler block:\n{job_block}"
    )