Shadowbroker/backend/services/radio_intercept.py

import requests
from bs4 import BeautifulSoup
import logging
from cachetools import cached, TTLCache
import reverse_geocoder as rg
from urllib.parse import urlparse

from services.network_utils import outbound_user_agent

logger = logging.getLogger(__name__)

_OPENMHZ_AUDIO_HOSTS = {"media.openmhz.com", "media2.openmhz.com", "media3.openmhz.com"}


# Round 7a / Issues #289, #290, #291 (tg12 audit):
# We previously sent a spoofed Chrome User-Agent and (for OpenMHz) used
# cloudscraper to bypass anti-bot challenges. Both are dishonest and ToS-
# unfriendly. We now send the per-install Shadowbroker UA — the upstream
# can identify us, rate-limit us per install, and contact us if needed.
#
# If the upstream actively blocks our honest UA, the feature degrades
# gracefully (returns an empty list / cached results) rather than
# escalating to deception.


def _broadcastify_user_agent() -> str:
    return outbound_user_agent("broadcastify")


def _openmhz_user_agent() -> str:
    return outbound_user_agent("openmhz")

# Cache the top feeds for 5 minutes so we don't hammer Broadcastify
radio_cache = TTLCache(maxsize=1, ttl=300)


@cached(radio_cache)
def get_top_broadcastify_feeds():
    """
    Scrapes the Broadcastify Top 50 live audio feeds public dashboard.
    Returns a list of dictionaries containing feed metadata and direct stream URLs.
    """
    logger.info("Scraping Broadcastify Top Feeds (Cache Miss)")
    headers = {
        # Issue #289 (tg12) + Round 7a: identify ourselves honestly as a
        # per-install Shadowbroker scraper. Broadcastify can rate-limit
        # us per install or block us; either way we stop pretending to be
        # a browser. If they block, the panel degrades gracefully.
        "User-Agent": _broadcastify_user_agent(),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
    }

    try:
        res = requests.get("https://www.broadcastify.com/listen/top", headers=headers, timeout=10)
        if res.status_code != 200:
            logger.error(f"Broadcastify Scrape Failed: HTTP {res.status_code}")
            return []

        soup = BeautifulSoup(res.text, "html.parser")

        table = soup.find("table", {"class": "btable"})
        if not table:
            logger.error("Could not find feeds table on Broadcastify.")
            return []

        feeds = []
        rows = table.find_all("tr")[1:]  # Skip header row

        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 5:
                # Top layout: [Listeners, Feed ID (hidden), Location, Feed Name, Category, Genre]
                listeners_str = cols[0].text.strip().replace(",", "")
                listeners = int(listeners_str) if listeners_str.isdigit() else 0

                link_tag = cols[2].find("a")
                if not link_tag:
                    continue

                href = link_tag.get("href", "")
                feed_id = href.split("/")[-1] if "/listen/feed/" in href else None

                if not feed_id:
                    continue

                location = cols[1].text.strip()
                name = cols[2].text.strip()
                category = cols[3].text.strip()

                feeds.append(
                    {
                        "id": feed_id,
                        "listeners": listeners,
                        "location": location,
                        "name": name,
                        "category": category,
                        "stream_url": f"https://broadcastify.cdnstream1.com/{feed_id}",
                    }
                )

        logger.info(f"Successfully scraped {len(feeds)} top feeds from Broadcastify.")
        return feeds

    except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e:
        logger.error(f"Broadcastify Scrape Exception: {e}")
        return []


# Cache OpenMHZ systems mapping so we don't have to fetch all 450+ every time
openmhz_systems_cache = TTLCache(maxsize=1, ttl=3600)


@cached(openmhz_systems_cache)
def get_openmhz_systems():
    """Fetches the full directory of OpenMHZ systems.

    Issue #290 (tg12) + Round 7a: replaced cloudscraper-based Chrome
    impersonation with an honest per-install Shadowbroker User-Agent.
    If OpenMHz's Cloudflare layer blocks honest traffic, we accept
    that degradation (return empty list) rather than spoof a browser.
    """
    logger.info("Fetching OpenMHZ Systems (Cache Miss)")
    try:
        res = requests.get(
            "https://api.openmhz.com/systems",
            timeout=15,
            headers={"User-Agent": _openmhz_user_agent(), "Accept": "application/json"},
        )
        if res.status_code == 200:
            data = res.json()
            return data.get("systems", []) if isinstance(data, dict) else []
        if res.status_code in (403, 503):
            logger.warning(
                "OpenMHZ returned %s for systems directory — Cloudflare may "
                "be blocking our honest UA. Feature degrades to empty result.",
                res.status_code,
            )
        return []
    except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e:
        logger.error(f"OpenMHZ Systems Fetch Exception: {e}")
        return []


# Cache specific city calls briefly (15-30s) to limit our polling rate
openmhz_calls_cache = TTLCache(maxsize=100, ttl=20)


@cached(openmhz_calls_cache)
def get_recent_openmhz_calls(sys_name: str):
    """Fetches the actual audio burst .m4a URLs for a specific system (e.g., 'wmata').

    Issue #290 (tg12) + Round 7a: same honest-UA model as
    ``get_openmhz_systems``.
    """
    logger.info(f"Fetching OpenMHZ calls for {sys_name} (Cache Miss)")
    try:
        url = f"https://api.openmhz.com/{sys_name}/calls"
        res = requests.get(
            url,
            timeout=15,
            headers={"User-Agent": _openmhz_user_agent(), "Accept": "application/json"},
        )
        if res.status_code == 200:
            data = res.json()
            return data.get("calls", []) if isinstance(data, dict) else []
        return []
    except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e:
        logger.error(f"OpenMHZ Calls Fetch Exception ({sys_name}): {e}")
        return []


_OPENMHZ_MAX_REDIRECTS = 5


def openmhz_audio_response(target_url: str):
    """Fetch an OpenMHz audio object through the backend with browser-safe headers.

    Redirects are followed manually so each hop's host can be re-validated
    against ``_OPENMHZ_AUDIO_HOSTS``. Without this, the upstream could
    302-redirect to an internal address (e.g. ``http://127.0.0.1:8000/...``
    or an RFC1918 range), and the backend would dutifully fetch and stream
    that response back to the browser — a classic open-redirect-to-SSRF
    chain. Same-host redirects (CDN edge selection) still work normally.
    """
    from fastapi import HTTPException
    from fastapi.responses import StreamingResponse
    from urllib.parse import urljoin

    parsed = urlparse(str(target_url or ""))
    host = (parsed.hostname or "").lower()
    if parsed.scheme != "https" or host not in _OPENMHZ_AUDIO_HOSTS:
        raise HTTPException(status_code=400, detail="Unsupported OpenMHz audio URL")

    current_url = target_url
    hops = 0
    try:
        while True:
            upstream = requests.get(
                current_url,
                stream=True,
                timeout=(5, 20),
                allow_redirects=False,
                headers={
                    # Issue #291 (tg12) + Round 7a: drop spoofed Mozilla
                    # UA and the fake first-party Referer. Identify as
                    # the per-install Shadowbroker proxy honestly.
                    "User-Agent": _openmhz_user_agent(),
                    "Accept": "audio/mpeg,audio/*,*/*;q=0.8",
                },
            )
            if upstream.is_redirect or upstream.status_code in (301, 302, 303, 307, 308):
                location = upstream.headers.get("Location", "")
                upstream.close()
                if hops >= _OPENMHZ_MAX_REDIRECTS or not location:
                    raise HTTPException(status_code=502, detail="OpenMHz redirect rejected")
                next_url = urljoin(current_url, location)
                next_parsed = urlparse(next_url)
                next_host = (next_parsed.hostname or "").lower()
                # Re-validate the next hop against the same allowlist used for
                # the original URL. Cross-host redirects to disallowed hosts
                # are rejected silently; the browser audio element handles
                # the resulting 502 gracefully and moves on.
                if next_parsed.scheme != "https" or next_host not in _OPENMHZ_AUDIO_HOSTS:
                    raise HTTPException(status_code=502, detail="OpenMHz redirect rejected")
                current_url = next_url
                hops += 1
                continue
            break
    except requests.RequestException as exc:
        raise HTTPException(status_code=502, detail="OpenMHz audio fetch failed") from exc

    if upstream.status_code >= 400:
        upstream.close()
        raise HTTPException(status_code=upstream.status_code, detail="OpenMHz audio unavailable")

    def chunks():
        try:
            for chunk in upstream.iter_content(chunk_size=64 * 1024):
                if chunk:
                    yield chunk
        finally:
            upstream.close()

    return StreamingResponse(
        chunks(),
        media_type="audio/mpeg",
        headers={
            "Cache-Control": "public, max-age=300",
            "Accept-Ranges": "bytes",
        },
    )


US_STATES = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "Washington, D.C.": "DC",
    "District of Columbia": "DC",
}

import math


def haversine_distance(lat1, lon1, lat2, lon2):
    R = 3958.8  # Earth radius in miles
    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + math.cos(math.radians(lat1)) * math.cos(
        math.radians(lat2)
    ) * math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c


def find_nearest_openmhz_systems_list(lat: float, lng: float, limit: int = 5):
    """
    Finds the strictly nearest OpenMHZ systems by distance.
    """
    systems = get_openmhz_systems()
    if not systems:
        return []

    # Calculate distance for all systems that provide coordinates
    valid_systems = []
    for s in systems:
        s_lat = s.get("lat")
        s_lng = s.get("lng")
        if s_lat is not None and s_lng is not None:
            dist = haversine_distance(lat, lng, float(s_lat), float(s_lng))
            s["distance_miles"] = dist
            valid_systems.append(s)

    if not valid_systems:
        return []

    # Sort strictly by distance
    valid_systems.sort(key=lambda x: x["distance_miles"])
    return valid_systems[:limit]


def find_nearest_openmhz_system(lat: float, lng: float):
    """
    Returns the single closest OpenMHZ system by distance.
    """
    nearest = find_nearest_openmhz_systems_list(lat, lng, limit=1)
    if nearest:
        return nearest[0]
    return None