diff --git a/.gitignore b/.gitignore index bbfc8b3..89bd512 100644 --- a/.gitignore +++ b/.gitignore @@ -198,6 +198,7 @@ graphify-out/ # Internal docs & brainstorming (never commit) # ======================== docs/* +!docs/OUTBOUND_DATA.md !docs/mesh/ docs/mesh/* !docs/mesh/threat-model.md diff --git a/backend/.env.example b/backend/.env.example index 1e6a24b..93e9677 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -87,6 +87,12 @@ AIS_API_KEY= # https://aisstream.io/ — free tier WebSocket key # Free MAP_KEY from https://firms.modaps.eosdis.nasa.gov/map/#d:24hrs;@0.0,0.0,3.0z # FIRMS_MAP_KEY= +# Ukraine frontline mirror (GitHub). Default follows cyterat/deepstate-map-data@main. +# Pin an immutable commit SHA so ingest cannot silently change if main is force-pushed (#362). +# Example (verify on GitHub before use): main @ b479954e94696bc5622c7818fd20a64a699f4fe8 +# DEEPSTATE_MIRROR_COMMIT=b479954e94696bc5622c7818fd20a64a699f4fe8 +# DEEPSTATE_MIRROR_REPO=cyterat/deepstate-map-data + # Ukraine air raid alerts from alerts.in.ua — free token from https://alerts.in.ua/ # ALERTS_IN_UA_TOKEN= diff --git a/backend/services/cctv_pipeline.py b/backend/services/cctv_pipeline.py index a489396..380b0a0 100644 --- a/backend/services/cctv_pipeline.py +++ b/backend/services/cctv_pipeline.py @@ -1012,14 +1012,33 @@ def _extract_img_src(html_fragment: str): class MadridCityIngestor(BaseCCTVIngestor): """Madrid City Hall traffic cameras from datos.madrid.es KML feed.""" - KML_URL = "http://datos.madrid.es/egob/catalogo/202088-0-trafico-camaras.kml" + KML_URL_HTTPS = "https://datos.madrid.es/egob/catalogo/202088-0-trafico-camaras.kml" + KML_URL_HTTP = "http://datos.madrid.es/egob/catalogo/202088-0-trafico-camaras.kml" + + def _fetch_kml(self): + """Prefer HTTPS; fall back to legacy HTTP if the catalog is HTTP-only (#363).""" + last_error: Exception | None = None + for url in (self.KML_URL_HTTPS, self.KML_URL_HTTP): + try: + response = fetch_with_curl(url, timeout=20) + response.raise_for_status() + if url == self.KML_URL_HTTP: + logger.warning( + "MadridCityIngestor: HTTPS KML unavailable, using HTTP catalog feed" + ) + return response + except Exception as e: + last_error = e + logger.debug("MadridCityIngestor: KML fetch failed for %s: %s", url, e) + if last_error is not None: + raise last_error + raise RuntimeError("Madrid KML fetch failed") def fetch_data(self) -> List[Dict[str, Any]]: import defusedxml.ElementTree as ET try: - response = fetch_with_curl(self.KML_URL, timeout=20) - response.raise_for_status() + response = self._fetch_kml() except Exception as e: logger.error(f"MadridCityIngestor: failed to fetch KML: {e}") return [] diff --git a/backend/services/geopolitics.py b/backend/services/geopolitics.py index 47357db..ee0fd11 100644 --- a/backend/services/geopolitics.py +++ b/backend/services/geopolitics.py @@ -1,3 +1,4 @@ +import os import requests import logging import zipfile @@ -20,6 +21,50 @@ logger = logging.getLogger(__name__) # Cache Frontline data for 30 minutes, it doesn't move that fast frontline_cache = TTLCache(maxsize=1, ttl=1800) +_DEFAULT_DEEPSTATE_MIRROR_REPO = "cyterat/deepstate-map-data" + + +def _deepstate_mirror_ref() -> tuple[str, str]: + """Return (github_repo_slug, git_ref) for the DeepState mirror. + + When ``DEEPSTATE_MIRROR_COMMIT`` is set, ingest is pinned to that immutable + SHA instead of following the mutable ``main`` branch (#362). + """ + repo = (os.environ.get("DEEPSTATE_MIRROR_REPO") or _DEFAULT_DEEPSTATE_MIRROR_REPO).strip() + if repo.count("/") != 1: + repo = _DEFAULT_DEEPSTATE_MIRROR_REPO + commit = (os.environ.get("DEEPSTATE_MIRROR_COMMIT") or "").strip() + ref = commit if commit else "main" + return repo, ref + + +def _latest_deepstate_geo_path(tree_items: list) -> str | None: + geo_files = [ + item["path"] + for item in tree_items + if isinstance(item, dict) + and str(item.get("path", "")).startswith("data/deepstatemap_data_") + and str(item.get("path", "")).endswith(".geojson") + ] + return sorted(geo_files)[-1] if geo_files else None + + +def _annotate_deepstate_geojson(data: dict) -> dict: + name_map = { + 0: "Russian-occupied areas", + 1: "Russian advance", + 2: "Liberated area", + 3: "Russian-occupied areas", # Crimea / LPR / DPR + 4: "Directions of UA attacks", + } + if "features" in data: + for idx, feature in enumerate(data["features"]): + if "properties" not in feature or feature["properties"] is None: + feature["properties"] = {} + feature["properties"]["name"] = name_map.get(idx, "Russian-occupied areas") + feature["properties"]["zone_id"] = idx + return data + @cached(frontline_cache) def fetch_ukraine_frontlines(): @@ -27,67 +72,34 @@ def fetch_ukraine_frontlines(): Fetches the latest GeoJSON data representing the Ukraine frontline. We use the cyterat/deepstate-map-data github mirror since the public API is locked. """ + repo, ref = _deepstate_mirror_ref() try: - logger.info("Fetching DeepStateMap from GitHub mirror...") + logger.info("Fetching DeepStateMap from GitHub mirror (%s @ %s)...", repo, ref) - # First, query the repo tree to find the latest file name - tree_url = ( - "https://api.github.com/repos/cyterat/deepstate-map-data/git/trees/main?recursive=1" - ) + tree_url = f"https://api.github.com/repos/{repo}/git/trees/{ref}?recursive=1" res_tree = requests.get(tree_url, timeout=10) if res_tree.status_code == 200: - tree_data = res_tree.json().get("tree", []) - # Filter for geojson files in data folder - geo_files = [ - item["path"] - for item in tree_data - if item["path"].startswith("data/deepstatemap_data_") - and item["path"].endswith(".geojson") - ] - - if geo_files: - # Get the alphabetically latest file (since it's named with YYYYMMDD) - latest_file = sorted(geo_files)[-1] - - raw_url = f"https://raw.githubusercontent.com/cyterat/deepstate-map-data/main/{latest_file}" - logger.info(f"Downloading latest DeepStateMap: {raw_url}") + latest_file = _latest_deepstate_geo_path(res_tree.json().get("tree", [])) + if latest_file: + raw_url = f"https://raw.githubusercontent.com/{repo}/{ref}/{latest_file}" + logger.info("Downloading DeepStateMap: %s", raw_url) res_geo = requests.get(raw_url, timeout=20) if res_geo.status_code == 200: - data = res_geo.json() - - # The Cyterat GitHub mirror strips all properties and just provides a raw array of Feature polygons. - # Based on DeepStateMap's frontend mapping, the array index corresponds to the zone type: - # 0: Russian-occupied areas - # 1: Russian advance - # 2: Liberated area - # 3: Uncontested/Crimea (often folded into occupied) - name_map = { - 0: "Russian-occupied areas", - 1: "Russian advance", - 2: "Liberated area", - 3: "Russian-occupied areas", # Crimea / LPR / DPR - 4: "Directions of UA attacks", - } - - if "features" in data: - for idx, feature in enumerate(data["features"]): - if "properties" not in feature or feature["properties"] is None: - feature["properties"] = {} - - feature["properties"]["name"] = name_map.get( - idx, "Russian-occupied areas" - ) - feature["properties"]["zone_id"] = idx - - return data - else: - logger.error( - f"Failed to fetch parsed Github Raw GeoJSON: {res_geo.status_code}" - ) + return _annotate_deepstate_geojson(res_geo.json()) + logger.error( + "Failed to fetch parsed Github Raw GeoJSON: %s", res_geo.status_code + ) + else: + logger.error("No deepstatemap_data_*.geojson files in mirror tree at %s", ref) else: - logger.error(f"Failed to fetch Github Tree for Deepstatemap: {res_tree.status_code}") + logger.error( + "Failed to fetch Github tree for Deepstatemap (%s @ %s): %s", + repo, + ref, + res_tree.status_code, + ) except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e: logger.error(f"Error fetching DeepStateMap: {e}") return None diff --git a/backend/services/kiwisdr_fetcher.py b/backend/services/kiwisdr_fetcher.py index 47b1f9b..6408cc1 100644 --- a/backend/services/kiwisdr_fetcher.py +++ b/backend/services/kiwisdr_fetcher.py @@ -32,7 +32,8 @@ logger = logging.getLogger(__name__) _REFRESH_SECONDS = 24 * 3600 kiwisdr_cache: TTLCache = TTLCache(maxsize=1, ttl=_REFRESH_SECONDS) -_SOURCE_URL = "http://rx.linkfanel.net/kiwisdr_com.js" +_SOURCE_URL_HTTP = "http://rx.linkfanel.net/kiwisdr_com.js" +_SOURCE_URL_HTTPS = "https://rx.linkfanel.net/kiwisdr_com.js" _CACHE_FILE = Path(__file__).resolve().parent.parent / "data" / "kiwisdr_cache.json" # Bundled fallback — shipped with the codebase so the KiwiSDR layer always # has something to render even when the upstream is unreachable, returns @@ -184,6 +185,29 @@ def _validate_fetched_nodes(nodes: list[dict]) -> bool: return True +def _fetch_mirror_payload_text() -> str | None: + """Try HTTPS first, then HTTP. Shape validation still applies (#364).""" + from services.network_utils import fetch_with_curl + + last_error: Exception | None = None + for url in (_SOURCE_URL_HTTPS, _SOURCE_URL_HTTP): + try: + res = fetch_with_curl(url, timeout=20) + if res and res.status_code == 200: + if url == _SOURCE_URL_HTTP: + logger.info( + "KiwiSDR: HTTPS mirror unavailable; using HTTP with shape validation" + ) + return res.text + last_error = RuntimeError(f"HTTP {getattr(res, 'status_code', 'unknown')}") + except Exception as e: + last_error = e + logger.debug("KiwiSDR mirror fetch failed for %s: %s", url, e) + if last_error is not None: + logger.warning("KiwiSDR mirror fetch failed: %s", last_error) + return None + + def _load_bundled_fallback() -> list[dict]: """Last-resort directory shipped with the codebase. Always returns a list (may be empty if the bundle is missing in older deployments).""" @@ -216,8 +240,6 @@ def fetch_kiwisdr_nodes() -> list[dict]: tampered upstream returning garbage is caught by _validate_fetched_nodes() and falls through to whatever previously-trusted snapshot we have. """ - from services.network_utils import fetch_with_curl - # 1. Trust on-disk cache if fresh. cached_nodes = _load_disk_cache() if cached_nodes is not None: @@ -230,14 +252,12 @@ def fetch_kiwisdr_nodes() -> list[dict]: fresh_nodes: list[dict] = [] fetch_succeeded = False try: - res = fetch_with_curl(_SOURCE_URL, timeout=20) - if res and res.status_code == 200: - fresh_nodes = _parse_mirror_payload(res.text) + body = _fetch_mirror_payload_text() + if body: + fresh_nodes = _parse_mirror_payload(body) fetch_succeeded = True else: - logger.warning( - f"KiwiSDR fetch returned HTTP {res.status_code if res else 'no response'}" - ) + logger.warning("KiwiSDR fetch returned no usable mirror payload") except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e: logger.warning(f"KiwiSDR fetch exception: {e}") diff --git a/backend/tests/test_deepstate_mirror_pin.py b/backend/tests/test_deepstate_mirror_pin.py new file mode 100644 index 0000000..4ef3ba4 --- /dev/null +++ b/backend/tests/test_deepstate_mirror_pin.py @@ -0,0 +1,46 @@ +"""DeepState GitHub mirror pinning (#362).""" +from __future__ import annotations + +import os +from unittest.mock import MagicMock, patch + +import services.geopolitics as gp + + +def test_deepstate_mirror_ref_defaults(monkeypatch): + monkeypatch.delenv("DEEPSTATE_MIRROR_COMMIT", raising=False) + monkeypatch.delenv("DEEPSTATE_MIRROR_REPO", raising=False) + repo, ref = gp._deepstate_mirror_ref() + assert repo == "cyterat/deepstate-map-data" + assert ref == "main" + + +def test_deepstate_mirror_ref_pinned_commit(monkeypatch): + monkeypatch.setenv("DEEPSTATE_MIRROR_COMMIT", "abc123def456") + monkeypatch.setenv("DEEPSTATE_MIRROR_REPO", "cyterat/deepstate-map-data") + repo, ref = gp._deepstate_mirror_ref() + assert repo == "cyterat/deepstate-map-data" + assert ref == "abc123def456" + + +def test_fetch_ukraine_frontlines_uses_pinned_tree_url(monkeypatch): + monkeypatch.setenv("DEEPSTATE_MIRROR_COMMIT", "deadbeef") + gp.frontline_cache.clear() + + tree_resp = MagicMock(status_code=200) + tree_resp.json.return_value = { + "tree": [{"path": "data/deepstatemap_data_20260101.geojson"}] + } + geo_resp = MagicMock(status_code=200) + geo_resp.json.return_value = {"features": []} + + with patch("services.geopolitics.requests.get", side_effect=[tree_resp, geo_resp]) as get: + result = gp.fetch_ukraine_frontlines() + + assert result == {"features": []} + tree_call = get.call_args_list[0][0][0] + raw_call = get.call_args_list[1][0][0] + assert "/git/trees/deadbeef" in tree_call + assert "raw.githubusercontent.com/cyterat/deepstate-map-data/deadbeef/" in raw_call + + gp.frontline_cache.clear() diff --git a/backend/tests/test_kiwisdr_https_first.py b/backend/tests/test_kiwisdr_https_first.py new file mode 100644 index 0000000..85fb066 --- /dev/null +++ b/backend/tests/test_kiwisdr_https_first.py @@ -0,0 +1,29 @@ +"""KiwiSDR mirror prefers HTTPS (#364).""" +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from services.kiwisdr_fetcher import ( + _SOURCE_URL_HTTP, + _SOURCE_URL_HTTPS, + _fetch_mirror_payload_text, +) + + +def test_fetch_mirror_tries_https_before_http(): + calls: list[str] = [] + + def fake_fetch(url, **kwargs): + calls.append(url) + if url == _SOURCE_URL_HTTPS: + raise ConnectionError("tls not available") + res = MagicMock() + res.status_code = 200 + res.text = "var kiwisdr_com = [];" + return res + + with patch("services.network_utils.fetch_with_curl", side_effect=fake_fetch): + body = _fetch_mirror_payload_text() + + assert body == "var kiwisdr_com = [];" + assert calls == [_SOURCE_URL_HTTPS, _SOURCE_URL_HTTP] diff --git a/docs/OUTBOUND_DATA.md b/docs/OUTBOUND_DATA.md new file mode 100644 index 0000000..df78fee --- /dev/null +++ b/docs/OUTBOUND_DATA.md @@ -0,0 +1,43 @@ +# Outbound data and third-party exposure + +Shadowbroker is **self-hosted**: each install uses its own backend egress IP (and optional `OPERATOR_HANDLE` in `User-Agent`). This documents intentional third-party contact for audit issues #348–#366. + +## Architecture + +| Path | Who calls third parties | +|------|-------------------------| +| UI → `/api/*` → fetchers | **Backend** | +| Map basemap tiles/fonts | **Browser** (CARTO, demotiles.maplibre.org) | +| CCTV proxy | **Backend** (with upstream-required `Referer` / `Origin`) | + +## Ukraine frontline mirror (#362) + +- **Layer:** `ukraine_frontline` → `frontlines` on the map (DeepStateMap polygons). **Not** UAP (`uap_sightings` / NUFORC). +- **Code:** `backend/services/geopolitics.py` +- **Default:** `cyterat/deepstate-map-data` @ `main`, latest `data/deepstatemap_data_*.geojson` +- **Pin:** `DEEPSTATE_MIRROR_COMMIT=` — immutable Git snapshot; bump SHA when you want newer lines +- **Optional:** `DEEPSTATE_MIRROR_REPO=owner/repo` + +## Madrid CCTV (#363) + +- **Ingest:** HTTPS-first KML on `datos.madrid.es` (catalog only); HTTP fallback if needed +- **Feeds:** Still images from URLs inside the KML (`informo.madrid.es`, etc.), proxied with `Referer: https://informo.madrid.es/` — unchanged by KML transport + +## KiwiSDR (#364) + +- HTTPS first, then HTTP; shape validation + bundled `backend/data/kiwisdr_directory.json` + +## Other documented exposures + +- **#354 Basemap:** browser → `*.basemaps.cartocdn.com`, `demotiles.maplibre.org` +- **#349 CCTV Referer:** required for many DOT/city streams; backend proxy only +- **#361 Operator UA:** `OPERATOR_HANDLE` / `outbound_user_agent()` per install +- **#366 Broadcastify:** backend scrape with honest UA +- **#348 LiveUAMap:** `SHADOWBROKER_ENABLE_LIVEUAMAP_SCRAPER` (default on Linux, off Windows) + +## Operator checklist + +1. Set `OPERATOR_HANDLE` if you want a recognizable contact on upstream logs. +2. Pin `DEEPSTATE_MIRROR_COMMIT` after reviewing a mirror commit (see `backend/.env.example`). +3. Set `SHADOWBROKER_ENABLE_LIVEUAMAP_SCRAPER=false` to disable LiveUAMap contact. +4. Self-host map tiles if basemap CDN exposure matters.