mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-05-28 18:11:31 +02:00
76750caa92
== Per-install operator handle for every third-party API call ==
Before this PR, every Shadowbroker install identified itself to
Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, Broadcastify,
weather.gov, NUFORC, Sentinel/Planetary Computer, TinyGS / CelesTrak,
Shodan, Finnhub, and others with a single project-wide User-Agent
("Shadowbroker/1.0" or "ShadowBroker-OSINT/1.0"). From the upstream's
perspective every install in the world looked like one giant scraper.
If one install misbehaved, the upstream's only recourse was to block
"Shadowbroker" as a whole.
PR #284 inadvertently doubled down on this in the frontend by
introducing a shared `WIKIMEDIA_API_USER_AGENT` constant. This PR
retrofits both backends to per-operator attribution.
New setting: OPERATOR_HANDLE (env var / settings UI / auto-gen)
New helper: network_utils.outbound_user_agent("purpose")
The handle is auto-generated as "operator-XXXXXX" on first call (the
"shadow-" prefix from earlier drafts was deliberately dropped — too
suspicious-looking for abuse-detection systems). Operators can
override via OPERATOR_HANDLE; the value is sanitized to lowercase
alphanumeric+dash+underscore and capped at 48 chars. Persisted to
backend/data/operator_handle.json so it survives container restarts.
Retrofitted call sites (every previously-MONSTER User-Agent):
- services/region_dossier.py (Wikipedia + Wikidata + Nominatim)
- services/geocode.py (Nominatim)
- services/sentinel_search.py (Microsoft Planetary Computer)
- services/feed_ingester.py (operator-curated RSS feeds)
- services/fetchers/earth_observation.py (weather.gov, NUFORC)
- services/fetchers/infrastructure.py
- services/fetchers/aircraft_database.py
- services/fetchers/route_database.py
- services/fetchers/trains.py
- services/fetchers/meshtastic_map.py
- services/shodan_connector.py
- services/unusual_whales_connector.py (Finnhub)
- services/tinygs_fetcher.py (CelesTrak + TinyGS)
- services/sar/sar_products_client.py
- services/geopolitics.py (GDELT)
- services/radio_intercept.py (Broadcastify + OpenMHz)
- routers/cctv.py + main.py (CCTV proxy)
- routers/ai_intel.py
- scripts/convert_power_plants.py (release-time data refresh)
Spoofed browser UAs removed (issues #289 / #290 / #291 — tg12 audit):
- cloudscraper-based Chrome impersonation against api.openmhz.com
-> replaced with honest requests + per-install UA
- Mozilla/5.0 spoofed UA on Broadcastify scrape
-> replaced with honest UA
- Mozilla/5.0 + fake first-party Referer on OpenMHz audio relay
-> replaced with honest UA
- cloudscraper dependency dropped from pyproject.toml + uv.lock
Frontend retrofit:
- new GET /api/settings/operator-handle endpoint (local-operator
gated) returns the install's handle
- frontend/src/lib/wikimediaClient.ts fetches the handle once on
first use, caches it for page lifetime, embeds it in the
Api-User-Agent for every Wikipedia / Wikidata browser-direct call
== GDELT GCS-direct fix ==
GDELT's data.gdeltproject.org is a CNAME to a Google Cloud Storage
bucket. GCS responds with the wildcard *.storage.googleapis.com cert
which legitimately does NOT cover the GDELT custom domain, so Python's
TLS verification correctly refuses the connection. Some networks
happen to route through a path where this works; many (notably Docker
Desktop's outbound NAT on local installs) do not. Verified on the
maintainer's local install: GDELT was unreachable; 1610 geopolitical
events / 48 export files were dropping silently.
Fix: services/geopolitics._gcs_direct_gdelt_url() rewrites any
data.gdeltproject.org URL to its GCS-direct equivalent
(storage.googleapis.com/data.gdeltproject.org/...) where the standard
GCS cert is genuinely valid. api.gdeltproject.org and every other host
are left untouched.
Confirmed live: backend log goes from
GDELT lastupdate failed: 500
to
Downloading 48 GDELT export files...
Downloaded 48/48 GDELT exports
GDELT parsed: 1610 conflict locations from 48 files
== Tests ==
backend/tests/test_per_operator_outbound_attribution.py (12 tests)
backend/tests/test_gdelt_gcs_direct_rewrite.py (6 tests)
backend/tests/test_region_dossier_wikimedia_ua.py (updated to
pin the helper + per-operator handle, not the old constant)
frontend/src/__tests__/utils/wikimediaClient.test.ts (rewritten
to mock /api/settings/operator-handle and assert per-operator UA)
Local: backend 114/114 security+audit+round7a suite green;
frontend 718/718 vitest suite green.
Credit: tg12 (external security audit, issues #289/#290/#291
relating to spoofed UAs); BigBodyCobain (operator-prefix call,
GDELT cloud-vs-local diagnosis).
169 lines
5.4 KiB
Python
169 lines
5.4 KiB
Python
"""Static route + airport database loaded from vrs-standing-data.adsb.lol.
|
|
|
|
Replaces the per-batch /api/0/routeset POST with a single daily bulk download.
|
|
Routes change ~weekly when airlines update schedules, so a 24h refresh cadence
|
|
is far more than sufficient and removes ~all live-API pressure on adsb.lol.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import gzip
|
|
import io
|
|
import logging
|
|
import threading
|
|
import time
|
|
from typing import Any
|
|
|
|
import requests
|
|
|
|
|
|
|
|
def _route_db_user_agent() -> str:
|
|
from services.network_utils import outbound_user_agent
|
|
return outbound_user_agent("route-database")
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_ROUTES_URL = "https://vrs-standing-data.adsb.lol/routes.csv.gz"
|
|
_AIRPORTS_URL = "https://vrs-standing-data.adsb.lol/airports.csv.gz"
|
|
_REFRESH_INTERVAL_S = 5 * 24 * 3600
|
|
_HTTP_TIMEOUT_S = 60
|
|
|
|
from services.network_utils import DEFAULT_USER_AGENT as _USER_AGENT
|
|
|
|
_lock = threading.RLock()
|
|
_routes_by_callsign: dict[str, dict[str, Any]] = {}
|
|
_airports_by_icao: dict[str, dict[str, Any]] = {}
|
|
_last_refresh = 0.0
|
|
_refresh_in_progress = False
|
|
|
|
|
|
def _fetch_csv_gz(url: str) -> list[dict[str, str]]:
|
|
response = requests.get(
|
|
url,
|
|
timeout=_HTTP_TIMEOUT_S,
|
|
headers={"User-Agent": _route_db_user_agent(), "Accept-Encoding": "gzip"},
|
|
)
|
|
response.raise_for_status()
|
|
text = gzip.decompress(response.content).decode("utf-8-sig")
|
|
return list(csv.DictReader(io.StringIO(text)))
|
|
|
|
|
|
def _build_route_index(rows: list[dict[str, str]]) -> dict[str, dict[str, Any]]:
|
|
index: dict[str, dict[str, Any]] = {}
|
|
for row in rows:
|
|
callsign = (row.get("Callsign") or "").strip().upper()
|
|
airport_codes = (row.get("AirportCodes") or "").strip()
|
|
if not callsign or not airport_codes:
|
|
continue
|
|
icaos = [c.strip() for c in airport_codes.split("-") if c.strip()]
|
|
if len(icaos) < 2:
|
|
continue
|
|
index[callsign] = {
|
|
"airline_code": (row.get("AirlineCode") or "").strip(),
|
|
"airport_codes": airport_codes,
|
|
"airport_icaos": icaos,
|
|
}
|
|
return index
|
|
|
|
|
|
def _build_airport_index(rows: list[dict[str, str]]) -> dict[str, dict[str, Any]]:
|
|
index: dict[str, dict[str, Any]] = {}
|
|
for row in rows:
|
|
icao = (row.get("ICAO") or "").strip().upper()
|
|
if not icao:
|
|
continue
|
|
try:
|
|
lat = float(row.get("Latitude") or 0)
|
|
lon = float(row.get("Longitude") or 0)
|
|
except (TypeError, ValueError):
|
|
continue
|
|
index[icao] = {
|
|
"name": (row.get("Name") or "").strip(),
|
|
"iata": (row.get("IATA") or "").strip(),
|
|
"country": (row.get("CountryISO2") or "").strip(),
|
|
"lat": lat,
|
|
"lon": lon,
|
|
}
|
|
return index
|
|
|
|
|
|
def refresh_route_database(force: bool = False) -> bool:
|
|
"""Pull routes.csv.gz + airports.csv.gz and rebuild the in-memory indexes.
|
|
|
|
Returns True if a refresh was performed (success or attempted), False if
|
|
skipped because the cache is still fresh or another refresh is in flight.
|
|
"""
|
|
global _last_refresh, _refresh_in_progress
|
|
|
|
now = time.time()
|
|
with _lock:
|
|
if _refresh_in_progress:
|
|
return False
|
|
if not force and (now - _last_refresh) < _REFRESH_INTERVAL_S and _routes_by_callsign:
|
|
return False
|
|
_refresh_in_progress = True
|
|
|
|
try:
|
|
started = time.time()
|
|
airport_rows = _fetch_csv_gz(_AIRPORTS_URL)
|
|
route_rows = _fetch_csv_gz(_ROUTES_URL)
|
|
airports = _build_airport_index(airport_rows)
|
|
routes = _build_route_index(route_rows)
|
|
with _lock:
|
|
_airports_by_icao.clear()
|
|
_airports_by_icao.update(airports)
|
|
_routes_by_callsign.clear()
|
|
_routes_by_callsign.update(routes)
|
|
_last_refresh = time.time()
|
|
logger.info(
|
|
"route database refreshed in %.1fs: %d routes, %d airports",
|
|
time.time() - started,
|
|
len(routes),
|
|
len(airports),
|
|
)
|
|
return True
|
|
except (requests.RequestException, OSError, ValueError) as exc:
|
|
logger.warning("route database refresh failed: %s", exc)
|
|
return True
|
|
finally:
|
|
with _lock:
|
|
_refresh_in_progress = False
|
|
|
|
|
|
def lookup_route(callsign: str) -> dict[str, Any] | None:
|
|
"""Resolve a callsign to {orig_name, dest_name, orig_loc, dest_loc} or None.
|
|
|
|
Matches the shape produced by the legacy fetch_routes_background cache so
|
|
the caller in flights.py can be a drop-in replacement.
|
|
"""
|
|
key = (callsign or "").strip().upper()
|
|
if not key:
|
|
return None
|
|
with _lock:
|
|
route = _routes_by_callsign.get(key)
|
|
if not route:
|
|
return None
|
|
icaos = route["airport_icaos"]
|
|
orig = _airports_by_icao.get(icaos[0].upper())
|
|
dest = _airports_by_icao.get(icaos[-1].upper())
|
|
if not orig or not dest:
|
|
return None
|
|
return {
|
|
"orig_name": f"{orig['iata']}: {orig['name']}" if orig["iata"] else orig["name"],
|
|
"dest_name": f"{dest['iata']}: {dest['name']}" if dest["iata"] else dest["name"],
|
|
"orig_loc": [orig["lon"], orig["lat"]],
|
|
"dest_loc": [dest["lon"], dest["lat"]],
|
|
}
|
|
|
|
|
|
def route_database_status() -> dict[str, Any]:
|
|
with _lock:
|
|
return {
|
|
"last_refresh": _last_refresh,
|
|
"routes": len(_routes_by_callsign),
|
|
"airports": len(_airports_by_icao),
|
|
"in_progress": _refresh_in_progress,
|
|
}
|