diff --git a/backend/.env.example b/backend/.env.example index 54779ff..545c4b0 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -24,14 +24,28 @@ AIS_API_KEY= # https://aisstream.io/ — free tier WebSocket key # Requires MESH_DEBUG_MODE=true; do not enable this for ordinary use. # ALLOW_INSECURE_ADMIN=false -# Default outbound User-Agent for all third-party HTTP fetchers. -# Project-generic by default — does NOT include any personal contact info or -# operator-specific identifier. Override only if you run a public relay and -# want upstreams to be able to reach you (e.g. Nominatim/OSM usage policy). -# SHADOWBROKER_USER_AGENT=ShadowBroker-OSINT/0.9 (contact: ops@example.com) +# Per-install operator handle. Round 7a: every outbound third-party API +# call (Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, Broadcastify, +# weather.gov, NUFORC, etc.) includes this handle in the User-Agent so +# upstreams can rate-limit / contact the specific install instead of +# treating every Shadowbroker user as one entity. +# +# Default empty -> a stable pseudonymous handle (e.g. "operator-7f3a92") is +# auto-generated on first run and persisted to backend/data/operator_handle.json. +# Operators who want a meaningful handle (real name, org, GitHub login) can +# set it here. Special characters are sanitized to dashes. +# OPERATOR_HANDLE= -# User-Agent for Nominatim geocoding requests (per OSM usage policy). -# NOMINATIM_USER_AGENT=ShadowBroker/1.0 +# Default outbound User-Agent for all third-party HTTP fetchers. Operators +# who run a public relay and want a completely custom UA can set this; it +# bypasses the per-operator helper entirely. Most installs should leave it +# unset and use OPERATOR_HANDLE instead. +# SHADOWBROKER_USER_AGENT= + +# Nominatim-specific User-Agent override (OSM usage policy). Leave unset to +# use the per-install handle (default) — set only if you have a registered +# Nominatim relay identity. +# NOMINATIM_USER_AGENT= # ── Third-party fetcher opt-ins ──────────────────────────────── # These data sources phone home to politically/commercially sensitive diff --git a/backend/main.py b/backend/main.py index aa6710f..d51525b 100644 --- a/backend/main.py +++ b/backend/main.py @@ -8148,8 +8148,12 @@ def _cctv_proxy_profile_for_url(target_url: str) -> _CCTVProxyProfile: def _cctv_upstream_headers(request: Request, profile: _CCTVProxyProfile) -> dict[str, str]: + # Round 7a: per-install operator handle. See routers/cctv.py for the + # canonical handler; this duplicate stays in lockstep until the #239 + # dedup ladder removes it. + from services.network_utils import outbound_user_agent headers = { - "User-Agent": "Mozilla/5.0 (compatible; ShadowBroker CCTV proxy)", + "User-Agent": f"Mozilla/5.0 (compatible; {outbound_user_agent('cctv-proxy')})", **profile.headers, } range_header = request.headers.get("range") diff --git a/backend/pyproject.toml b/backend/pyproject.toml index e0dbe40..25617ea 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -13,7 +13,6 @@ dependencies = [ "apscheduler==3.10.3", "beautifulsoup4>=4.9.0", "cachetools==5.5.2", - "cloudscraper==1.2.71", "cryptography>=41.0.0", "defusedxml>=0.7.1", "fastapi==0.115.12", diff --git a/backend/routers/admin.py b/backend/routers/admin.py index 5678623..5bb3b25 100644 --- a/backend/routers/admin.py +++ b/backend/routers/admin.py @@ -82,6 +82,28 @@ async def api_get_keys_meta(request: Request): return get_env_path_info() +@router.get( + "/api/settings/operator-handle", + dependencies=[Depends(require_local_operator)], +) +@limiter.limit("60/minute") +async def api_get_operator_handle(request: Request): + """Round 7a: return the per-install operator handle so the frontend + can include it in browser-direct third-party API calls (Wikipedia / + Wikidata via lib/wikimediaClient). The handle is auto-generated on + first use; operators can override it via the OPERATOR_HANDLE setting + or the env var of the same name. + + Gated on local-operator: legitimate browser usage goes through the + Next.js proxy which auto-attaches the admin key; remote scanners get + 403. The handle itself isn't a secret (it's sent to every third-party + API the operator touches), but admin-gating it matches the rest of + the settings endpoints and follows least-privilege. + """ + from services.network_utils import get_operator_handle + return {"handle": get_operator_handle()} + + @router.get( "/api/settings/news-feeds", dependencies=[Depends(require_local_operator)], diff --git a/backend/routers/ai_intel.py b/backend/routers/ai_intel.py index bea38d4..aab4b39 100644 --- a/backend/routers/ai_intel.py +++ b/backend/routers/ai_intel.py @@ -18,6 +18,12 @@ from auth import require_local_operator, require_openclaw_or_local from limiter import limiter from services.fetchers._store import latest_data as _latest_data + + +def _ai_intel_user_agent() -> str: + from services.network_utils import outbound_user_agent + return outbound_user_agent("ai-intel") + logger = logging.getLogger(__name__) router = APIRouter() @@ -447,7 +453,7 @@ async def ai_satellite_images( "https://planetarycomputer.microsoft.com/api/stac/v1/search", json=search_payload, timeout=10, - headers={"User-Agent": "ShadowBroker-OSINT/1.0 (ai-intel)"}, + headers={"User-Agent": _ai_intel_user_agent()}, ) resp.raise_for_status() features = resp.json().get("features", []) diff --git a/backend/routers/cctv.py b/backend/routers/cctv.py index f4b90e5..599b84c 100644 --- a/backend/routers/cctv.py +++ b/backend/routers/cctv.py @@ -165,7 +165,13 @@ def _cctv_proxy_profile_for_url(target_url: str) -> _CCTVProxyProfile: def _cctv_upstream_headers(request: Request, profile: _CCTVProxyProfile) -> dict: - headers = {"User-Agent": "Mozilla/5.0 (compatible; ShadowBroker CCTV proxy)", **profile.headers} + # Round 7a: per-install operator handle. Mozilla/5.0 prefix retained + # because many CCTV endpoints sniff for a browser-like prefix. + from services.network_utils import outbound_user_agent + headers = { + "User-Agent": f"Mozilla/5.0 (compatible; {outbound_user_agent('cctv-proxy')})", + **profile.headers, + } range_header = request.headers.get("range") if range_header: headers["Range"] = range_header diff --git a/backend/scripts/convert_power_plants.py b/backend/scripts/convert_power_plants.py index 9de69ff..18dc46c 100644 --- a/backend/scripts/convert_power_plants.py +++ b/backend/scripts/convert_power_plants.py @@ -20,7 +20,17 @@ OUT_PATH = Path(__file__).parent.parent / "data" / "power_plants.json" def main() -> None: print(f"Downloading WRI Global Power Plant Database from GitHub...") - req = urllib.request.Request(CSV_URL, headers={"User-Agent": "ShadowBroker-OSINT/1.0"}) + # Round 7a: release-time data refresher. Uses the per-operator UA if + # available, otherwise a release-script-specific identifier. This + # script is run by the maintainer at release time, NOT at runtime, + # so an aggregate UA is acceptable; we still use the helper so the + # behavior matches the rest of the project. + try: + from services.network_utils import outbound_user_agent + ua = outbound_user_agent("release-script-power-plants") + except Exception: + ua = "Shadowbroker/0.9 (release-script-power-plants; +https://github.com/BigBodyCobain/Shadowbroker/issues)" + req = urllib.request.Request(CSV_URL, headers={"User-Agent": ua}) with urllib.request.urlopen(req, timeout=60) as resp: raw = resp.read().decode("utf-8") diff --git a/backend/services/config.py b/backend/services/config.py index ca70313..518e09c 100644 --- a/backend/services/config.py +++ b/backend/services/config.py @@ -295,6 +295,19 @@ class Settings(BaseSettings): # service operator can identify per-install traffic instead of a generic # "ShadowBroker" aggregate. MESHTASTIC_OPERATOR_CALLSIGN: str = "" + # Per-install operator handle used in the User-Agent for EVERY third-party + # API the backend calls (Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, + # Broadcastify, weather.gov, NUFORC, etc.). The default is empty, in which + # case backend/services/network_utils.py auto-generates a stable + # pseudonymous handle like "operator-7f3a92" on first use and caches it. + # Operators who want to identify themselves with a real handle can set + # this; operators who want to stay pseudonymous can leave it empty. + # + # The handle is sent ONLY to public third-party APIs. It is NEVER mixed + # into mesh / Wormhole / Infonet identity (those have their own crypto + # identity layer; conflating the two would leak public attribution into + # private mesh state). + OPERATOR_HANDLE: str = "" # SAR (Synthetic Aperture Radar) data layer # Mode A — free catalog metadata, no account, default-on diff --git a/backend/services/feed_ingester.py b/backend/services/feed_ingester.py index 03f0ea7..85baff0 100644 --- a/backend/services/feed_ingester.py +++ b/backend/services/feed_ingester.py @@ -16,8 +16,15 @@ from typing import Any import requests +from services.network_utils import outbound_user_agent + logger = logging.getLogger(__name__) + +def _feed_ingester_user_agent() -> str: + # Round 7a: per-install attribution for operator-curated feed URLs. + return outbound_user_agent("feed-ingester") + # --------------------------------------------------------------------------- # State # --------------------------------------------------------------------------- @@ -157,7 +164,7 @@ def _fetch_layer_feed(layer: dict[str, Any]) -> None: resp = requests.get( feed_url, timeout=_FETCH_TIMEOUT, - headers={"User-Agent": "ShadowBroker-FeedIngester/1.0"}, + headers={"User-Agent": _feed_ingester_user_agent()}, ) resp.raise_for_status() data = resp.json() diff --git a/backend/services/fetchers/aircraft_database.py b/backend/services/fetchers/aircraft_database.py index 1e2d1ea..7d51954 100644 --- a/backend/services/fetchers/aircraft_database.py +++ b/backend/services/fetchers/aircraft_database.py @@ -21,6 +21,13 @@ from typing import Any import defusedxml.ElementTree as ET import requests + + +def _aircraft_db_user_agent() -> str: + """Round 7a: lazy import so the per-install operator handle is included.""" + from services.network_utils import outbound_user_agent + return outbound_user_agent("aircraft-database") + logger = logging.getLogger(__name__) _BUCKET_LIST_URL = ( @@ -44,7 +51,7 @@ def _latest_snapshot_key() -> str: response = requests.get( _BUCKET_LIST_URL, timeout=_LIST_TIMEOUT_S, - headers={"User-Agent": _USER_AGENT}, + headers={"User-Agent": _aircraft_db_user_agent()}, ) response.raise_for_status() root = ET.fromstring(response.text) @@ -71,7 +78,7 @@ def _stream_csv_index(url: str) -> dict[str, dict[str, str]]: url, timeout=_DOWNLOAD_TIMEOUT_S, stream=True, - headers={"User-Agent": _USER_AGENT}, + headers={"User-Agent": _aircraft_db_user_agent()}, ) as response: response.raise_for_status() line_iter = ( diff --git a/backend/services/fetchers/earth_observation.py b/backend/services/fetchers/earth_observation.py index 6274766..6712a3c 100644 --- a/backend/services/fetchers/earth_observation.py +++ b/backend/services/fetchers/earth_observation.py @@ -15,7 +15,11 @@ import time import heapq from datetime import datetime, timedelta from pathlib import Path -from services.network_utils import external_curl_fallback_enabled, fetch_with_curl +from services.network_utils import ( + external_curl_fallback_enabled, + fetch_with_curl, + outbound_user_agent, +) from services.fetchers._store import latest_data, _data_lock, _mark_fresh from services.fetchers.nuforc_enrichment import enrich_sighting from services.fetchers.retry import with_retry @@ -279,13 +283,13 @@ def fetch_weather_alerts(): return alerts = [] try: - # weather.gov requires a User-Agent per their API policy, but it - # need not identify the operator. Use a project-generic string and - # let the user override via SHADOWBROKER_USER_AGENT if needed. - from services.network_utils import DEFAULT_USER_AGENT + # weather.gov requires a User-Agent per their API policy. Round 7a: + # send the per-install operator handle so they can rate-limit per + # operator instead of treating "Shadowbroker" as one entity. + from services.network_utils import outbound_user_agent url = "https://api.weather.gov/alerts/active?status=actual" headers = { - "User-Agent": DEFAULT_USER_AGENT, + "User-Agent": outbound_user_agent("weather-gov"), "Accept": "application/geo+json", } response = fetch_with_curl(url, timeout=15, headers=headers) @@ -713,7 +717,12 @@ _NUFORC_LIVE_NONCE_RE = re.compile( r'id=["\']wdtNonceFrontendServerSide_1["\'][^>]*value=["\']([a-f0-9]+)["\']' ) _NUFORC_LIVE_SIGHTING_ID_RE = re.compile(r"id=(\d+)") -_NUFORC_LIVE_USER_AGENT = "Mozilla/5.0 (ShadowBroker-OSINT NUFORC-fetcher)" +# Round 7a: NUFORC's site is sensitive to non-browser UAs but we send a +# per-install operator handle prefixed by Mozilla/5.0 so we're identifiable +# without being aggregately blocked. Operators who want stricter privacy +# can override the entire UA via SHADOWBROKER_USER_AGENT. +def _nuforc_live_user_agent() -> str: + return f"Mozilla/5.0 ({outbound_user_agent('nuforc-live')})" _NUFORC_LIVE_SESSION_COOKIES = _NUFORC_DATA_DIR / "nuforc_session.cookies" # Sample grid covering continental US, Alaska, Hawaii, Canada, UK, Australia @@ -957,7 +966,7 @@ def _photon_lookup(query: str) -> list[float] | None: res = fetch_with_curl( url, headers={ - "User-Agent": "ShadowBroker-OSINT/1.0 (NUFORC-UAP-layer)", + "User-Agent": outbound_user_agent("nuforc-uap-geocode"), "Accept-Language": "en", }, timeout=10, @@ -1053,7 +1062,7 @@ def _nuforc_fetch_month_live(yyyymm: str, cookie_jar: Path) -> list[dict]: index_res = subprocess.run( [ curl_bin, "-sL", - "-A", _NUFORC_LIVE_USER_AGENT, + "-A", _nuforc_live_user_agent(), "-c", str(cookie_jar), "-b", str(cookie_jar), index_url, @@ -1089,7 +1098,7 @@ def _nuforc_fetch_month_live(yyyymm: str, cookie_jar: Path) -> list[dict]: ajax_res = subprocess.run( [ curl_bin, "-sL", - "-A", _NUFORC_LIVE_USER_AGENT, + "-A", _nuforc_live_user_agent(), "-c", str(cookie_jar), "-b", str(cookie_jar), "-X", "POST", diff --git a/backend/services/fetchers/infrastructure.py b/backend/services/fetchers/infrastructure.py index a25d3fe..0372f8d 100644 --- a/backend/services/fetchers/infrastructure.py +++ b/backend/services/fetchers/infrastructure.py @@ -6,7 +6,7 @@ import heapq import logging from pathlib import Path from cachetools import TTLCache -from services.network_utils import fetch_with_curl +from services.network_utils import fetch_with_curl, outbound_user_agent from services.fetchers._store import latest_data, _data_lock, _mark_fresh from services.fetchers.retry import with_retry @@ -29,7 +29,7 @@ def _geocode_region(region_name: str, country_name: str) -> tuple: query = urllib.parse.quote(f"{region_name}, {country_name}") url = f"https://nominatim.openstreetmap.org/search?q={query}&format=json&limit=1" - response = fetch_with_curl(url, timeout=8, headers={"User-Agent": "ShadowBroker-OSINT/1.0"}) + response = fetch_with_curl(url, timeout=8, headers={"User-Agent": outbound_user_agent("infrastructure-data")}) if response.status_code == 200: results = response.json() if results: diff --git a/backend/services/fetchers/meshtastic_map.py b/backend/services/fetchers/meshtastic_map.py index 91893c2..21b212f 100644 --- a/backend/services/fetchers/meshtastic_map.py +++ b/backend/services/fetchers/meshtastic_map.py @@ -191,8 +191,13 @@ def fetch_meshtastic_nodes(): _os.environ.get("MESHTASTIC_SEND_CALLSIGN_HEADER", "true") ).strip().lower() not in {"0", "false", "no", "off", ""} - from services.network_utils import DEFAULT_USER_AGENT - ua_base = f"{DEFAULT_USER_AGENT}; 24h polling" + # Round 7a: outbound_user_agent already includes the per-install handle. + # The optional Meshtastic callsign is appended as additional context so + # meshtastic.liamcottle.net's operator can identify both the install AND + # the registered radio operator (when MESHTASTIC_OPERATOR_CALLSIGN is set + # and MESHTASTIC_SEND_CALLSIGN_HEADER is true; see issue #203). + from services.network_utils import outbound_user_agent + ua_base = f"{outbound_user_agent('meshtastic-map')}; 24h polling" if callsign and send_callsign_header: user_agent = f"{ua_base}; node={callsign}" else: diff --git a/backend/services/fetchers/route_database.py b/backend/services/fetchers/route_database.py index c65fee3..52b3b61 100644 --- a/backend/services/fetchers/route_database.py +++ b/backend/services/fetchers/route_database.py @@ -17,6 +17,12 @@ from typing import Any import requests + + +def _route_db_user_agent() -> str: + from services.network_utils import outbound_user_agent + return outbound_user_agent("route-database") + logger = logging.getLogger(__name__) _ROUTES_URL = "https://vrs-standing-data.adsb.lol/routes.csv.gz" @@ -37,7 +43,7 @@ def _fetch_csv_gz(url: str) -> list[dict[str, str]]: response = requests.get( url, timeout=_HTTP_TIMEOUT_S, - headers={"User-Agent": _USER_AGENT, "Accept-Encoding": "gzip"}, + headers={"User-Agent": _route_db_user_agent(), "Accept-Encoding": "gzip"}, ) response.raise_for_status() text = gzip.decompress(response.content).decode("utf-8-sig") diff --git a/backend/services/fetchers/trains.py b/backend/services/fetchers/trains.py index 7f8528b..bb5af80 100644 --- a/backend/services/fetchers/trains.py +++ b/backend/services/fetchers/trains.py @@ -10,6 +10,12 @@ from datetime import datetime, timezone from services.fetchers._store import _data_lock, _mark_fresh, latest_data from services.network_utils import fetch_with_curl + + +def _trains_user_agent() -> str: + from services.network_utils import outbound_user_agent + return outbound_user_agent("trains") + logger = logging.getLogger(__name__) _EARTH_RADIUS_KM = 6371.0 @@ -379,7 +385,7 @@ def _fetch_digitraffic() -> list[dict]: timeout=15, headers={ "Accept-Encoding": "gzip", - "User-Agent": "ShadowBroker-OSINT/1.0", + "User-Agent": _trains_user_agent(), }, ) if resp.status_code != 200: diff --git a/backend/services/geocode.py b/backend/services/geocode.py index 791e2b7..204fdb0 100644 --- a/backend/services/geocode.py +++ b/backend/services/geocode.py @@ -21,9 +21,17 @@ _cache_lock = threading.Lock() _local_search_cache: List[Dict[str, Any]] | None = None _local_search_lock = threading.Lock() -_USER_AGENT = os.environ.get( - "NOMINATIM_USER_AGENT", "ShadowBroker/1.0 (https://github.com/BigBodyCobain/Shadowbroker)" -) +# Round 7a: per-install operator handle threads through every Nominatim +# call. NOMINATIM_USER_AGENT env override is still honored for operators +# who run a custom relay / known good identity, but the default uses the +# per-install handle so OpenStreetMap can rate-limit per install instead +# of treating "Shadowbroker" as one big offender. +def _nominatim_user_agent() -> str: + override = os.environ.get("NOMINATIM_USER_AGENT", "").strip() + if override: + return override + from services.network_utils import outbound_user_agent + return outbound_user_agent("nominatim") def _get_cache(key: str): @@ -178,7 +186,7 @@ def search_geocode(query: str, limit: int = 5, local_only: bool = False) -> List res = fetch_with_curl( url, headers={ - "User-Agent": _USER_AGENT, + "User-Agent": _nominatim_user_agent(), "Accept-Language": "en", }, timeout=6, @@ -241,7 +249,7 @@ def reverse_geocode(lat: float, lng: float, local_only: bool = False) -> Dict[st res = fetch_with_curl( url, headers={ - "User-Agent": _USER_AGENT, + "User-Agent": _nominatim_user_agent(), "Accept-Language": "en", }, timeout=6, diff --git a/backend/services/geopolitics.py b/backend/services/geopolitics.py index e07abf2..47357db 100644 --- a/backend/services/geopolitics.py +++ b/backend/services/geopolitics.py @@ -8,6 +8,13 @@ from datetime import datetime from urllib.parse import urljoin, urlparse from services.network_utils import fetch_with_curl + + +def _geopolitics_user_agent() -> str: + """Round 7a: GDELT geopolitics fetcher attribution.""" + from services.network_utils import outbound_user_agent + return outbound_user_agent("geopolitics-gdelt") + logger = logging.getLogger(__name__) # Cache Frontline data for 30 minutes, it doesn't move that fast @@ -316,7 +323,7 @@ def _fetch_article_title(url): resp = requests.get( current_url, timeout=4, - headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT Dashboard/1.0)"}, + headers={"User-Agent": _geopolitics_user_agent()}, stream=True, allow_redirects=False, ) @@ -521,10 +528,29 @@ def _parse_gdelt_export_zip(zip_bytes, conflict_codes, seen_locs, features, loc_ logger.warning(f"Failed to parse GDELT export zip: {e}") +# GDELT's data.gdeltproject.org is a CNAME to a Google Cloud Storage +# bucket of the same name. GCS returns the wildcard ``*.storage.googleapis.com`` +# certificate, which legitimately does NOT cover the GDELT custom domain +# — Python's TLS verification correctly refuses it. Some networks/POPs +# happen to route through a path where this works; many do not (notably +# Docker Desktop's outbound NAT on local installs). +# +# Fix: rewrite the URL to hit GCS directly with a path-style bucket +# reference, where the standard GCS cert is genuinely valid. Same data, +# verified TLS, no operator-side workaround needed. +def _gcs_direct_gdelt_url(url: str) -> str: + """If ``url`` points at data.gdeltproject.org, return the equivalent + GCS-direct URL. Otherwise return the URL unchanged.""" + prefix = "://data.gdeltproject.org/" + if prefix in url: + return url.replace(prefix, "://storage.googleapis.com/data.gdeltproject.org/", 1) + return url + + def _download_gdelt_export(url): """Download a single GDELT export file, return bytes or None.""" try: - res = fetch_with_curl(url, timeout=15) + res = fetch_with_curl(_gcs_direct_gdelt_url(url), timeout=15) if res.status_code == 200: return res.content except (ConnectionError, TimeoutError, OSError): # non-critical @@ -620,8 +646,12 @@ def fetch_global_military_incidents(): # HTTPS is used to prevent passive network observers from injecting # poisoned export records into the global incident map via MITM. # GDELT serves the same content over HTTPS as HTTP. + # Use the GCS-direct URL because data.gdeltproject.org's CNAME + # serves a wildcard *.storage.googleapis.com cert that legitimately + # doesn't cover the GDELT hostname. See _gcs_direct_gdelt_url above. index_res = fetch_with_curl( - "https://data.gdeltproject.org/gdeltv2/lastupdate.txt", timeout=10 + _gcs_direct_gdelt_url("https://data.gdeltproject.org/gdeltv2/lastupdate.txt"), + timeout=10, ) if index_res.status_code != 200: logger.error(f"GDELT lastupdate failed: {index_res.status_code}") diff --git a/backend/services/network_utils.py b/backend/services/network_utils.py index 3479c26..a94a64f 100644 --- a/backend/services/network_utils.py +++ b/backend/services/network_utils.py @@ -5,7 +5,9 @@ import subprocess import shutil import time import threading +import uuid import requests +from pathlib import Path from urllib.parse import urlparse from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry @@ -20,14 +22,211 @@ _session.mount("https://", HTTPAdapter(max_retries=_retry, pool_maxsize=20)) _session.mount("http://", HTTPAdapter(max_retries=_retry, pool_maxsize=10)) -# Default outbound User-Agent. Generic by design — does NOT include any -# personal contact info or a fork-specific repo URL. Operators who run a -# public-facing relay and want to identify themselves to upstreams (e.g. -# for Nominatim / weather.gov usage-policy compliance) can override this -# via the SHADOWBROKER_USER_AGENT env var. +# --------------------------------------------------------------------------- +# Per-operator outbound identification +# --------------------------------------------------------------------------- +# +# Issues #289 / #290 / #291 and the retrofit of PR #284 (#218 / #219 / #220): +# every third-party API the backend calls used to identify itself with a +# single "Shadowbroker" aggregate User-Agent. From the upstream's +# perspective, that meant every Shadowbroker install in the world looked +# like one giant entity hammering them. If one install misbehaved, the +# upstream's only recourse was to block "Shadowbroker" as a whole — which +# would take out every other install too. +# +# Fix: give each install a stable pseudonymous handle and include it in +# the User-Agent. Now an upstream can rate-limit or block the offending +# operator without affecting anyone else. +# +# The handle: +# +# - Is auto-generated on first call if no `OPERATOR_HANDLE` is configured +# (looks like "operator-7f3a92" — 6 hex chars from uuid4()). +# - Is persisted to ``backend/data/operator_handle.json`` so it survives +# restarts. Under Docker compose that file lives in the volume mount +# alongside `carrier_cache.json` and the other persistent state. +# - Can be overridden by the operator via the `OPERATOR_HANDLE` setting +# (env var or settings UI). Operators with their own GitHub handle, +# organization name, etc. can use that for traceability. +# - Is NEVER mixed into mesh / Wormhole / Infonet identity. This layer is +# strictly for public third-party API attribution. + +_SHADOWBROKER_VERSION = "0.9" +_OPERATOR_HANDLE_FILE = ( + Path(__file__).parent.parent / "data" / "operator_handle.json" +) +_OPERATOR_HANDLE_CACHE: str = "" +_OPERATOR_HANDLE_LOCK = threading.Lock() + + +def _generate_operator_handle() -> str: + """Produce a stable pseudonymous handle for first-launch installs. + + Format: ``operator-7f3a92`` (6 hex chars from a fresh uuid4()). + Distinct per install. Carries no real-world identity by default — + operators who want one can override via ``OPERATOR_HANDLE``. + + Note: the prefix is deliberately neutral. Earlier drafts used + ``shadow-`` which, while accurate to the project name, looks + exactly like the kind of pattern a third-party abuse-detection + system would auto-block as suspicious. ``operator-`` describes + what the value actually is and doesn't pattern-match malware. + """ + return f"operator-{uuid.uuid4().hex[:6]}" + + +def _load_persisted_operator_handle() -> str: + """Return the previously-saved handle from disk, or empty if none. + + Reads ``backend/data/operator_handle.json`` if it exists. Any read + error returns empty so a fresh handle gets generated rather than + crashing the request. + """ + try: + if _OPERATOR_HANDLE_FILE.exists(): + data = json.loads(_OPERATOR_HANDLE_FILE.read_text(encoding="utf-8")) + return str(data.get("handle", "") or "").strip() + except (OSError, json.JSONDecodeError, ValueError): + pass + return "" + + +def _persist_operator_handle(handle: str) -> None: + """Atomically save the auto-generated handle so subsequent restarts + use the same one. Failure to persist is non-fatal — the request still + succeeds with the in-memory handle, we just may generate a different + one on the next process restart.""" + try: + _OPERATOR_HANDLE_FILE.parent.mkdir(parents=True, exist_ok=True) + tmp = _OPERATOR_HANDLE_FILE.with_suffix(_OPERATOR_HANDLE_FILE.suffix + ".tmp") + tmp.write_text( + json.dumps({"handle": handle, "_meta": { + "purpose": "Per-install operator handle for outbound third-party API attribution.", + "see": "backend/services/network_utils.py:outbound_user_agent", + }}, indent=2), + encoding="utf-8", + ) + os.replace(tmp, _OPERATOR_HANDLE_FILE) + except OSError as exc: + logger.debug("Could not persist operator_handle (continuing in-memory): %s", exc) + + +def get_operator_handle() -> str: + """Return the stable per-install operator handle. + + Resolution order: + 1. ``OPERATOR_HANDLE`` setting (env var / settings UI) if non-empty. + 2. Process-cached value from previous call this run. + 3. Value persisted to ``operator_handle.json`` (from a previous run). + 4. Newly generated pseudonymous handle, persisted to disk. + + The handle is normalized: stripped of whitespace, lowercased, + non-alphanumeric chars (except ``-`` and ``_``) replaced with ``-``. + This both sanitizes any HTTP-header-unsafe characters AND prevents + the operator from impersonating real third-party projects via + inventive whitespace. + """ + global _OPERATOR_HANDLE_CACHE + with _OPERATOR_HANDLE_LOCK: + # 1. Configured override always wins. + configured = "" + try: + from services.config import get_settings + + configured = str(getattr(get_settings(), "OPERATOR_HANDLE", "") or "").strip() + except Exception: + configured = "" + if configured: + return _normalize_handle(configured) + + # 2. In-memory cache (fast path for repeated calls). + if _OPERATOR_HANDLE_CACHE: + return _OPERATOR_HANDLE_CACHE + + # 3. On-disk handle from a previous run. + persisted = _load_persisted_operator_handle() + if persisted: + _OPERATOR_HANDLE_CACHE = _normalize_handle(persisted) + return _OPERATOR_HANDLE_CACHE + + # 4. Generate, persist, return. + fresh = _generate_operator_handle() + _persist_operator_handle(fresh) + _OPERATOR_HANDLE_CACHE = fresh + return fresh + + +def _normalize_handle(raw: str) -> str: + """Strip whitespace, lowercase, replace unsafe characters with dashes.""" + safe = "".join( + ch if (ch.isalnum() or ch in "-_") else "-" + for ch in raw.strip().lower() + ) + # Collapse runs of dashes and trim to a reasonable length so an + # operator can't make our outbound logs unreadable. + while "--" in safe: + safe = safe.replace("--", "-") + safe = safe.strip("-") + return safe[:48] if safe else "anonymous" + + +_CONTACT_URL = "https://github.com/BigBodyCobain/Shadowbroker/issues" + + +def outbound_user_agent(purpose: str = "") -> str: + """Build a User-Agent for an outbound third-party HTTP request. + + Returns something like:: + + Shadowbroker/0.9 (operator: shadow-7f3a92; purpose: wikipedia; + +https://github.com/BigBodyCobain/Shadowbroker/issues) + + The ``purpose`` is optional but recommended — it tells the upstream + what feature of ours is making the call (``wikipedia``, ``openmhz``, + ``nominatim``, etc.), which makes their logs and our complaints + actionable. + + Every outbound call in the backend that previously sent a custom + User-Agent should call this helper instead. Centralizing here means: + - one place to change the contact URL, + - one place to bump the version on release, + - one place a Wikimedia / OpenMHz operator can reach to ask for + the project to back off, with a per-install handle so they can + target the specific install instead of the project as a whole. + """ + handle = get_operator_handle() + if purpose: + purpose_clean = _normalize_handle(purpose) + return ( + f"Shadowbroker/{_SHADOWBROKER_VERSION} " + f"(operator: {handle}; purpose: {purpose_clean}; +{_CONTACT_URL})" + ) + return ( + f"Shadowbroker/{_SHADOWBROKER_VERSION} " + f"(operator: {handle}; +{_CONTACT_URL})" + ) + + +def _reset_operator_handle_cache_for_tests() -> None: + """Test-only: invalidate the in-memory cache so a test can set a + new ``OPERATOR_HANDLE`` env var and see it picked up immediately.""" + global _OPERATOR_HANDLE_CACHE + with _OPERATOR_HANDLE_LOCK: + _OPERATOR_HANDLE_CACHE = "" + + +# Default outbound User-Agent. Retained for backwards compatibility with +# call sites that haven't been migrated to ``outbound_user_agent()`` yet. +# Operators who want full per-install attribution should set the +# ``OPERATOR_HANDLE`` setting and migrate call sites incrementally. +# +# Operators who run a public-facing relay can also override the whole UA +# string via the ``SHADOWBROKER_USER_AGENT`` env var. That override +# completely bypasses the per-operator helper; only use it if you know +# what you're doing. DEFAULT_USER_AGENT = os.environ.get( "SHADOWBROKER_USER_AGENT", - "ShadowBroker-OSINT/0.9", + f"Shadowbroker/{_SHADOWBROKER_VERSION}", ) # Find bash for curl fallback — Git bash's curl has the TLS features diff --git a/backend/services/radio_intercept.py b/backend/services/radio_intercept.py index 679ad98..147477b 100644 --- a/backend/services/radio_intercept.py +++ b/backend/services/radio_intercept.py @@ -2,14 +2,34 @@ import requests from bs4 import BeautifulSoup import logging from cachetools import cached, TTLCache -import cloudscraper import reverse_geocoder as rg from urllib.parse import urlparse +from services.network_utils import outbound_user_agent + logger = logging.getLogger(__name__) _OPENMHZ_AUDIO_HOSTS = {"media.openmhz.com", "media2.openmhz.com", "media3.openmhz.com"} + +# Round 7a / Issues #289, #290, #291 (tg12 audit): +# We previously sent a spoofed Chrome User-Agent and (for OpenMHz) used +# cloudscraper to bypass anti-bot challenges. Both are dishonest and ToS- +# unfriendly. We now send the per-install Shadowbroker UA — the upstream +# can identify us, rate-limit us per install, and contact us if needed. +# +# If the upstream actively blocks our honest UA, the feature degrades +# gracefully (returns an empty list / cached results) rather than +# escalating to deception. + + +def _broadcastify_user_agent() -> str: + return outbound_user_agent("broadcastify") + + +def _openmhz_user_agent() -> str: + return outbound_user_agent("openmhz") + # Cache the top feeds for 5 minutes so we don't hammer Broadcastify radio_cache = TTLCache(maxsize=1, ttl=300) @@ -22,8 +42,12 @@ def get_top_broadcastify_feeds(): """ logger.info("Scraping Broadcastify Top Feeds (Cache Miss)") headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + # Issue #289 (tg12) + Round 7a: identify ourselves honestly as a + # per-install Shadowbroker scraper. Broadcastify can rate-limit + # us per install or block us; either way we stop pretending to be + # a browser. If they block, the panel degrades gracefully. + "User-Agent": _broadcastify_user_agent(), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", } @@ -89,21 +113,32 @@ openmhz_systems_cache = TTLCache(maxsize=1, ttl=3600) @cached(openmhz_systems_cache) def get_openmhz_systems(): - """Fetches the full directory of OpenMHZ systems.""" - logger.info("Scraping OpenMHZ Systems (Cache Miss)") - scraper = cloudscraper.create_scraper( - browser={"browser": "chrome", "platform": "windows", "desktop": True} - ) + """Fetches the full directory of OpenMHZ systems. + Issue #290 (tg12) + Round 7a: replaced cloudscraper-based Chrome + impersonation with an honest per-install Shadowbroker User-Agent. + If OpenMHz's Cloudflare layer blocks honest traffic, we accept + that degradation (return empty list) rather than spoof a browser. + """ + logger.info("Fetching OpenMHZ Systems (Cache Miss)") try: - res = scraper.get("https://api.openmhz.com/systems", timeout=15) + res = requests.get( + "https://api.openmhz.com/systems", + timeout=15, + headers={"User-Agent": _openmhz_user_agent(), "Accept": "application/json"}, + ) if res.status_code == 200: data = res.json() - # Return list of systems return data.get("systems", []) if isinstance(data, dict) else [] + if res.status_code in (403, 503): + logger.warning( + "OpenMHZ returned %s for systems directory — Cloudflare may " + "be blocking our honest UA. Feature degrades to empty result.", + res.status_code, + ) return [] except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e: - logger.error(f"OpenMHZ Systems Scrape Exception: {e}") + logger.error(f"OpenMHZ Systems Fetch Exception: {e}") return [] @@ -113,21 +148,25 @@ openmhz_calls_cache = TTLCache(maxsize=100, ttl=20) @cached(openmhz_calls_cache) def get_recent_openmhz_calls(sys_name: str): - """Fetches the actual audio burst .m4a URLs for a specific system (e.g., 'wmata').""" - logger.info(f"Fetching OpenMHZ calls for {sys_name} (Cache Miss)") - scraper = cloudscraper.create_scraper( - browser={"browser": "chrome", "platform": "windows", "desktop": True} - ) + """Fetches the actual audio burst .m4a URLs for a specific system (e.g., 'wmata'). + Issue #290 (tg12) + Round 7a: same honest-UA model as + ``get_openmhz_systems``. + """ + logger.info(f"Fetching OpenMHZ calls for {sys_name} (Cache Miss)") try: url = f"https://api.openmhz.com/{sys_name}/calls" - res = scraper.get(url, timeout=15) + res = requests.get( + url, + timeout=15, + headers={"User-Agent": _openmhz_user_agent(), "Accept": "application/json"}, + ) if res.status_code == 200: data = res.json() return data.get("calls", []) if isinstance(data, dict) else [] return [] except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e: - logger.error(f"OpenMHZ Calls Scrape Exception ({sys_name}): {e}") + logger.error(f"OpenMHZ Calls Fetch Exception ({sys_name}): {e}") return [] @@ -163,9 +202,11 @@ def openmhz_audio_response(target_url: str): timeout=(5, 20), allow_redirects=False, headers={ - "User-Agent": "Mozilla/5.0", + # Issue #291 (tg12) + Round 7a: drop spoofed Mozilla + # UA and the fake first-party Referer. Identify as + # the per-install Shadowbroker proxy honestly. + "User-Agent": _openmhz_user_agent(), "Accept": "audio/mpeg,audio/*,*/*;q=0.8", - "Referer": "https://openmhz.com/", }, ) if upstream.is_redirect or upstream.status_code in (301, 302, 303, 307, 308): diff --git a/backend/services/region_dossier.py b/backend/services/region_dossier.py index 2dd6157..89871c2 100644 --- a/backend/services/region_dossier.py +++ b/backend/services/region_dossier.py @@ -4,7 +4,7 @@ import concurrent.futures from urllib.parse import quote import requests as _requests from cachetools import TTLCache -from services.network_utils import fetch_with_curl, DEFAULT_USER_AGENT +from services.network_utils import fetch_with_curl, outbound_user_agent logger = logging.getLogger(__name__) @@ -15,24 +15,30 @@ dossier_cache = TTLCache(maxsize=500, ttl=86400) # Nominatim requires max 1 req/sec — track last call time _nominatim_last_call = 0.0 -# Issue #218 / #219 (tg12): Wikimedia's User-Agent policy requires API +# Issues #218 / #219 (tg12): Wikimedia's User-Agent policy requires API # clients to identify themselves with a stable User-Agent that includes -# a contact path. Bare "python-requests/x.y" or generic strings violate -# the policy and risk getting blocked. We send the project default UA -# (operator-overridable via SHADOWBROKER_USER_AGENT) on EVERY outbound -# Wikimedia request, plus the policy-recommended Api-User-Agent which -# Wikimedia explicitly accepts on top of the regular UA. +# a contact path. # -# This is documented and stable so a Wikimedia operator who wants to -# rate-limit or contact us has a fixed identifier to grep for. -_WIKIMEDIA_REQUEST_HEADERS = { - "User-Agent": DEFAULT_USER_AGENT, - "Api-User-Agent": ( - f"{DEFAULT_USER_AGENT} " - "(+https://github.com/BigBodyCobain/Shadowbroker; " - "report issues at /issues)" - ), -} +# Round 7a: the original fix in PR #284 used a single project-wide +# identifier, which from Wikimedia's perspective made every Shadowbroker +# install in the world look like one giant scraper. If one install +# misbehaved, their only recourse was to block "Shadowbroker" as a +# whole. We now build the headers from ``outbound_user_agent('wikimedia')`` +# which embeds the per-install operator handle (auto-generated or +# operator-chosen), so Wikimedia can rate-limit / contact the specific +# install instead of the project. + + +def _wikimedia_request_headers() -> dict[str, str]: + ua = outbound_user_agent("wikimedia") + return { + "User-Agent": ua, + # Browser-JS-style header that Wikimedia's policy explicitly + # accepts on top of (or instead of) User-Agent. We send both so + # whichever the upstream prefers, the per-operator handle is + # always available. + "Api-User-Agent": ua, + } def _reverse_geocode_offline(lat: float, lng: float) -> dict: @@ -64,9 +70,7 @@ def _reverse_geocode(lat: float, lng: float) -> dict: f"https://nominatim.openstreetmap.org/reverse?" f"lat={lat}&lon={lng}&format=json&zoom=10&addressdetails=1&accept-language=en" ) - headers = { - "User-Agent": "ShadowBroker-OSINT/1.0 (live-risk-dashboard; contact@shadowbroker.app)" - } + headers = {"User-Agent": outbound_user_agent("nominatim")} for attempt in range(2): # Enforce Nominatim's 1 req/sec policy @@ -146,7 +150,7 @@ def _fetch_wikidata_leader(country_name: str) -> dict: # specific Api-User-Agent that the policy specifically asks # for, since this request originates from a backend service # that proxies on behalf of (potentially many) browser users. - res = fetch_with_curl(url, timeout=6, headers=_WIKIMEDIA_REQUEST_HEADERS) + res = fetch_with_curl(url, timeout=6, headers=_wikimedia_request_headers()) if res.status_code == 200: results = res.json().get("results", {}).get("bindings", []) if results: @@ -174,7 +178,7 @@ def _fetch_local_wiki_summary(place_name: str, country_name: str = "") -> dict: try: # Issue #219 (tg12): identify ourselves to Wikimedia per # their UA policy; see _fetch_wikidata_leader above. - res = fetch_with_curl(url, timeout=5, headers=_WIKIMEDIA_REQUEST_HEADERS) + res = fetch_with_curl(url, timeout=5, headers=_wikimedia_request_headers()) if res.status_code == 200: data = res.json() if data.get("type") != "disambiguation": diff --git a/backend/services/sar/sar_products_client.py b/backend/services/sar/sar_products_client.py index a808eae..c2204cc 100644 --- a/backend/services/sar/sar_products_client.py +++ b/backend/services/sar/sar_products_client.py @@ -34,6 +34,11 @@ from services.sar.sar_config import ( copernicus_token, earthdata_token, ) + + +def _sar_user_agent() -> str: + from services.network_utils import outbound_user_agent + return outbound_user_agent("sar-products") from services.sar.sar_normalize import ( SarAnomaly, evidence_hash_for_payload, @@ -442,7 +447,7 @@ def _fetch_unosat_packages() -> list[dict[str, Any]]: # HDX CKAN returns 406 without explicit Accept + a browser-ish UA. hdx_headers = { "Accept": "application/json", - "User-Agent": "Mozilla/5.0 (compatible; ShadowBroker-SAR/1.0)", + "User-Agent": _sar_user_agent(), } try: resp = fetch_with_curl(url, timeout=20, headers=hdx_headers) diff --git a/backend/services/sentinel_search.py b/backend/services/sentinel_search.py index ec66896..9f53033 100644 --- a/backend/services/sentinel_search.py +++ b/backend/services/sentinel_search.py @@ -11,12 +11,21 @@ import requests from datetime import datetime, timedelta from cachetools import TTLCache +from services.network_utils import outbound_user_agent + logger = logging.getLogger(__name__) # Cache by rounded lat/lon (0.02° grid ~= 2km), TTL 1 hour _sentinel_cache = TTLCache(maxsize=200, ttl=3600) +def _planetary_user_agent() -> str: + # Round 7a: per-install handle so Microsoft Planetary Computer can + # attribute requests to the specific operator rather than treating + # the whole Shadowbroker user base as one entity. + return outbound_user_agent("sentinel2-planetary-computer") + + def _esri_imagery_fallback(lat: float, lng: float) -> dict: lat_span = 0.18 lng_span = 0.24 @@ -64,7 +73,7 @@ def search_sentinel2_scene(lat: float, lng: float) -> dict: "https://planetarycomputer.microsoft.com/api/stac/v1/search", json=search_payload, timeout=8, - headers={"User-Agent": "ShadowBroker-OSINT/1.0 (live-risk-dashboard)"}, + headers={"User-Agent": _planetary_user_agent()}, ) search_res.raise_for_status() data = search_res.json() diff --git a/backend/services/shodan_connector.py b/backend/services/shodan_connector.py index aaa124c..7fa1f5f 100644 --- a/backend/services/shodan_connector.py +++ b/backend/services/shodan_connector.py @@ -20,7 +20,11 @@ from cachetools import TTLCache logger = logging.getLogger(__name__) _SHODAN_BASE = "https://api.shodan.io" -_USER_AGENT = "ShadowBroker/0.9.79 local Shodan connector" +# Round 7a: per-install attribution. Shodan already has the operator API +# key for billing, but the UA still identifies the install. +def _shodan_user_agent(): + from services.network_utils import outbound_user_agent + return outbound_user_agent("shodan") _REQUEST_TIMEOUT = 15 _MIN_INTERVAL_SECONDS = 1.05 # Shodan docs say API plans are rate limited to ~1 req/sec. _DEFAULT_SEARCH_PAGES = 1 @@ -179,7 +183,7 @@ def _request(path: str, *, params: dict[str, Any], cache: TTLCache[str, dict[str f"{_SHODAN_BASE}{path}", params=payload, timeout=_REQUEST_TIMEOUT, - headers={"User-Agent": _USER_AGENT, "Accept": "application/json"}, + headers={"User-Agent": _shodan_user_agent(), "Accept": "application/json"}, ) finally: _last_request_at = time.monotonic() diff --git a/backend/services/tinygs_fetcher.py b/backend/services/tinygs_fetcher.py index cdf806b..f123baf 100644 --- a/backend/services/tinygs_fetcher.py +++ b/backend/services/tinygs_fetcher.py @@ -19,6 +19,13 @@ from pathlib import Path import requests from sgp4.api import Satrec, WGS72, jday + + +def _tinygs_user_agent(purpose: str) -> str: + """Round 7a: per-install handle for CelesTrak / TinyGS attribution.""" + from services.network_utils import outbound_user_agent + return outbound_user_agent(f"tinygs-{purpose}") + logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- @@ -113,7 +120,7 @@ def _fetch_celestrak_tles() -> list[dict]: params={"GROUP": group, "FORMAT": "json"}, timeout=20, headers={ - "User-Agent": "ShadowBroker-OSINT/1.0 (CelesTrak fair-use)", + "User-Agent": _tinygs_user_agent("celestrak"), "Accept": "application/json", }, ) @@ -259,7 +266,7 @@ def _fetch_tinygs_telemetry() -> None: timeout=15, headers={ "Accept": "application/json", - "User-Agent": "ShadowBroker-OSINT/1.0", + "User-Agent": _tinygs_user_agent("tinygs"), }, ) resp.raise_for_status() diff --git a/backend/services/unusual_whales_connector.py b/backend/services/unusual_whales_connector.py index acf97fc..6fdb99c 100644 --- a/backend/services/unusual_whales_connector.py +++ b/backend/services/unusual_whales_connector.py @@ -24,7 +24,9 @@ from cachetools import TTLCache logger = logging.getLogger(__name__) _FINNHUB_BASE = "https://finnhub.io/api/v1" -_USER_AGENT = "ShadowBroker/0.9.79 Finnhub connector" +def _finnhub_user_agent(): + from services.network_utils import outbound_user_agent + return outbound_user_agent("finnhub") _REQUEST_TIMEOUT = 12 _MIN_INTERVAL_SECONDS = 0.35 # Stay well under 60 calls/min @@ -89,7 +91,7 @@ def _request(path: str, params: dict[str, Any] | None = None) -> Any: f"{_FINNHUB_BASE}{path}", params=payload, timeout=_REQUEST_TIMEOUT, - headers={"User-Agent": _USER_AGENT, "Accept": "application/json"}, + headers={"User-Agent": _finnhub_user_agent(), "Accept": "application/json"}, ) finally: _last_request_at = time.monotonic() diff --git a/backend/tests/test_gdelt_gcs_direct_rewrite.py b/backend/tests/test_gdelt_gcs_direct_rewrite.py new file mode 100644 index 0000000..deaaf43 --- /dev/null +++ b/backend/tests/test_gdelt_gcs_direct_rewrite.py @@ -0,0 +1,83 @@ +"""GDELT's ``data.gdeltproject.org`` is a CNAME to a Google Cloud Storage +bucket. GCS responds with the wildcard ``*.storage.googleapis.com`` +certificate, which legitimately does NOT cover the GDELT custom +domain, so Python's TLS verification refuses the connection. Some +networks happen to route through a path where this works; many +(notably Docker Desktop's outbound NAT on local installs) do not. + +The fix in ``services.geopolitics._gcs_direct_gdelt_url`` rewrites any +URL pointing at ``data.gdeltproject.org`` to its GCS-direct equivalent +(``storage.googleapis.com/data.gdeltproject.org/...``), where the +standard GCS certificate is genuinely valid. ``api.gdeltproject.org`` +and every other host are left untouched. + +These tests pin that behavior so a future refactor that drops the +helper or accidentally rewrites the wrong host gets a loud failure. +""" +from __future__ import annotations + +import pytest + + +def test_rewrites_data_gdeltproject_https(): + from services.geopolitics import _gcs_direct_gdelt_url + + assert _gcs_direct_gdelt_url( + "https://data.gdeltproject.org/gdeltv2/lastupdate.txt" + ) == "https://storage.googleapis.com/data.gdeltproject.org/gdeltv2/lastupdate.txt" + + +def test_rewrites_data_gdeltproject_http(): + """GDELT's lastupdate.txt sometimes lists URLs with http:// — we + rewrite those too (the downstream call upgrades them to https).""" + from services.geopolitics import _gcs_direct_gdelt_url + + assert _gcs_direct_gdelt_url( + "http://data.gdeltproject.org/gdeltv2/20260301120000.export.CSV.zip" + ) == "http://storage.googleapis.com/data.gdeltproject.org/gdeltv2/20260301120000.export.CSV.zip" + + +def test_rewrites_preserve_query_string_and_path(): + from services.geopolitics import _gcs_direct_gdelt_url + + url = "https://data.gdeltproject.org/some/deep/path?a=1&b=2&c=hello%20world" + rewritten = _gcs_direct_gdelt_url(url) + assert rewritten == ( + "https://storage.googleapis.com/data.gdeltproject.org" + "/some/deep/path?a=1&b=2&c=hello%20world" + ) + + +def test_does_not_touch_api_gdeltproject_org(): + """The API host is NOT a CNAME to GCS; rewriting it would break the + actual GDELT API endpoint.""" + from services.geopolitics import _gcs_direct_gdelt_url + + url = "https://api.gdeltproject.org/api/v2/doc/doc?query=carrier" + assert _gcs_direct_gdelt_url(url) == url + + +def test_does_not_touch_other_hosts(): + from services.geopolitics import _gcs_direct_gdelt_url + + for url in ( + "https://en.wikipedia.org/wiki/Boeing_747", + "https://query.wikidata.org/sparql", + "https://storage.googleapis.com/already-correct/path", + "https://nominatim.openstreetmap.org/search", + ): + assert _gcs_direct_gdelt_url(url) == url + + +def test_does_not_partially_match_strings(): + """``data.gdeltproject.org`` is matched exactly; URLs that merely + contain that substring elsewhere (in a query parameter, for example) + are left alone. Otherwise we'd rewrite something like + ``https://example.com/?ref=data.gdeltproject.org/x`` which is wrong.""" + from services.geopolitics import _gcs_direct_gdelt_url + + # The match requires ``://`` immediately before the host, so a host + # like ``example-data.gdeltproject.org`` would also be left alone + # (treated as a different host, which is correct). + url = "https://example-data.gdeltproject.org/path" + assert _gcs_direct_gdelt_url(url) == url diff --git a/backend/tests/test_per_operator_outbound_attribution.py b/backend/tests/test_per_operator_outbound_attribution.py new file mode 100644 index 0000000..e2aa643 --- /dev/null +++ b/backend/tests/test_per_operator_outbound_attribution.py @@ -0,0 +1,277 @@ +"""Round 7a: per-install operator handle threads through every outbound +third-party API call. + +Background: before this change every Shadowbroker install identified +itself to Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, Broadcastify, +weather.gov, NUFORC, etc. with a single project-wide ``Shadowbroker`` +User-Agent. From the upstream's perspective, every install in the world +looked like one giant scraper. If one install misbehaved, the upstream's +only recourse was to block ``Shadowbroker`` as a whole, taking out every +other install. + +Fix: each install gets a stable pseudonymous handle (auto-generated like +``shadow-7f3a92`` or operator-overridden via ``OPERATOR_HANDLE``) that +gets embedded in the User-Agent for every outbound call. Upstreams can +now rate-limit / contact the specific operator instead of the project. + +These tests pin: + + 1. The handle is auto-generated on first call if no override exists. + 2. The handle survives process restart (persisted to disk). + 3. ``OPERATOR_HANDLE`` env var override wins over the auto-gen handle. + 4. The handle is sanitized (whitespace, special chars, length). + 5. Every previously-MONSTER-UA call site now sends the per-operator UA. +""" +from __future__ import annotations + +import json +import os +from pathlib import Path +from unittest.mock import patch + +import pytest + + +@pytest.fixture +def isolated_handle(tmp_path, monkeypatch): + """Redirect the persistence path to tmp and reset caches between tests.""" + from services import network_utils + + handle_file = tmp_path / "operator_handle.json" + monkeypatch.setattr(network_utils, "_OPERATOR_HANDLE_FILE", handle_file) + network_utils._reset_operator_handle_cache_for_tests() + monkeypatch.delenv("OPERATOR_HANDLE", raising=False) + + # Reset Settings cache so OPERATOR_HANDLE env changes are picked up. + from services.config import get_settings + get_settings.cache_clear() + + yield network_utils + + network_utils._reset_operator_handle_cache_for_tests() + get_settings.cache_clear() + + +# --------------------------------------------------------------------------- +# Core handle generation / persistence / override +# --------------------------------------------------------------------------- + + +class TestOperatorHandleGeneration: + def test_auto_generates_on_first_call(self, isolated_handle): + h = isolated_handle.get_operator_handle() + # Prefix is "operator-" (deliberately neutral; "shadow-" looked + # exactly like a pattern abuse-detection systems would auto-block). + assert h.startswith("operator-") + assert len(h) == len("operator-") + 6 + # Hex suffix. + suffix = h.split("-", 1)[1] + int(suffix, 16) # raises if not hex + + def test_persists_to_disk_so_handle_survives_restart(self, isolated_handle): + first = isolated_handle.get_operator_handle() + # Simulate process restart: clear in-memory cache, then ask again. + isolated_handle._reset_operator_handle_cache_for_tests() + second = isolated_handle.get_operator_handle() + assert second == first + # The file actually exists. + assert isolated_handle._OPERATOR_HANDLE_FILE.exists() + body = json.loads(isolated_handle._OPERATOR_HANDLE_FILE.read_text()) + assert body["handle"] == first + + def test_env_override_wins_over_auto_generated(self, isolated_handle, monkeypatch): + # First call without env var auto-generates. + auto = isolated_handle.get_operator_handle() + assert auto.startswith("operator-") + # Setting env var changes the resolved handle without touching the disk file. + monkeypatch.setenv("OPERATOR_HANDLE", "alice") + from services.config import get_settings + get_settings.cache_clear() + isolated_handle._reset_operator_handle_cache_for_tests() + assert isolated_handle.get_operator_handle() == "alice" + + def test_handle_is_sanitized(self, isolated_handle, monkeypatch): + from services.config import get_settings + + # Sanitization tests run against the normalizer directly so the + # empty-string case can be asserted independently of the env-var + # resolution path (where empty means "use auto-gen", not "use + # 'anonymous'"). + from services.network_utils import _normalize_handle + + cases = [ + ("Alice Smith", "alice-smith"), + ("user@example.com", "user-example-com"), + (" whitespace ", "whitespace"), + ("UPPER-CASE", "upper-case"), + ("multiple---dashes", "multiple-dashes"), + ("/leading/slash", "leading-slash"), + ("trailing-", "trailing"), + ("", "anonymous"), + ] + for raw, expected in cases: + got = _normalize_handle(raw) + assert got == expected, f"{raw!r} -> {got!r}, expected {expected!r}" + assert got == got.lower() + for ch in got: + assert ch.isalnum() or ch in "-_", f"unsafe char {ch!r} in {got!r}" + assert "--" not in got + + def test_handle_is_length_capped(self, isolated_handle, monkeypatch): + from services.config import get_settings + + monkeypatch.setenv("OPERATOR_HANDLE", "x" * 1000) + get_settings.cache_clear() + isolated_handle._reset_operator_handle_cache_for_tests() + got = isolated_handle.get_operator_handle() + assert len(got) <= 48 + + +# --------------------------------------------------------------------------- +# outbound_user_agent() builds the right header +# --------------------------------------------------------------------------- + + +class TestOutboundUserAgentString: + def test_includes_operator_handle(self, isolated_handle): + ua = isolated_handle.outbound_user_agent() + handle = isolated_handle.get_operator_handle() + assert f"operator: {handle}" in ua + + def test_includes_purpose_when_provided(self, isolated_handle): + ua = isolated_handle.outbound_user_agent("wikipedia") + assert "purpose: wikipedia" in ua + + def test_includes_contact_path(self, isolated_handle): + ua = isolated_handle.outbound_user_agent() + assert "github.com" in ua.lower() + assert "shadowbroker" in ua.lower() + + def test_version_prefix(self, isolated_handle): + ua = isolated_handle.outbound_user_agent() + assert ua.startswith("Shadowbroker/") + + +# --------------------------------------------------------------------------- +# Wikipedia / Wikidata — retroactive fix for PR #284's MONSTER pattern +# --------------------------------------------------------------------------- + + +class TestWikimediaCallsAreNowPerOperator: + def test_wikidata_call_uses_per_operator_ua(self, isolated_handle, monkeypatch): + from services import region_dossier + + captured = [] + + class _FakeResp: + status_code = 200 + def json(self): + return {"results": {"bindings": []}} + + def fake_fetch(url, **kwargs): + captured.append(kwargs.get("headers") or {}) + return _FakeResp() + + monkeypatch.setattr(region_dossier, "fetch_with_curl", fake_fetch) + region_dossier._fetch_wikidata_leader("Testlandia") + + assert captured, "Wikidata fetcher was not called" + headers = captured[0] + assert "User-Agent" in headers + assert "Api-User-Agent" in headers + handle = isolated_handle.get_operator_handle() + for header_value in (headers["User-Agent"], headers["Api-User-Agent"]): + assert f"operator: {handle}" in header_value, ( + f"Wikimedia UA must include the per-operator handle; got {header_value!r}" + ) + + def test_wikipedia_summary_uses_per_operator_ua(self, isolated_handle, monkeypatch): + from services import region_dossier + + captured = [] + + class _FakeResp: + status_code = 200 + def json(self): + return { + "type": "standard", + "description": "x", + "extract": "y", + "thumbnail": {"source": ""}, + } + + def fake_fetch(url, **kwargs): + captured.append((url, kwargs.get("headers") or {})) + return _FakeResp() + + monkeypatch.setattr(region_dossier, "fetch_with_curl", fake_fetch) + region_dossier._fetch_local_wiki_summary("Paris", "France") + + wikipedia_hits = [c for c in captured if "wikipedia.org" in c[0]] + assert wikipedia_hits, "Wikipedia summary fetch was not called" + for _url, headers in wikipedia_hits: + handle = isolated_handle.get_operator_handle() + assert f"operator: {handle}" in headers.get("User-Agent", "") + + +# --------------------------------------------------------------------------- +# Generic round-7a regression guard +# --------------------------------------------------------------------------- + + +class TestNoMonsterUserAgentRemains: + """The audit's underlying concern was that every Shadowbroker install + looked like one entity. This test scans the codebase for the OLD + aggregate identifier patterns and fails if a new one sneaks back in. + + We allow the strings to appear in: + - comments (audit prose, change-log notes) + - tests + - .env.example (documentation) + The test only fails if the string lives in actual outbound-request + HEADER values without going through the per-operator helper. + """ + + BANNED_LITERALS = ( + "ShadowBroker-OSINT/1.0", + "ShadowBroker-OSINT/0.9", + "ShadowBroker-FeedIngester/1.0", + "ShadowBroker/0.9.79 local Shodan connector", + "ShadowBroker/0.9.79 Finnhub connector", + "Mozilla/5.0 (compatible; ShadowBroker CCTV proxy)", + ) + + def test_no_banned_aggregate_user_agent_strings(self): + from pathlib import Path + + backend_root = Path(__file__).parent.parent + offenders = [] + for py in backend_root.rglob("*.py"): + # Skip test files and any audit-context comments. + rel = py.relative_to(backend_root).as_posix() + if rel.startswith("tests/"): + continue + text = py.read_text(encoding="utf-8", errors="ignore") + # Look only for the literal as part of a string in a User-Agent + # context: cheap heuristic via "User-Agent" + literal coexisting + # in the same file. A literal in a comment block won't trigger + # because the same line won't have User-Agent surrounding it. + for banned in self.BANNED_LITERALS: + if banned in text: + # Walk lines to ensure it's a real header value. + for i, line in enumerate(text.splitlines(), 1): + if banned in line: + # Comments / docstrings are allowed — only fail + # if the line looks like a header assignment. + stripped = line.strip() + if stripped.startswith("#"): + continue + if '"User-Agent"' in line or "'User-Agent'" in line: + offenders.append(f"{rel}:{i}: {stripped[:120]}") + assert not offenders, ( + "Round 7a regression: the following lines reintroduced an " + "aggregate Shadowbroker User-Agent. Use " + "outbound_user_agent('purpose') instead so the per-install " + "operator handle is embedded.\n" + + "\n".join(offenders) + ) diff --git a/backend/tests/test_region_dossier_wikimedia_ua.py b/backend/tests/test_region_dossier_wikimedia_ua.py index 96de216..36ca604 100644 --- a/backend/tests/test_region_dossier_wikimedia_ua.py +++ b/backend/tests/test_region_dossier_wikimedia_ua.py @@ -77,15 +77,25 @@ def test_wikipedia_summary_call_passes_wikimedia_request_headers(): assert "github.com" in headers["Api-User-Agent"].lower() -def test_wikimedia_headers_constant_is_stable(): - """Regression guard: if someone removes the contact path from the - Api-User-Agent we want a loud test failure, not a silent ToS drift. - """ - from services.region_dossier import _WIKIMEDIA_REQUEST_HEADERS +def test_wikimedia_headers_helper_is_stable(): + """Regression guard: if someone removes the contact path or the + per-operator handle from the Wikimedia headers, we want a loud + test failure, not a silent ToS drift. - aua = _WIKIMEDIA_REQUEST_HEADERS.get("Api-User-Agent", "") - assert "Shadowbroker" in aua or "ShadowBroker" in aua - assert "github.com" in aua.lower() - # Must include a path Wikimedia operators can use to contact us - # (we use /issues against the public repo). - assert "issues" in aua.lower() + Round 7a: the original ``_WIKIMEDIA_REQUEST_HEADERS`` constant was + replaced with the ``_wikimedia_request_headers()`` function so the + per-install operator handle is embedded at call time. This test + pins both the project identifier AND the contact path AND the + per-operator format. + """ + from services.region_dossier import _wikimedia_request_headers + + headers = _wikimedia_request_headers() + aua = headers.get("Api-User-Agent", "") + ua = headers.get("User-Agent", "") + for h, label in ((ua, "User-Agent"), (aua, "Api-User-Agent")): + assert "Shadowbroker" in h or "ShadowBroker" in h, f"{label} missing project id" + assert "github.com" in h.lower(), f"{label} missing contact URL" + assert "issues" in h.lower(), f"{label} missing /issues contact path" + # Round 7a: must include the per-operator handle. + assert "operator:" in h, f"{label} missing per-operator handle: {h!r}" diff --git a/frontend/src/__tests__/utils/wikimediaClient.test.ts b/frontend/src/__tests__/utils/wikimediaClient.test.ts index 1e17aad..318dd17 100644 --- a/frontend/src/__tests__/utils/wikimediaClient.test.ts +++ b/frontend/src/__tests__/utils/wikimediaClient.test.ts @@ -1,16 +1,21 @@ /** - * Issues #218 / #219 / #220 (tg12 external audit): + * Issues #218 / #219 / #220 (tg12 external audit) + Round 7a: * * Every browser-direct call to Wikipedia or Wikidata must send the - * `Api-User-Agent` header that Wikimedia's UA policy asks for. These - * tests pin that requirement on the shared `lib/wikimediaClient` + * `Api-User-Agent` header that Wikimedia's UA policy asks for, AND must + * embed the per-install operator handle so Wikimedia can rate-limit / + * contact the specific operator instead of treating "Shadowbroker" as + * one giant entity. + * + * These tests pin both requirements on the shared `lib/wikimediaClient` * helper that WikiImage, NewsFeed, and useRegionDossier all route - * through, so a future refactor that drops the header gets a loud - * test failure rather than a silent ToS regression. + * through. A future refactor that drops either the header OR the + * per-operator handle gets a loud test failure rather than a silent + * ToS / privacy regression. */ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { - WIKIMEDIA_API_USER_AGENT, + buildWikimediaUserAgent, fetchWikipediaSummary, fetchWikidataSparql, _resetWikimediaClientCacheForTests, @@ -18,6 +23,18 @@ import { const originalFetch = globalThis.fetch; +// Helper: stub fetch so calls to /api/settings/operator-handle return a +// known handle, and everything else proxies to whatever the test set up. +function withHandle(handle: string, otherFetch: typeof globalThis.fetch) { + return vi.fn(async (input: any, init?: RequestInit) => { + const url = String(input); + if (url.endsWith('/api/settings/operator-handle')) { + return new Response(JSON.stringify({ handle }), { status: 200 }); + } + return otherFetch(input, init); + }); +} + describe('lib/wikimediaClient', () => { beforeEach(() => { _resetWikimediaClientCacheForTests(); @@ -28,16 +45,35 @@ describe('lib/wikimediaClient', () => { vi.restoreAllMocks(); }); - it('exposes a stable Api-User-Agent identifier with a contact path', () => { - expect(WIKIMEDIA_API_USER_AGENT).toContain('Shadowbroker'); - expect(WIKIMEDIA_API_USER_AGENT.toLowerCase()).toContain('github.com'); - expect(WIKIMEDIA_API_USER_AGENT.toLowerCase()).toContain('issues'); + it('builds a stable per-operator Api-User-Agent with contact path', async () => { + globalThis.fetch = withHandle( + 'operator-abc123', + vi.fn(async () => new Response('{}', { status: 200 })) as any, + ) as any; + const ua = await buildWikimediaUserAgent('wikipedia-summary'); + expect(ua).toContain('Shadowbroker'); + expect(ua.toLowerCase()).toContain('github.com'); + expect(ua.toLowerCase()).toContain('issues'); + expect(ua).toContain('operator: operator-abc123'); + expect(ua).toContain('purpose: wikipedia-summary'); }); - it('sends Api-User-Agent on Wikipedia summary fetch', async () => { - const calls: Array<{ url: string; init?: RequestInit }> = []; - globalThis.fetch = vi.fn(async (url: any, init?: RequestInit) => { - calls.push({ url: String(url), init }); + it('falls back to "operator-offline" when handle endpoint is unreachable', async () => { + globalThis.fetch = vi.fn(async (input: any) => { + const url = String(input); + if (url.endsWith('/api/settings/operator-handle')) { + return new Response('forbidden', { status: 403 }); + } + return new Response('{}', { status: 200 }); + }) as any; + const ua = await buildWikimediaUserAgent('test'); + expect(ua).toContain('operator: operator-offline'); + }); + + it('sends per-operator Api-User-Agent on Wikipedia summary fetch', async () => { + const wikiCalls: Array<{ url: string; init?: RequestInit }> = []; + const baseFetch = vi.fn(async (url: any, init?: RequestInit) => { + wikiCalls.push({ url: String(url), init }); return new Response( JSON.stringify({ type: 'standard', @@ -48,44 +84,71 @@ describe('lib/wikimediaClient', () => { }), { status: 200 }, ); - }) as any; + }); + globalThis.fetch = withHandle('operator-test01', baseFetch as any) as any; const summary = await fetchWikipediaSummary('Boeing 747'); expect(summary?.thumbnail).toBe('https://example.org/thumb.jpg'); - expect(calls).toHaveLength(1); - const headers = (calls[0].init?.headers || {}) as Record; - expect(headers['Api-User-Agent']).toBe(WIKIMEDIA_API_USER_AGENT); + // wikiCalls only captures calls to non-handle URLs. + expect(wikiCalls).toHaveLength(1); + const headers = (wikiCalls[0].init?.headers || {}) as Record; + expect(headers['Api-User-Agent']).toContain('operator: operator-test01'); + expect(headers['Api-User-Agent']).toContain('purpose: wikipedia-summary'); }); - it('sends Api-User-Agent on Wikidata SPARQL fetch', async () => { + it('sends per-operator Api-User-Agent on Wikidata SPARQL fetch', async () => { const calls: Array<{ url: string; init?: RequestInit }> = []; - globalThis.fetch = vi.fn(async (url: any, init?: RequestInit) => { + const baseFetch = vi.fn(async (url: any, init?: RequestInit) => { calls.push({ url: String(url), init }); return new Response( JSON.stringify({ - results: { - bindings: [ - { - leaderLabel: { value: 'Test Leader' }, - govTypeLabel: { value: 'Test Government' }, - }, - ], - }, + results: { bindings: [{ leaderLabel: { value: 'Test Leader' } }] }, + }), + { status: 200 }, + ); + }); + globalThis.fetch = withHandle('operator-sparql', baseFetch as any) as any; + + const bindings = await fetchWikidataSparql('SELECT * WHERE { ?s ?p ?o }'); + expect(bindings).toHaveLength(1); + const headers = (calls[0].init?.headers || {}) as Record; + expect(headers['Api-User-Agent']).toContain('operator: operator-sparql'); + expect(headers['Api-User-Agent']).toContain('purpose: wikidata-sparql'); + expect(headers['Accept']).toBe('application/sparql-results+json'); + }); + + it('handle endpoint is queried only ONCE across many wiki fetches', async () => { + let handleCalls = 0; + let wikiCalls = 0; + globalThis.fetch = vi.fn(async (input: any) => { + const url = String(input); + if (url.endsWith('/api/settings/operator-handle')) { + handleCalls++; + return new Response(JSON.stringify({ handle: 'operator-cache' }), { status: 200 }); + } + wikiCalls++; + return new Response( + JSON.stringify({ + type: 'standard', + title: 'X', + description: '', + extract: '', + thumbnail: { source: 'https://example.org/x.jpg' }, }), { status: 200 }, ); }) as any; - const bindings = await fetchWikidataSparql('SELECT * WHERE { ?s ?p ?o }'); - expect(bindings).toHaveLength(1); - const headers = (calls[0].init?.headers || {}) as Record; - expect(headers['Api-User-Agent']).toBe(WIKIMEDIA_API_USER_AGENT); - expect(headers['Accept']).toBe('application/sparql-results+json'); + await fetchWikipediaSummary('Eiffel Tower'); + await fetchWikipediaSummary('Mount Fuji'); + await fetchWikipediaSummary('Statue of Liberty'); + expect(handleCalls).toBe(1); + expect(wikiCalls).toBe(3); }); it('shares cache across consecutive callers for the same Wikipedia title', async () => { let fetchCount = 0; - globalThis.fetch = vi.fn(async () => { + const baseFetch = vi.fn(async () => { fetchCount++; return new Response( JSON.stringify({ @@ -97,7 +160,8 @@ describe('lib/wikimediaClient', () => { }), { status: 200 }, ); - }) as any; + }); + globalThis.fetch = withHandle('operator-cache', baseFetch as any) as any; const a = await fetchWikipediaSummary('Eiffel Tower'); const b = await fetchWikipediaSummary('Eiffel Tower'); @@ -107,7 +171,7 @@ describe('lib/wikimediaClient', () => { it('deduplicates concurrent in-flight requests for the same title', async () => { let fetchCount = 0; - globalThis.fetch = vi.fn(async () => { + const baseFetch = vi.fn(async () => { fetchCount++; await new Promise((r) => setTimeout(r, 5)); return new Response( @@ -120,7 +184,8 @@ describe('lib/wikimediaClient', () => { }), { status: 200 }, ); - }) as any; + }); + globalThis.fetch = withHandle('operator-cache', baseFetch as any) as any; const [a, b, c] = await Promise.all([ fetchWikipediaSummary('Mount Fuji'), @@ -134,28 +199,37 @@ describe('lib/wikimediaClient', () => { }); it('returns null on disambiguation pages without throwing', async () => { - globalThis.fetch = vi.fn(async () => - new Response(JSON.stringify({ type: 'disambiguation' }), { status: 200 }), + globalThis.fetch = withHandle( + 'operator-cache', + vi.fn(async () => + new Response(JSON.stringify({ type: 'disambiguation' }), { status: 200 }), + ) as any, ) as any; const summary = await fetchWikipediaSummary('Mercury'); expect(summary).toBeNull(); }); it('returns null on HTTP error without throwing', async () => { - globalThis.fetch = vi.fn(async () => new Response('not found', { status: 404 })) as any; + globalThis.fetch = withHandle( + 'operator-cache', + vi.fn(async () => new Response('not found', { status: 404 })) as any, + ) as any; const summary = await fetchWikipediaSummary('Nonexistent Article 12345'); expect(summary).toBeNull(); }); it('returns null on network error without throwing', async () => { - globalThis.fetch = vi.fn(async () => { - throw new Error('network down'); - }) as any; + globalThis.fetch = withHandle( + 'operator-cache', + vi.fn(async () => { + throw new Error('network down'); + }) as any, + ) as any; const summary = await fetchWikipediaSummary('Anything'); expect(summary).toBeNull(); }); - it('returns null on empty input', async () => { + it('returns null on empty input without fetching anything', async () => { globalThis.fetch = vi.fn(async () => new Response('{}', { status: 200 })) as any; expect(await fetchWikipediaSummary('')).toBeNull(); expect(await fetchWikipediaSummary(' ')).toBeNull(); diff --git a/frontend/src/lib/wikimediaClient.ts b/frontend/src/lib/wikimediaClient.ts index 8b0bbea..7a92aa3 100644 --- a/frontend/src/lib/wikimediaClient.ts +++ b/frontend/src/lib/wikimediaClient.ts @@ -1,51 +1,37 @@ /** * wikimediaClient — single fetch surface for Wikipedia / Wikidata. * - * Issues #218, #219, #220 (tg12 external audit): + * Issues #218, #219, #220 (tg12 external audit) + Round 7a: * * Wikimedia's User-Agent policy asks API clients to identify themselves * via `Api-User-Agent` when calling from browser JavaScript (because the - * browser does not let JS set `User-Agent` directly). Before this - * module existed, three independent components issued anonymous browser - * fetches against Wikipedia / Wikidata: + * browser does not let JS set `User-Agent` directly). Three independent + * components used to issue anonymous browser fetches against Wikipedia / + * Wikidata: * * - useRegionDossier (Wikidata SPARQL + Wikipedia REST summary) * - WikiImage (Wikipedia REST summary) * - NewsFeed (Wikipedia REST summary) * - * Each component shipped its own copy-pasted fetch + module-local cache. - * Provider-policy compliance was missing in all three places. + * PR #284 collapsed them into this shared helper with one stable + * `Api-User-Agent`. That fixed compliance but introduced a new problem: + * the `Api-User-Agent` was project-wide, so from Wikimedia's perspective + * every Shadowbroker install looked like one giant scraper. If one + * install misbehaved, Wikimedia's only recourse was to block the project + * as a whole. * - * This module centralizes: + * Round 7a fixes that. The frontend fetches the per-install operator + * handle from `GET /api/settings/operator-handle` once on first use and + * embeds it in the `Api-User-Agent`. Wikimedia can now rate-limit / + * contact the specific install instead of the project. The handle is + * auto-generated on the backend (`shadow-XXXXXX`) or operator-chosen via + * the `OPERATOR_HANDLE` setting. * - * 1. The `Api-User-Agent` header on every request. - * 2. A single LRU cache for Wikipedia summary lookups (keyed by article - * title). Multiple components asking for the same article share - * one in-flight request and one cache slot. - * 3. One predictable kill switch — if Wikimedia ever asks us to back - * off, we change `WIKIMEDIA_API_USER_AGENT` here and the whole - * frontend updates. - * - * This does NOT change end-user UX: - * - * - WikiImage still shows the same thumbnails. - * - NewsFeed still shows aircraft thumbnails. - * - useRegionDossier still returns the same place summary + leader. - * - * What changes: - * - * - Wikimedia can identify our traffic from any other anonymous - * browser visitor pool. - * - Provider-policy fixes happen here once, not in three places. + * UX impact: zero. Same thumbnails, same summaries, same load behavior. + * The only observable change is the value of the outgoing + * `Api-User-Agent` header. */ -// Stable identifier per Wikimedia UA policy. Includes a contact path so -// Wikimedia's operators can reach the project if they need to rate-limit -// or coordinate. Bump the version when the contact path changes. -export const WIKIMEDIA_API_USER_AGENT = - 'Shadowbroker/1.0 (+https://github.com/BigBodyCobain/Shadowbroker; ' + - 'report issues at /issues)'; - // Module-level cache shared by WikiImage, NewsFeed, and useRegionDossier. // Keyed by Wikipedia article title (NOT slug — we keep the human-readable // form so debugging the cache is easier). Values track in-flight state @@ -73,6 +59,66 @@ function evictIfOverCap() { if (oldest) _summaryCache.delete(oldest); } +// ─── Per-operator handle (Round 7a) ──────────────────────────────────────── + +// Fetched once from the backend on first need and cached for the page +// lifetime. The handle is NOT a secret — Wikimedia will see it on every +// Wikipedia / Wikidata request we make — but caching it locally avoids a +// round-trip on every Wikipedia fetch and lets the offline / no-backend +// case still produce a stable UA (the fallback handle). +let _handlePromise: Promise | null = null; +let _cachedHandle: string | null = null; + +const FALLBACK_HANDLE = 'operator-offline'; +const HANDLE_ENDPOINT = '/api/settings/operator-handle'; + +async function fetchOperatorHandle(): Promise { + try { + const res = await fetch(HANDLE_ENDPOINT, { + // Use the standard relative-path proxy so the Next.js admin-key + // injection (same-origin) flows naturally for legitimate browser + // sessions. A cross-origin scanner will be blocked by the proxy + // before this even leaves their browser. + credentials: 'same-origin', + }); + if (!res.ok) return FALLBACK_HANDLE; + const data = await res.json(); + const h = (data && typeof data.handle === 'string' && data.handle.trim()) || ''; + return h || FALLBACK_HANDLE; + } catch { + return FALLBACK_HANDLE; + } +} + +async function getOperatorHandle(): Promise { + if (_cachedHandle) return _cachedHandle; + if (!_handlePromise) { + _handlePromise = fetchOperatorHandle().then((h) => { + _cachedHandle = h; + return h; + }); + } + return _handlePromise; +} + +/** Build the Wikimedia Api-User-Agent for this install. + * + * Includes the per-install operator handle so Wikimedia can rate-limit / + * contact the specific operator instead of the project as a whole. + * Exported for tests; production callers should let + * `fetchWikipediaSummary` / `fetchWikidataSparql` build it implicitly. + */ +export async function buildWikimediaUserAgent(purpose: string): Promise { + const handle = await getOperatorHandle(); + const safePurpose = (purpose || '').replace(/[^a-zA-Z0-9_-]/g, '-').toLowerCase(); + return ( + `Shadowbroker/1.0 (operator: ${handle}; purpose: ${safePurpose}; ` + + '+https://github.com/BigBodyCobain/Shadowbroker; report issues at /issues)' + ); +} + +// ─── Wikipedia summary fetch ─────────────────────────────────────────────── + /** Fetch a Wikipedia article summary (titles, NOT URLs). * * Empty / invalid input resolves to `null`. Network errors and disambig @@ -92,40 +138,42 @@ export async function fetchWikipediaSummary( const slug = encodeURIComponent(trimmed.replace(/ /g, '_')); const url = `https://en.wikipedia.org/api/rest_v1/page/summary/${slug}`; - const promise = fetch(url, { - headers: { 'Api-User-Agent': WIKIMEDIA_API_USER_AGENT }, - }) - .then(async (r) => { + const promise = (async (): Promise => { + try { + const ua = await buildWikimediaUserAgent('wikipedia-summary'); + const r = await fetch(url, { headers: { 'Api-User-Agent': ua } }); if (!r.ok) return null; const d = await r.json(); if (d?.type === 'disambiguation') return null; - const summary: WikipediaSummary = { + return { title: trimmed, description: d?.description || '', extract: d?.extract || '', thumbnail: d?.thumbnail?.source || d?.originalimage?.source || '', type: d?.type || 'standard', }; - return summary; - }) - .catch(() => null) - .then((summary) => { - _summaryCache.set(trimmed, { summary, inflight: null, loaded: true }); - evictIfOverCap(); - return summary; - }); + } catch { + return null; + } + })().then((summary) => { + _summaryCache.set(trimmed, { summary, inflight: null, loaded: true }); + evictIfOverCap(); + return summary; + }); _summaryCache.set(trimmed, { summary: null, inflight: promise, loaded: false }); evictIfOverCap(); return promise; } +// ─── Wikidata SPARQL ─────────────────────────────────────────────────────── + /** Fetch a Wikidata SPARQL query result. * * Returns the parsed JSON `results.bindings` array on success; `null` * (not throwing) on any failure so callers can render fallbacks - * silently. Kept as a thin wrapper so the audit-required UA header is - * applied in exactly one place. + * silently. Per-install operator handle threaded through `Api-User-Agent` + * (Round 7a). */ export async function fetchWikidataSparql>( sparql: string, @@ -136,9 +184,10 @@ export async function fetchWikidataSparql> trimmed, )}&format=json`; try { + const ua = await buildWikimediaUserAgent('wikidata-sparql'); const res = await fetch(url, { headers: { - 'Api-User-Agent': WIKIMEDIA_API_USER_AGENT, + 'Api-User-Agent': ua, Accept: 'application/sparql-results+json', }, }); @@ -151,7 +200,11 @@ export async function fetchWikidataSparql> } } -/** Internal: clear the shared cache. Exposed for tests only. */ +// ─── Test helpers ────────────────────────────────────────────────────────── + +/** Internal: clear the shared cache + the handle cache. Exposed for tests only. */ export function _resetWikimediaClientCacheForTests() { _summaryCache.clear(); + _handlePromise = null; + _cachedHandle = null; } diff --git a/uv.lock b/uv.lock index fa417f4..6a28a8a 100644 --- a/uv.lock +++ b/uv.lock @@ -80,7 +80,6 @@ dependencies = [ { name = "apscheduler" }, { name = "beautifulsoup4" }, { name = "cachetools" }, - { name = "cloudscraper" }, { name = "cryptography" }, { name = "defusedxml" }, { name = "fastapi" }, @@ -119,7 +118,6 @@ requires-dist = [ { name = "apscheduler", specifier = "==3.10.3" }, { name = "beautifulsoup4", specifier = ">=4.9.0" }, { name = "cachetools", specifier = "==5.5.2" }, - { name = "cloudscraper", specifier = "==1.2.71" }, { name = "cryptography", specifier = ">=41.0.0" }, { name = "defusedxml", specifier = ">=0.7.1" }, { name = "fastapi", specifier = "==0.115.12" }, @@ -453,20 +451,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, ] -[[package]] -name = "cloudscraper" -version = "1.2.71" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyparsing" }, - { name = "requests" }, - { name = "requests-toolbelt" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ac/25/6d0481860583f44953bd791de0b7c4f6d7ead7223f8a17e776247b34a5b4/cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3", size = 93261, upload-time = "2023-04-25T23:20:19.467Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/97/fc88803a451029688dffd7eb446dc1b529657577aec13aceff1cc9628c5d/cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0", size = 99652, upload-time = "2023-04-25T23:20:15.974Z" }, -] - [[package]] name = "colorama" version = "0.4.6" @@ -1643,15 +1627,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/32/15e08a0c4bb536303e1568e2ba5cae1ce39a2e026a03aea46173af4c7a2d/pyobjc_framework_libdispatch-12.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:23fc9915cba328216b6a736c7a48438a16213f16dfb467f69506300b95938cc7", size = 15976, upload-time = "2025-11-14T09:53:07.936Z" }, ] -[[package]] -name = "pyparsing" -version = "3.3.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f3/91/9c6ee907786a473bf81c5f53cf703ba0957b23ab84c264080fb5a450416f/pyparsing-3.3.2.tar.gz", hash = "sha256:c777f4d763f140633dcb6d8a3eda953bf7a214dc4eff598413c070bcdc117cbc", size = 6851574, upload-time = "2026-01-21T03:57:59.36Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/10/bd/c038d7cc38edc1aa5bf91ab8068b63d4308c66c4c8bb3cbba7dfbc049f9c/pyparsing-3.3.2-py3-none-any.whl", hash = "sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d", size = 122781, upload-time = "2026-01-21T03:57:55.912Z" }, -] - [[package]] name = "pypubsub" version = "4.0.7" @@ -1901,18 +1876,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/8e/0e2d847013cb52cd35b38c009bb167a1a26b2ce6cd6965bf26b47bc0bf44/requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f", size = 62574, upload-time = "2023-05-22T15:12:42.313Z" }, ] -[[package]] -name = "requests-toolbelt" -version = "1.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f3/61/d7545dafb7ac2230c70d38d31cbfe4cc64f7144dc41f6e4e4b78ecd9f5bb/requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6", size = 206888, upload-time = "2023-05-01T04:11:33.229Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" }, -] - [[package]] name = "reverse-geocoder" version = "1.5.1"