mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-05-28 18:11:31 +02:00
76750caa92
== Per-install operator handle for every third-party API call ==
Before this PR, every Shadowbroker install identified itself to
Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, Broadcastify,
weather.gov, NUFORC, Sentinel/Planetary Computer, TinyGS / CelesTrak,
Shodan, Finnhub, and others with a single project-wide User-Agent
("Shadowbroker/1.0" or "ShadowBroker-OSINT/1.0"). From the upstream's
perspective every install in the world looked like one giant scraper.
If one install misbehaved, the upstream's only recourse was to block
"Shadowbroker" as a whole.
PR #284 inadvertently doubled down on this in the frontend by
introducing a shared `WIKIMEDIA_API_USER_AGENT` constant. This PR
retrofits both backends to per-operator attribution.
New setting: OPERATOR_HANDLE (env var / settings UI / auto-gen)
New helper: network_utils.outbound_user_agent("purpose")
The handle is auto-generated as "operator-XXXXXX" on first call (the
"shadow-" prefix from earlier drafts was deliberately dropped — too
suspicious-looking for abuse-detection systems). Operators can
override via OPERATOR_HANDLE; the value is sanitized to lowercase
alphanumeric+dash+underscore and capped at 48 chars. Persisted to
backend/data/operator_handle.json so it survives container restarts.
Retrofitted call sites (every previously-MONSTER User-Agent):
- services/region_dossier.py (Wikipedia + Wikidata + Nominatim)
- services/geocode.py (Nominatim)
- services/sentinel_search.py (Microsoft Planetary Computer)
- services/feed_ingester.py (operator-curated RSS feeds)
- services/fetchers/earth_observation.py (weather.gov, NUFORC)
- services/fetchers/infrastructure.py
- services/fetchers/aircraft_database.py
- services/fetchers/route_database.py
- services/fetchers/trains.py
- services/fetchers/meshtastic_map.py
- services/shodan_connector.py
- services/unusual_whales_connector.py (Finnhub)
- services/tinygs_fetcher.py (CelesTrak + TinyGS)
- services/sar/sar_products_client.py
- services/geopolitics.py (GDELT)
- services/radio_intercept.py (Broadcastify + OpenMHz)
- routers/cctv.py + main.py (CCTV proxy)
- routers/ai_intel.py
- scripts/convert_power_plants.py (release-time data refresh)
Spoofed browser UAs removed (issues #289 / #290 / #291 — tg12 audit):
- cloudscraper-based Chrome impersonation against api.openmhz.com
-> replaced with honest requests + per-install UA
- Mozilla/5.0 spoofed UA on Broadcastify scrape
-> replaced with honest UA
- Mozilla/5.0 + fake first-party Referer on OpenMHz audio relay
-> replaced with honest UA
- cloudscraper dependency dropped from pyproject.toml + uv.lock
Frontend retrofit:
- new GET /api/settings/operator-handle endpoint (local-operator
gated) returns the install's handle
- frontend/src/lib/wikimediaClient.ts fetches the handle once on
first use, caches it for page lifetime, embeds it in the
Api-User-Agent for every Wikipedia / Wikidata browser-direct call
== GDELT GCS-direct fix ==
GDELT's data.gdeltproject.org is a CNAME to a Google Cloud Storage
bucket. GCS responds with the wildcard *.storage.googleapis.com cert
which legitimately does NOT cover the GDELT custom domain, so Python's
TLS verification correctly refuses the connection. Some networks
happen to route through a path where this works; many (notably Docker
Desktop's outbound NAT on local installs) do not. Verified on the
maintainer's local install: GDELT was unreachable; 1610 geopolitical
events / 48 export files were dropping silently.
Fix: services/geopolitics._gcs_direct_gdelt_url() rewrites any
data.gdeltproject.org URL to its GCS-direct equivalent
(storage.googleapis.com/data.gdeltproject.org/...) where the standard
GCS cert is genuinely valid. api.gdeltproject.org and every other host
are left untouched.
Confirmed live: backend log goes from
GDELT lastupdate failed: 500
to
Downloading 48 GDELT export files...
Downloaded 48/48 GDELT exports
GDELT parsed: 1610 conflict locations from 48 files
== Tests ==
backend/tests/test_per_operator_outbound_attribution.py (12 tests)
backend/tests/test_gdelt_gcs_direct_rewrite.py (6 tests)
backend/tests/test_region_dossier_wikimedia_ua.py (updated to
pin the helper + per-operator handle, not the old constant)
frontend/src/__tests__/utils/wikimediaClient.test.ts (rewritten
to mock /api/settings/operator-handle and assert per-operator UA)
Local: backend 114/114 security+audit+round7a suite green;
frontend 718/718 vitest suite green.
Credit: tg12 (external security audit, issues #289/#290/#291
relating to spoofed UAs); BigBodyCobain (operator-prefix call,
GDELT cloud-vs-local diagnosis).
181 lines
6.0 KiB
Python
181 lines
6.0 KiB
Python
"""OpenSky aircraft metadata: ICAO24 hex -> ICAO type code + friendly model.
|
|
|
|
OpenSky's /states/all does not include aircraft type, so OpenSky-sourced
|
|
flights arrive with ``t`` field empty. This module bulk-loads the public
|
|
OpenSky aircraft database (one snapshot CSV per month, ~108 MB uncompressed,
|
|
~600k aircraft) once every 5 days and exposes a fast in-memory hex lookup.
|
|
|
|
The data is also useful when adsb.lol's live API is degraded: even the
|
|
adsb.lol /v2 feed sometimes returns aircraft with empty ``t`` for newly seen
|
|
transponders, and the lookup gracefully fills those in too.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import logging
|
|
import threading
|
|
import time
|
|
from typing import Any
|
|
|
|
import defusedxml.ElementTree as ET
|
|
import requests
|
|
|
|
|
|
|
|
def _aircraft_db_user_agent() -> str:
|
|
"""Round 7a: lazy import so the per-install operator handle is included."""
|
|
from services.network_utils import outbound_user_agent
|
|
return outbound_user_agent("aircraft-database")
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_BUCKET_LIST_URL = (
|
|
"https://s3.opensky-network.org/data-samples?prefix=metadata/&list-type=2"
|
|
)
|
|
_BUCKET_BASE = "https://s3.opensky-network.org/data-samples/"
|
|
_S3_NS = "{http://s3.amazonaws.com/doc/2006-03-01/}"
|
|
_REFRESH_INTERVAL_S = 5 * 24 * 3600
|
|
_LIST_TIMEOUT_S = 30
|
|
_DOWNLOAD_TIMEOUT_S = 600
|
|
from services.network_utils import DEFAULT_USER_AGENT as _USER_AGENT
|
|
|
|
_lock = threading.RLock()
|
|
_aircraft_by_hex: dict[str, dict[str, str]] = {}
|
|
_last_refresh = 0.0
|
|
_in_progress = False
|
|
|
|
|
|
def _latest_snapshot_key() -> str:
|
|
"""Discover the most recent aircraft-database-complete snapshot key."""
|
|
response = requests.get(
|
|
_BUCKET_LIST_URL,
|
|
timeout=_LIST_TIMEOUT_S,
|
|
headers={"User-Agent": _aircraft_db_user_agent()},
|
|
)
|
|
response.raise_for_status()
|
|
root = ET.fromstring(response.text)
|
|
keys: list[str] = []
|
|
for content in root.iter(f"{_S3_NS}Contents"):
|
|
key_el = content.find(f"{_S3_NS}Key")
|
|
if key_el is None or not key_el.text:
|
|
continue
|
|
if "aircraft-database-complete-" in key_el.text and key_el.text.endswith(".csv"):
|
|
keys.append(key_el.text)
|
|
if not keys:
|
|
raise RuntimeError("no aircraft-database-complete snapshot found in bucket listing")
|
|
return sorted(keys)[-1]
|
|
|
|
|
|
def _stream_csv_index(url: str) -> dict[str, dict[str, str]]:
|
|
"""Stream-parse the OpenSky aircraft CSV into a hex-keyed index.
|
|
|
|
The CSV uses single-quote quoting, so csv.DictReader is configured with
|
|
``quotechar="'"``. Rows are processed line-by-line via iter_lines() to
|
|
keep memory bounded even though the file is ~108 MB.
|
|
"""
|
|
with requests.get(
|
|
url,
|
|
timeout=_DOWNLOAD_TIMEOUT_S,
|
|
stream=True,
|
|
headers={"User-Agent": _aircraft_db_user_agent()},
|
|
) as response:
|
|
response.raise_for_status()
|
|
line_iter = (
|
|
line.decode("utf-8", errors="replace")
|
|
for line in response.iter_lines(decode_unicode=False)
|
|
if line
|
|
)
|
|
reader = csv.DictReader(line_iter, quotechar="'")
|
|
index: dict[str, dict[str, str]] = {}
|
|
for row in reader:
|
|
hex_code = (row.get("icao24") or "").strip().lower()
|
|
if not hex_code or hex_code == "000000":
|
|
continue
|
|
typecode = (row.get("typecode") or "").strip().upper()
|
|
model = (row.get("model") or "").strip()
|
|
mfr = (row.get("manufacturerName") or "").strip()
|
|
registration = (row.get("registration") or "").strip().upper()
|
|
operator = (row.get("operator") or "").strip()
|
|
if not (typecode or model):
|
|
continue
|
|
entry: dict[str, str] = {}
|
|
if typecode:
|
|
entry["typecode"] = typecode
|
|
if model:
|
|
entry["model"] = model
|
|
if mfr:
|
|
entry["manufacturer"] = mfr
|
|
if registration:
|
|
entry["registration"] = registration
|
|
if operator:
|
|
entry["operator"] = operator
|
|
index[hex_code] = entry
|
|
return index
|
|
|
|
|
|
def refresh_aircraft_database(force: bool = False) -> bool:
|
|
"""Download the latest OpenSky aircraft snapshot and rebuild the index.
|
|
|
|
Returns True if a refresh was performed (success or attempted), False if
|
|
skipped because the cache is still fresh or another refresh is in flight.
|
|
"""
|
|
global _last_refresh, _in_progress
|
|
|
|
now = time.time()
|
|
with _lock:
|
|
if _in_progress:
|
|
return False
|
|
if not force and (now - _last_refresh) < _REFRESH_INTERVAL_S and _aircraft_by_hex:
|
|
return False
|
|
_in_progress = True
|
|
|
|
try:
|
|
started = time.time()
|
|
key = _latest_snapshot_key()
|
|
index = _stream_csv_index(_BUCKET_BASE + key)
|
|
with _lock:
|
|
_aircraft_by_hex.clear()
|
|
_aircraft_by_hex.update(index)
|
|
_last_refresh = time.time()
|
|
logger.info(
|
|
"aircraft database refreshed in %.1fs from %s: %d aircraft",
|
|
time.time() - started,
|
|
key,
|
|
len(index),
|
|
)
|
|
return True
|
|
except (requests.RequestException, OSError, ValueError, ET.ParseError) as exc:
|
|
logger.warning("aircraft database refresh failed: %s", exc)
|
|
return True
|
|
finally:
|
|
with _lock:
|
|
_in_progress = False
|
|
|
|
|
|
def lookup_aircraft(icao24: str) -> dict[str, str] | None:
|
|
"""Return the metadata record for an ICAO24 hex code, or None."""
|
|
key = (icao24 or "").strip().lower()
|
|
if not key:
|
|
return None
|
|
with _lock:
|
|
entry = _aircraft_by_hex.get(key)
|
|
return dict(entry) if entry else None
|
|
|
|
|
|
def lookup_aircraft_type(icao24: str) -> str:
|
|
"""Return the ICAO type code (e.g. 'B738', 'GLF4') or '' if unknown."""
|
|
entry = lookup_aircraft(icao24)
|
|
if not entry:
|
|
return ""
|
|
return entry.get("typecode", "")
|
|
|
|
|
|
def aircraft_database_status() -> dict[str, Any]:
|
|
with _lock:
|
|
return {
|
|
"last_refresh": _last_refresh,
|
|
"aircraft": len(_aircraft_by_hex),
|
|
"in_progress": _in_progress,
|
|
}
|