mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-05-28 10:01:31 +02:00
76750caa92
== Per-install operator handle for every third-party API call ==
Before this PR, every Shadowbroker install identified itself to
Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, Broadcastify,
weather.gov, NUFORC, Sentinel/Planetary Computer, TinyGS / CelesTrak,
Shodan, Finnhub, and others with a single project-wide User-Agent
("Shadowbroker/1.0" or "ShadowBroker-OSINT/1.0"). From the upstream's
perspective every install in the world looked like one giant scraper.
If one install misbehaved, the upstream's only recourse was to block
"Shadowbroker" as a whole.
PR #284 inadvertently doubled down on this in the frontend by
introducing a shared `WIKIMEDIA_API_USER_AGENT` constant. This PR
retrofits both backends to per-operator attribution.
New setting: OPERATOR_HANDLE (env var / settings UI / auto-gen)
New helper: network_utils.outbound_user_agent("purpose")
The handle is auto-generated as "operator-XXXXXX" on first call (the
"shadow-" prefix from earlier drafts was deliberately dropped — too
suspicious-looking for abuse-detection systems). Operators can
override via OPERATOR_HANDLE; the value is sanitized to lowercase
alphanumeric+dash+underscore and capped at 48 chars. Persisted to
backend/data/operator_handle.json so it survives container restarts.
Retrofitted call sites (every previously-MONSTER User-Agent):
- services/region_dossier.py (Wikipedia + Wikidata + Nominatim)
- services/geocode.py (Nominatim)
- services/sentinel_search.py (Microsoft Planetary Computer)
- services/feed_ingester.py (operator-curated RSS feeds)
- services/fetchers/earth_observation.py (weather.gov, NUFORC)
- services/fetchers/infrastructure.py
- services/fetchers/aircraft_database.py
- services/fetchers/route_database.py
- services/fetchers/trains.py
- services/fetchers/meshtastic_map.py
- services/shodan_connector.py
- services/unusual_whales_connector.py (Finnhub)
- services/tinygs_fetcher.py (CelesTrak + TinyGS)
- services/sar/sar_products_client.py
- services/geopolitics.py (GDELT)
- services/radio_intercept.py (Broadcastify + OpenMHz)
- routers/cctv.py + main.py (CCTV proxy)
- routers/ai_intel.py
- scripts/convert_power_plants.py (release-time data refresh)
Spoofed browser UAs removed (issues #289 / #290 / #291 — tg12 audit):
- cloudscraper-based Chrome impersonation against api.openmhz.com
-> replaced with honest requests + per-install UA
- Mozilla/5.0 spoofed UA on Broadcastify scrape
-> replaced with honest UA
- Mozilla/5.0 + fake first-party Referer on OpenMHz audio relay
-> replaced with honest UA
- cloudscraper dependency dropped from pyproject.toml + uv.lock
Frontend retrofit:
- new GET /api/settings/operator-handle endpoint (local-operator
gated) returns the install's handle
- frontend/src/lib/wikimediaClient.ts fetches the handle once on
first use, caches it for page lifetime, embeds it in the
Api-User-Agent for every Wikipedia / Wikidata browser-direct call
== GDELT GCS-direct fix ==
GDELT's data.gdeltproject.org is a CNAME to a Google Cloud Storage
bucket. GCS responds with the wildcard *.storage.googleapis.com cert
which legitimately does NOT cover the GDELT custom domain, so Python's
TLS verification correctly refuses the connection. Some networks
happen to route through a path where this works; many (notably Docker
Desktop's outbound NAT on local installs) do not. Verified on the
maintainer's local install: GDELT was unreachable; 1610 geopolitical
events / 48 export files were dropping silently.
Fix: services/geopolitics._gcs_direct_gdelt_url() rewrites any
data.gdeltproject.org URL to its GCS-direct equivalent
(storage.googleapis.com/data.gdeltproject.org/...) where the standard
GCS cert is genuinely valid. api.gdeltproject.org and every other host
are left untouched.
Confirmed live: backend log goes from
GDELT lastupdate failed: 500
to
Downloading 48 GDELT export files...
Downloaded 48/48 GDELT exports
GDELT parsed: 1610 conflict locations from 48 files
== Tests ==
backend/tests/test_per_operator_outbound_attribution.py (12 tests)
backend/tests/test_gdelt_gcs_direct_rewrite.py (6 tests)
backend/tests/test_region_dossier_wikimedia_ua.py (updated to
pin the helper + per-operator handle, not the old constant)
frontend/src/__tests__/utils/wikimediaClient.test.ts (rewritten
to mock /api/settings/operator-handle and assert per-operator UA)
Local: backend 114/114 security+audit+round7a suite green;
frontend 718/718 vitest suite green.
Credit: tg12 (external security audit, issues #289/#290/#291
relating to spoofed UAs); BigBodyCobain (operator-prefix call,
GDELT cloud-vs-local diagnosis).
464 lines
16 KiB
Python
464 lines
16 KiB
Python
"""Train tracking fetchers with normalized metadata and non-redundant merging."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import math
|
|
from collections.abc import Callable
|
|
from datetime import datetime, timezone
|
|
|
|
from services.fetchers._store import _data_lock, _mark_fresh, latest_data
|
|
from services.network_utils import fetch_with_curl
|
|
|
|
|
|
|
|
def _trains_user_agent() -> str:
|
|
from services.network_utils import outbound_user_agent
|
|
return outbound_user_agent("trains")
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_EARTH_RADIUS_KM = 6371.0
|
|
_MERGE_DISTANCE_KM = 5.0
|
|
_MAX_INFERRED_SPEED_KMH = 350.0
|
|
_TRACK_CACHE_TTL_S = 6 * 60 * 60
|
|
|
|
_SOURCE_METADATA: dict[str, dict[str, object]] = {
|
|
"amtrak": {
|
|
"source_label": "Amtraker",
|
|
"operator": "Amtrak",
|
|
"country": "US",
|
|
"telemetry_quality": "aggregated",
|
|
"priority": 70,
|
|
},
|
|
"digitraffic": {
|
|
"source_label": "Digitraffic Finland",
|
|
"operator": "Finnish Rail",
|
|
"country": "FI",
|
|
"telemetry_quality": "official",
|
|
"priority": 100,
|
|
},
|
|
# Future slots so better official feeds can be merged without changing the
|
|
# rest of the train pipeline or duplicating map entities.
|
|
"networkrail": {
|
|
"source_label": "Network Rail Open Data",
|
|
"operator": "Network Rail",
|
|
"country": "GB",
|
|
"telemetry_quality": "official",
|
|
"priority": 98,
|
|
},
|
|
"dbcargo": {
|
|
"source_label": "DB Cargo link2rail",
|
|
"operator": "DB Cargo",
|
|
"country": "DE",
|
|
"telemetry_quality": "commercial",
|
|
"priority": 96,
|
|
},
|
|
"railinc": {
|
|
"source_label": "Railinc RailSight",
|
|
"operator": "Railinc",
|
|
"country": "US",
|
|
"telemetry_quality": "commercial",
|
|
"priority": 97,
|
|
},
|
|
"sncf": {
|
|
"source_label": "SNCF Open Data",
|
|
"operator": "SNCF",
|
|
"country": "FR",
|
|
"telemetry_quality": "official",
|
|
"priority": 94,
|
|
},
|
|
}
|
|
|
|
_TRAIN_TRACK_CACHE: dict[str, dict[str, float]] = {}
|
|
|
|
|
|
def _safe_float(value) -> float | None:
|
|
try:
|
|
if value is None or value == "":
|
|
return None
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _parse_observed_at(value) -> float | None:
|
|
if value is None or value == "":
|
|
return None
|
|
if isinstance(value, (int, float)):
|
|
raw = float(value)
|
|
return raw / 1000.0 if raw > 1_000_000_000_000 else raw
|
|
if not isinstance(value, str):
|
|
return None
|
|
text = value.strip()
|
|
if not text:
|
|
return None
|
|
if text.endswith("Z"):
|
|
text = f"{text[:-1]}+00:00"
|
|
try:
|
|
return datetime.fromisoformat(text).timestamp()
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _haversine_km(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
|
|
lat1_rad, lon1_rad = math.radians(lat1), math.radians(lon1)
|
|
lat2_rad, lon2_rad = math.radians(lat2), math.radians(lon2)
|
|
dlat = lat2_rad - lat1_rad
|
|
dlon = lon2_rad - lon1_rad
|
|
a = (
|
|
math.sin(dlat / 2.0) ** 2
|
|
+ math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2.0) ** 2
|
|
)
|
|
return 2.0 * _EARTH_RADIUS_KM * math.asin(math.sqrt(a))
|
|
|
|
|
|
def _bearing_degrees(lat1: float, lon1: float, lat2: float, lon2: float) -> float | None:
|
|
if lat1 == lat2 and lon1 == lon2:
|
|
return None
|
|
lat1_rad, lat2_rad = math.radians(lat1), math.radians(lat2)
|
|
dlon_rad = math.radians(lon2 - lon1)
|
|
y = math.sin(dlon_rad) * math.cos(lat2_rad)
|
|
x = (
|
|
math.cos(lat1_rad) * math.sin(lat2_rad)
|
|
- math.sin(lat1_rad) * math.cos(lat2_rad) * math.cos(dlon_rad)
|
|
)
|
|
return (math.degrees(math.atan2(y, x)) + 360.0) % 360.0
|
|
|
|
|
|
def _source_meta(source: str) -> dict[str, object]:
|
|
return dict(_SOURCE_METADATA.get(source, {}))
|
|
|
|
|
|
def _normalize_train(
|
|
*,
|
|
source: str,
|
|
raw_id: str,
|
|
number: str,
|
|
lat,
|
|
lng,
|
|
name: str = "",
|
|
status: str = "Active",
|
|
route: str = "",
|
|
speed_kmh=None,
|
|
heading=None,
|
|
operator: str | None = None,
|
|
country: str | None = None,
|
|
source_label: str | None = None,
|
|
telemetry_quality: str | None = None,
|
|
observed_at=None,
|
|
) -> dict | None:
|
|
lat_f = _safe_float(lat)
|
|
lng_f = _safe_float(lng)
|
|
if lat_f is None or lng_f is None:
|
|
return None
|
|
if not (-90.0 <= lat_f <= 90.0 and -180.0 <= lng_f <= 180.0):
|
|
return None
|
|
|
|
number_text = str(number or "").strip()
|
|
meta = _source_meta(source)
|
|
observed_ts = _parse_observed_at(observed_at) or datetime.now(timezone.utc).timestamp()
|
|
speed_f = _safe_float(speed_kmh)
|
|
heading_f = _safe_float(heading)
|
|
normalized = {
|
|
"id": str(raw_id or f"{source}-{number_text or 'unknown'}"),
|
|
"name": str(name or f"Train {number_text or '?'}").strip(),
|
|
"number": number_text,
|
|
"source": source,
|
|
"source_label": str(source_label or meta.get("source_label") or source.upper()),
|
|
"operator": str(operator or meta.get("operator") or "").strip(),
|
|
"country": str(country or meta.get("country") or "").strip(),
|
|
"telemetry_quality": str(
|
|
telemetry_quality or meta.get("telemetry_quality") or "unknown"
|
|
).strip(),
|
|
"lat": lat_f,
|
|
"lng": lng_f,
|
|
"speed_kmh": speed_f,
|
|
"heading": heading_f,
|
|
"status": str(status or "Active").strip(),
|
|
"route": str(route or "").strip(),
|
|
"_source_priority": int(meta.get("priority") or 0),
|
|
"_observed_ts": observed_ts,
|
|
}
|
|
_apply_motion_estimates(normalized)
|
|
return normalized
|
|
|
|
|
|
def _prune_track_cache(now_ts: float) -> None:
|
|
stale_before = now_ts - _TRACK_CACHE_TTL_S
|
|
stale_ids = [train_id for train_id, entry in _TRAIN_TRACK_CACHE.items() if entry["ts"] < stale_before]
|
|
for train_id in stale_ids:
|
|
_TRAIN_TRACK_CACHE.pop(train_id, None)
|
|
|
|
|
|
def _apply_motion_estimates(train: dict) -> None:
|
|
train_id = str(train.get("id") or "")
|
|
if not train_id:
|
|
return
|
|
now_ts = float(train.get("_observed_ts") or datetime.now(timezone.utc).timestamp())
|
|
_prune_track_cache(now_ts)
|
|
previous = _TRAIN_TRACK_CACHE.get(train_id)
|
|
if previous:
|
|
dt_s = now_ts - previous["ts"]
|
|
if 5.0 <= dt_s <= 15.0 * 60.0:
|
|
distance_km = _haversine_km(
|
|
float(previous["lat"]),
|
|
float(previous["lng"]),
|
|
float(train["lat"]),
|
|
float(train["lng"]),
|
|
)
|
|
if 0.02 <= distance_km <= (_MAX_INFERRED_SPEED_KMH * (dt_s / 3600.0)):
|
|
if train.get("speed_kmh") is None:
|
|
inferred_speed = distance_km / (dt_s / 3600.0)
|
|
train["speed_kmh"] = round(min(inferred_speed, _MAX_INFERRED_SPEED_KMH), 1)
|
|
if train.get("heading") is None:
|
|
inferred_heading = _bearing_degrees(
|
|
float(previous["lat"]),
|
|
float(previous["lng"]),
|
|
float(train["lat"]),
|
|
float(train["lng"]),
|
|
)
|
|
if inferred_heading is not None:
|
|
train["heading"] = round(inferred_heading, 1)
|
|
|
|
_TRAIN_TRACK_CACHE[train_id] = {
|
|
"lat": float(train["lat"]),
|
|
"lng": float(train["lng"]),
|
|
"ts": now_ts,
|
|
}
|
|
|
|
|
|
def _train_merge_key(train: dict) -> str:
|
|
operator = str(train.get("operator") or "").strip().lower()
|
|
country = str(train.get("country") or "").strip().lower()
|
|
number = str(train.get("number") or "").strip().lower()
|
|
if operator and number:
|
|
return f"{country}|{operator}|{number}"
|
|
return f"{str(train.get('source') or '').lower()}|{str(train.get('id') or '').lower()}"
|
|
|
|
|
|
def _train_completeness(train: dict) -> tuple[int, int, int]:
|
|
return (
|
|
1 if train.get("speed_kmh") is not None else 0,
|
|
1 if train.get("heading") is not None else 0,
|
|
1 if train.get("route") else 0,
|
|
)
|
|
|
|
|
|
def _should_merge(existing: dict, candidate: dict) -> bool:
|
|
if _train_merge_key(existing) != _train_merge_key(candidate):
|
|
return False
|
|
return _haversine_km(
|
|
float(existing["lat"]),
|
|
float(existing["lng"]),
|
|
float(candidate["lat"]),
|
|
float(candidate["lng"]),
|
|
) <= _MERGE_DISTANCE_KM
|
|
|
|
|
|
def _merge_train_pair(existing: dict, candidate: dict) -> dict:
|
|
existing_priority = int(existing.get("_source_priority") or 0)
|
|
candidate_priority = int(candidate.get("_source_priority") or 0)
|
|
existing_score = (existing_priority, _train_completeness(existing))
|
|
candidate_score = (candidate_priority, _train_completeness(candidate))
|
|
primary = candidate if candidate_score > existing_score else existing
|
|
secondary = existing if primary is candidate else candidate
|
|
merged = dict(primary)
|
|
|
|
for field in (
|
|
"speed_kmh",
|
|
"heading",
|
|
"route",
|
|
"status",
|
|
"operator",
|
|
"country",
|
|
"source_label",
|
|
"telemetry_quality",
|
|
):
|
|
if merged.get(field) in (None, "", "Active"):
|
|
replacement = secondary.get(field)
|
|
if replacement not in (None, ""):
|
|
merged[field] = replacement
|
|
|
|
if primary is not candidate and float(candidate.get("_observed_ts") or 0) > float(
|
|
primary.get("_observed_ts") or 0
|
|
):
|
|
merged["lat"] = candidate["lat"]
|
|
merged["lng"] = candidate["lng"]
|
|
merged["_observed_ts"] = candidate["_observed_ts"]
|
|
return merged
|
|
|
|
|
|
def _merge_nonredundant_trains(*sources: list[dict]) -> list[dict]:
|
|
merged: list[dict] = []
|
|
for source_trains in sources:
|
|
for train in source_trains:
|
|
exact_match = next(
|
|
(
|
|
idx
|
|
for idx, existing in enumerate(merged)
|
|
if existing.get("source") == train.get("source")
|
|
and existing.get("id") == train.get("id")
|
|
),
|
|
None,
|
|
)
|
|
if exact_match is not None:
|
|
merged[exact_match] = _merge_train_pair(merged[exact_match], train)
|
|
continue
|
|
|
|
merged_idx = next(
|
|
(idx for idx, existing in enumerate(merged) if _should_merge(existing, train)),
|
|
None,
|
|
)
|
|
if merged_idx is not None:
|
|
merged[merged_idx] = _merge_train_pair(merged[merged_idx], train)
|
|
continue
|
|
merged.append(train)
|
|
|
|
merged.sort(
|
|
key=lambda train: (
|
|
str(train.get("country") or ""),
|
|
str(train.get("operator") or ""),
|
|
str(train.get("number") or ""),
|
|
str(train.get("id") or ""),
|
|
)
|
|
)
|
|
for train in merged:
|
|
train.pop("_source_priority", None)
|
|
train.pop("_observed_ts", None)
|
|
return merged
|
|
|
|
|
|
def _fetch_amtraker() -> list[dict]:
|
|
"""Fetch all active Amtrak trains from the Amtraker API."""
|
|
try:
|
|
resp = fetch_with_curl(
|
|
"https://api.amtraker.com/v3/trains",
|
|
timeout=20,
|
|
headers={
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/136.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "application/json,text/plain,*/*",
|
|
"Referer": "https://www.amtraker.com/",
|
|
},
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.warning("Amtraker returned %s", resp.status_code)
|
|
return []
|
|
raw = resp.json()
|
|
trains: list[dict] = []
|
|
for train_num, variants in raw.items():
|
|
if not isinstance(variants, list):
|
|
continue
|
|
for item in variants:
|
|
normalized = _normalize_train(
|
|
source="amtrak",
|
|
raw_id=f"AMTK-{item.get('trainID', train_num)}",
|
|
name=item.get("routeName", f"Train {train_num}"),
|
|
number=str(item.get("trainNum", train_num) or train_num),
|
|
lat=item.get("lat"),
|
|
lng=item.get("lon"),
|
|
speed_kmh=item.get("velocity") or item.get("speed"),
|
|
heading=item.get("heading") or item.get("bearing"),
|
|
status=item.get("trainTimely") or "On Time",
|
|
route=item.get("routeName", ""),
|
|
observed_at=item.get("updatedAt")
|
|
or item.get("lastValTS")
|
|
or item.get("eventDT"),
|
|
)
|
|
if normalized:
|
|
trains.append(normalized)
|
|
return trains
|
|
except Exception as exc:
|
|
logger.warning("Amtraker fetch error: %s", exc)
|
|
return []
|
|
|
|
|
|
def _fetch_digitraffic() -> list[dict]:
|
|
"""Fetch live train positions from Finnish DigiTraffic API."""
|
|
try:
|
|
resp = fetch_with_curl(
|
|
"https://rata.digitraffic.fi/api/v1/train-locations/latest",
|
|
timeout=15,
|
|
headers={
|
|
"Accept-Encoding": "gzip",
|
|
"User-Agent": _trains_user_agent(),
|
|
},
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.warning("DigiTraffic returned %s", resp.status_code)
|
|
return []
|
|
raw = resp.json()
|
|
trains: list[dict] = []
|
|
for item in raw:
|
|
location = item.get("location", {})
|
|
coords = location.get("coordinates")
|
|
if not coords or len(coords) < 2:
|
|
continue
|
|
lon, lat = coords[0], coords[1]
|
|
train_number = str(item.get("trainNumber", "") or "").strip()
|
|
route_bits = [
|
|
str(item.get("departureStationShortCode") or "").strip(),
|
|
str(item.get("stationShortCode") or "").strip(),
|
|
]
|
|
route = " -> ".join([bit for bit in route_bits if bit])
|
|
train_type = str(item.get("trainType") or "").strip()
|
|
normalized = _normalize_train(
|
|
source="digitraffic",
|
|
raw_id=f"FIN-{train_number or len(trains)}",
|
|
name=f"{train_type} {train_number}".strip() or f"Train {train_number or '?'}",
|
|
number=train_number,
|
|
lat=lat,
|
|
lng=lon,
|
|
speed_kmh=item.get("speed"),
|
|
heading=item.get("heading"),
|
|
status="Active",
|
|
route=route,
|
|
observed_at=item.get("timestamp"),
|
|
)
|
|
if normalized:
|
|
trains.append(normalized)
|
|
return trains
|
|
except Exception as exc:
|
|
logger.warning("DigiTraffic fetch error: %s", exc)
|
|
return []
|
|
|
|
|
|
_TRAIN_FETCHERS: tuple[tuple[str, Callable[[], list[dict]]], ...] = (
|
|
("amtrak", _fetch_amtraker),
|
|
("digitraffic", _fetch_digitraffic),
|
|
)
|
|
|
|
|
|
def fetch_trains():
|
|
"""Fetch trains from all configured sources and merge without duplicates."""
|
|
with _data_lock:
|
|
existing_trains = list(latest_data.get("trains") or [])
|
|
source_batches: list[list[dict]] = []
|
|
source_counts: list[str] = []
|
|
for source_name, fetcher in _TRAIN_FETCHERS:
|
|
batch = fetcher()
|
|
source_batches.append(batch)
|
|
if batch:
|
|
source_counts.append(f"{source_name}:{len(batch)}")
|
|
|
|
trains = _merge_nonredundant_trains(*source_batches)
|
|
if not trains and existing_trains:
|
|
logger.warning(
|
|
"Train refresh returned 0 records — preserving %s cached trains until the next successful poll",
|
|
len(existing_trains),
|
|
)
|
|
trains = existing_trains
|
|
|
|
with _data_lock:
|
|
latest_data["trains"] = trains
|
|
_mark_fresh("trains")
|
|
logger.info(
|
|
"Trains: %s total%s",
|
|
len(trains),
|
|
f" ({', '.join(source_counts)})" if source_counts else "",
|
|
)
|