mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-05-28 18:11:31 +02:00
76750caa92
== Per-install operator handle for every third-party API call ==
Before this PR, every Shadowbroker install identified itself to
Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, Broadcastify,
weather.gov, NUFORC, Sentinel/Planetary Computer, TinyGS / CelesTrak,
Shodan, Finnhub, and others with a single project-wide User-Agent
("Shadowbroker/1.0" or "ShadowBroker-OSINT/1.0"). From the upstream's
perspective every install in the world looked like one giant scraper.
If one install misbehaved, the upstream's only recourse was to block
"Shadowbroker" as a whole.
PR #284 inadvertently doubled down on this in the frontend by
introducing a shared `WIKIMEDIA_API_USER_AGENT` constant. This PR
retrofits both backends to per-operator attribution.
New setting: OPERATOR_HANDLE (env var / settings UI / auto-gen)
New helper: network_utils.outbound_user_agent("purpose")
The handle is auto-generated as "operator-XXXXXX" on first call (the
"shadow-" prefix from earlier drafts was deliberately dropped — too
suspicious-looking for abuse-detection systems). Operators can
override via OPERATOR_HANDLE; the value is sanitized to lowercase
alphanumeric+dash+underscore and capped at 48 chars. Persisted to
backend/data/operator_handle.json so it survives container restarts.
Retrofitted call sites (every previously-MONSTER User-Agent):
- services/region_dossier.py (Wikipedia + Wikidata + Nominatim)
- services/geocode.py (Nominatim)
- services/sentinel_search.py (Microsoft Planetary Computer)
- services/feed_ingester.py (operator-curated RSS feeds)
- services/fetchers/earth_observation.py (weather.gov, NUFORC)
- services/fetchers/infrastructure.py
- services/fetchers/aircraft_database.py
- services/fetchers/route_database.py
- services/fetchers/trains.py
- services/fetchers/meshtastic_map.py
- services/shodan_connector.py
- services/unusual_whales_connector.py (Finnhub)
- services/tinygs_fetcher.py (CelesTrak + TinyGS)
- services/sar/sar_products_client.py
- services/geopolitics.py (GDELT)
- services/radio_intercept.py (Broadcastify + OpenMHz)
- routers/cctv.py + main.py (CCTV proxy)
- routers/ai_intel.py
- scripts/convert_power_plants.py (release-time data refresh)
Spoofed browser UAs removed (issues #289 / #290 / #291 — tg12 audit):
- cloudscraper-based Chrome impersonation against api.openmhz.com
-> replaced with honest requests + per-install UA
- Mozilla/5.0 spoofed UA on Broadcastify scrape
-> replaced with honest UA
- Mozilla/5.0 + fake first-party Referer on OpenMHz audio relay
-> replaced with honest UA
- cloudscraper dependency dropped from pyproject.toml + uv.lock
Frontend retrofit:
- new GET /api/settings/operator-handle endpoint (local-operator
gated) returns the install's handle
- frontend/src/lib/wikimediaClient.ts fetches the handle once on
first use, caches it for page lifetime, embeds it in the
Api-User-Agent for every Wikipedia / Wikidata browser-direct call
== GDELT GCS-direct fix ==
GDELT's data.gdeltproject.org is a CNAME to a Google Cloud Storage
bucket. GCS responds with the wildcard *.storage.googleapis.com cert
which legitimately does NOT cover the GDELT custom domain, so Python's
TLS verification correctly refuses the connection. Some networks
happen to route through a path where this works; many (notably Docker
Desktop's outbound NAT on local installs) do not. Verified on the
maintainer's local install: GDELT was unreachable; 1610 geopolitical
events / 48 export files were dropping silently.
Fix: services/geopolitics._gcs_direct_gdelt_url() rewrites any
data.gdeltproject.org URL to its GCS-direct equivalent
(storage.googleapis.com/data.gdeltproject.org/...) where the standard
GCS cert is genuinely valid. api.gdeltproject.org and every other host
are left untouched.
Confirmed live: backend log goes from
GDELT lastupdate failed: 500
to
Downloading 48 GDELT export files...
Downloaded 48/48 GDELT exports
GDELT parsed: 1610 conflict locations from 48 files
== Tests ==
backend/tests/test_per_operator_outbound_attribution.py (12 tests)
backend/tests/test_gdelt_gcs_direct_rewrite.py (6 tests)
backend/tests/test_region_dossier_wikimedia_ua.py (updated to
pin the helper + per-operator handle, not the old constant)
frontend/src/__tests__/utils/wikimediaClient.test.ts (rewritten
to mock /api/settings/operator-handle and assert per-operator UA)
Local: backend 114/114 security+audit+round7a suite green;
frontend 718/718 vitest suite green.
Credit: tg12 (external security audit, issues #289/#290/#291
relating to spoofed UAs); BigBodyCobain (operator-prefix call,
GDELT cloud-vs-local diagnosis).
321 lines
12 KiB
Python
321 lines
12 KiB
Python
"""
|
|
Local-only Shodan connector.
|
|
|
|
This module intentionally does NOT merge Shodan results into the dashboard's
|
|
canonical live-data store. It exposes manual, operator-triggered lookups that
|
|
can be rendered locally in the UI as a temporary investigative overlay.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import threading
|
|
import time
|
|
from typing import Any
|
|
|
|
import requests
|
|
from cachetools import TTLCache
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SHODAN_BASE = "https://api.shodan.io"
|
|
# Round 7a: per-install attribution. Shodan already has the operator API
|
|
# key for billing, but the UA still identifies the install.
|
|
def _shodan_user_agent():
|
|
from services.network_utils import outbound_user_agent
|
|
return outbound_user_agent("shodan")
|
|
_REQUEST_TIMEOUT = 15
|
|
_MIN_INTERVAL_SECONDS = 1.05 # Shodan docs say API plans are rate limited to ~1 req/sec.
|
|
_DEFAULT_SEARCH_PAGES = 1
|
|
_MAX_SEARCH_PAGES = 2
|
|
|
|
_search_cache: TTLCache[str, dict[str, Any]] = TTLCache(maxsize=24, ttl=90)
|
|
_count_cache: TTLCache[str, dict[str, Any]] = TTLCache(maxsize=24, ttl=120)
|
|
_host_cache: TTLCache[str, dict[str, Any]] = TTLCache(maxsize=32, ttl=300)
|
|
|
|
_request_lock = threading.Lock()
|
|
_last_request_at = 0.0
|
|
|
|
|
|
class ShodanConnectorError(Exception):
|
|
def __init__(self, detail: str, status_code: int = 400):
|
|
super().__init__(detail)
|
|
self.detail = detail
|
|
self.status_code = status_code
|
|
|
|
|
|
def _get_api_key() -> str:
|
|
api_key = os.environ.get("SHODAN_API_KEY", "").strip()
|
|
if not api_key:
|
|
raise ShodanConnectorError(
|
|
"Shodan API key not configured. Add SHODAN_API_KEY in Settings > API Keys.",
|
|
status_code=428,
|
|
)
|
|
return api_key
|
|
|
|
|
|
def _clean_query(value: str | None) -> str:
|
|
query = (value or "").strip()
|
|
if not query:
|
|
raise ShodanConnectorError("Shodan query cannot be empty.", status_code=400)
|
|
if "\n" in query or "\r" in query:
|
|
raise ShodanConnectorError("Shodan query must be a single line.", status_code=400)
|
|
return query
|
|
|
|
|
|
def _cache_key(prefix: str, payload: dict[str, Any]) -> str:
|
|
normalized = tuple(sorted((str(k), str(v)) for k, v in payload.items()))
|
|
return f"{prefix}:{normalized!r}"
|
|
|
|
|
|
def _normalize_string_list(values: Any, limit: int = 10) -> list[str]:
|
|
if not isinstance(values, list):
|
|
return []
|
|
cleaned: list[str] = []
|
|
for item in values:
|
|
text = str(item).strip()
|
|
if text:
|
|
cleaned.append(text)
|
|
if len(cleaned) >= limit:
|
|
break
|
|
return cleaned
|
|
|
|
|
|
def _location_label(location: dict[str, Any]) -> str | None:
|
|
parts = [
|
|
str(location.get("city") or "").strip(),
|
|
str(location.get("region_code") or "").strip(),
|
|
str(location.get("country_code") or "").strip(),
|
|
]
|
|
label = ", ".join([p for p in parts if p])
|
|
return label or None
|
|
|
|
|
|
def _normalize_match(match: dict[str, Any]) -> dict[str, Any]:
|
|
location = match.get("location") or {}
|
|
lat = location.get("latitude")
|
|
lng = location.get("longitude")
|
|
port = match.get("port")
|
|
ip_str = str(match.get("ip_str") or match.get("ip") or "").strip()
|
|
host_id = f"shodan-{ip_str or 'unknown'}-{port or 'na'}"
|
|
vulns = match.get("vulns") or []
|
|
if isinstance(vulns, dict):
|
|
vuln_list = _normalize_string_list(list(vulns.keys()), limit=12)
|
|
else:
|
|
vuln_list = _normalize_string_list(vulns, limit=12)
|
|
return {
|
|
"id": host_id,
|
|
"ip": ip_str or "UNKNOWN",
|
|
"port": port,
|
|
"transport": match.get("transport"),
|
|
"timestamp": match.get("timestamp"),
|
|
"lat": lat if isinstance(lat, (int, float)) else None,
|
|
"lng": lng if isinstance(lng, (int, float)) else None,
|
|
"city": location.get("city"),
|
|
"region_code": location.get("region_code"),
|
|
"country_code": location.get("country_code"),
|
|
"country_name": location.get("country_name"),
|
|
"location_label": _location_label(location),
|
|
"asn": match.get("asn"),
|
|
"org": match.get("org"),
|
|
"isp": match.get("isp"),
|
|
"product": match.get("product"),
|
|
"os": match.get("os"),
|
|
"hostnames": _normalize_string_list(match.get("hostnames")),
|
|
"domains": _normalize_string_list(match.get("domains")),
|
|
"tags": _normalize_string_list(match.get("tags")),
|
|
"vulns": vuln_list,
|
|
"data_snippet": str(match.get("data") or "").strip()[:280] or None,
|
|
"attribution": "Data from Shodan",
|
|
}
|
|
|
|
|
|
def _normalize_services(items: Any) -> list[dict[str, Any]]:
|
|
if not isinstance(items, list):
|
|
return []
|
|
services: list[dict[str, Any]] = []
|
|
for item in items[:30]:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
services.append(
|
|
{
|
|
"port": item.get("port"),
|
|
"transport": item.get("transport"),
|
|
"product": item.get("product"),
|
|
"timestamp": item.get("timestamp"),
|
|
"tags": _normalize_string_list(item.get("tags"), limit=8),
|
|
"banner_excerpt": str(item.get("data") or "").strip()[:320] or None,
|
|
}
|
|
)
|
|
return services
|
|
|
|
|
|
def _normalize_facets(raw_facets: Any) -> dict[str, list[dict[str, Any]]]:
|
|
if not isinstance(raw_facets, dict):
|
|
return {}
|
|
normalized: dict[str, list[dict[str, Any]]] = {}
|
|
for key, bucket_list in raw_facets.items():
|
|
if not isinstance(bucket_list, list):
|
|
continue
|
|
normalized[str(key)] = [
|
|
{"value": str(bucket.get("value") or ""), "count": int(bucket.get("count") or 0)}
|
|
for bucket in bucket_list[:12]
|
|
if isinstance(bucket, dict)
|
|
]
|
|
return normalized
|
|
|
|
|
|
def _request(path: str, *, params: dict[str, Any], cache: TTLCache[str, dict[str, Any]] | None = None) -> dict[str, Any]:
|
|
api_key = _get_api_key()
|
|
payload = {**params, "key": api_key}
|
|
cache_key = _cache_key(path, {k: v for k, v in payload.items() if k != "key"})
|
|
if cache is not None and cache_key in cache:
|
|
return cache[cache_key]
|
|
|
|
global _last_request_at
|
|
with _request_lock:
|
|
elapsed = time.monotonic() - _last_request_at
|
|
if elapsed < _MIN_INTERVAL_SECONDS:
|
|
time.sleep(_MIN_INTERVAL_SECONDS - elapsed)
|
|
try:
|
|
response = requests.get(
|
|
f"{_SHODAN_BASE}{path}",
|
|
params=payload,
|
|
timeout=_REQUEST_TIMEOUT,
|
|
headers={"User-Agent": _shodan_user_agent(), "Accept": "application/json"},
|
|
)
|
|
finally:
|
|
_last_request_at = time.monotonic()
|
|
|
|
if response.status_code == 401:
|
|
raise ShodanConnectorError("Shodan rejected the API key. Check SHODAN_API_KEY.", 401)
|
|
if response.status_code == 402:
|
|
raise ShodanConnectorError(
|
|
"Shodan returned payment/plan required. This feature needs a paid Shodan API plan.",
|
|
402,
|
|
)
|
|
if response.status_code == 429:
|
|
raise ShodanConnectorError(
|
|
"Shodan rate limit reached. Slow down queries and try again shortly.",
|
|
429,
|
|
)
|
|
if response.status_code >= 400:
|
|
detail = response.text.strip()[:240] or "Unexpected Shodan API error."
|
|
raise ShodanConnectorError(f"Shodan request failed: {detail}", response.status_code)
|
|
|
|
try:
|
|
parsed = response.json()
|
|
except ValueError as exc:
|
|
raise ShodanConnectorError(f"Shodan returned invalid JSON: {exc}", 502) from exc
|
|
|
|
if cache is not None:
|
|
cache[cache_key] = parsed
|
|
return parsed
|
|
|
|
|
|
def get_shodan_connector_status() -> dict[str, Any]:
|
|
has_key = bool(os.environ.get("SHODAN_API_KEY", "").strip())
|
|
return {
|
|
"ok": True,
|
|
"configured": has_key,
|
|
"source": "Shodan",
|
|
"mode": "operator-supplied local overlay",
|
|
"paid_api": True,
|
|
"manual_only": True,
|
|
"background_polling": False,
|
|
"local_only": True,
|
|
"attribution": "Data from Shodan",
|
|
"warning": (
|
|
"Shodan is a paid API. Searches use your local SHODAN_API_KEY, results stay local to "
|
|
"your ShadowBroker session by default, and any downstream use is your responsibility."
|
|
),
|
|
"limits": {
|
|
"default_pages_per_search": _DEFAULT_SEARCH_PAGES,
|
|
"max_pages_per_search": _MAX_SEARCH_PAGES,
|
|
"cooldown_seconds": _MIN_INTERVAL_SECONDS,
|
|
},
|
|
}
|
|
|
|
|
|
def search_shodan(query: str, page: int = 1, facets: list[str] | None = None) -> dict[str, Any]:
|
|
cleaned_query = _clean_query(query)
|
|
safe_page = max(1, min(int(page or 1), _MAX_SEARCH_PAGES))
|
|
facet_list = [str(f).strip() for f in (facets or []) if str(f).strip()][:6]
|
|
params: dict[str, Any] = {"query": cleaned_query, "page": safe_page}
|
|
if facet_list:
|
|
params["facets"] = ",".join(facet_list)
|
|
raw = _request("/shodan/host/search", params=params, cache=_search_cache)
|
|
matches = [_normalize_match(match) for match in raw.get("matches") or [] if isinstance(match, dict)]
|
|
return {
|
|
"ok": True,
|
|
"source": "Shodan",
|
|
"attribution": "Data from Shodan",
|
|
"query": cleaned_query,
|
|
"page": safe_page,
|
|
"total": int(raw.get("total") or 0),
|
|
"matches": matches,
|
|
"facets": _normalize_facets(raw.get("facets")),
|
|
"note": "Operator-triggered Shodan results. Not part of ShadowBroker core feeds.",
|
|
}
|
|
|
|
|
|
def count_shodan(query: str, facets: list[str] | None = None) -> dict[str, Any]:
|
|
cleaned_query = _clean_query(query)
|
|
facet_list = [str(f).strip() for f in (facets or []) if str(f).strip()][:8]
|
|
params: dict[str, Any] = {"query": cleaned_query}
|
|
if facet_list:
|
|
params["facets"] = ",".join(facet_list)
|
|
raw = _request("/shodan/host/count", params=params, cache=_count_cache)
|
|
return {
|
|
"ok": True,
|
|
"source": "Shodan",
|
|
"attribution": "Data from Shodan",
|
|
"query": cleaned_query,
|
|
"total": int(raw.get("total") or 0),
|
|
"facets": _normalize_facets(raw.get("facets")),
|
|
"note": "Count/facets query only. No persistent ShadowBroker storage.",
|
|
}
|
|
|
|
|
|
def lookup_shodan_host(ip: str, history: bool = False) -> dict[str, Any]:
|
|
clean_ip = str(ip or "").strip()
|
|
if not clean_ip:
|
|
raise ShodanConnectorError("Host lookup requires an IP address.", 400)
|
|
raw = _request(
|
|
f"/shodan/host/{clean_ip}",
|
|
params={"history": "true" if history else "false"},
|
|
cache=_host_cache,
|
|
)
|
|
location = raw.get("location") or {}
|
|
host = {
|
|
"id": f"shodan-{clean_ip}-host",
|
|
"ip": str(raw.get("ip_str") or clean_ip),
|
|
"lat": location.get("latitude") if isinstance(location.get("latitude"), (int, float)) else None,
|
|
"lng": location.get("longitude") if isinstance(location.get("longitude"), (int, float)) else None,
|
|
"city": location.get("city"),
|
|
"region_code": location.get("region_code"),
|
|
"country_code": location.get("country_code"),
|
|
"country_name": location.get("country_name"),
|
|
"location_label": _location_label(location),
|
|
"asn": raw.get("asn"),
|
|
"org": raw.get("org"),
|
|
"isp": raw.get("isp"),
|
|
"os": raw.get("os"),
|
|
"hostnames": _normalize_string_list(raw.get("hostnames")),
|
|
"domains": _normalize_string_list(raw.get("domains")),
|
|
"tags": _normalize_string_list(raw.get("tags")),
|
|
"ports": [int(p) for p in (raw.get("ports") or []) if isinstance(p, int)],
|
|
"services": _normalize_services(raw.get("data")),
|
|
"vulns": _normalize_string_list(list((raw.get("vulns") or {}).keys()) if isinstance(raw.get("vulns"), dict) else raw.get("vulns"), limit=20),
|
|
"attribution": "Data from Shodan",
|
|
}
|
|
return {
|
|
"ok": True,
|
|
"source": "Shodan",
|
|
"attribution": "Data from Shodan",
|
|
"host": host,
|
|
"history": bool(history),
|
|
"note": "Operator-triggered Shodan host lookup. Not merged into ShadowBroker datasets.",
|
|
}
|