mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-06-05 13:58:15 +02:00
363b5a49c8
- User-Agent is per-install handle only (no Shadowbroker product token) - LiveUAMap: Windows UI consent when enabling Global Incidents; env override - Meshtastic callsign upstream header off by default (opt-in true) - Expanded docs/OUTBOUND_DATA.md and README link for CCTV, basemap, Broadcastify Co-authored-by: Cursor <cursoragent@cursor.com>
280 lines
12 KiB
Python
280 lines
12 KiB
Python
"""Round 7a: per-install operator handle threads through every outbound
|
|
third-party API call.
|
|
|
|
Background: before this change every Shadowbroker install identified
|
|
itself to Wikipedia, Wikidata, Nominatim, GDELT, OpenMHz, Broadcastify,
|
|
weather.gov, NUFORC, etc. with a single project-wide ``Shadowbroker``
|
|
User-Agent. From the upstream's perspective, every install in the world
|
|
looked like one giant scraper. If one install misbehaved, the upstream's
|
|
only recourse was to block ``Shadowbroker`` as a whole, taking out every
|
|
other install.
|
|
|
|
Fix: each install gets a stable pseudonymous handle (auto-generated like
|
|
``shadow-7f3a92`` or operator-overridden via ``OPERATOR_HANDLE``) that
|
|
gets embedded in the User-Agent for every outbound call. Upstreams can
|
|
now rate-limit / contact the specific operator instead of the project.
|
|
|
|
These tests pin:
|
|
|
|
1. The handle is auto-generated on first call if no override exists.
|
|
2. The handle survives process restart (persisted to disk).
|
|
3. ``OPERATOR_HANDLE`` env var override wins over the auto-gen handle.
|
|
4. The handle is sanitized (whitespace, special chars, length).
|
|
5. Every previously-MONSTER-UA call site now sends the per-operator UA.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture
|
|
def isolated_handle(tmp_path, monkeypatch):
|
|
"""Redirect the persistence path to tmp and reset caches between tests."""
|
|
from services import network_utils
|
|
|
|
handle_file = tmp_path / "operator_handle.json"
|
|
monkeypatch.setattr(network_utils, "_OPERATOR_HANDLE_FILE", handle_file)
|
|
network_utils._reset_operator_handle_cache_for_tests()
|
|
monkeypatch.delenv("OPERATOR_HANDLE", raising=False)
|
|
|
|
# Reset Settings cache so OPERATOR_HANDLE env changes are picked up.
|
|
from services.config import get_settings
|
|
get_settings.cache_clear()
|
|
|
|
yield network_utils
|
|
|
|
network_utils._reset_operator_handle_cache_for_tests()
|
|
get_settings.cache_clear()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Core handle generation / persistence / override
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestOperatorHandleGeneration:
|
|
def test_auto_generates_on_first_call(self, isolated_handle):
|
|
h = isolated_handle.get_operator_handle()
|
|
# Prefix is "operator-" (deliberately neutral; "shadow-" looked
|
|
# exactly like a pattern abuse-detection systems would auto-block).
|
|
assert h.startswith("operator-")
|
|
assert len(h) == len("operator-") + 6
|
|
# Hex suffix.
|
|
suffix = h.split("-", 1)[1]
|
|
int(suffix, 16) # raises if not hex
|
|
|
|
def test_persists_to_disk_so_handle_survives_restart(self, isolated_handle):
|
|
first = isolated_handle.get_operator_handle()
|
|
# Simulate process restart: clear in-memory cache, then ask again.
|
|
isolated_handle._reset_operator_handle_cache_for_tests()
|
|
second = isolated_handle.get_operator_handle()
|
|
assert second == first
|
|
# The file actually exists.
|
|
assert isolated_handle._OPERATOR_HANDLE_FILE.exists()
|
|
body = json.loads(isolated_handle._OPERATOR_HANDLE_FILE.read_text())
|
|
assert body["handle"] == first
|
|
|
|
def test_env_override_wins_over_auto_generated(self, isolated_handle, monkeypatch):
|
|
# First call without env var auto-generates.
|
|
auto = isolated_handle.get_operator_handle()
|
|
assert auto.startswith("operator-")
|
|
# Setting env var changes the resolved handle without touching the disk file.
|
|
monkeypatch.setenv("OPERATOR_HANDLE", "alice")
|
|
from services.config import get_settings
|
|
get_settings.cache_clear()
|
|
isolated_handle._reset_operator_handle_cache_for_tests()
|
|
assert isolated_handle.get_operator_handle() == "alice"
|
|
|
|
def test_handle_is_sanitized(self, isolated_handle, monkeypatch):
|
|
from services.config import get_settings
|
|
|
|
# Sanitization tests run against the normalizer directly so the
|
|
# empty-string case can be asserted independently of the env-var
|
|
# resolution path (where empty means "use auto-gen", not "use
|
|
# 'anonymous'").
|
|
from services.network_utils import _normalize_handle
|
|
|
|
cases = [
|
|
("Alice Smith", "alice-smith"),
|
|
("user@example.com", "user-example-com"),
|
|
(" whitespace ", "whitespace"),
|
|
("UPPER-CASE", "upper-case"),
|
|
("multiple---dashes", "multiple-dashes"),
|
|
("/leading/slash", "leading-slash"),
|
|
("trailing-", "trailing"),
|
|
("", "anonymous"),
|
|
]
|
|
for raw, expected in cases:
|
|
got = _normalize_handle(raw)
|
|
assert got == expected, f"{raw!r} -> {got!r}, expected {expected!r}"
|
|
assert got == got.lower()
|
|
for ch in got:
|
|
assert ch.isalnum() or ch in "-_", f"unsafe char {ch!r} in {got!r}"
|
|
assert "--" not in got
|
|
|
|
def test_handle_is_length_capped(self, isolated_handle, monkeypatch):
|
|
from services.config import get_settings
|
|
|
|
monkeypatch.setenv("OPERATOR_HANDLE", "x" * 1000)
|
|
get_settings.cache_clear()
|
|
isolated_handle._reset_operator_handle_cache_for_tests()
|
|
got = isolated_handle.get_operator_handle()
|
|
assert len(got) <= 48
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# outbound_user_agent() builds the right header
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestOutboundUserAgentString:
|
|
def test_ua_is_operator_handle(self, isolated_handle):
|
|
ua = isolated_handle.outbound_user_agent()
|
|
handle = isolated_handle.get_operator_handle()
|
|
assert ua == handle
|
|
|
|
def test_includes_purpose_when_provided(self, isolated_handle):
|
|
ua = isolated_handle.outbound_user_agent("wikipedia")
|
|
handle = isolated_handle.get_operator_handle()
|
|
assert ua == f"{handle} (purpose: wikipedia)"
|
|
|
|
def test_no_shadowbroker_product_token(self, isolated_handle):
|
|
ua = isolated_handle.outbound_user_agent("nominatim")
|
|
assert "shadowbroker" not in ua.lower()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Wikipedia / Wikidata — retroactive fix for PR #284's MONSTER pattern
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestWikimediaCallsAreNowPerOperator:
|
|
def test_wikidata_call_uses_per_operator_ua(self, isolated_handle, monkeypatch):
|
|
from services import region_dossier
|
|
|
|
captured = []
|
|
|
|
class _FakeResp:
|
|
status_code = 200
|
|
def json(self):
|
|
return {"results": {"bindings": []}}
|
|
|
|
def fake_fetch(url, **kwargs):
|
|
captured.append(kwargs.get("headers") or {})
|
|
return _FakeResp()
|
|
|
|
monkeypatch.setattr(region_dossier, "fetch_with_curl", fake_fetch)
|
|
region_dossier._fetch_wikidata_leader("Testlandia")
|
|
|
|
assert captured, "Wikidata fetcher was not called"
|
|
headers = captured[0]
|
|
assert "User-Agent" in headers
|
|
assert "Api-User-Agent" in headers
|
|
handle = isolated_handle.get_operator_handle()
|
|
for header_value in (headers["User-Agent"], headers["Api-User-Agent"]):
|
|
assert header_value.startswith(handle), (
|
|
f"Wikimedia UA must be the per-operator handle; got {header_value!r}"
|
|
)
|
|
|
|
def test_wikipedia_summary_uses_per_operator_ua(self, isolated_handle, monkeypatch):
|
|
from services import region_dossier
|
|
|
|
captured = []
|
|
|
|
class _FakeResp:
|
|
status_code = 200
|
|
def json(self):
|
|
return {
|
|
"type": "standard",
|
|
"description": "x",
|
|
"extract": "y",
|
|
"thumbnail": {"source": ""},
|
|
}
|
|
|
|
def fake_fetch(url, **kwargs):
|
|
captured.append((url, kwargs.get("headers") or {}))
|
|
return _FakeResp()
|
|
|
|
monkeypatch.setattr(region_dossier, "fetch_with_curl", fake_fetch)
|
|
region_dossier._fetch_local_wiki_summary("Paris", "France")
|
|
|
|
wikipedia_hits = [c for c in captured if "wikipedia.org" in c[0]]
|
|
assert wikipedia_hits, "Wikipedia summary fetch was not called"
|
|
for _url, headers in wikipedia_hits:
|
|
handle = isolated_handle.get_operator_handle()
|
|
ua = headers.get("User-Agent", "")
|
|
assert ua.startswith(handle), f"Wikipedia UA must be the operator handle; got {ua!r}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Generic round-7a regression guard
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestNoMonsterUserAgentRemains:
|
|
"""The audit's underlying concern was that every Shadowbroker install
|
|
looked like one entity. This test scans the codebase for the OLD
|
|
aggregate identifier patterns and fails if a new one sneaks back in.
|
|
|
|
We allow the strings to appear in:
|
|
- comments (audit prose, change-log notes)
|
|
- tests
|
|
- .env.example (documentation)
|
|
The test only fails if the string lives in actual outbound-request
|
|
HEADER values without going through the per-operator helper.
|
|
"""
|
|
|
|
BANNED_LITERALS = (
|
|
"Shadowbroker/",
|
|
"ShadowBroker-OSINT/1.0",
|
|
"ShadowBroker-OSINT/0.9",
|
|
"ShadowBroker-FeedIngester/1.0",
|
|
"ShadowBroker/0.9.79 local Shodan connector",
|
|
"ShadowBroker/0.9.79 Finnhub connector",
|
|
"ShadowBroker/0.9.8 local Shodan connector",
|
|
"ShadowBroker/0.9.8 Finnhub connector",
|
|
"ShadowBroker/0.9.81 local Shodan connector",
|
|
"ShadowBroker/0.9.81 Finnhub connector",
|
|
"Mozilla/5.0 (compatible; ShadowBroker CCTV proxy)",
|
|
)
|
|
|
|
def test_no_banned_aggregate_user_agent_strings(self):
|
|
from pathlib import Path
|
|
|
|
backend_root = Path(__file__).parent.parent
|
|
offenders = []
|
|
for py in backend_root.rglob("*.py"):
|
|
# Skip test files and any audit-context comments.
|
|
rel = py.relative_to(backend_root).as_posix()
|
|
if rel.startswith("tests/"):
|
|
continue
|
|
text = py.read_text(encoding="utf-8", errors="ignore")
|
|
# Look only for the literal as part of a string in a User-Agent
|
|
# context: cheap heuristic via "User-Agent" + literal coexisting
|
|
# in the same file. A literal in a comment block won't trigger
|
|
# because the same line won't have User-Agent surrounding it.
|
|
for banned in self.BANNED_LITERALS:
|
|
if banned in text:
|
|
# Walk lines to ensure it's a real header value.
|
|
for i, line in enumerate(text.splitlines(), 1):
|
|
if banned in line:
|
|
# Comments / docstrings are allowed — only fail
|
|
# if the line looks like a header assignment.
|
|
stripped = line.strip()
|
|
if stripped.startswith("#"):
|
|
continue
|
|
if '"User-Agent"' in line or "'User-Agent'" in line:
|
|
offenders.append(f"{rel}:{i}: {stripped[:120]}")
|
|
assert not offenders, (
|
|
"Round 7a regression: the following lines reintroduced an "
|
|
"aggregate Shadowbroker User-Agent. Use "
|
|
"outbound_user_agent('purpose') instead so the per-install "
|
|
"operator handle is embedded.\n"
|
|
+ "\n".join(offenders)
|
|
)
|