Files
Shadowbroker/backend/services/fetchers/fimi.py
T
BigBodyCobain 71a9d9e144 [security] Close post-#227 control-surface and fetcher gaps
PR #227 hardened most Wormhole/Infonet control surfaces behind
require_local_operator and made the CrowdThreat fetcher opt-in. An
audit of the codebase against that PR's stated goals turned up four
classes of gap that the original change missed:

1. Two operator-only endpoints were left unprotected:
   - POST /api/wormhole/join: calls bootstrap_wormhole_identity() and
     flips the node into Tor mode, exactly the surface #227 hardened
     on /api/wormhole/identity/bootstrap.
   - POST /api/sigint/transmit: relays APRS-IS packets over radio
     using operator-supplied credentials. Anything that reached the
     API could transmit on the operator's authority.

   Both now require_local_operator. test_control_surface_auth.py
   extended with regression coverage for both.

2. Five third-party fetchers were still default-on, phoning home to
   politically/commercially sensitive upstreams on every poll cycle:
   - fimi.py            -> euvsdisinfo.eu        -> FIMI_ENABLED
   - prediction_markets -> Polymarket + Kalshi   -> PREDICTION_MARKETS_ENABLED
   - financial.py       -> Finnhub / yfinance    -> FINANCIAL_ENABLED or FINNHUB_API_KEY
   - nuforc_enrichment  -> huggingface.co        -> NUFORC_ENABLED
   - news.py            -> configured RSS feeds  -> NEWS_ENABLED (default on, kill switch)

   Same CrowdThreat-style pattern: explicit env-var opt-in, empty
   the data slot and mark_fresh when disabled. New regression test
   file test_third_party_fetchers_opt_in.py asserts each fetcher's
   network entry point is not called when its gate is off.

3. The outbound User-Agent leaked both the operator's personal email
   and a fork-specific GitHub URL on every fetcher request. Consolidated
   to a single DEFAULT_USER_AGENT in network_utils.py, project-generic
   by default (no contact info), overridable via SHADOWBROKER_USER_AGENT
   for operators who want to identify themselves (e.g. for Nominatim or
   weather.gov usage-policy compliance). Six call sites updated; the
   Nominatim-specific override is preserved.

4. The same generic UA now also flows through the peer prekey lookup
   in mesh_wormhole_prekey.py, so DM first-contact requests no longer
   identify the caller as a Shadowbroker fork to the peer being
   queried.

.env.example updated to document all new opt-in env vars.

Tests: backend/tests/test_control_surface_auth.py (extended),
       backend/tests/test_crowdthreat_opt_in.py (unchanged, still passes),
       backend/tests/test_third_party_fetchers_opt_in.py (new, 7 tests).
All 31 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-18 13:53:33 -06:00

292 lines
9.2 KiB
Python

"""EUvsDisinfo FIMI (Foreign Information Manipulation & Interference) fetcher.
Parses the EUvsDisinfo RSS feed to extract disinformation narratives,
debunked claims, threat actor mentions, and target country references.
Refreshes every 12 hours (FIMI data updates weekly).
"""
import os
import re
import logging
from datetime import datetime, timezone
import feedparser
from services.network_utils import fetch_with_curl
from services.fetchers._store import latest_data, _data_lock, _mark_fresh
from services.fetchers.retry import with_retry
logger = logging.getLogger("services.data_fetcher")
_FIMI_FEED_URL = "https://euvsdisinfo.eu/feed/"
def fimi_fetch_enabled() -> bool:
"""Return True only when the operator explicitly opts into FIMI pulls."""
return str(os.environ.get("FIMI_ENABLED", "")).strip().lower() in {
"1",
"true",
"yes",
"on",
}
# ── Threat actor keywords ──────────────────────────────────────────────────
# Map of keyword → canonical actor name. Checked case-insensitively.
_THREAT_ACTORS: dict[str, str] = {
"russia": "Russia",
"russian": "Russia",
"kremlin": "Russia",
"pro-kremlin": "Russia",
"moscow": "Russia",
"china": "China",
"chinese": "China",
"beijing": "China",
"iran": "Iran",
"iranian": "Iran",
"tehran": "Iran",
"north korea": "North Korea",
"pyongyang": "North Korea",
"dprk": "North Korea",
"belarus": "Belarus",
"belarusian": "Belarus",
"minsk": "Belarus",
}
# ── Target country/region keywords ─────────────────────────────────────────
_TARGET_KEYWORDS: dict[str, str] = {
"ukraine": "Ukraine",
"kyiv": "Ukraine",
"moldova": "Moldova",
"georgia": "Georgia",
"tbilisi": "Georgia",
"eu": "EU",
"european union": "EU",
"europe": "Europe",
"nato": "NATO",
"united states": "United States",
"usa": "United States",
"germany": "Germany",
"france": "France",
"poland": "Poland",
"baltic": "Baltics",
"lithuania": "Baltics",
"latvia": "Baltics",
"estonia": "Baltics",
"romania": "Romania",
"czech": "Czech Republic",
"slovakia": "Slovakia",
"armenia": "Armenia",
"africa": "Africa",
"middle east": "Middle East",
"syria": "Syria",
"israel": "Israel",
"serbia": "Serbia",
"india": "India",
"brazil": "Brazil",
}
# ── Disinformation topic keywords (for cross-referencing news) ─────────────
_DISINFO_TOPICS = [
"sanctions",
"energy crisis",
"gas supply",
"nuclear threat",
"nato expansion",
"biolab",
"biological weapon",
"provocation",
"false flag",
"staged",
"nazi",
"genocide",
"referendum",
"regime change",
"coup",
"puppet government",
"election interference",
"election meddling",
"voter fraud",
"migrant invasion",
"refugee crisis",
"civil war",
"food crisis",
"grain deal",
]
# Regex for extracting debunked report URLs from feed HTML
_REPORT_URL_RE = re.compile(
r'https?://euvsdisinfo\.eu/report/[a-z0-9\-]+/?',
re.IGNORECASE,
)
# Regex for extracting the claim title from a report URL slug
_SLUG_RE = re.compile(r'/report/([a-z0-9\-]+)/?$', re.IGNORECASE)
def _slug_to_title(url: str) -> str:
"""Convert a report URL slug to a human-readable title."""
m = _SLUG_RE.search(url)
if not m:
return url
return m.group(1).replace("-", " ").title()
def _count_mentions(text: str, keywords: dict[str, str]) -> dict[str, int]:
"""Count keyword mentions, mapping to canonical names."""
counts: dict[str, int] = {}
text_lower = text.lower()
for kw, canonical in keywords.items():
# Word-boundary match, case-insensitive
pattern = r'\b' + re.escape(kw) + r'\b'
matches = re.findall(pattern, text_lower)
if matches:
counts[canonical] = counts.get(canonical, 0) + len(matches)
return counts
def _extract_disinfo_keywords(text: str) -> list[str]:
"""Return which disinformation topic keywords appear in the text."""
text_lower = text.lower()
found = []
for topic in _DISINFO_TOPICS:
if topic in text_lower:
found.append(topic)
return found
def _is_major_wave(narratives: list[dict], targets: dict[str, int]) -> bool:
"""Heuristic: detect a 'major disinformation wave'.
Triggers when:
- 3+ narratives in the feed mention the same target, OR
- A single target has 10+ total mentions across all narratives, OR
- 5+ distinct debunked claims extracted in one fetch
"""
if not narratives:
return False
# Check per-target narrative count
target_narrative_counts: dict[str, int] = {}
total_claims = 0
for n in narratives:
for t in n.get("targets", []):
target_narrative_counts[t] = target_narrative_counts.get(t, 0) + 1
total_claims += len(n.get("claims", []))
if any(c >= 3 for c in target_narrative_counts.values()):
return True
if any(c >= 10 for c in targets.values()):
return True
if total_claims >= 5:
return True
return False
@with_retry(max_retries=1, base_delay=5)
def fetch_fimi():
"""Fetch and parse the EUvsDisinfo RSS feed."""
if not fimi_fetch_enabled():
logger.debug("FIMI fetch skipped; set FIMI_ENABLED=true to opt in")
with _data_lock:
latest_data["fimi"] = []
_mark_fresh("fimi")
return
try:
resp = fetch_with_curl(_FIMI_FEED_URL, timeout=15)
feed = feedparser.parse(resp.text)
except Exception as e:
logger.warning(f"FIMI feed fetch failed: {e}")
return
if not feed.entries:
logger.warning("FIMI feed: no entries found")
return
narratives = []
all_claims: list[dict] = []
agg_actors: dict[str, int] = {}
agg_targets: dict[str, int] = {}
all_disinfo_kw: set[str] = set()
for entry in feed.entries[:15]: # Cap at 15 entries
title = entry.get("title", "")
link = entry.get("link", "")
published = entry.get("published", "")
summary_html = entry.get("summary", "") or entry.get("description", "")
# Strip HTML tags for text analysis
summary_text = re.sub(r"<[^>]+>", " ", summary_html)
summary_text = re.sub(r"\s+", " ", summary_text).strip()
full_text = f"{title} {summary_text}"
# Extract debunked report URLs
report_urls = list(set(_REPORT_URL_RE.findall(summary_html)))
claims = [{"url": url, "title": _slug_to_title(url)} for url in report_urls]
all_claims.extend(claims)
# Count threat actors
actors = _count_mentions(full_text, _THREAT_ACTORS)
for actor, count in actors.items():
agg_actors[actor] = agg_actors.get(actor, 0) + count
# Count target countries
targets = _count_mentions(full_text, _TARGET_KEYWORDS)
for target, count in targets.items():
agg_targets[target] = agg_targets.get(target, 0) + count
# Extract disinfo topic keywords
disinfo_kw = _extract_disinfo_keywords(full_text)
all_disinfo_kw.update(disinfo_kw)
# Truncate summary for storage
snippet = summary_text[:300] + ("..." if len(summary_text) > 300 else "")
narratives.append({
"title": title,
"link": link,
"published": published,
"snippet": snippet,
"claims": claims,
"actors": list(actors.keys()),
"targets": list(targets.keys()),
"disinfo_keywords": disinfo_kw,
})
# Sort actors and targets by count (descending)
sorted_actors = dict(sorted(agg_actors.items(), key=lambda x: x[1], reverse=True))
sorted_targets = dict(sorted(agg_targets.items(), key=lambda x: x[1], reverse=True))
# Deduplicate claims
seen_urls: set[str] = set()
unique_claims = []
for c in all_claims:
if c["url"] not in seen_urls:
seen_urls.add(c["url"])
unique_claims.append(c)
major_wave = _is_major_wave(narratives, sorted_targets)
fimi_data = {
"narratives": narratives,
"claims": unique_claims,
"threat_actors": sorted_actors,
"targets": sorted_targets,
"disinfo_keywords": sorted(all_disinfo_kw),
"major_wave": major_wave,
"major_wave_target": (
max(sorted_targets, key=sorted_targets.get) if major_wave and sorted_targets else None
),
"last_fetched": datetime.now(timezone.utc).isoformat(),
"source": "EUvsDisinfo",
"source_url": "https://euvsdisinfo.eu",
}
with _data_lock:
latest_data["fimi"] = fimi_data
_mark_fresh("fimi")
logger.info(
f"FIMI fetch complete: {len(narratives)} narratives, "
f"{len(unique_claims)} claims, "
f"{len(sorted_actors)} actors, "
f"major_wave={major_wave}"
)