import requests
import logging
import zipfile
import socket
import ipaddress
from cachetools import cached, TTLCache
from datetime import datetime
from urllib.parse import urljoin, urlparse
from services.network_utils import fetch_with_curl
def _geopolitics_user_agent() -> str:
"""Round 7a: GDELT geopolitics fetcher attribution."""
from services.network_utils import outbound_user_agent
return outbound_user_agent("geopolitics-gdelt")
logger = logging.getLogger(__name__)
# Cache Frontline data for 30 minutes, it doesn't move that fast
frontline_cache = TTLCache(maxsize=1, ttl=1800)
@cached(frontline_cache)
def fetch_ukraine_frontlines():
"""
Fetches the latest GeoJSON data representing the Ukraine frontline.
We use the cyterat/deepstate-map-data github mirror since the public API is locked.
"""
try:
logger.info("Fetching DeepStateMap from GitHub mirror...")
# First, query the repo tree to find the latest file name
tree_url = (
"https://api.github.com/repos/cyterat/deepstate-map-data/git/trees/main?recursive=1"
)
res_tree = requests.get(tree_url, timeout=10)
if res_tree.status_code == 200:
tree_data = res_tree.json().get("tree", [])
# Filter for geojson files in data folder
geo_files = [
item["path"]
for item in tree_data
if item["path"].startswith("data/deepstatemap_data_")
and item["path"].endswith(".geojson")
]
if geo_files:
# Get the alphabetically latest file (since it's named with YYYYMMDD)
latest_file = sorted(geo_files)[-1]
raw_url = f"https://raw.githubusercontent.com/cyterat/deepstate-map-data/main/{latest_file}"
logger.info(f"Downloading latest DeepStateMap: {raw_url}")
res_geo = requests.get(raw_url, timeout=20)
if res_geo.status_code == 200:
data = res_geo.json()
# The Cyterat GitHub mirror strips all properties and just provides a raw array of Feature polygons.
# Based on DeepStateMap's frontend mapping, the array index corresponds to the zone type:
# 0: Russian-occupied areas
# 1: Russian advance
# 2: Liberated area
# 3: Uncontested/Crimea (often folded into occupied)
name_map = {
0: "Russian-occupied areas",
1: "Russian advance",
2: "Liberated area",
3: "Russian-occupied areas", # Crimea / LPR / DPR
4: "Directions of UA attacks",
}
if "features" in data:
for idx, feature in enumerate(data["features"]):
if "properties" not in feature or feature["properties"] is None:
feature["properties"] = {}
feature["properties"]["name"] = name_map.get(
idx, "Russian-occupied areas"
)
feature["properties"]["zone_id"] = idx
return data
else:
logger.error(
f"Failed to fetch parsed Github Raw GeoJSON: {res_geo.status_code}"
)
else:
logger.error(f"Failed to fetch Github Tree for Deepstatemap: {res_tree.status_code}")
except (requests.RequestException, ConnectionError, TimeoutError, ValueError, KeyError) as e:
logger.error(f"Error fetching DeepStateMap: {e}")
return None
# Cache GDELT data for 6 hours - heavy aggregation, data doesn't change rapidly
gdelt_cache = TTLCache(maxsize=1, ttl=21600)
def _extract_domain(url):
"""Extract a clean source name from a URL, e.g. 'nytimes.com' from 'https://www.nytimes.com/...'"""
try:
from urllib.parse import urlparse
host = urlparse(url).hostname or ""
# Strip www. prefix
if host.startswith("www."):
host = host[4:]
return host
except (ValueError, AttributeError, KeyError): # non-critical
return url[:40]
def _url_to_headline(url):
"""Extract a human-readable headline from a URL path.
e.g. 'https://nytimes.com/2026/03/us-strikes-iran-nuclear-sites.html' -> 'Us Strikes Iran Nuclear Sites'
Falls back to domain name if the URL slug is gibberish (hex IDs, UUIDs, etc.).
"""
import re
try:
from urllib.parse import urlparse, unquote
parsed = urlparse(url)
domain = parsed.hostname or ""
if domain.startswith("www."):
domain = domain[4:]
# Get last meaningful path segment
path = unquote(parsed.path).strip("/")
if not path:
return domain
# Try the last path segment first, then walk backwards
segments = [s for s in path.split("/") if s]
slug = ""
for seg in reversed(segments):
# Remove file extensions
for ext in [".html", ".htm", ".php", ".asp", ".aspx", ".shtml"]:
if seg.lower().endswith(ext):
seg = seg[: -len(ext)]
# Skip segments that are clearly not headlines
if _is_gibberish(seg):
continue
slug = seg
break
if not slug:
return domain
# Remove common ID patterns at start/end
slug = re.sub(r"^[\d]+-", "", slug) # leading "13847569-"
slug = re.sub(r"-[\da-f]{6,}$", "", slug) # trailing hex IDs
slug = re.sub(r"[-_]c-\d+$", "", slug) # trailing "-c-21803431"
slug = re.sub(r"^p=\d+$", "", slug) # WordPress ?p=1234
# Convert slug separators to spaces
slug = slug.replace("-", " ").replace("_", " ")
slug = re.sub(r"\s+", " ", slug).strip()
# Final gibberish check after cleanup
if len(slug) < 8 or _is_gibberish(slug.replace(" ", "-")):
return domain
# Title case and truncate
headline = slug.title()
if len(headline) > 90:
headline = headline[:87] + "..."
return headline
except (ValueError, AttributeError, KeyError): # non-critical
return url[:60]
def _is_gibberish(text):
"""Detect if a URL segment is gibberish (hex IDs, UUIDs, numeric IDs, etc.)
rather than a real human-readable slug like 'us-strikes-iran'."""
import re
t = text.strip()
if not t:
return True
# Pure numbers
if re.match(r"^\d+$", t):
return True
# UUID pattern (with or without dashes)
if re.match(
r"^[0-9a-f]{8}[_-]?[0-9a-f]{4}[_-]?[0-9a-f]{4}[_-]?[0-9a-f]{4}[_-]?[0-9a-f]{12}$", t, re.I
):
return True
# Hex-heavy string: more than 40% hex digits among alphanumeric chars
alnum = re.sub(r"[^a-zA-Z0-9]", "", t)
if alnum:
hex_chars = sum(1 for c in alnum if c in "0123456789abcdefABCDEF")
if hex_chars / len(alnum) > 0.4 and len(alnum) > 6:
return True
# Mostly digits with a few alpha (like "article8efa6c53")
digits = sum(1 for c in alnum if c.isdigit())
if alnum and digits / len(alnum) > 0.5:
return True
# Too short to be a headline slug
if len(t) < 5:
return True
# Query-param style segments
if "=" in t:
return True
return False
# Persistent cache for article titles — survives across GDELT cache refreshes
# Bounded to 5000 entries with 24hr TTL to prevent unbounded memory growth
_article_title_cache = TTLCache(maxsize=5000, ttl=86400)
_article_snippet_cache: dict[str, str | None] = {}
_article_url_safety_cache = TTLCache(maxsize=5000, ttl=3600)
_TITLE_FETCH_MAX_REDIRECTS = 3
_TITLE_FETCH_READ_BYTES = 32768
_ALLOWED_ARTICLE_PORTS = {80, 443, 8080, 8443}
_MAX_SNIPPET_LEN = 200
def _hostname_resolves_public(hostname: str, port: int) -> bool:
try:
infos = socket.getaddrinfo(hostname, port, type=socket.SOCK_STREAM)
except (socket.gaierror, OSError):
return False
addresses = set()
for info in infos:
sockaddr = info[4] if len(info) > 4 else None
if not sockaddr:
continue
raw_addr = str(sockaddr[0] or "").split("%", 1)[0]
if not raw_addr:
continue
try:
addresses.add(ipaddress.ip_address(raw_addr))
except ValueError:
continue
return bool(addresses) and all(addr.is_global for addr in addresses)
def _is_safe_public_article_url(url: str) -> tuple[bool, str]:
cached = _article_url_safety_cache.get(url)
if cached is not None:
return cached
try:
parsed = urlparse(str(url or "").strip())
except ValueError:
result = (False, "parse_error")
_article_url_safety_cache[url] = result
return result
scheme = str(parsed.scheme or "").lower()
host = str(parsed.hostname or "").strip().lower()
if scheme not in {"http", "https"}:
result = (False, "scheme")
elif not host:
result = (False, "host")
elif parsed.username or parsed.password:
result = (False, "userinfo")
elif host in {"localhost", "localhost.localdomain"}:
result = (False, "localhost")
else:
port = parsed.port or (443 if scheme == "https" else 80)
if port not in _ALLOWED_ARTICLE_PORTS:
result = (False, "port")
else:
try:
target_ip = ipaddress.ip_address(host.split("%", 1)[0])
except ValueError:
target_ip = None
if target_ip is not None:
result = (True, "") if target_ip.is_global else (False, "private_ip")
else:
result = (True, "") if _hostname_resolves_public(host, port) else (False, "private_dns")
_article_url_safety_cache[url] = result
return result
def _extract_snippet(url: str, chunk: str) -> None:
"""Extract og:description or meta description from an already-fetched HTML chunk."""
import re
import html as html_mod
if url in _article_snippet_cache:
return
snippet = None
# Try og:description first
for pattern in (
r']+property=["\']og:description["\'][^>]+content=["\']([^"\'>]+)["\']',
r']+content=["\']([^"\'>]+)["\'][^>]+property=["\']og:description["\']',
r']+name=["\']description["\'][^>]+content=["\']([^"\'>]+)["\']',
r']+content=["\']([^"\'>]+)["\'][^>]+name=["\']description["\']',
):
m = re.search(pattern, chunk, re.I)
if m:
snippet = html_mod.unescape(m.group(1)).strip()
break
if snippet and len(snippet) > _MAX_SNIPPET_LEN:
snippet = snippet[:_MAX_SNIPPET_LEN - 3].rsplit(" ", 1)[0] + "..."
_article_snippet_cache[url] = snippet if snippet and len(snippet) > 15 else None
def _fetch_article_title(url):
"""Fetch the real headline from an article's HTML
or og:title tag.
Returns the title string, or None if it can't be fetched.
Uses a persistent cache to avoid refetching."""
if url in _article_title_cache:
return _article_title_cache[url]
import re
try:
current_url = str(url or "").strip()
chunk = ""
for _ in range(_TITLE_FETCH_MAX_REDIRECTS + 1):
allowed, _reason = _is_safe_public_article_url(current_url)
if not allowed:
_article_title_cache[url] = None
return None
resp = requests.get(
current_url,
timeout=4,
headers={"User-Agent": _geopolitics_user_agent()},
stream=True,
allow_redirects=False,
)
try:
location = str(resp.headers.get("Location") or "").strip()
if 300 <= resp.status_code < 400 and location:
current_url = urljoin(current_url, location)
continue
if resp.status_code != 200:
_article_title_cache[url] = None
return None
chunk = resp.raw.read(_TITLE_FETCH_READ_BYTES).decode("utf-8", errors="replace")
break
finally:
resp.close()
else:
_article_title_cache[url] = None
return None
title = None
# Try og:title first (usually the cleanest)
og_match = re.search(
r']+property=["\']og:title["\'][^>]+content=["\']([^"\'>]+)["\']', chunk, re.I
)
if not og_match:
og_match = re.search(
r']+content=["\']([^"\'>]+)["\'][^>]+property=["\']og:title["\']',
chunk,
re.I,
)
if og_match:
title = og_match.group(1).strip()
# Fall back to tag
if not title:
title_match = re.search(r"]*>([^<]+)", chunk, re.I)
if title_match:
title = title_match.group(1).strip()
if title:
# Clean up HTML entities
import html as html_mod
title = html_mod.unescape(title)
# Remove site name suffixes like " | CNN" or " - BBC News"
title = re.sub(r"\s*[|\-–—]\s*[^|\-–—]{2,30}$", "", title).strip()
# Truncate very long titles
if len(title) > 120:
title = title[:117] + "..."
if len(title) > 10:
_article_title_cache[url] = title
# Also extract og:description / meta description for snippet
_extract_snippet(url, chunk)
return title
_article_title_cache[url] = None
return None
except (
requests.RequestException,
ConnectionError,
TimeoutError,
ValueError,
AttributeError,
): # non-critical
_article_title_cache[url] = None
return None
def _batch_fetch_titles(urls):
"""Fetch real article titles for a list of URLs in parallel.
Returns a dict of url -> title (or None if fetch failed)."""
from concurrent.futures import ThreadPoolExecutor
results = {}
with ThreadPoolExecutor(max_workers=16) as executor:
futures = {executor.submit(_fetch_article_title, u): u for u in urls}
for future in futures:
url = futures[future]
try:
results[url] = future.result()
except Exception: # non-critical: optional title enrichment
results[url] = None
return results
def _parse_gdelt_export_zip(zip_bytes, conflict_codes, seen_locs, features, loc_index):
"""Parse a single GDELT export ZIP and append conflict features.
loc_index maps loc_key -> index in features list for fast duplicate merging.
"""
import csv, io, zipfile
try:
zf = zipfile.ZipFile(io.BytesIO(zip_bytes))
csv_name = zf.namelist()[0]
with zf.open(csv_name) as cf:
reader = csv.reader(
io.TextIOWrapper(cf, encoding="utf-8", errors="replace"), delimiter="\t"
)
for row in reader:
try:
if len(row) < 61:
continue
event_code = row[26][:2] if len(row[26]) >= 2 else ""
if event_code not in conflict_codes:
continue
lat = float(row[56]) if row[56] else None
lng = float(row[57]) if row[57] else None
if lat is None or lng is None or (lat == 0 and lng == 0):
continue
source_url = row[60].strip() if len(row) > 60 else ""
location = row[52].strip() if len(row) > 52 else "Unknown"
actor1 = row[6].strip() if len(row) > 6 else ""
actor2 = row[16].strip() if len(row) > 16 else ""
# Extract enrichment fields from GDELT CSV
event_date = row[1].strip() if len(row) > 1 else ""
full_event_code = row[26].strip() if len(row) > 26 else ""
quad_class = int(row[29]) if len(row) > 29 and row[29].strip().isdigit() else 0
goldstein = float(row[30]) if len(row) > 30 and row[30].strip() else 0.0
num_mentions = int(row[31]) if len(row) > 31 and row[31].strip().isdigit() else 0
num_sources = int(row[32]) if len(row) > 32 and row[32].strip().isdigit() else 0
num_articles = int(row[33]) if len(row) > 33 and row[33].strip().isdigit() else 0
avg_tone = float(row[34]) if len(row) > 34 and row[34].strip() else 0.0
loc_key = f"{round(lat, 1)}_{round(lng, 1)}"
if loc_key in seen_locs:
# Merge: increment count, accumulate intensity, add source URL
idx = loc_index[loc_key]
feat = features[idx]
props = feat["properties"]
props["count"] = props.get("count", 1) + 1
# Track worst Goldstein score (most negative = most intense)
if goldstein < props.get("goldstein", 0):
props["goldstein"] = round(goldstein, 1)
# Accumulate mentions/sources for importance ranking
props["num_mentions"] = props.get("num_mentions", 0) + num_mentions
props["num_sources"] = props.get("num_sources", 0) + num_sources
props["num_articles"] = props.get("num_articles", 0) + num_articles
# Track latest date
if event_date and event_date > props.get("event_date", ""):
props["event_date"] = event_date
# Collect actors
actors = props.get("_actors_set", set())
if actor1:
actors.add(actor1)
if actor2:
actors.add(actor2)
props["_actors_set"] = actors
urls = props.get("_urls", [])
seen_domains = props.get("_domains", set())
if source_url:
domain = _extract_domain(source_url)
if domain not in seen_domains and len(urls) < 10:
urls.append(source_url)
seen_domains.add(domain)
props["_urls"] = urls
props["_domains"] = seen_domains
continue
seen_locs.add(loc_key)
name = (
location
or (f"{actor1} vs {actor2}" if actor1 and actor2 else actor1)
or "Unknown Incident"
)
domain = _extract_domain(source_url) if source_url else ""
actors_set = set()
if actor1:
actors_set.add(actor1)
if actor2:
actors_set.add(actor2)
loc_index[loc_key] = len(features)
features.append(
{
"type": "Feature",
"properties": {
"name": name,
"count": 1,
"event_date": event_date,
"event_code": full_event_code,
"quad_class": quad_class,
"goldstein": round(goldstein, 1),
"num_mentions": num_mentions,
"num_sources": num_sources,
"num_articles": num_articles,
"avg_tone": round(avg_tone, 1),
"actor1": actor1,
"actor2": actor2,
"_actors_set": actors_set,
"_urls": [source_url] if source_url else [],
"_domains": {domain} if domain else set(),
},
"geometry": {"type": "Point", "coordinates": [lng, lat]},
"_loc_key": loc_key,
}
)
except (ValueError, IndexError):
continue
except (IOError, OSError, ValueError, KeyError, zipfile.BadZipFile) as e:
logger.warning(f"Failed to parse GDELT export zip: {e}")
# GDELT's data.gdeltproject.org is a CNAME to a Google Cloud Storage
# bucket of the same name. GCS returns the wildcard ``*.storage.googleapis.com``
# certificate, which legitimately does NOT cover the GDELT custom domain
# — Python's TLS verification correctly refuses it. Some networks/POPs
# happen to route through a path where this works; many do not (notably
# Docker Desktop's outbound NAT on local installs).
#
# Fix: rewrite the URL to hit GCS directly with a path-style bucket
# reference, where the standard GCS cert is genuinely valid. Same data,
# verified TLS, no operator-side workaround needed.
def _gcs_direct_gdelt_url(url: str) -> str:
"""If ``url`` points at data.gdeltproject.org, return the equivalent
GCS-direct URL. Otherwise return the URL unchanged."""
prefix = "://data.gdeltproject.org/"
if prefix in url:
return url.replace(prefix, "://storage.googleapis.com/data.gdeltproject.org/", 1)
return url
def _download_gdelt_export(url):
"""Download a single GDELT export file, return bytes or None."""
try:
res = fetch_with_curl(_gcs_direct_gdelt_url(url), timeout=15)
if res.status_code == 200:
return res.content
except (ConnectionError, TimeoutError, OSError): # non-critical
pass
return None
def _build_feature_html(features, fetched_titles=None):
"""Build URL + headline arrays for frontend rendering.
Uses fetched_titles (real article titles) when available, falls back to URL slug parsing."""
import html as html_mod
for f in features:
urls = f["properties"].pop("_urls", [])
f["properties"].pop("_domains", None)
# Convert actors set to sorted list for JSON serialization
actors_set = f["properties"].pop("_actors_set", set())
if actors_set:
f["properties"]["actors"] = sorted(actors_set)[:6]
headlines = []
snippets = []
for u in urls:
real_title = fetched_titles.get(u) if fetched_titles else None
headlines.append(real_title if real_title else _url_to_headline(u))
snippets.append(_article_snippet_cache.get(u) or "")
f["properties"]["_urls_list"] = urls
f["properties"]["_headlines_list"] = headlines
f["properties"]["_snippets_list"] = snippets
if urls:
links = []
for u, h in zip(urls, headlines):
safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
safe_h = html_mod.escape(h)
links.append(
f''
)
f["properties"]["html"] = "".join(links)
else:
f["properties"]["html"] = html_mod.escape(f["properties"]["name"])
f.pop("_loc_key", None)
def _enrich_gdelt_titles_background(features, all_article_urls):
"""Background thread: fetch real article titles then update features in-place."""
import html as html_mod
try:
logger.info(f"[BG] Fetching real article titles for {len(all_article_urls)} URLs...")
fetched_titles = _batch_fetch_titles(all_article_urls)
fetched_count = sum(1 for v in fetched_titles.values() if v)
logger.info(f"[BG] Resolved {fetched_count}/{len(all_article_urls)} article titles")
# Update features in-place with real titles and snippets
for f in features:
urls = f["properties"].get("_urls_list", [])
if not urls:
continue
headlines = []
snippets = []
for u in urls:
real_title = fetched_titles.get(u)
headlines.append(real_title if real_title else _url_to_headline(u))
snippets.append(_article_snippet_cache.get(u) or "")
f["properties"]["_headlines_list"] = headlines
f["properties"]["_snippets_list"] = snippets
links = []
for u, h in zip(urls, headlines):
safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
safe_h = html_mod.escape(h)
links.append(
f''
)
f["properties"]["html"] = "".join(links)
logger.info(f"[BG] GDELT title enrichment complete")
except Exception as e:
logger.error(f"[BG] GDELT title enrichment failed: {e}")
def fetch_global_military_incidents():
"""
Fetches global military/conflict incidents from GDELT Events Export files.
Aggregates the last ~8 hours of 15-minute exports to build ~1000 incidents.
Returns immediately with URL-slug headlines; enriches with real titles in background.
"""
import threading
from datetime import timedelta
from concurrent.futures import ThreadPoolExecutor
try:
logger.info("Fetching GDELT events via export CDN (multi-file)...")
# Get the latest export URL to determine current timestamp.
# HTTPS is used to prevent passive network observers from injecting
# poisoned export records into the global incident map via MITM.
# GDELT serves the same content over HTTPS as HTTP.
# Use the GCS-direct URL because data.gdeltproject.org's CNAME
# serves a wildcard *.storage.googleapis.com cert that legitimately
# doesn't cover the GDELT hostname. See _gcs_direct_gdelt_url above.
index_res = fetch_with_curl(
_gcs_direct_gdelt_url("https://data.gdeltproject.org/gdeltv2/lastupdate.txt"),
timeout=10,
)
if index_res.status_code != 200:
logger.error(f"GDELT lastupdate failed: {index_res.status_code}")
return []
# Extract latest export URL and its timestamp
latest_url = None
for line in index_res.text.strip().split("\n"):
parts = line.strip().split()
if len(parts) >= 3 and parts[2].endswith(".export.CSV.zip"):
latest_url = parts[2]
break
if not latest_url:
logger.error("Could not find GDELT export URL")
return []
# Extract timestamp from URL like: https://data.gdeltproject.org/gdeltv2/20260301120000.export.CSV.zip
# (GDELT's lastupdate.txt may still list URLs with http:// — we ignore
# the scheme there and reconstruct each download URL as https:// below.)
import re
ts_match = re.search(r"(\d{14})\.export\.CSV\.zip", latest_url)
if not ts_match:
logger.error("Could not parse GDELT export timestamp")
return []
latest_ts = datetime.strptime(ts_match.group(1), "%Y%m%d%H%M%S")
# Generate URLs for the last 12 hours (48 files at 15-min intervals)
NUM_FILES = 48
urls = []
for i in range(NUM_FILES):
ts = latest_ts - timedelta(minutes=15 * i)
fname = ts.strftime("%Y%m%d%H%M%S") + ".export.CSV.zip"
url = f"https://data.gdeltproject.org/gdeltv2/{fname}"
urls.append(url)
logger.info(f"Downloading {len(urls)} GDELT export files...")
# Download in parallel (8 threads)
with ThreadPoolExecutor(max_workers=8) as executor:
zip_results = list(executor.map(_download_gdelt_export, urls))
successful = sum(1 for r in zip_results if r is not None)
logger.info(f"Downloaded {successful}/{len(urls)} GDELT exports")
# Parse all downloaded files
CONFLICT_CODES = {"13", "14", "15", "16", "17", "18", "19", "20"}
features = []
seen_locs = set()
loc_index = {} # loc_key -> index in features
for zip_bytes in zip_results:
if zip_bytes:
_parse_gdelt_export_zip(zip_bytes, CONFLICT_CODES, seen_locs, features, loc_index)
# Collect all unique article URLs
all_article_urls = set()
for f in features:
for u in f["properties"].get("_urls", []):
if u:
all_article_urls.add(u)
# Build HTML immediately with URL-slug headlines (instant, no network)
_build_feature_html(features)
logger.info(
f"GDELT parsed: {len(features)} conflict locations from {successful} files (titles enriching in background)"
)
# Kick off background thread to enrich with real article titles
# Features list is shared — background thread updates in-place
t = threading.Thread(
target=_enrich_gdelt_titles_background,
args=(features, all_article_urls),
daemon=True,
)
t.start()
return features
except (
requests.RequestException,
ConnectionError,
TimeoutError,
ValueError,
KeyError,
OSError,
) as e:
logger.error(f"Error fetching GDELT data: {e}")
return []