v0.7.0: performance hardening — parallel fetches, deferred icons, AIS stability

Optimizations:
- Parallelized yfinance stock/oil fetches via ThreadPoolExecutor (~2s vs ~8s)
- AIS backoff reset after 200 successes; removed hot-loop pruning (lock contention)
- Single-pass ETag serialization (was double-serializing JSON)
- Deferred ~50 non-critical map icons via setTimeout(0)
- News feed animation capped at 15 items (was 100+ simultaneous)
- heapq.nlargest() for FIRMS fires (60K→5K) and internet outages
- Removed satellite duplication from fast endpoint
- Geopolitics interval 5min → 30min
- Ship counts single-pass memoized; color maps module-level constants
- Improved GDELT URL-to-headline extraction (skip gibberish slugs)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Former-commit-id: 4a14a2f078
This commit is contained in:
anoracleofra-code
2026-03-11 06:25:31 -06:00
parent 10f376d4d7
commit cf68f1978d
8 changed files with 511 additions and 221 deletions
+41 -9
View File
@@ -1,12 +1,44 @@
{
"feeds": [
{ "name": "NPR", "url": "https://feeds.npr.org/1004/rss.xml", "weight": 4 },
{ "name": "BBC", "url": "http://feeds.bbci.co.uk/news/world/rss.xml", "weight": 3 },
{ "name": "AlJazeera", "url": "https://www.aljazeera.com/xml/rss/all.xml", "weight": 2 },
{ "name": "NYT", "url": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", "weight": 1 },
{ "name": "GDACS", "url": "https://www.gdacs.org/xml/rss.xml", "weight": 5 },
{ "name": "NHK", "url": "https://www3.nhk.or.jp/nhkworld/rss/world.xml", "weight": 3 },
{ "name": "CNA", "url": "https://www.channelnewsasia.com/rssfeed/8395986", "weight": 3 },
{ "name": "Mercopress", "url": "https://en.mercopress.com/rss/", "weight": 3 }
{
"name": "NPR",
"url": "https://feeds.npr.org/1004/rss.xml",
"weight": 4
},
{
"name": "BBC",
"url": "http://feeds.bbci.co.uk/news/world/rss.xml",
"weight": 3
},
{
"name": "AlJazeera",
"url": "https://www.aljazeera.com/xml/rss/all.xml",
"weight": 2
},
{
"name": "NYT",
"url": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
"weight": 1
},
{
"name": "GDACS",
"url": "https://www.gdacs.org/xml/rss.xml",
"weight": 5
},
{
"name": "NHK",
"url": "https://www3.nhk.or.jp/nhkworld/rss/world.xml",
"weight": 3
},
{
"name": "CNA",
"url": "https://www.channelnewsasia.com/rssfeed/8395986",
"weight": 3
},
{
"name": "Mercopress",
"url": "https://en.mercopress.com/rss/",
"weight": 3
}
]
}
}
+11 -24
View File
@@ -77,6 +77,15 @@ async def force_refresh():
async def live_data():
return get_latest_data()
def _etag_response(request: Request, payload: dict, prefix: str = "", default=None):
"""Serialize once, hash the bytes for ETag, return 304 or full response."""
content = json_mod.dumps(payload, default=default)
etag = hashlib.md5(f"{prefix}{content[:256]}".encode()).hexdigest()[:16]
if request.headers.get("if-none-match") == etag:
return Response(status_code=304, headers={"ETag": etag, "Cache-Control": "no-cache"})
return Response(content=content, media_type="application/json",
headers={"ETag": etag, "Cache-Control": "no-cache"})
@app.get("/api/live-data/fast")
async def live_data_fast(request: Request):
d = get_latest_data()
@@ -87,25 +96,13 @@ async def live_data_fast(request: Request):
"private_jets": d.get("private_jets", []),
"tracked_flights": d.get("tracked_flights", []),
"ships": d.get("ships", []),
"satellites": d.get("satellites", []),
"cctv": d.get("cctv", []),
"uavs": d.get("uavs", []),
"liveuamap": d.get("liveuamap", []),
"gps_jamming": d.get("gps_jamming", []),
"freshness": dict(source_timestamps),
}
# ETag includes last_updated timestamp so it changes on every data refresh,
# not just when item counts change (old bug: positions went stale)
last_updated = d.get("last_updated", "")
counts = "|".join(f"{k}:{len(v) if isinstance(v, list) else 0}" for k, v in payload.items() if k != "freshness")
etag = hashlib.md5(f"{last_updated}|{counts}".encode()).hexdigest()[:16]
if request.headers.get("if-none-match") == etag:
return Response(status_code=304, headers={"ETag": etag, "Cache-Control": "no-cache"})
return Response(
content=json_mod.dumps(payload),
media_type="application/json",
headers={"ETag": etag, "Cache-Control": "no-cache"}
)
return _etag_response(request, payload, prefix="fast|")
@app.get("/api/live-data/slow")
async def live_data_slow(request: Request):
@@ -129,17 +126,7 @@ async def live_data_slow(request: Request):
"datacenters": d.get("datacenters", []),
"freshness": dict(source_timestamps),
}
# ETag based on last_updated + item counts
last_updated = d.get("last_updated", "")
counts = "|".join(f"{k}:{len(v) if isinstance(v, list) else 0}" for k, v in payload.items() if k != "freshness")
etag = hashlib.md5(f"slow|{last_updated}|{counts}".encode()).hexdigest()[:16]
if request.headers.get("if-none-match") == etag:
return Response(status_code=304, headers={"ETag": etag, "Cache-Control": "no-cache"})
return Response(
content=json_mod.dumps(payload, default=str),
media_type="application/json",
headers={"ETag": etag, "Cache-Control": "no-cache"}
)
return _etag_response(request, payload, prefix="slow|", default=str)
@app.get("/api/debug-latest")
async def debug_latest_data():
+27 -24
View File
@@ -238,49 +238,51 @@ def _ais_stream_loop():
logger.info("AIS Stream proxy started — receiving vessel data")
msg_count = 0
ok_streak = 0 # Track consecutive successful messages for backoff reset
last_log_time = time.time()
for raw_msg in iter(process.stdout.readline, ''):
if not _ws_running:
process.terminate()
break
raw_msg = raw_msg.strip()
if not raw_msg:
continue
try:
data = json.loads(raw_msg)
except json.JSONDecodeError:
continue
if "error" in data:
logger.error(f"AIS Stream error: {data['error']}")
continue
msg_type = data.get("MessageType", "")
metadata = data.get("MetaData", {})
message = data.get("Message", {})
mmsi = metadata.get("MMSI", 0)
if not mmsi:
continue
with _vessels_lock:
if mmsi not in _vessels:
_vessels[mmsi] = {"_updated": time.time()}
vessel = _vessels[mmsi]
# Update position from PositionReport or StandardClassBPositionReport
if msg_type in ("PositionReport", "StandardClassBPositionReport"):
report = message.get(msg_type, {})
lat = report.get("Latitude", metadata.get("latitude", 0))
lng = report.get("Longitude", metadata.get("longitude", 0))
# Skip invalid positions
if lat == 0 and lng == 0:
continue
if abs(lat) > 90 or abs(lng) > 180:
continue
with _vessels_lock:
vessel["lat"] = lat
vessel["lng"] = lng
@@ -292,12 +294,12 @@ def _ais_stream_loop():
# Use metadata name if we don't have one yet
if not vessel.get("name") or vessel["name"] == "UNKNOWN":
vessel["name"] = metadata.get("ShipName", "UNKNOWN").strip() or "UNKNOWN"
# Update static data from ShipStaticData
elif msg_type == "ShipStaticData":
static = message.get("ShipStaticData", {})
ais_type = static.get("Type", 0)
with _vessels_lock:
vessel["name"] = (static.get("Name", "") or metadata.get("ShipName", "UNKNOWN")).strip() or "UNKNOWN"
vessel["callsign"] = (static.get("CallSign", "") or "").strip()
@@ -306,21 +308,24 @@ def _ais_stream_loop():
vessel["ais_type_code"] = ais_type
vessel["type"] = classify_vessel(ais_type, mmsi)
vessel["_updated"] = time.time()
msg_count += 1
if msg_count % 5000 == 0:
ok_streak += 1
# Reset backoff after 200 consecutive successful messages
if ok_streak >= 200 and backoff > 1:
backoff = 1
ok_streak = 0
# Periodic logging + cache save (time-based instead of count-based to avoid lock in hot loop)
now = time.time()
if now - last_log_time >= 60:
with _vessels_lock:
# Inline pruning: remove vessels not updated in 15 minutes
prune_cutoff = time.time() - 900
stale = [k for k, v in _vessels.items() if v.get("_updated", 0) < prune_cutoff]
for k in stale:
del _vessels[k]
count = len(_vessels)
if stale:
logger.info(f"AIS pruned {len(stale)} stale vessels")
logger.info(f"AIS Stream: processed {msg_count} messages, tracking {count} vessels")
_save_cache() # Auto-save every 5000 messages (~60 seconds)
_save_cache()
last_log_time = now
except Exception as e:
logger.error(f"AIS proxy connection error: {e}")
if _ws_running:
@@ -328,8 +333,6 @@ def _ais_stream_loop():
time.sleep(backoff)
backoff = min(backoff * 2, 60) # Double up to 60s max
continue
# Reset backoff on successful connection (got at least some messages)
backoff = 1
def _run_ais_loop():
+148 -45
View File
@@ -15,6 +15,7 @@ import threading
import io
from apscheduler.schedulers.background import BackgroundScheduler
import concurrent.futures
import heapq
from sgp4.api import Satrec, WGS72
from sgp4.api import jday
from datetime import datetime
@@ -81,6 +82,25 @@ opensky_client = OpenSkyClient(
last_opensky_fetch = 0
cached_opensky_flights = []
# ---------------------------------------------------------------------------
# Supplemental ADS-B sources for blind-spot gap-filling (Russia/China/Africa)
# These aggregators have different feeder pools than adsb.lol and can surface
# aircraft invisible to our primary source. Only gap-fill planes are kept.
# ---------------------------------------------------------------------------
_BLIND_SPOT_REGIONS = [
{"name": "Yekaterinburg", "lat": 56.8, "lon": 60.6, "radius_nm": 250},
{"name": "Novosibirsk", "lat": 55.0, "lon": 82.9, "radius_nm": 250},
{"name": "Krasnoyarsk", "lat": 56.0, "lon": 92.9, "radius_nm": 250},
{"name": "Vladivostok", "lat": 43.1, "lon": 131.9, "radius_nm": 250},
{"name": "Urumqi", "lat": 43.8, "lon": 87.6, "radius_nm": 250},
{"name": "Chengdu", "lat": 30.6, "lon": 104.1, "radius_nm": 250},
{"name": "Lagos-Accra", "lat": 6.5, "lon": 3.4, "radius_nm": 250},
{"name": "Addis Ababa", "lat": 9.0, "lon": 38.7, "radius_nm": 250},
]
_SUPPLEMENTAL_FETCH_INTERVAL = 120 # seconds — only query every 2 min
last_supplemental_fetch = 0
cached_supplemental_flights = []
# In-memory store
@@ -480,27 +500,31 @@ def fetch_news():
latest_data['news'] = news_items
_mark_fresh("news")
def _fetch_single_ticker(symbol: str, period: str = "2d"):
"""Fetch a single yfinance ticker. Returns (symbol, data_dict) or (symbol, None)."""
try:
ticker = yf.Ticker(symbol)
hist = ticker.history(period=period)
if len(hist) >= 1:
current_price = hist['Close'].iloc[-1]
prev_close = hist['Close'].iloc[0] if len(hist) > 1 else current_price
change_percent = ((current_price - prev_close) / prev_close) * 100 if prev_close else 0
return symbol, {
"price": round(float(current_price), 2),
"change_percent": round(float(change_percent), 2),
"up": bool(change_percent >= 0)
}
except Exception as e:
logger.warning(f"Could not fetch data for {symbol}: {e}")
return symbol, None
def fetch_defense_stocks():
tickers = ["RTX", "LMT", "NOC", "GD", "BA", "PLTR"]
stocks_data = {}
try:
for t in tickers:
try:
ticker = yf.Ticker(t)
hist = ticker.history(period="2d")
if len(hist) >= 1:
current_price = hist['Close'].iloc[-1]
prev_close = hist['Close'].iloc[0] if len(hist) > 1 else current_price
change_percent = ((current_price - prev_close) / prev_close) * 100 if prev_close else 0
stocks_data[t] = {
"price": round(float(current_price), 2),
"change_percent": round(float(change_percent), 2),
"up": bool(change_percent >= 0)
}
except Exception as e:
logger.warning(f"Could not fetch data for {t}: {e}")
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as pool:
results = pool.map(lambda t: _fetch_single_ticker(t, "2d"), tickers)
stocks_data = {sym: data for sym, data in results if data}
latest_data['stocks'] = stocks_data
_mark_fresh("stocks")
except Exception as e:
@@ -509,25 +533,10 @@ def fetch_defense_stocks():
def fetch_oil_prices():
# CL=F is Crude Oil, BZ=F is Brent Crude
tickers = {"WTI Crude": "CL=F", "Brent Crude": "BZ=F"}
oil_data = {}
try:
for name, symbol in tickers.items():
try:
ticker = yf.Ticker(symbol)
hist = ticker.history(period="5d")
if len(hist) >= 2:
current_price = hist['Close'].iloc[-1]
prev_close = hist['Close'].iloc[-2]
change_percent = ((current_price - prev_close) / prev_close) * 100 if prev_close else 0
oil_data[name] = {
"price": round(float(current_price), 2),
"change_percent": round(float(change_percent), 2),
"up": bool(change_percent >= 0)
}
except Exception as e:
logger.warning(f"Could not fetch data for {symbol}: {e}")
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as pool:
results = pool.map(lambda item: (_fetch_single_ticker(item[1], "5d")[1], item[0]), tickers.items())
oil_data = {name: data for data, name in results if data}
latest_data['oil'] = oil_data
_mark_fresh("oil")
except Exception as e:
@@ -612,6 +621,87 @@ _HELI_TYPES_BACKEND = {
"B47G", "HUEY", "GAMA", "CABR", "EXE",
}
def _fetch_supplemental_sources(seen_hex: set) -> list:
"""Fetch from airplanes.live and adsb.fi to fill blind-spot gaps.
Only returns aircraft whose ICAO hex is NOT already in seen_hex.
Throttled to run every _SUPPLEMENTAL_FETCH_INTERVAL seconds.
Fully wrapped in try/except — returns [] on any failure.
"""
global last_supplemental_fetch, cached_supplemental_flights
now = time.time()
if now - last_supplemental_fetch < _SUPPLEMENTAL_FETCH_INTERVAL:
# Return cached results, but still filter against current seen_hex
return [f for f in cached_supplemental_flights
if f.get("hex", "").lower().strip() not in seen_hex]
new_supplemental = []
supplemental_hex = set() # track hex within supplemental to avoid internal dupes
# --- airplanes.live (parallel, all hotspots) ---
def _fetch_airplaneslive(region):
try:
url = (f"https://api.airplanes.live/v2/point/"
f"{region['lat']}/{region['lon']}/{region['radius_nm']}")
res = fetch_with_curl(url, timeout=10)
if res.status_code == 200:
data = res.json()
return data.get("ac", [])
except Exception as e:
logger.debug(f"airplanes.live {region['name']} failed: {e}")
return []
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as pool:
results = list(pool.map(_fetch_airplaneslive, _BLIND_SPOT_REGIONS))
for region_flights in results:
for f in region_flights:
h = f.get("hex", "").lower().strip()
if h and h not in seen_hex and h not in supplemental_hex:
f["supplemental_source"] = "airplanes.live"
new_supplemental.append(f)
supplemental_hex.add(h)
except Exception as e:
logger.warning(f"airplanes.live supplemental fetch failed: {e}")
ap_count = len(new_supplemental)
# --- adsb.fi (sequential, 1.1s between requests to respect 1 req/sec limit) ---
try:
for region in _BLIND_SPOT_REGIONS:
try:
url = (f"https://opendata.adsb.fi/api/v3/lat/"
f"{region['lat']}/lon/{region['lon']}/dist/{region['radius_nm']}")
res = fetch_with_curl(url, timeout=10)
if res.status_code == 200:
data = res.json()
for f in data.get("ac", []):
h = f.get("hex", "").lower().strip()
if h and h not in seen_hex and h not in supplemental_hex:
f["supplemental_source"] = "adsb.fi"
new_supplemental.append(f)
supplemental_hex.add(h)
except Exception as e:
logger.debug(f"adsb.fi {region['name']} failed: {e}")
time.sleep(1.1) # Rate limit: 1 req/sec
except Exception as e:
logger.warning(f"adsb.fi supplemental fetch failed: {e}")
fi_count = len(new_supplemental) - ap_count
cached_supplemental_flights = new_supplemental
last_supplemental_fetch = now
if new_supplemental:
_mark_fresh("supplemental_flights")
logger.info(f"Supplemental: +{len(new_supplemental)} new aircraft from blind-spot "
f"hotspots (airplanes.live: {ap_count}, adsb.fi: {fi_count})")
return new_supplemental
def fetch_flights():
# OpenSky Network public API for flights. We want to demonstrate global coverage.
flights = []
@@ -712,7 +802,22 @@ def fetch_flights():
all_adsb_flights.append(osf)
seen_hex.add(h.lower().strip())
# -------------------------------------------------------------------
# Supplemental Sources: airplanes.live + adsb.fi (blind-spot gap-fill)
# Only adds aircraft whose ICAO hex is NOT already in seen_hex.
# -------------------------------------------------------------------
try:
gap_fill = _fetch_supplemental_sources(seen_hex)
for f in gap_fill:
all_adsb_flights.append(f)
h = f.get("hex", "").lower().strip()
if h:
seen_hex.add(h)
if gap_fill:
logger.info(f"Gap-fill: added {len(gap_fill)} aircraft to pipeline")
except Exception as e:
logger.warning(f"Supplemental source fetch failed (non-fatal): {e}")
if all_adsb_flights:
# The user requested maximum flight density. Rendering all available aircraft.
@@ -1333,9 +1438,8 @@ def fetch_firms_fires():
})
except (ValueError, TypeError):
continue
# Sort by FRP descending, keep top 5000 (most intense fires first)
all_rows.sort(key=lambda x: x["frp"], reverse=True)
fires = all_rows[:5000]
# Keep top 5000 by FRP (most intense fires first) — heapq is O(n) vs O(n log n) sort
fires = heapq.nlargest(5000, all_rows, key=lambda x: x["frp"])
logger.info(f"FIRMS fires: {len(fires)} hotspots (from {response.status_code})")
except Exception as e:
logger.error(f"Error fetching FIRMS fires: {e}")
@@ -1471,9 +1575,8 @@ def fetch_internet_outages():
r["lat"] = coords[0]
r["lng"] = coords[1]
geocoded.append(r)
# Sort by severity descending, cap at 100
geocoded.sort(key=lambda x: x["severity"], reverse=True)
outages = geocoded[:100]
# Keep top 100 by severity
outages = heapq.nlargest(100, geocoded, key=lambda x: x["severity"])
logger.info(f"Internet outages: {len(outages)} regions affected")
except Exception as e:
logger.error(f"Error fetching internet outages: {e}")
@@ -2219,8 +2322,8 @@ def start_scheduler():
scheduler.add_job(update_liveuamap, 'date', run_date=datetime.now())
scheduler.add_job(update_liveuamap, 'interval', hours=12)
# Geopolitics (frontlines) more frequently than other slow data
scheduler.add_job(fetch_geopolitics, 'interval', minutes=5)
# Geopolitics (frontlines) aligned with slow-data tier
scheduler.add_job(fetch_geopolitics, 'interval', minutes=30)
scheduler.start()
+153 -27
View File
@@ -86,8 +86,10 @@ def _extract_domain(url):
def _url_to_headline(url):
"""Extract a human-readable headline from a URL path.
e.g. 'https://nytimes.com/2026/03/us-strikes-iran-nuclear-sites.html' -> 'Us Strikes Iran Nuclear Sites (nytimes.com)'
e.g. 'https://nytimes.com/2026/03/us-strikes-iran-nuclear-sites.html' -> 'Us Strikes Iran Nuclear Sites'
Falls back to domain name if the URL slug is gibberish (hex IDs, UUIDs, etc.).
"""
import re
try:
from urllib.parse import urlparse, unquote
parsed = urlparse(url)
@@ -100,43 +102,151 @@ def _url_to_headline(url):
if not path:
return domain
# Take the last path segment (usually the slug)
slug = path.split('/')[-1]
# Remove file extensions
for ext in ['.html', '.htm', '.php', '.asp', '.aspx', '.shtml']:
if slug.lower().endswith(ext):
slug = slug[:-len(ext)]
# If slug is purely numeric or a short ID, try the second-to-last segment
import re
if re.match(r'^[a-z]?\d{5,}$', slug, re.IGNORECASE):
segments = path.split('/')
if len(segments) >= 2:
slug = segments[-2]
for ext in ['.html', '.htm', '.php']:
if slug.lower().endswith(ext):
slug = slug[:-len(ext)]
# Try the last path segment first, then walk backwards
segments = [s for s in path.split('/') if s]
slug = ''
for seg in reversed(segments):
# Remove file extensions
for ext in ['.html', '.htm', '.php', '.asp', '.aspx', '.shtml']:
if seg.lower().endswith(ext):
seg = seg[:-len(ext)]
# Skip segments that are clearly not headlines
if _is_gibberish(seg):
continue
slug = seg
break
if not slug:
return domain
# Remove common ID patterns at start/end
slug = re.sub(r'^[\d]+-', '', slug) # leading numbers like "13847569-"
slug = re.sub(r'-[\da-f]{6,}$', '', slug) # trailing hex IDs
slug = re.sub(r'[-_]c-\d+$', '', slug) # trailing "-c-21803431"
slug = re.sub(r'^p=\d+$', '', slug) # WordPress ?p=1234
slug = re.sub(r'^[\d]+-', '', slug) # leading "13847569-"
slug = re.sub(r'-[\da-f]{6,}$', '', slug) # trailing hex IDs
slug = re.sub(r'[-_]c-\d+$', '', slug) # trailing "-c-21803431"
slug = re.sub(r'^p=\d+$', '', slug) # WordPress ?p=1234
# Convert slug separators to spaces
slug = slug.replace('-', ' ').replace('_', ' ')
# Clean up multiple spaces
slug = re.sub(r'\s+', ' ', slug).strip()
# If slug is still just a number or too short, fall back to domain
if len(slug) < 5 or re.match(r'^\d+$', slug):
# Final gibberish check after cleanup
if len(slug) < 8 or _is_gibberish(slug.replace(' ', '-')):
return domain
# Title case and truncate
headline = slug.title()
if len(headline) > 80:
headline = headline[:77] + '...'
return f"{headline} ({domain})"
if len(headline) > 90:
headline = headline[:87] + '...'
return headline
except Exception:
return url[:60]
def _is_gibberish(text):
"""Detect if a URL segment is gibberish (hex IDs, UUIDs, numeric IDs, etc.)
rather than a real human-readable slug like 'us-strikes-iran'."""
import re
t = text.strip()
if not t:
return True
# Pure numbers
if re.match(r'^\d+$', t):
return True
# UUID pattern (with or without dashes)
if re.match(r'^[0-9a-f]{8}[_-]?[0-9a-f]{4}[_-]?[0-9a-f]{4}[_-]?[0-9a-f]{4}[_-]?[0-9a-f]{12}$', t, re.I):
return True
# Hex-heavy string: more than 40% hex digits among alphanumeric chars
alnum = re.sub(r'[^a-zA-Z0-9]', '', t)
if alnum:
hex_chars = sum(1 for c in alnum if c in '0123456789abcdefABCDEF')
if hex_chars / len(alnum) > 0.4 and len(alnum) > 6:
return True
# Mostly digits with a few alpha (like "article8efa6c53")
digits = sum(1 for c in alnum if c.isdigit())
if alnum and digits / len(alnum) > 0.5:
return True
# Too short to be a headline slug
if len(t) < 5:
return True
# Query-param style segments
if '=' in t:
return True
return False
# Persistent cache for article titles — survives across GDELT cache refreshes
_article_title_cache = {}
def _fetch_article_title(url):
"""Fetch the real headline from an article's HTML <title> or og:title tag.
Returns the title string, or None if it can't be fetched.
Uses a persistent cache to avoid refetching."""
if url in _article_title_cache:
return _article_title_cache[url]
import re
try:
# Only read the first 32KB — the <title> is always in <head>
resp = requests.get(url, timeout=4, headers={
'User-Agent': 'Mozilla/5.0 (compatible; OSINT Dashboard/1.0)'
}, stream=True)
if resp.status_code != 200:
_article_title_cache[url] = None
return None
chunk = resp.raw.read(32768).decode('utf-8', errors='replace')
resp.close()
title = None
# Try og:title first (usually the cleanest)
og_match = re.search(r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\'>]+)["\']', chunk, re.I)
if not og_match:
og_match = re.search(r'<meta[^>]+content=["\']([^"\'>]+)["\'][^>]+property=["\']og:title["\']', chunk, re.I)
if og_match:
title = og_match.group(1).strip()
# Fall back to <title> tag
if not title:
title_match = re.search(r'<title[^>]*>([^<]+)</title>', chunk, re.I)
if title_match:
title = title_match.group(1).strip()
if title:
# Clean up HTML entities
import html as html_mod
title = html_mod.unescape(title)
# Remove site name suffixes like " | CNN" or " - BBC News"
title = re.sub(r'\s*[|\-–—]\s*[^|\-–—]{2,30}$', '', title).strip()
# Truncate very long titles
if len(title) > 120:
title = title[:117] + '...'
if len(title) > 10:
_article_title_cache[url] = title
return title
_article_title_cache[url] = None
return None
except Exception:
_article_title_cache[url] = None
return None
def _batch_fetch_titles(urls):
"""Fetch real article titles for a list of URLs in parallel.
Returns a dict of url -> title (or None if fetch failed)."""
from concurrent.futures import ThreadPoolExecutor
results = {}
with ThreadPoolExecutor(max_workers=16) as executor:
futures = {executor.submit(_fetch_article_title, u): u for u in urls}
for future in futures:
url = futures[future]
try:
results[url] = future.result()
except Exception:
results[url] = None
return results
def _parse_gdelt_export_zip(zip_bytes, conflict_codes, seen_locs, features, loc_index):
"""Parse a single GDELT export ZIP and append conflict features.
loc_index maps loc_key -> index in features list for fast duplicate merging.
@@ -278,11 +388,27 @@ def fetch_global_military_incidents():
if zip_bytes:
_parse_gdelt_export_zip(zip_bytes, CONFLICT_CODES, seen_locs, features, loc_index)
# Collect all unique article URLs for batch title fetching
all_article_urls = set()
for f in features:
for u in f["properties"].get("_urls", []):
if u:
all_article_urls.add(u)
logger.info(f"Fetching real article titles for {len(all_article_urls)} unique URLs...")
fetched_titles = _batch_fetch_titles(all_article_urls)
fetched_count = sum(1 for v in fetched_titles.values() if v)
logger.info(f"Resolved {fetched_count}/{len(all_article_urls)} article titles from HTML")
# Build URL + headline arrays for frontend rendering
for f in features:
urls = f["properties"].pop("_urls", [])
f["properties"].pop("_domains", None)
headlines = [_url_to_headline(u) for u in urls]
headlines = []
for u in urls:
# Try the real fetched title first, then fall back to URL slug parsing
real_title = fetched_titles.get(u)
headlines.append(real_title if real_title else _url_to_headline(u))
f["properties"]["_urls_list"] = urls
f["properties"]["_headlines_list"] = headlines
import html