fix(gdelt): stop background thread mutating already-published features (dictionary changed size during iteration) (#388)

* fix(gdelt): publish enriched copies instead of mutating live features

_enrich_gdelt_titles_background ran in a daemon thread that mutated the
nested properties dicts of GDELT features already published into
latest_data[gdelt]. HTTP readers hold live references to those dicts and
serialize them outside the data lock, so the in-place mutation raced the
serializer and raised RuntimeError: dictionary changed size during
iteration on /api/live-data/slow and /api/bootstrap/critical.

Enrich deep copies instead and atomically swap the top-level key under
_data_lock, with an identity guard so a newer fetch_gdelt() is not clobbered.
Honors the replace-don't-mutate contract documented in fetchers/_store.py.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

* test(gdelt): regression test for background enrichment isolation

Asserts _enrich_gdelt_titles_background does not mutate already-published features and instead atomically swaps latest_data["gdelt"] with enriched copies (with the identity guard). Locks in the fix for the dictionary-changed-size race.

---------

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
TheYellowBeanieGuy
2026-06-16 02:23:25 +02:00
committed by GitHub
parent 71a2ef4ce7
commit 9c5a4054f6
2 changed files with 120 additions and 22 deletions
+49 -22
View File
@@ -606,8 +606,19 @@ def _build_feature_html(features, fetched_titles=None):
def _enrich_gdelt_titles_background(features, all_article_urls):
"""Background thread: fetch real article titles then update features in-place."""
"""Background thread: fetch real article titles, then publish enriched COPIES.
The ``features`` handed to us were already published into
``latest_data["gdelt"]`` by ``fetch_gdelt()``. Per the store's thread-safety
contract (see ``get_latest_data_subset_refs`` in fetchers/_store.py), HTTP
readers hold live references to these nested ``properties`` dicts and
serialize them OUTSIDE the data lock. Mutating the published dicts in place
here races that serialization and raises
``RuntimeError: dictionary changed size during iteration``. So we enrich
copies and atomically swap the top-level key under the lock instead.
"""
import html as html_mod
from services.fetchers._store import latest_data, _data_lock, _mark_fresh
try:
logger.info(f"[BG] Fetching real article titles for {len(all_article_urls)} URLs...")
@@ -615,28 +626,44 @@ def _enrich_gdelt_titles_background(features, all_article_urls):
fetched_count = sum(1 for v in fetched_titles.values() if v)
logger.info(f"[BG] Resolved {fetched_count}/{len(all_article_urls)} article titles")
# Update features in-place with real titles and snippets
# Build enriched copies — never touch the already-published objects.
enriched = []
for f in features:
urls = f["properties"].get("_urls_list", [])
if not urls:
continue
headlines = []
snippets = []
for u in urls:
real_title = fetched_titles.get(u)
headlines.append(real_title if real_title else _url_to_headline(u))
snippets.append(_article_snippet_cache.get(u) or "")
f["properties"]["_headlines_list"] = headlines
f["properties"]["_snippets_list"] = snippets
links = []
for u, h in zip(urls, headlines):
safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
safe_h = html_mod.escape(h)
links.append(
f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
)
f["properties"]["html"] = "".join(links)
logger.info(f"[BG] GDELT title enrichment complete")
nf = dict(f)
props = dict(f.get("properties", {}))
urls = props.get("_urls_list", [])
if urls:
headlines = []
snippets = []
for u in urls:
real_title = fetched_titles.get(u)
headlines.append(real_title if real_title else _url_to_headline(u))
snippets.append(_article_snippet_cache.get(u) or "")
props["_headlines_list"] = headlines
props["_snippets_list"] = snippets
links = []
for u, h in zip(urls, headlines):
safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
safe_h = html_mod.escape(h)
links.append(
f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
)
props["html"] = "".join(links)
nf["properties"] = props
enriched.append(nf)
# Atomically publish — but only if a newer fetch_gdelt() hasn't already
# replaced the layer while we were fetching titles (identity guard).
published = False
with _data_lock:
if latest_data.get("gdelt") is features:
latest_data["gdelt"] = enriched
published = True
if published:
_mark_fresh("gdelt")
logger.info(f"[BG] GDELT title enrichment complete ({len(enriched)} features)")
else:
logger.info("[BG] GDELT layer changed under us; skipping stale enrichment swap")
except Exception as e:
logger.error(f"[BG] GDELT title enrichment failed: {e}")
@@ -0,0 +1,71 @@
"""Regression tests for the GDELT background title enrichment.
The background enrichment thread used to mutate the nested ``properties`` dicts
of GDELT features *after* they were already published into
``latest_data["gdelt"]``. HTTP readers serialize those dicts outside the data
lock, so the in-place mutation raced the serializer and raised
``RuntimeError: dictionary changed size during iteration``.
These tests pin the contract: the enrichment must NOT touch the
already-published feature objects, and must instead publish enriched copies via
an atomic swap (with an identity guard so a newer fetch is not clobbered).
"""
from services.fetchers import _store
from services import geopolitics
def _make_feature():
return {
"type": "Feature",
"geometry": {"type": "Point", "coordinates": [0.0, 0.0]},
"properties": {"name": "loc", "_urls_list": ["http://example.test/article-1"]},
}
def test_enrichment_does_not_mutate_published_features(monkeypatch):
feature = _make_feature()
features = [feature]
with _store._data_lock:
_store.latest_data["gdelt"] = features
monkeypatch.setattr(
geopolitics,
"_batch_fetch_titles",
lambda urls: {"http://example.test/article-1": "Real Headline"},
)
geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
# The originally-published feature object must be untouched (no in-place
# mutation of its properties dict — that was the source of the crash).
assert "_headlines_list" not in feature["properties"]
assert "_snippets_list" not in feature["properties"]
# The layer must have been atomically replaced with an enriched COPY.
published = _store.latest_data["gdelt"]
assert published is not features
assert published[0] is not feature
assert published[0]["properties"]["_headlines_list"] == ["Real Headline"]
def test_enrichment_skips_swap_when_layer_replaced(monkeypatch):
feature = _make_feature()
features = [feature]
# Simulate a newer fetch_gdelt() having already replaced the layer while the
# background thread was still resolving titles.
sentinel = [{"properties": {"name": "newer"}}]
with _store._data_lock:
_store.latest_data["gdelt"] = sentinel
monkeypatch.setattr(
geopolitics,
"_batch_fetch_titles",
lambda urls: {"http://example.test/article-1": "Real Headline"},
)
geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
# The identity guard must prevent clobbering the newer layer.
assert _store.latest_data["gdelt"] is sentinel