From 9c5a4054f6a2833b2dc9d16838d20a2be15a1e62 Mon Sep 17 00:00:00 2001
From: TheYellowBeanieGuy <114354481+YellowBeanie@users.noreply.github.com>
Date: Tue, 16 Jun 2026 02:23:25 +0200
Subject: [PATCH] fix(gdelt): stop background thread mutating already-published
 features (dictionary changed size during iteration) (#388)

* fix(gdelt): publish enriched copies instead of mutating live features

_enrich_gdelt_titles_background ran in a daemon thread that mutated the
nested properties dicts of GDELT features already published into
latest_data[gdelt]. HTTP readers hold live references to those dicts and
serialize them outside the data lock, so the in-place mutation raced the
serializer and raised RuntimeError: dictionary changed size during
iteration on /api/live-data/slow and /api/bootstrap/critical.

Enrich deep copies instead and atomically swap the top-level key under
_data_lock, with an identity guard so a newer fetch_gdelt() is not clobbered.
Honors the replace-don't-mutate contract documented in fetchers/_store.py.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

* test(gdelt): regression test for background enrichment isolation

Asserts _enrich_gdelt_titles_background does not mutate already-published features and instead atomically swaps latest_data["gdelt"] with enriched copies (with the identity guard). Locks in the fix for the dictionary-changed-size race.

---------

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
---
 backend/services/geopolitics.py               | 71 +++++++++++++------
 .../tests/test_gdelt_enrichment_isolation.py  | 71 +++++++++++++++++++
 2 files changed, 120 insertions(+), 22 deletions(-)
 create mode 100644 backend/tests/test_gdelt_enrichment_isolation.py

diff --git a/backend/services/geopolitics.py b/backend/services/geopolitics.py
index ee0fd11..8b18d71 100644
--- a/backend/services/geopolitics.py
+++ b/backend/services/geopolitics.py
@@ -606,8 +606,19 @@ def _build_feature_html(features, fetched_titles=None):
 
 
 def _enrich_gdelt_titles_background(features, all_article_urls):
-    """Background thread: fetch real article titles then update features in-place."""
+    """Background thread: fetch real article titles, then publish enriched COPIES.
+
+    The ``features`` handed to us were already published into
+    ``latest_data["gdelt"]`` by ``fetch_gdelt()``. Per the store's thread-safety
+    contract (see ``get_latest_data_subset_refs`` in fetchers/_store.py), HTTP
+    readers hold live references to these nested ``properties`` dicts and
+    serialize them OUTSIDE the data lock. Mutating the published dicts in place
+    here races that serialization and raises
+    ``RuntimeError: dictionary changed size during iteration``. So we enrich
+    copies and atomically swap the top-level key under the lock instead.
+    """
     import html as html_mod
+    from services.fetchers._store import latest_data, _data_lock, _mark_fresh
 
     try:
         logger.info(f"[BG] Fetching real article titles for {len(all_article_urls)} URLs...")
@@ -615,28 +626,44 @@ def _enrich_gdelt_titles_background(features, all_article_urls):
         fetched_count = sum(1 for v in fetched_titles.values() if v)
         logger.info(f"[BG] Resolved {fetched_count}/{len(all_article_urls)} article titles")
 
-        # Update features in-place with real titles and snippets
+        # Build enriched copies — never touch the already-published objects.
+        enriched = []
         for f in features:
-            urls = f["properties"].get("_urls_list", [])
-            if not urls:
-                continue
-            headlines = []
-            snippets = []
-            for u in urls:
-                real_title = fetched_titles.get(u)
-                headlines.append(real_title if real_title else _url_to_headline(u))
-                snippets.append(_article_snippet_cache.get(u) or "")
-            f["properties"]["_headlines_list"] = headlines
-            f["properties"]["_snippets_list"] = snippets
-            links = []
-            for u, h in zip(urls, headlines):
-                safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
-                safe_h = html_mod.escape(h)
-                links.append(
-                    f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
-                )
-            f["properties"]["html"] = "".join(links)
-        logger.info(f"[BG] GDELT title enrichment complete")
+            nf = dict(f)
+            props = dict(f.get("properties", {}))
+            urls = props.get("_urls_list", [])
+            if urls:
+                headlines = []
+                snippets = []
+                for u in urls:
+                    real_title = fetched_titles.get(u)
+                    headlines.append(real_title if real_title else _url_to_headline(u))
+                    snippets.append(_article_snippet_cache.get(u) or "")
+                props["_headlines_list"] = headlines
+                props["_snippets_list"] = snippets
+                links = []
+                for u, h in zip(urls, headlines):
+                    safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
+                    safe_h = html_mod.escape(h)
+                    links.append(
+                        f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
+                    )
+                props["html"] = "".join(links)
+            nf["properties"] = props
+            enriched.append(nf)
+
+        # Atomically publish — but only if a newer fetch_gdelt() hasn't already
+        # replaced the layer while we were fetching titles (identity guard).
+        published = False
+        with _data_lock:
+            if latest_data.get("gdelt") is features:
+                latest_data["gdelt"] = enriched
+                published = True
+        if published:
+            _mark_fresh("gdelt")
+            logger.info(f"[BG] GDELT title enrichment complete ({len(enriched)} features)")
+        else:
+            logger.info("[BG] GDELT layer changed under us; skipping stale enrichment swap")
     except Exception as e:
         logger.error(f"[BG] GDELT title enrichment failed: {e}")
 
diff --git a/backend/tests/test_gdelt_enrichment_isolation.py b/backend/tests/test_gdelt_enrichment_isolation.py
new file mode 100644
index 0000000..66d16aa
--- /dev/null
+++ b/backend/tests/test_gdelt_enrichment_isolation.py
@@ -0,0 +1,71 @@
+"""Regression tests for the GDELT background title enrichment.
+
+The background enrichment thread used to mutate the nested ``properties`` dicts
+of GDELT features *after* they were already published into
+``latest_data["gdelt"]``. HTTP readers serialize those dicts outside the data
+lock, so the in-place mutation raced the serializer and raised
+``RuntimeError: dictionary changed size during iteration``.
+
+These tests pin the contract: the enrichment must NOT touch the
+already-published feature objects, and must instead publish enriched copies via
+an atomic swap (with an identity guard so a newer fetch is not clobbered).
+"""
+
+from services.fetchers import _store
+from services import geopolitics
+
+
+def _make_feature():
+    return {
+        "type": "Feature",
+        "geometry": {"type": "Point", "coordinates": [0.0, 0.0]},
+        "properties": {"name": "loc", "_urls_list": ["http://example.test/article-1"]},
+    }
+
+
+def test_enrichment_does_not_mutate_published_features(monkeypatch):
+    feature = _make_feature()
+    features = [feature]
+    with _store._data_lock:
+        _store.latest_data["gdelt"] = features
+
+    monkeypatch.setattr(
+        geopolitics,
+        "_batch_fetch_titles",
+        lambda urls: {"http://example.test/article-1": "Real Headline"},
+    )
+
+    geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
+
+    # The originally-published feature object must be untouched (no in-place
+    # mutation of its properties dict — that was the source of the crash).
+    assert "_headlines_list" not in feature["properties"]
+    assert "_snippets_list" not in feature["properties"]
+
+    # The layer must have been atomically replaced with an enriched COPY.
+    published = _store.latest_data["gdelt"]
+    assert published is not features
+    assert published[0] is not feature
+    assert published[0]["properties"]["_headlines_list"] == ["Real Headline"]
+
+
+def test_enrichment_skips_swap_when_layer_replaced(monkeypatch):
+    feature = _make_feature()
+    features = [feature]
+
+    # Simulate a newer fetch_gdelt() having already replaced the layer while the
+    # background thread was still resolving titles.
+    sentinel = [{"properties": {"name": "newer"}}]
+    with _store._data_lock:
+        _store.latest_data["gdelt"] = sentinel
+
+    monkeypatch.setattr(
+        geopolitics,
+        "_batch_fetch_titles",
+        lambda urls: {"http://example.test/article-1": "Real Headline"},
+    )
+
+    geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
+
+    # The identity guard must prevent clobbering the newer layer.
+    assert _store.latest_data["gdelt"] is sentinel