fix(gdelt): stop background thread mutating already-published features (dictionary changed size during iteration) (#388)

* fix(gdelt): publish enriched copies instead of mutating live features _enrich_gdelt_titles_background ran in a daemon thread that mutated the nested properties dicts of GDELT features already published into latest_data[gdelt]. HTTP readers hold live references to those dicts and serialize them outside the data lock, so the in-place mutation raced the serializer and raised RuntimeError: dictionary changed size during iteration on /api/live-data/slow and /api/bootstrap/critical. Enrich deep copies instead and atomically swap the top-level key under _data_lock, with an identity guard so a newer fetch_gdelt() is not clobbered. Honors the replace-don't-mutate contract documented in fetchers/_store.py. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com> * test(gdelt): regression test for background enrichment isolation Asserts _enrich_gdelt_titles_background does not mutate already-published features and instead atomically swaps latest_data["gdelt"] with enriched copies (with the identity guard). Locks in the fix for the dictionary-changed-size race. --------- Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
2026-07-01 10:15:43 +02:00 · 2026-06-16 02:23:25 +02:00
parent 71a2ef4ce7
commit 9c5a4054f6
2 changed files with 120 additions and 22 deletions
@@ -606,8 +606,19 @@ def _build_feature_html(features, fetched_titles=None):


 def _enrich_gdelt_titles_background(features, all_article_urls):
-    """Background thread: fetch real article titles then update features in-place."""
+    """Background thread: fetch real article titles, then publish enriched COPIES.
+
+    The ``features`` handed to us were already published into
+    ``latest_data["gdelt"]`` by ``fetch_gdelt()``. Per the store's thread-safety
+    contract (see ``get_latest_data_subset_refs`` in fetchers/_store.py), HTTP
+    readers hold live references to these nested ``properties`` dicts and
+    serialize them OUTSIDE the data lock. Mutating the published dicts in place
+    here races that serialization and raises
+    ``RuntimeError: dictionary changed size during iteration``. So we enrich
+    copies and atomically swap the top-level key under the lock instead.
+    """
    import html as html_mod
+    from services.fetchers._store import latest_data, _data_lock, _mark_fresh

    try:
        logger.info(f"[BG] Fetching real article titles for {len(all_article_urls)} URLs...")
@@ -615,28 +626,44 @@ def _enrich_gdelt_titles_background(features, all_article_urls):
        fetched_count = sum(1 for v in fetched_titles.values() if v)
        logger.info(f"[BG] Resolved {fetched_count}/{len(all_article_urls)} article titles")

-        # Update features in-place with real titles and snippets
+        # Build enriched copies — never touch the already-published objects.
+        enriched = []
        for f in features:
-            urls = f["properties"].get("_urls_list", [])
-            if not urls:
-                continue
-            headlines = []
-            snippets = []
-            for u in urls:
-                real_title = fetched_titles.get(u)
-                headlines.append(real_title if real_title else _url_to_headline(u))
-                snippets.append(_article_snippet_cache.get(u) or "")
-            f["properties"]["_headlines_list"] = headlines
-            f["properties"]["_snippets_list"] = snippets
-            links = []
-            for u, h in zip(urls, headlines):
-                safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
-                safe_h = html_mod.escape(h)
-                links.append(
-                    f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
-                )
-            f["properties"]["html"] = "".join(links)
-        logger.info(f"[BG] GDELT title enrichment complete")
+            nf = dict(f)
+            props = dict(f.get("properties", {}))
+            urls = props.get("_urls_list", [])
+            if urls:
+                headlines = []
+                snippets = []
+                for u in urls:
+                    real_title = fetched_titles.get(u)
+                    headlines.append(real_title if real_title else _url_to_headline(u))
+                    snippets.append(_article_snippet_cache.get(u) or "")
+                props["_headlines_list"] = headlines
+                props["_snippets_list"] = snippets
+                links = []
+                for u, h in zip(urls, headlines):
+                    safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
+                    safe_h = html_mod.escape(h)
+                    links.append(
+                        f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
+                    )
+                props["html"] = "".join(links)
+            nf["properties"] = props
+            enriched.append(nf)
+
+        # Atomically publish — but only if a newer fetch_gdelt() hasn't already
+        # replaced the layer while we were fetching titles (identity guard).
+        published = False
+        with _data_lock:
+            if latest_data.get("gdelt") is features:
+                latest_data["gdelt"] = enriched
+                published = True
+        if published:
+            _mark_fresh("gdelt")
+            logger.info(f"[BG] GDELT title enrichment complete ({len(enriched)} features)")
+        else:
+            logger.info("[BG] GDELT layer changed under us; skipping stale enrichment swap")
    except Exception as e:
        logger.error(f"[BG] GDELT title enrichment failed: {e}")

@@ -0,0 +1,71 @@
+"""Regression tests for the GDELT background title enrichment.
+
+The background enrichment thread used to mutate the nested ``properties`` dicts
+of GDELT features *after* they were already published into
+``latest_data["gdelt"]``. HTTP readers serialize those dicts outside the data
+lock, so the in-place mutation raced the serializer and raised
+``RuntimeError: dictionary changed size during iteration``.
+
+These tests pin the contract: the enrichment must NOT touch the
+already-published feature objects, and must instead publish enriched copies via
+an atomic swap (with an identity guard so a newer fetch is not clobbered).
+"""
+
+from services.fetchers import _store
+from services import geopolitics
+
+
+def _make_feature():
+    return {
+        "type": "Feature",
+        "geometry": {"type": "Point", "coordinates": [0.0, 0.0]},
+        "properties": {"name": "loc", "_urls_list": ["http://example.test/article-1"]},
+    }
+
+
+def test_enrichment_does_not_mutate_published_features(monkeypatch):
+    feature = _make_feature()
+    features = [feature]
+    with _store._data_lock:
+        _store.latest_data["gdelt"] = features
+
+    monkeypatch.setattr(
+        geopolitics,
+        "_batch_fetch_titles",
+        lambda urls: {"http://example.test/article-1": "Real Headline"},
+    )
+
+    geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
+
+    # The originally-published feature object must be untouched (no in-place
+    # mutation of its properties dict — that was the source of the crash).
+    assert "_headlines_list" not in feature["properties"]
+    assert "_snippets_list" not in feature["properties"]
+
+    # The layer must have been atomically replaced with an enriched COPY.
+    published = _store.latest_data["gdelt"]
+    assert published is not features
+    assert published[0] is not feature
+    assert published[0]["properties"]["_headlines_list"] == ["Real Headline"]
+
+
+def test_enrichment_skips_swap_when_layer_replaced(monkeypatch):
+    feature = _make_feature()
+    features = [feature]
+
+    # Simulate a newer fetch_gdelt() having already replaced the layer while the
+    # background thread was still resolving titles.
+    sentinel = [{"properties": {"name": "newer"}}]
+    with _store._data_lock:
+        _store.latest_data["gdelt"] = sentinel
+
+    monkeypatch.setattr(
+        geopolitics,
+        "_batch_fetch_titles",
+        lambda urls: {"http://example.test/article-1": "Real Headline"},
+    )
+
+    geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
+
+    # The identity guard must prevent clobbering the newer layer.
+    assert _store.latest_data["gdelt"] is sentinel