From 9c5a4054f6a2833b2dc9d16838d20a2be15a1e62 Mon Sep 17 00:00:00 2001 From: TheYellowBeanieGuy <114354481+YellowBeanie@users.noreply.github.com> Date: Tue, 16 Jun 2026 02:23:25 +0200 Subject: [PATCH] fix(gdelt): stop background thread mutating already-published features (dictionary changed size during iteration) (#388) * fix(gdelt): publish enriched copies instead of mutating live features _enrich_gdelt_titles_background ran in a daemon thread that mutated the nested properties dicts of GDELT features already published into latest_data[gdelt]. HTTP readers hold live references to those dicts and serialize them outside the data lock, so the in-place mutation raced the serializer and raised RuntimeError: dictionary changed size during iteration on /api/live-data/slow and /api/bootstrap/critical. Enrich deep copies instead and atomically swap the top-level key under _data_lock, with an identity guard so a newer fetch_gdelt() is not clobbered. Honors the replace-don't-mutate contract documented in fetchers/_store.py. Co-Authored-By: Claude Fable 5 * test(gdelt): regression test for background enrichment isolation Asserts _enrich_gdelt_titles_background does not mutate already-published features and instead atomically swaps latest_data["gdelt"] with enriched copies (with the identity guard). Locks in the fix for the dictionary-changed-size race. --------- Co-authored-by: Claude Fable 5 --- backend/services/geopolitics.py | 71 +++++++++++++------ .../tests/test_gdelt_enrichment_isolation.py | 71 +++++++++++++++++++ 2 files changed, 120 insertions(+), 22 deletions(-) create mode 100644 backend/tests/test_gdelt_enrichment_isolation.py diff --git a/backend/services/geopolitics.py b/backend/services/geopolitics.py index ee0fd11..8b18d71 100644 --- a/backend/services/geopolitics.py +++ b/backend/services/geopolitics.py @@ -606,8 +606,19 @@ def _build_feature_html(features, fetched_titles=None): def _enrich_gdelt_titles_background(features, all_article_urls): - """Background thread: fetch real article titles then update features in-place.""" + """Background thread: fetch real article titles, then publish enriched COPIES. + + The ``features`` handed to us were already published into + ``latest_data["gdelt"]`` by ``fetch_gdelt()``. Per the store's thread-safety + contract (see ``get_latest_data_subset_refs`` in fetchers/_store.py), HTTP + readers hold live references to these nested ``properties`` dicts and + serialize them OUTSIDE the data lock. Mutating the published dicts in place + here races that serialization and raises + ``RuntimeError: dictionary changed size during iteration``. So we enrich + copies and atomically swap the top-level key under the lock instead. + """ import html as html_mod + from services.fetchers._store import latest_data, _data_lock, _mark_fresh try: logger.info(f"[BG] Fetching real article titles for {len(all_article_urls)} URLs...") @@ -615,28 +626,44 @@ def _enrich_gdelt_titles_background(features, all_article_urls): fetched_count = sum(1 for v in fetched_titles.values() if v) logger.info(f"[BG] Resolved {fetched_count}/{len(all_article_urls)} article titles") - # Update features in-place with real titles and snippets + # Build enriched copies — never touch the already-published objects. + enriched = [] for f in features: - urls = f["properties"].get("_urls_list", []) - if not urls: - continue - headlines = [] - snippets = [] - for u in urls: - real_title = fetched_titles.get(u) - headlines.append(real_title if real_title else _url_to_headline(u)) - snippets.append(_article_snippet_cache.get(u) or "") - f["properties"]["_headlines_list"] = headlines - f["properties"]["_snippets_list"] = snippets - links = [] - for u, h in zip(urls, headlines): - safe_url = u if u.startswith(("http://", "https://")) else "about:blank" - safe_h = html_mod.escape(h) - links.append( - f'
{safe_h}
' - ) - f["properties"]["html"] = "".join(links) - logger.info(f"[BG] GDELT title enrichment complete") + nf = dict(f) + props = dict(f.get("properties", {})) + urls = props.get("_urls_list", []) + if urls: + headlines = [] + snippets = [] + for u in urls: + real_title = fetched_titles.get(u) + headlines.append(real_title if real_title else _url_to_headline(u)) + snippets.append(_article_snippet_cache.get(u) or "") + props["_headlines_list"] = headlines + props["_snippets_list"] = snippets + links = [] + for u, h in zip(urls, headlines): + safe_url = u if u.startswith(("http://", "https://")) else "about:blank" + safe_h = html_mod.escape(h) + links.append( + f'
{safe_h}
' + ) + props["html"] = "".join(links) + nf["properties"] = props + enriched.append(nf) + + # Atomically publish — but only if a newer fetch_gdelt() hasn't already + # replaced the layer while we were fetching titles (identity guard). + published = False + with _data_lock: + if latest_data.get("gdelt") is features: + latest_data["gdelt"] = enriched + published = True + if published: + _mark_fresh("gdelt") + logger.info(f"[BG] GDELT title enrichment complete ({len(enriched)} features)") + else: + logger.info("[BG] GDELT layer changed under us; skipping stale enrichment swap") except Exception as e: logger.error(f"[BG] GDELT title enrichment failed: {e}") diff --git a/backend/tests/test_gdelt_enrichment_isolation.py b/backend/tests/test_gdelt_enrichment_isolation.py new file mode 100644 index 0000000..66d16aa --- /dev/null +++ b/backend/tests/test_gdelt_enrichment_isolation.py @@ -0,0 +1,71 @@ +"""Regression tests for the GDELT background title enrichment. + +The background enrichment thread used to mutate the nested ``properties`` dicts +of GDELT features *after* they were already published into +``latest_data["gdelt"]``. HTTP readers serialize those dicts outside the data +lock, so the in-place mutation raced the serializer and raised +``RuntimeError: dictionary changed size during iteration``. + +These tests pin the contract: the enrichment must NOT touch the +already-published feature objects, and must instead publish enriched copies via +an atomic swap (with an identity guard so a newer fetch is not clobbered). +""" + +from services.fetchers import _store +from services import geopolitics + + +def _make_feature(): + return { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [0.0, 0.0]}, + "properties": {"name": "loc", "_urls_list": ["http://example.test/article-1"]}, + } + + +def test_enrichment_does_not_mutate_published_features(monkeypatch): + feature = _make_feature() + features = [feature] + with _store._data_lock: + _store.latest_data["gdelt"] = features + + monkeypatch.setattr( + geopolitics, + "_batch_fetch_titles", + lambda urls: {"http://example.test/article-1": "Real Headline"}, + ) + + geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"}) + + # The originally-published feature object must be untouched (no in-place + # mutation of its properties dict — that was the source of the crash). + assert "_headlines_list" not in feature["properties"] + assert "_snippets_list" not in feature["properties"] + + # The layer must have been atomically replaced with an enriched COPY. + published = _store.latest_data["gdelt"] + assert published is not features + assert published[0] is not feature + assert published[0]["properties"]["_headlines_list"] == ["Real Headline"] + + +def test_enrichment_skips_swap_when_layer_replaced(monkeypatch): + feature = _make_feature() + features = [feature] + + # Simulate a newer fetch_gdelt() having already replaced the layer while the + # background thread was still resolving titles. + sentinel = [{"properties": {"name": "newer"}}] + with _store._data_lock: + _store.latest_data["gdelt"] = sentinel + + monkeypatch.setattr( + geopolitics, + "_batch_fetch_titles", + lambda urls: {"http://example.test/article-1": "Real Headline"}, + ) + + geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"}) + + # The identity guard must prevent clobbering the newer layer. + assert _store.latest_data["gdelt"] is sentinel