mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-07-01 10:15:43 +02:00
fix(gdelt): stop background thread mutating already-published features (dictionary changed size during iteration) (#388)
* fix(gdelt): publish enriched copies instead of mutating live features _enrich_gdelt_titles_background ran in a daemon thread that mutated the nested properties dicts of GDELT features already published into latest_data[gdelt]. HTTP readers hold live references to those dicts and serialize them outside the data lock, so the in-place mutation raced the serializer and raised RuntimeError: dictionary changed size during iteration on /api/live-data/slow and /api/bootstrap/critical. Enrich deep copies instead and atomically swap the top-level key under _data_lock, with an identity guard so a newer fetch_gdelt() is not clobbered. Honors the replace-don't-mutate contract documented in fetchers/_store.py. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com> * test(gdelt): regression test for background enrichment isolation Asserts _enrich_gdelt_titles_background does not mutate already-published features and instead atomically swaps latest_data["gdelt"] with enriched copies (with the identity guard). Locks in the fix for the dictionary-changed-size race. --------- Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
71a2ef4ce7
commit
9c5a4054f6
@@ -606,8 +606,19 @@ def _build_feature_html(features, fetched_titles=None):
|
||||
|
||||
|
||||
def _enrich_gdelt_titles_background(features, all_article_urls):
|
||||
"""Background thread: fetch real article titles then update features in-place."""
|
||||
"""Background thread: fetch real article titles, then publish enriched COPIES.
|
||||
|
||||
The ``features`` handed to us were already published into
|
||||
``latest_data["gdelt"]`` by ``fetch_gdelt()``. Per the store's thread-safety
|
||||
contract (see ``get_latest_data_subset_refs`` in fetchers/_store.py), HTTP
|
||||
readers hold live references to these nested ``properties`` dicts and
|
||||
serialize them OUTSIDE the data lock. Mutating the published dicts in place
|
||||
here races that serialization and raises
|
||||
``RuntimeError: dictionary changed size during iteration``. So we enrich
|
||||
copies and atomically swap the top-level key under the lock instead.
|
||||
"""
|
||||
import html as html_mod
|
||||
from services.fetchers._store import latest_data, _data_lock, _mark_fresh
|
||||
|
||||
try:
|
||||
logger.info(f"[BG] Fetching real article titles for {len(all_article_urls)} URLs...")
|
||||
@@ -615,28 +626,44 @@ def _enrich_gdelt_titles_background(features, all_article_urls):
|
||||
fetched_count = sum(1 for v in fetched_titles.values() if v)
|
||||
logger.info(f"[BG] Resolved {fetched_count}/{len(all_article_urls)} article titles")
|
||||
|
||||
# Update features in-place with real titles and snippets
|
||||
# Build enriched copies — never touch the already-published objects.
|
||||
enriched = []
|
||||
for f in features:
|
||||
urls = f["properties"].get("_urls_list", [])
|
||||
if not urls:
|
||||
continue
|
||||
headlines = []
|
||||
snippets = []
|
||||
for u in urls:
|
||||
real_title = fetched_titles.get(u)
|
||||
headlines.append(real_title if real_title else _url_to_headline(u))
|
||||
snippets.append(_article_snippet_cache.get(u) or "")
|
||||
f["properties"]["_headlines_list"] = headlines
|
||||
f["properties"]["_snippets_list"] = snippets
|
||||
links = []
|
||||
for u, h in zip(urls, headlines):
|
||||
safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
|
||||
safe_h = html_mod.escape(h)
|
||||
links.append(
|
||||
f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
|
||||
)
|
||||
f["properties"]["html"] = "".join(links)
|
||||
logger.info(f"[BG] GDELT title enrichment complete")
|
||||
nf = dict(f)
|
||||
props = dict(f.get("properties", {}))
|
||||
urls = props.get("_urls_list", [])
|
||||
if urls:
|
||||
headlines = []
|
||||
snippets = []
|
||||
for u in urls:
|
||||
real_title = fetched_titles.get(u)
|
||||
headlines.append(real_title if real_title else _url_to_headline(u))
|
||||
snippets.append(_article_snippet_cache.get(u) or "")
|
||||
props["_headlines_list"] = headlines
|
||||
props["_snippets_list"] = snippets
|
||||
links = []
|
||||
for u, h in zip(urls, headlines):
|
||||
safe_url = u if u.startswith(("http://", "https://")) else "about:blank"
|
||||
safe_h = html_mod.escape(h)
|
||||
links.append(
|
||||
f'<div style="margin-bottom:6px;"><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_h}</a></div>'
|
||||
)
|
||||
props["html"] = "".join(links)
|
||||
nf["properties"] = props
|
||||
enriched.append(nf)
|
||||
|
||||
# Atomically publish — but only if a newer fetch_gdelt() hasn't already
|
||||
# replaced the layer while we were fetching titles (identity guard).
|
||||
published = False
|
||||
with _data_lock:
|
||||
if latest_data.get("gdelt") is features:
|
||||
latest_data["gdelt"] = enriched
|
||||
published = True
|
||||
if published:
|
||||
_mark_fresh("gdelt")
|
||||
logger.info(f"[BG] GDELT title enrichment complete ({len(enriched)} features)")
|
||||
else:
|
||||
logger.info("[BG] GDELT layer changed under us; skipping stale enrichment swap")
|
||||
except Exception as e:
|
||||
logger.error(f"[BG] GDELT title enrichment failed: {e}")
|
||||
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
"""Regression tests for the GDELT background title enrichment.
|
||||
|
||||
The background enrichment thread used to mutate the nested ``properties`` dicts
|
||||
of GDELT features *after* they were already published into
|
||||
``latest_data["gdelt"]``. HTTP readers serialize those dicts outside the data
|
||||
lock, so the in-place mutation raced the serializer and raised
|
||||
``RuntimeError: dictionary changed size during iteration``.
|
||||
|
||||
These tests pin the contract: the enrichment must NOT touch the
|
||||
already-published feature objects, and must instead publish enriched copies via
|
||||
an atomic swap (with an identity guard so a newer fetch is not clobbered).
|
||||
"""
|
||||
|
||||
from services.fetchers import _store
|
||||
from services import geopolitics
|
||||
|
||||
|
||||
def _make_feature():
|
||||
return {
|
||||
"type": "Feature",
|
||||
"geometry": {"type": "Point", "coordinates": [0.0, 0.0]},
|
||||
"properties": {"name": "loc", "_urls_list": ["http://example.test/article-1"]},
|
||||
}
|
||||
|
||||
|
||||
def test_enrichment_does_not_mutate_published_features(monkeypatch):
|
||||
feature = _make_feature()
|
||||
features = [feature]
|
||||
with _store._data_lock:
|
||||
_store.latest_data["gdelt"] = features
|
||||
|
||||
monkeypatch.setattr(
|
||||
geopolitics,
|
||||
"_batch_fetch_titles",
|
||||
lambda urls: {"http://example.test/article-1": "Real Headline"},
|
||||
)
|
||||
|
||||
geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
|
||||
|
||||
# The originally-published feature object must be untouched (no in-place
|
||||
# mutation of its properties dict — that was the source of the crash).
|
||||
assert "_headlines_list" not in feature["properties"]
|
||||
assert "_snippets_list" not in feature["properties"]
|
||||
|
||||
# The layer must have been atomically replaced with an enriched COPY.
|
||||
published = _store.latest_data["gdelt"]
|
||||
assert published is not features
|
||||
assert published[0] is not feature
|
||||
assert published[0]["properties"]["_headlines_list"] == ["Real Headline"]
|
||||
|
||||
|
||||
def test_enrichment_skips_swap_when_layer_replaced(monkeypatch):
|
||||
feature = _make_feature()
|
||||
features = [feature]
|
||||
|
||||
# Simulate a newer fetch_gdelt() having already replaced the layer while the
|
||||
# background thread was still resolving titles.
|
||||
sentinel = [{"properties": {"name": "newer"}}]
|
||||
with _store._data_lock:
|
||||
_store.latest_data["gdelt"] = sentinel
|
||||
|
||||
monkeypatch.setattr(
|
||||
geopolitics,
|
||||
"_batch_fetch_titles",
|
||||
lambda urls: {"http://example.test/article-1": "Real Headline"},
|
||||
)
|
||||
|
||||
geopolitics._enrich_gdelt_titles_background(features, {"http://example.test/article-1"})
|
||||
|
||||
# The identity guard must prevent clobbering the newer layer.
|
||||
assert _store.latest_data["gdelt"] is sentinel
|
||||
Reference in New Issue
Block a user