From 130287bb49ff5995ba3098a86f0ef9c10ceb2d62 Mon Sep 17 00:00:00 2001 From: adust09 Date: Sun, 15 Mar 2026 23:19:55 +0900 Subject: [PATCH] feat: add East Asia news sources and improve geocoding for Taiwan contingency Add 5 East Asia-focused RSS feeds (FocusTaiwan, Kyodo, SCMP, The Diplomat, Stars and Stripes) and 22 geographic keywords (Taiwan Strait, South/East China Sea, Okinawa, Guam, military bases, etc.) to improve coverage of Taiwan contingency scenarios. Refactor keyword matching into a pure _resolve_coords() function with longest-match-first sorting so specific locations like "Taiwan Strait" are not absorbed by generic "Taiwan". Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/config/news_feeds.json | 25 +++++++ backend/services/fetchers/news.py | 60 ++++++++++++---- backend/services/news_feed_config.py | 5 ++ backend/tests/test_news_keywords.py | 102 +++++++++++++++++++++++++++ 4 files changed, 179 insertions(+), 13 deletions(-) create mode 100644 backend/tests/test_news_keywords.py diff --git a/backend/config/news_feeds.json b/backend/config/news_feeds.json index 1af3a0f..4c3a906 100644 --- a/backend/config/news_feeds.json +++ b/backend/config/news_feeds.json @@ -39,6 +39,31 @@ "name": "Mercopress", "url": "https://en.mercopress.com/rss/", "weight": 3 + }, + { + "name": "FocusTaiwan", + "url": "https://focustaiwan.tw/rss", + "weight": 5 + }, + { + "name": "Kyodo", + "url": "https://english.kyodonews.net/rss/news.xml", + "weight": 4 + }, + { + "name": "SCMP", + "url": "https://www.scmp.com/rss/91/feed", + "weight": 4 + }, + { + "name": "The Diplomat", + "url": "https://thediplomat.com/feed/", + "weight": 4 + }, + { + "name": "Stars and Stripes", + "url": "https://www.stripes.com/feeds/pacific.rss", + "weight": 4 } ] } \ No newline at end of file diff --git a/backend/services/fetchers/news.py b/backend/services/fetchers/news.py index ce92a56..aeb7270 100644 --- a/backend/services/fetchers/news.py +++ b/backend/services/fetchers/news.py @@ -33,6 +33,29 @@ _KEYWORD_COORDS = { "lebanon": (33.854, 35.862), "syria": (34.802, 38.996), "yemen": (15.552, 48.516), + # East Asia — specific locations (longer keywords matched first via _SORTED_KEYWORDS) + "taiwan strait": (24.0, 119.5), + "south china sea": (15.0, 115.0), + "east china sea": (28.0, 125.0), + "philippine sea": (20.0, 130.0), + "senkaku": (25.740, 123.474), + "diaoyu": (25.740, 123.474), + "ryukyu": (26.334, 127.800), + "okinawa": (26.334, 127.800), + "kadena": (26.351, 127.767), + "naha": (26.212, 127.679), + "yokosuka": (35.283, 139.671), + "sasebo": (33.159, 129.722), + "misawa": (40.682, 141.368), + "iwakuni": (34.144, 132.236), + "guam": (13.444, 144.793), + "taipei": (25.033, 121.565), + "kaohsiung": (22.616, 120.313), + "xiamen": (24.479, 118.089), + "fujian": (26.074, 119.296), + "guangdong": (23.379, 113.763), + "zhejiang": (29.141, 119.788), + "hainan": (19.200, 109.999), "china": (35.861, 104.195), "beijing": (39.904, 116.407), "taiwan": (23.697, 120.960), @@ -90,6 +113,27 @@ _KEYWORD_COORDS = { "jakarta": (-6.208, 106.845), } +# Immutable after module load — sort by descending keyword length so +# specific locations ("taiwan strait") match before generic ones ("taiwan") +_SORTED_KEYWORDS = sorted(_KEYWORD_COORDS.items(), key=lambda x: len(x[0]), reverse=True) + + +def _resolve_coords(text: str) -> tuple[float, float] | None: + """Return (lat, lng) for the most specific keyword match, or None. + + Longer keywords are tried first. Space-padded keywords (" us ", " uk ") + use substring matching on padded text; all others use word-boundary regex. + """ + padded_text = f" {text} " + for kw, coords in _SORTED_KEYWORDS: + if kw.startswith(" ") or kw.endswith(" "): + if kw in padded_text: + return coords + else: + if re.search(r'\b' + re.escape(kw) + r'\b', text): + return coords + return None + @with_retry(max_retries=1, base_delay=2) def fetch_news(): @@ -140,8 +184,6 @@ def fetch_news(): risk_score += 2 risk_score = min(10, risk_score) - keyword_coords = _KEYWORD_COORDS - lat, lng = None, None if 'georss_point' in entry: @@ -153,18 +195,10 @@ def fetch_news(): lat, lng = coords[1], coords[0] if lat is None: - # text may not be defined yet for GDACS path text = (title + " " + summary).lower() - padded_text = f" {text} " - for kw, coords in keyword_coords.items(): - if kw.startswith(" ") or kw.endswith(" "): - if kw in padded_text: - lat, lng = coords - break - else: - if re.search(r'\b' + re.escape(kw) + r'\b', text): - lat, lng = coords - break + result = _resolve_coords(text) + if result: + lat, lng = result if lat is not None: key = None diff --git a/backend/services/news_feed_config.py b/backend/services/news_feed_config.py index b26ba1e..586ed4d 100644 --- a/backend/services/news_feed_config.py +++ b/backend/services/news_feed_config.py @@ -20,6 +20,11 @@ DEFAULT_FEEDS = [ {"name": "NHK", "url": "https://www3.nhk.or.jp/nhkworld/rss/world.xml", "weight": 3}, {"name": "CNA", "url": "https://www.channelnewsasia.com/rssfeed/8395986", "weight": 3}, {"name": "Mercopress", "url": "https://en.mercopress.com/rss/", "weight": 3}, + {"name": "FocusTaiwan", "url": "https://focustaiwan.tw/rss", "weight": 5}, + {"name": "Kyodo", "url": "https://english.kyodonews.net/rss/news.xml", "weight": 4}, + {"name": "SCMP", "url": "https://www.scmp.com/rss/91/feed", "weight": 4}, + {"name": "The Diplomat", "url": "https://thediplomat.com/feed/", "weight": 4}, + {"name": "Stars and Stripes", "url": "https://www.stripes.com/feeds/pacific.rss", "weight": 4}, ] diff --git a/backend/tests/test_news_keywords.py b/backend/tests/test_news_keywords.py new file mode 100644 index 0000000..bb19823 --- /dev/null +++ b/backend/tests/test_news_keywords.py @@ -0,0 +1,102 @@ +"""Regression tests for news geocoding keywords and feed configuration.""" +import json +from pathlib import Path + +import pytest + +from services.fetchers.news import _resolve_coords +from services.news_feed_config import DEFAULT_FEEDS + + +CONFIG_PATH = Path(__file__).parent.parent / "config" / "news_feeds.json" + + +# -- Keyword resolution: East Asia specific locations -------------------------- + +class TestResolveCoords: + """_resolve_coords should prefer longer (more specific) keywords.""" + + def test_taiwan_strait_not_absorbed_by_taiwan(self): + result = _resolve_coords("tensions in the taiwan strait") + assert result == (24.0, 119.5) + + def test_south_china_sea_not_absorbed_by_china(self): + result = _resolve_coords("south china sea patrol") + assert result == (15.0, 115.0) + + def test_east_china_sea(self): + result = _resolve_coords("east china sea tensions") + assert result == (28.0, 125.0) + + def test_philippine_sea(self): + result = _resolve_coords("philippine sea exercises") + assert result == (20.0, 130.0) + + def test_generic_china_still_works(self): + result = _resolve_coords("china deploys forces") + assert result == (35.861, 104.195) + + def test_generic_taiwan_still_works(self): + result = _resolve_coords("taiwan elections") + assert result == (23.697, 120.960) + + def test_taipei(self): + result = _resolve_coords("protests in taipei") + assert result == (25.033, 121.565) + + def test_okinawa(self): + result = _resolve_coords("okinawa base expansion") + assert result == (26.334, 127.800) + + # -- Existing inclusion-relationship regressions --------------------------- + + def test_new_delhi_not_absorbed_by_delhi(self): + result = _resolve_coords("new delhi summit") + assert result == (28.613, 77.209) + + def test_south_america_not_absorbed_by_america(self): + result = _resolve_coords("south america trade deal") + assert result == (-14.200, -51.900) + + def test_north_korea_not_absorbed_by_south_korea(self): + result = _resolve_coords("north korea missile launch") + assert result == (40.339, 127.510) + + # -- Space-padded keywords ------------------------------------------------- + + def test_us_with_spaces(self): + result = _resolve_coords("the us military") + assert result == (38.907, -77.036) + + def test_uk_with_spaces(self): + result = _resolve_coords("visit the uk soon") + assert result == (55.378, -3.435) + + # -- No match -------------------------------------------------------------- + + def test_no_match_returns_none(self): + result = _resolve_coords("unknown location xyz") + assert result is None + + +# -- Feed configuration consistency ------------------------------------------- + +class TestFeedConfig: + """DEFAULT_FEEDS and news_feeds.json must stay in sync.""" + + def test_default_feeds_match_json(self): + data = json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + json_feeds = data["feeds"] + + def normalize(feeds): + return sorted( + [{"name": f["name"], "url": f["url"], "weight": f["weight"]} for f in feeds], + key=lambda f: f["name"], + ) + + assert normalize(DEFAULT_FEEDS) == normalize(json_feeds) + + def test_new_east_asia_feeds_present(self): + names = {f["name"] for f in DEFAULT_FEEDS} + expected = {"FocusTaiwan", "Kyodo", "SCMP", "The Diplomat", "Stars and Stripes"} + assert expected.issubset(names)