feat: add East Asia news sources and improve geocoding for Taiwan contingency

Add 5 East Asia-focused RSS feeds (FocusTaiwan, Kyodo, SCMP, The Diplomat, Stars and Stripes) and 22 geographic keywords (Taiwan Strait, South/East China Sea, Okinawa, Guam, military bases, etc.) to improve coverage of Taiwan contingency scenarios. Refactor keyword matching into a pure _resolve_coords() function with longest-match-first sorting so specific locations like "Taiwan Strait" are not absorbed by generic "Taiwan". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 06:26:13 +02:00 · 2026-03-15 23:19:55 +09:00
parent 4a33424924
commit 130287bb49
4 changed files with 179 additions and 13 deletions
@@ -39,6 +39,31 @@
      "name": "Mercopress",
      "url": "https://en.mercopress.com/rss/",
      "weight": 3
+    },
+    {
+      "name": "FocusTaiwan",
+      "url": "https://focustaiwan.tw/rss",
+      "weight": 5
+    },
+    {
+      "name": "Kyodo",
+      "url": "https://english.kyodonews.net/rss/news.xml",
+      "weight": 4
+    },
+    {
+      "name": "SCMP",
+      "url": "https://www.scmp.com/rss/91/feed",
+      "weight": 4
+    },
+    {
+      "name": "The Diplomat",
+      "url": "https://thediplomat.com/feed/",
+      "weight": 4
+    },
+    {
+      "name": "Stars and Stripes",
+      "url": "https://www.stripes.com/feeds/pacific.rss",
+      "weight": 4
    }
  ]
 }
@@ -33,6 +33,29 @@ _KEYWORD_COORDS = {
    "lebanon": (33.854, 35.862),
    "syria": (34.802, 38.996),
    "yemen": (15.552, 48.516),
+    # East Asia — specific locations (longer keywords matched first via _SORTED_KEYWORDS)
+    "taiwan strait": (24.0, 119.5),
+    "south china sea": (15.0, 115.0),
+    "east china sea": (28.0, 125.0),
+    "philippine sea": (20.0, 130.0),
+    "senkaku": (25.740, 123.474),
+    "diaoyu": (25.740, 123.474),
+    "ryukyu": (26.334, 127.800),
+    "okinawa": (26.334, 127.800),
+    "kadena": (26.351, 127.767),
+    "naha": (26.212, 127.679),
+    "yokosuka": (35.283, 139.671),
+    "sasebo": (33.159, 129.722),
+    "misawa": (40.682, 141.368),
+    "iwakuni": (34.144, 132.236),
+    "guam": (13.444, 144.793),
+    "taipei": (25.033, 121.565),
+    "kaohsiung": (22.616, 120.313),
+    "xiamen": (24.479, 118.089),
+    "fujian": (26.074, 119.296),
+    "guangdong": (23.379, 113.763),
+    "zhejiang": (29.141, 119.788),
+    "hainan": (19.200, 109.999),
    "china": (35.861, 104.195),
    "beijing": (39.904, 116.407),
    "taiwan": (23.697, 120.960),
@@ -90,6 +113,27 @@ _KEYWORD_COORDS = {
    "jakarta": (-6.208, 106.845),
 }

+# Immutable after module load — sort by descending keyword length so
+# specific locations ("taiwan strait") match before generic ones ("taiwan")
+_SORTED_KEYWORDS = sorted(_KEYWORD_COORDS.items(), key=lambda x: len(x[0]), reverse=True)
+
+
+def _resolve_coords(text: str) -> tuple[float, float] | None:
+    """Return (lat, lng) for the most specific keyword match, or None.
+
+    Longer keywords are tried first. Space-padded keywords (" us ", " uk ")
+    use substring matching on padded text; all others use word-boundary regex.
+    """
+    padded_text = f" {text} "
+    for kw, coords in _SORTED_KEYWORDS:
+        if kw.startswith(" ") or kw.endswith(" "):
+            if kw in padded_text:
+                return coords
+        else:
+            if re.search(r'\b' + re.escape(kw) + r'\b', text):
+                return coords
+    return None
+

@with_retry(max_retries=1, base_delay=2)
 def fetch_news():
@@ -140,8 +184,6 @@ def fetch_news():
                        risk_score += 2
                risk_score = min(10, risk_score)

-            keyword_coords = _KEYWORD_COORDS
-
            lat, lng = None, None

            if 'georss_point' in entry:
@@ -153,18 +195,10 @@ def fetch_news():
                lat, lng = coords[1], coords[0]

            if lat is None:
-                # text may not be defined yet for GDACS path
                text = (title + " " + summary).lower()
-                padded_text = f" {text} "
-                for kw, coords in keyword_coords.items():
-                    if kw.startswith(" ") or kw.endswith(" "):
-                        if kw in padded_text:
-                            lat, lng = coords
-                            break
-                    else:
-                        if re.search(r'\b' + re.escape(kw) + r'\b', text):
-                            lat, lng = coords
-                            break
+                result = _resolve_coords(text)
+                if result:
+                    lat, lng = result

            if lat is not None:
                key = None
@@ -20,6 +20,11 @@ DEFAULT_FEEDS = [
    {"name": "NHK", "url": "https://www3.nhk.or.jp/nhkworld/rss/world.xml", "weight": 3},
    {"name": "CNA", "url": "https://www.channelnewsasia.com/rssfeed/8395986", "weight": 3},
    {"name": "Mercopress", "url": "https://en.mercopress.com/rss/", "weight": 3},
+    {"name": "FocusTaiwan", "url": "https://focustaiwan.tw/rss", "weight": 5},
+    {"name": "Kyodo", "url": "https://english.kyodonews.net/rss/news.xml", "weight": 4},
+    {"name": "SCMP", "url": "https://www.scmp.com/rss/91/feed", "weight": 4},
+    {"name": "The Diplomat", "url": "https://thediplomat.com/feed/", "weight": 4},
+    {"name": "Stars and Stripes", "url": "https://www.stripes.com/feeds/pacific.rss", "weight": 4},
 ]


@@ -0,0 +1,102 @@
+"""Regression tests for news geocoding keywords and feed configuration."""
+import json
+from pathlib import Path
+
+import pytest
+
+from services.fetchers.news import _resolve_coords
+from services.news_feed_config import DEFAULT_FEEDS
+
+
+CONFIG_PATH = Path(__file__).parent.parent / "config" / "news_feeds.json"
+
+
+# -- Keyword resolution: East Asia specific locations --------------------------
+
+class TestResolveCoords:
+    """_resolve_coords should prefer longer (more specific) keywords."""
+
+    def test_taiwan_strait_not_absorbed_by_taiwan(self):
+        result = _resolve_coords("tensions in the taiwan strait")
+        assert result == (24.0, 119.5)
+
+    def test_south_china_sea_not_absorbed_by_china(self):
+        result = _resolve_coords("south china sea patrol")
+        assert result == (15.0, 115.0)
+
+    def test_east_china_sea(self):
+        result = _resolve_coords("east china sea tensions")
+        assert result == (28.0, 125.0)
+
+    def test_philippine_sea(self):
+        result = _resolve_coords("philippine sea exercises")
+        assert result == (20.0, 130.0)
+
+    def test_generic_china_still_works(self):
+        result = _resolve_coords("china deploys forces")
+        assert result == (35.861, 104.195)
+
+    def test_generic_taiwan_still_works(self):
+        result = _resolve_coords("taiwan elections")
+        assert result == (23.697, 120.960)
+
+    def test_taipei(self):
+        result = _resolve_coords("protests in taipei")
+        assert result == (25.033, 121.565)
+
+    def test_okinawa(self):
+        result = _resolve_coords("okinawa base expansion")
+        assert result == (26.334, 127.800)
+
+    # -- Existing inclusion-relationship regressions ---------------------------
+
+    def test_new_delhi_not_absorbed_by_delhi(self):
+        result = _resolve_coords("new delhi summit")
+        assert result == (28.613, 77.209)
+
+    def test_south_america_not_absorbed_by_america(self):
+        result = _resolve_coords("south america trade deal")
+        assert result == (-14.200, -51.900)
+
+    def test_north_korea_not_absorbed_by_south_korea(self):
+        result = _resolve_coords("north korea missile launch")
+        assert result == (40.339, 127.510)
+
+    # -- Space-padded keywords -------------------------------------------------
+
+    def test_us_with_spaces(self):
+        result = _resolve_coords("the us military")
+        assert result == (38.907, -77.036)
+
+    def test_uk_with_spaces(self):
+        result = _resolve_coords("visit the uk soon")
+        assert result == (55.378, -3.435)
+
+    # -- No match --------------------------------------------------------------
+
+    def test_no_match_returns_none(self):
+        result = _resolve_coords("unknown location xyz")
+        assert result is None
+
+
+# -- Feed configuration consistency -------------------------------------------
+
+class TestFeedConfig:
+    """DEFAULT_FEEDS and news_feeds.json must stay in sync."""
+
+    def test_default_feeds_match_json(self):
+        data = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
+        json_feeds = data["feeds"]
+
+        def normalize(feeds):
+            return sorted(
+                [{"name": f["name"], "url": f["url"], "weight": f["weight"]} for f in feeds],
+                key=lambda f: f["name"],
+            )
+
+        assert normalize(DEFAULT_FEEDS) == normalize(json_feeds)
+
+    def test_new_east_asia_feeds_present(self):
+        names = {f["name"] for f in DEFAULT_FEEDS}
+        expected = {"FocusTaiwan", "Kyodo", "SCMP", "The Diplomat", "Stars and Stripes"}
+        assert expected.issubset(names)