feat: add East Asia news sources and improve geocoding for Taiwan contingency

Add 5 East Asia-focused RSS feeds (FocusTaiwan, Kyodo, SCMP, The Diplomat,
Stars and Stripes) and 22 geographic keywords (Taiwan Strait, South/East
China Sea, Okinawa, Guam, military bases, etc.) to improve coverage of
Taiwan contingency scenarios.

Refactor keyword matching into a pure _resolve_coords() function with
longest-match-first sorting so specific locations like "Taiwan Strait"
are not absorbed by generic "Taiwan".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
adust09
2026-03-15 23:19:55 +09:00
parent 4a33424924
commit 130287bb49
4 changed files with 179 additions and 13 deletions
+25
View File
@@ -39,6 +39,31 @@
"name": "Mercopress",
"url": "https://en.mercopress.com/rss/",
"weight": 3
},
{
"name": "FocusTaiwan",
"url": "https://focustaiwan.tw/rss",
"weight": 5
},
{
"name": "Kyodo",
"url": "https://english.kyodonews.net/rss/news.xml",
"weight": 4
},
{
"name": "SCMP",
"url": "https://www.scmp.com/rss/91/feed",
"weight": 4
},
{
"name": "The Diplomat",
"url": "https://thediplomat.com/feed/",
"weight": 4
},
{
"name": "Stars and Stripes",
"url": "https://www.stripes.com/feeds/pacific.rss",
"weight": 4
}
]
}
+47 -13
View File
@@ -33,6 +33,29 @@ _KEYWORD_COORDS = {
"lebanon": (33.854, 35.862),
"syria": (34.802, 38.996),
"yemen": (15.552, 48.516),
# East Asia — specific locations (longer keywords matched first via _SORTED_KEYWORDS)
"taiwan strait": (24.0, 119.5),
"south china sea": (15.0, 115.0),
"east china sea": (28.0, 125.0),
"philippine sea": (20.0, 130.0),
"senkaku": (25.740, 123.474),
"diaoyu": (25.740, 123.474),
"ryukyu": (26.334, 127.800),
"okinawa": (26.334, 127.800),
"kadena": (26.351, 127.767),
"naha": (26.212, 127.679),
"yokosuka": (35.283, 139.671),
"sasebo": (33.159, 129.722),
"misawa": (40.682, 141.368),
"iwakuni": (34.144, 132.236),
"guam": (13.444, 144.793),
"taipei": (25.033, 121.565),
"kaohsiung": (22.616, 120.313),
"xiamen": (24.479, 118.089),
"fujian": (26.074, 119.296),
"guangdong": (23.379, 113.763),
"zhejiang": (29.141, 119.788),
"hainan": (19.200, 109.999),
"china": (35.861, 104.195),
"beijing": (39.904, 116.407),
"taiwan": (23.697, 120.960),
@@ -90,6 +113,27 @@ _KEYWORD_COORDS = {
"jakarta": (-6.208, 106.845),
}
# Immutable after module load — sort by descending keyword length so
# specific locations ("taiwan strait") match before generic ones ("taiwan")
_SORTED_KEYWORDS = sorted(_KEYWORD_COORDS.items(), key=lambda x: len(x[0]), reverse=True)
def _resolve_coords(text: str) -> tuple[float, float] | None:
"""Return (lat, lng) for the most specific keyword match, or None.
Longer keywords are tried first. Space-padded keywords (" us ", " uk ")
use substring matching on padded text; all others use word-boundary regex.
"""
padded_text = f" {text} "
for kw, coords in _SORTED_KEYWORDS:
if kw.startswith(" ") or kw.endswith(" "):
if kw in padded_text:
return coords
else:
if re.search(r'\b' + re.escape(kw) + r'\b', text):
return coords
return None
@with_retry(max_retries=1, base_delay=2)
def fetch_news():
@@ -140,8 +184,6 @@ def fetch_news():
risk_score += 2
risk_score = min(10, risk_score)
keyword_coords = _KEYWORD_COORDS
lat, lng = None, None
if 'georss_point' in entry:
@@ -153,18 +195,10 @@ def fetch_news():
lat, lng = coords[1], coords[0]
if lat is None:
# text may not be defined yet for GDACS path
text = (title + " " + summary).lower()
padded_text = f" {text} "
for kw, coords in keyword_coords.items():
if kw.startswith(" ") or kw.endswith(" "):
if kw in padded_text:
lat, lng = coords
break
else:
if re.search(r'\b' + re.escape(kw) + r'\b', text):
lat, lng = coords
break
result = _resolve_coords(text)
if result:
lat, lng = result
if lat is not None:
key = None
+5
View File
@@ -20,6 +20,11 @@ DEFAULT_FEEDS = [
{"name": "NHK", "url": "https://www3.nhk.or.jp/nhkworld/rss/world.xml", "weight": 3},
{"name": "CNA", "url": "https://www.channelnewsasia.com/rssfeed/8395986", "weight": 3},
{"name": "Mercopress", "url": "https://en.mercopress.com/rss/", "weight": 3},
{"name": "FocusTaiwan", "url": "https://focustaiwan.tw/rss", "weight": 5},
{"name": "Kyodo", "url": "https://english.kyodonews.net/rss/news.xml", "weight": 4},
{"name": "SCMP", "url": "https://www.scmp.com/rss/91/feed", "weight": 4},
{"name": "The Diplomat", "url": "https://thediplomat.com/feed/", "weight": 4},
{"name": "Stars and Stripes", "url": "https://www.stripes.com/feeds/pacific.rss", "weight": 4},
]
+102
View File
@@ -0,0 +1,102 @@
"""Regression tests for news geocoding keywords and feed configuration."""
import json
from pathlib import Path
import pytest
from services.fetchers.news import _resolve_coords
from services.news_feed_config import DEFAULT_FEEDS
CONFIG_PATH = Path(__file__).parent.parent / "config" / "news_feeds.json"
# -- Keyword resolution: East Asia specific locations --------------------------
class TestResolveCoords:
"""_resolve_coords should prefer longer (more specific) keywords."""
def test_taiwan_strait_not_absorbed_by_taiwan(self):
result = _resolve_coords("tensions in the taiwan strait")
assert result == (24.0, 119.5)
def test_south_china_sea_not_absorbed_by_china(self):
result = _resolve_coords("south china sea patrol")
assert result == (15.0, 115.0)
def test_east_china_sea(self):
result = _resolve_coords("east china sea tensions")
assert result == (28.0, 125.0)
def test_philippine_sea(self):
result = _resolve_coords("philippine sea exercises")
assert result == (20.0, 130.0)
def test_generic_china_still_works(self):
result = _resolve_coords("china deploys forces")
assert result == (35.861, 104.195)
def test_generic_taiwan_still_works(self):
result = _resolve_coords("taiwan elections")
assert result == (23.697, 120.960)
def test_taipei(self):
result = _resolve_coords("protests in taipei")
assert result == (25.033, 121.565)
def test_okinawa(self):
result = _resolve_coords("okinawa base expansion")
assert result == (26.334, 127.800)
# -- Existing inclusion-relationship regressions ---------------------------
def test_new_delhi_not_absorbed_by_delhi(self):
result = _resolve_coords("new delhi summit")
assert result == (28.613, 77.209)
def test_south_america_not_absorbed_by_america(self):
result = _resolve_coords("south america trade deal")
assert result == (-14.200, -51.900)
def test_north_korea_not_absorbed_by_south_korea(self):
result = _resolve_coords("north korea missile launch")
assert result == (40.339, 127.510)
# -- Space-padded keywords -------------------------------------------------
def test_us_with_spaces(self):
result = _resolve_coords("the us military")
assert result == (38.907, -77.036)
def test_uk_with_spaces(self):
result = _resolve_coords("visit the uk soon")
assert result == (55.378, -3.435)
# -- No match --------------------------------------------------------------
def test_no_match_returns_none(self):
result = _resolve_coords("unknown location xyz")
assert result is None
# -- Feed configuration consistency -------------------------------------------
class TestFeedConfig:
"""DEFAULT_FEEDS and news_feeds.json must stay in sync."""
def test_default_feeds_match_json(self):
data = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
json_feeds = data["feeds"]
def normalize(feeds):
return sorted(
[{"name": f["name"], "url": f["url"], "weight": f["weight"]} for f in feeds],
key=lambda f: f["name"],
)
assert normalize(DEFAULT_FEEDS) == normalize(json_feeds)
def test_new_east_asia_feeds_present(self):
names = {f["name"] for f in DEFAULT_FEEDS}
expected = {"FocusTaiwan", "Kyodo", "SCMP", "The Diplomat", "Stars and Stripes"}
assert expected.issubset(names)