mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-04-29 06:26:13 +02:00
feat: add East Asia news sources and improve geocoding for Taiwan contingency
Add 5 East Asia-focused RSS feeds (FocusTaiwan, Kyodo, SCMP, The Diplomat, Stars and Stripes) and 22 geographic keywords (Taiwan Strait, South/East China Sea, Okinawa, Guam, military bases, etc.) to improve coverage of Taiwan contingency scenarios. Refactor keyword matching into a pure _resolve_coords() function with longest-match-first sorting so specific locations like "Taiwan Strait" are not absorbed by generic "Taiwan". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -39,6 +39,31 @@
|
||||
"name": "Mercopress",
|
||||
"url": "https://en.mercopress.com/rss/",
|
||||
"weight": 3
|
||||
},
|
||||
{
|
||||
"name": "FocusTaiwan",
|
||||
"url": "https://focustaiwan.tw/rss",
|
||||
"weight": 5
|
||||
},
|
||||
{
|
||||
"name": "Kyodo",
|
||||
"url": "https://english.kyodonews.net/rss/news.xml",
|
||||
"weight": 4
|
||||
},
|
||||
{
|
||||
"name": "SCMP",
|
||||
"url": "https://www.scmp.com/rss/91/feed",
|
||||
"weight": 4
|
||||
},
|
||||
{
|
||||
"name": "The Diplomat",
|
||||
"url": "https://thediplomat.com/feed/",
|
||||
"weight": 4
|
||||
},
|
||||
{
|
||||
"name": "Stars and Stripes",
|
||||
"url": "https://www.stripes.com/feeds/pacific.rss",
|
||||
"weight": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -33,6 +33,29 @@ _KEYWORD_COORDS = {
|
||||
"lebanon": (33.854, 35.862),
|
||||
"syria": (34.802, 38.996),
|
||||
"yemen": (15.552, 48.516),
|
||||
# East Asia — specific locations (longer keywords matched first via _SORTED_KEYWORDS)
|
||||
"taiwan strait": (24.0, 119.5),
|
||||
"south china sea": (15.0, 115.0),
|
||||
"east china sea": (28.0, 125.0),
|
||||
"philippine sea": (20.0, 130.0),
|
||||
"senkaku": (25.740, 123.474),
|
||||
"diaoyu": (25.740, 123.474),
|
||||
"ryukyu": (26.334, 127.800),
|
||||
"okinawa": (26.334, 127.800),
|
||||
"kadena": (26.351, 127.767),
|
||||
"naha": (26.212, 127.679),
|
||||
"yokosuka": (35.283, 139.671),
|
||||
"sasebo": (33.159, 129.722),
|
||||
"misawa": (40.682, 141.368),
|
||||
"iwakuni": (34.144, 132.236),
|
||||
"guam": (13.444, 144.793),
|
||||
"taipei": (25.033, 121.565),
|
||||
"kaohsiung": (22.616, 120.313),
|
||||
"xiamen": (24.479, 118.089),
|
||||
"fujian": (26.074, 119.296),
|
||||
"guangdong": (23.379, 113.763),
|
||||
"zhejiang": (29.141, 119.788),
|
||||
"hainan": (19.200, 109.999),
|
||||
"china": (35.861, 104.195),
|
||||
"beijing": (39.904, 116.407),
|
||||
"taiwan": (23.697, 120.960),
|
||||
@@ -90,6 +113,27 @@ _KEYWORD_COORDS = {
|
||||
"jakarta": (-6.208, 106.845),
|
||||
}
|
||||
|
||||
# Immutable after module load — sort by descending keyword length so
|
||||
# specific locations ("taiwan strait") match before generic ones ("taiwan")
|
||||
_SORTED_KEYWORDS = sorted(_KEYWORD_COORDS.items(), key=lambda x: len(x[0]), reverse=True)
|
||||
|
||||
|
||||
def _resolve_coords(text: str) -> tuple[float, float] | None:
|
||||
"""Return (lat, lng) for the most specific keyword match, or None.
|
||||
|
||||
Longer keywords are tried first. Space-padded keywords (" us ", " uk ")
|
||||
use substring matching on padded text; all others use word-boundary regex.
|
||||
"""
|
||||
padded_text = f" {text} "
|
||||
for kw, coords in _SORTED_KEYWORDS:
|
||||
if kw.startswith(" ") or kw.endswith(" "):
|
||||
if kw in padded_text:
|
||||
return coords
|
||||
else:
|
||||
if re.search(r'\b' + re.escape(kw) + r'\b', text):
|
||||
return coords
|
||||
return None
|
||||
|
||||
|
||||
@with_retry(max_retries=1, base_delay=2)
|
||||
def fetch_news():
|
||||
@@ -140,8 +184,6 @@ def fetch_news():
|
||||
risk_score += 2
|
||||
risk_score = min(10, risk_score)
|
||||
|
||||
keyword_coords = _KEYWORD_COORDS
|
||||
|
||||
lat, lng = None, None
|
||||
|
||||
if 'georss_point' in entry:
|
||||
@@ -153,18 +195,10 @@ def fetch_news():
|
||||
lat, lng = coords[1], coords[0]
|
||||
|
||||
if lat is None:
|
||||
# text may not be defined yet for GDACS path
|
||||
text = (title + " " + summary).lower()
|
||||
padded_text = f" {text} "
|
||||
for kw, coords in keyword_coords.items():
|
||||
if kw.startswith(" ") or kw.endswith(" "):
|
||||
if kw in padded_text:
|
||||
lat, lng = coords
|
||||
break
|
||||
else:
|
||||
if re.search(r'\b' + re.escape(kw) + r'\b', text):
|
||||
lat, lng = coords
|
||||
break
|
||||
result = _resolve_coords(text)
|
||||
if result:
|
||||
lat, lng = result
|
||||
|
||||
if lat is not None:
|
||||
key = None
|
||||
|
||||
@@ -20,6 +20,11 @@ DEFAULT_FEEDS = [
|
||||
{"name": "NHK", "url": "https://www3.nhk.or.jp/nhkworld/rss/world.xml", "weight": 3},
|
||||
{"name": "CNA", "url": "https://www.channelnewsasia.com/rssfeed/8395986", "weight": 3},
|
||||
{"name": "Mercopress", "url": "https://en.mercopress.com/rss/", "weight": 3},
|
||||
{"name": "FocusTaiwan", "url": "https://focustaiwan.tw/rss", "weight": 5},
|
||||
{"name": "Kyodo", "url": "https://english.kyodonews.net/rss/news.xml", "weight": 4},
|
||||
{"name": "SCMP", "url": "https://www.scmp.com/rss/91/feed", "weight": 4},
|
||||
{"name": "The Diplomat", "url": "https://thediplomat.com/feed/", "weight": 4},
|
||||
{"name": "Stars and Stripes", "url": "https://www.stripes.com/feeds/pacific.rss", "weight": 4},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
"""Regression tests for news geocoding keywords and feed configuration."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from services.fetchers.news import _resolve_coords
|
||||
from services.news_feed_config import DEFAULT_FEEDS
|
||||
|
||||
|
||||
CONFIG_PATH = Path(__file__).parent.parent / "config" / "news_feeds.json"
|
||||
|
||||
|
||||
# -- Keyword resolution: East Asia specific locations --------------------------
|
||||
|
||||
class TestResolveCoords:
|
||||
"""_resolve_coords should prefer longer (more specific) keywords."""
|
||||
|
||||
def test_taiwan_strait_not_absorbed_by_taiwan(self):
|
||||
result = _resolve_coords("tensions in the taiwan strait")
|
||||
assert result == (24.0, 119.5)
|
||||
|
||||
def test_south_china_sea_not_absorbed_by_china(self):
|
||||
result = _resolve_coords("south china sea patrol")
|
||||
assert result == (15.0, 115.0)
|
||||
|
||||
def test_east_china_sea(self):
|
||||
result = _resolve_coords("east china sea tensions")
|
||||
assert result == (28.0, 125.0)
|
||||
|
||||
def test_philippine_sea(self):
|
||||
result = _resolve_coords("philippine sea exercises")
|
||||
assert result == (20.0, 130.0)
|
||||
|
||||
def test_generic_china_still_works(self):
|
||||
result = _resolve_coords("china deploys forces")
|
||||
assert result == (35.861, 104.195)
|
||||
|
||||
def test_generic_taiwan_still_works(self):
|
||||
result = _resolve_coords("taiwan elections")
|
||||
assert result == (23.697, 120.960)
|
||||
|
||||
def test_taipei(self):
|
||||
result = _resolve_coords("protests in taipei")
|
||||
assert result == (25.033, 121.565)
|
||||
|
||||
def test_okinawa(self):
|
||||
result = _resolve_coords("okinawa base expansion")
|
||||
assert result == (26.334, 127.800)
|
||||
|
||||
# -- Existing inclusion-relationship regressions ---------------------------
|
||||
|
||||
def test_new_delhi_not_absorbed_by_delhi(self):
|
||||
result = _resolve_coords("new delhi summit")
|
||||
assert result == (28.613, 77.209)
|
||||
|
||||
def test_south_america_not_absorbed_by_america(self):
|
||||
result = _resolve_coords("south america trade deal")
|
||||
assert result == (-14.200, -51.900)
|
||||
|
||||
def test_north_korea_not_absorbed_by_south_korea(self):
|
||||
result = _resolve_coords("north korea missile launch")
|
||||
assert result == (40.339, 127.510)
|
||||
|
||||
# -- Space-padded keywords -------------------------------------------------
|
||||
|
||||
def test_us_with_spaces(self):
|
||||
result = _resolve_coords("the us military")
|
||||
assert result == (38.907, -77.036)
|
||||
|
||||
def test_uk_with_spaces(self):
|
||||
result = _resolve_coords("visit the uk soon")
|
||||
assert result == (55.378, -3.435)
|
||||
|
||||
# -- No match --------------------------------------------------------------
|
||||
|
||||
def test_no_match_returns_none(self):
|
||||
result = _resolve_coords("unknown location xyz")
|
||||
assert result is None
|
||||
|
||||
|
||||
# -- Feed configuration consistency -------------------------------------------
|
||||
|
||||
class TestFeedConfig:
|
||||
"""DEFAULT_FEEDS and news_feeds.json must stay in sync."""
|
||||
|
||||
def test_default_feeds_match_json(self):
|
||||
data = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
|
||||
json_feeds = data["feeds"]
|
||||
|
||||
def normalize(feeds):
|
||||
return sorted(
|
||||
[{"name": f["name"], "url": f["url"], "weight": f["weight"]} for f in feeds],
|
||||
key=lambda f: f["name"],
|
||||
)
|
||||
|
||||
assert normalize(DEFAULT_FEEDS) == normalize(json_feeds)
|
||||
|
||||
def test_new_east_asia_feeds_present(self):
|
||||
names = {f["name"] for f in DEFAULT_FEEDS}
|
||||
expected = {"FocusTaiwan", "Kyodo", "SCMP", "The Diplomat", "Stars and Stripes"}
|
||||
assert expected.issubset(names)
|
||||
Reference in New Issue
Block a user