Files
Shadowbroker/backend/analytics/feed_adapter.py
T
Shadowbroker cfbeabda1e Feat/gt analytics openclaw (#392)
* feat(telegram): auto-translate OSINT channel posts to English

Cherry-picked from @Bobpick PR #391 (telegram-only slice): server-side translation during fetch, SHOW ORIGINAL toggle in TelegramOsintPopup, and on-demand /api/telegram-feed?lang=.

Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com>
Co-authored-by: Cursor <cursoragent@cursor.com>

* feat(gt): experimental Derived OSINT analytics with lean-node safeguards

Cherry-picked from @Bobpick PR #391 (GT + OpenClaw slice): Bayesian strategic-risk engine, map overlay, OpenClaw commands, and telegram_rhetoric watchdog. Off by default (GT_ANALYTICS_ENABLED=false, gt_risk layer false). 1 vCPU nodes get cgroup detection, UI warning on layer toggle, and lean profile that skips scheduled ingest/Louvain unless GT_ANALYTICS_ACK_LOW_CPU=true. Backtest HUD removed from dashboard (OpenClaw/API regression only).

Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com>
Co-authored-by: Cursor <cursoragent@cursor.com>

---------

Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-16 17:05:46 -06:00

206 lines
7.0 KiB
Python

"""Normalize Shadowbroker feed records into GT analytics feed items."""
from __future__ import annotations
import re
from typing import Any, Iterable
_DOMAIN_CONFLICT = "conflict"
_DOMAIN_UNREST = "unrest"
_DOMAIN_FINANCIAL = "financial"
_CONFLICT_HINTS = re.compile(
r"\b(war|missile|strike|attack|military|invasion|troop|shelling|drone|bomb|nuclear)\b",
re.I,
)
_UNREST_HINTS = re.compile(
r"\b(protest|rally|strike|riot|unrest|mobiliz|demonstrat|curfew|purge|coup)\b",
re.I,
)
_FINANCIAL_HINTS = re.compile(
r"\b(payroll|loan|default|bankruptcy|liquidity|sanction|supply\s+chain|delay|shortage)\b",
re.I,
)
def _clean_region(value: Any) -> str:
region = str(value or "").strip().lower()
return region or "global"
def _infer_domain(text: str, explicit: str | None = None) -> str:
if explicit in {_DOMAIN_CONFLICT, _DOMAIN_UNREST, _DOMAIN_FINANCIAL}:
return explicit
if _CONFLICT_HINTS.search(text):
return _DOMAIN_CONFLICT
if _UNREST_HINTS.search(text):
return _DOMAIN_UNREST
if _FINANCIAL_HINTS.search(text):
return _DOMAIN_FINANCIAL
return _DOMAIN_FINANCIAL
def _text_from_record(
record: dict[str, Any],
*,
prefer_translation: bool = False,
) -> str:
"""Build ingest text; prefer English translations for Telegram OSINT when set."""
if prefer_translation:
translated_parts = [
record.get("title_translated"),
record.get("description_translated"),
]
translated = "\n".join(
str(p).strip() for p in translated_parts if p and str(p).strip()
)
if translated:
return translated
parts = [
record.get("title"),
record.get("description"),
record.get("text"),
record.get("summary"),
]
return "\n".join(str(p).strip() for p in parts if p and str(p).strip())
_HASHTAG_REGION = re.compile(r"#([a-z][a-z0-9_-]{2,})", re.I)
def _region_from_hashtags(text: str) -> str | None:
"""Map common theater hashtags (#Ukraine) to dossier/heatmap region keys."""
for match in _HASHTAG_REGION.finditer(text or ""):
tag = match.group(1).lower()
if tag in {
"ukraine",
"russia",
"israel",
"iran",
"gaza",
"syria",
"taiwan",
"china",
"belfast",
"uk",
"usa",
}:
return tag
return None
def _region_from_record(record: dict[str, Any], *, text: str = "") -> str:
for key in ("geotag", "region", "country", "location"):
if record.get(key):
return _clean_region(record[key])
hashtag_region = _region_from_hashtags(text)
if hashtag_region:
return hashtag_region
coords = record.get("coords")
if isinstance(coords, (list, tuple)) and len(coords) >= 2:
try:
lat = float(coords[0])
lng = float(coords[1])
return f"{lat:.2f},{lng:.2f}"
except (TypeError, ValueError):
pass
return "global"
def _entities_from_record(record: dict[str, Any]) -> list[str]:
entities: list[str] = []
for key in ("entities", "tags", "keywords"):
raw = record.get(key)
if isinstance(raw, list):
entities.extend(str(v).strip() for v in raw if str(v).strip())
elif isinstance(raw, str) and raw.strip():
entities.extend(part.strip() for part in raw.split(",") if part.strip())
channel = str(record.get("channel") or "").strip()
if channel:
entities.append(f"channel:{channel}")
source = str(record.get("source") or "").strip()
if source:
entities.append(f"source:{source}")
return entities
def normalize_feed_item(record: dict[str, Any], *, source_type: str = "generic") -> dict[str, Any]:
"""Map a news/Telegram/GDELT record into the GT engine schema."""
prefer_translation = source_type == "telegram_osint"
text = _text_from_record(record, prefer_translation=prefer_translation)
if prefer_translation and not text.strip():
text = _text_from_record(record, prefer_translation=False)
region = _region_from_record(record, text=text)
domain = _infer_domain(text, record.get("domain"))
coords = record.get("coords")
lat = lng = None
if isinstance(coords, (list, tuple)) and len(coords) >= 2:
try:
lat = float(coords[0])
lng = float(coords[1])
except (TypeError, ValueError):
lat = lng = None
return {
"id": record.get("id") or record.get("link"),
"text": text,
"source": str(record.get("source") or source_type),
"source_type": source_type,
"region": region,
"domain": domain,
"entities": _entities_from_record(record),
"coords": [lat, lng] if lat is not None and lng is not None else None,
"published": record.get("published"),
"risk_score": record.get("risk_score"),
}
def iter_telegram_posts(payload: dict[str, Any] | None) -> Iterable[dict[str, Any]]:
from services.telegram_translate import apply_post_translation, telegram_translate_enabled
posts = list((payload or {}).get("posts") or [])
for post in posts:
if not isinstance(post, dict):
continue
if not (post.get("description") or post.get("title")):
continue
enriched = (
apply_post_translation(post)
if telegram_translate_enabled()
else post
)
yield normalize_feed_item(enriched, source_type="telegram_osint")
def iter_news_items(payload: list[dict[str, Any]] | None) -> Iterable[dict[str, Any]]:
for item in list(payload or []):
if not isinstance(item, dict):
continue
yield normalize_feed_item(item, source_type="news")
for article in list(item.get("articles") or []):
if isinstance(article, dict):
yield normalize_feed_item(article, source_type="news_cluster")
def iter_gdelt_features(payload: list[dict[str, Any]] | None) -> Iterable[dict[str, Any]]:
for feature in list(payload or []):
if not isinstance(feature, dict):
continue
props = dict(feature.get("properties") or {})
geometry = dict(feature.get("geometry") or {})
coords = None
if geometry.get("type") == "Point":
raw = geometry.get("coordinates")
if isinstance(raw, (list, tuple)) and len(raw) >= 2:
coords = [float(raw[1]), float(raw[0])]
record = {
"title": props.get("name") or props.get("title"),
"description": props.get("snippet") or props.get("description"),
"source": props.get("source") or "gdelt",
"coords": coords,
"published": props.get("date") or props.get("published"),
"region": props.get("location") or props.get("country"),
}
if record["title"] or record["description"]:
yield normalize_feed_item(record, source_type="gdelt")