mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-06-21 05:20:11 +02:00
cfbeabda1e
* feat(telegram): auto-translate OSINT channel posts to English Cherry-picked from @Bobpick PR #391 (telegram-only slice): server-side translation during fetch, SHOW ORIGINAL toggle in TelegramOsintPopup, and on-demand /api/telegram-feed?lang=. Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com> Co-authored-by: Cursor <cursoragent@cursor.com> * feat(gt): experimental Derived OSINT analytics with lean-node safeguards Cherry-picked from @Bobpick PR #391 (GT + OpenClaw slice): Bayesian strategic-risk engine, map overlay, OpenClaw commands, and telegram_rhetoric watchdog. Off by default (GT_ANALYTICS_ENABLED=false, gt_risk layer false). 1 vCPU nodes get cgroup detection, UI warning on layer toggle, and lean profile that skips scheduled ingest/Louvain unless GT_ANALYTICS_ACK_LOW_CPU=true. Backtest HUD removed from dashboard (OpenClaw/API regression only). Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com> Co-authored-by: Cursor <cursoragent@cursor.com> --------- Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com> Co-authored-by: Cursor <cursoragent@cursor.com>
206 lines
7.0 KiB
Python
206 lines
7.0 KiB
Python
"""Normalize Shadowbroker feed records into GT analytics feed items."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any, Iterable
|
|
|
|
_DOMAIN_CONFLICT = "conflict"
|
|
_DOMAIN_UNREST = "unrest"
|
|
_DOMAIN_FINANCIAL = "financial"
|
|
|
|
_CONFLICT_HINTS = re.compile(
|
|
r"\b(war|missile|strike|attack|military|invasion|troop|shelling|drone|bomb|nuclear)\b",
|
|
re.I,
|
|
)
|
|
_UNREST_HINTS = re.compile(
|
|
r"\b(protest|rally|strike|riot|unrest|mobiliz|demonstrat|curfew|purge|coup)\b",
|
|
re.I,
|
|
)
|
|
_FINANCIAL_HINTS = re.compile(
|
|
r"\b(payroll|loan|default|bankruptcy|liquidity|sanction|supply\s+chain|delay|shortage)\b",
|
|
re.I,
|
|
)
|
|
|
|
|
|
def _clean_region(value: Any) -> str:
|
|
region = str(value or "").strip().lower()
|
|
return region or "global"
|
|
|
|
|
|
def _infer_domain(text: str, explicit: str | None = None) -> str:
|
|
if explicit in {_DOMAIN_CONFLICT, _DOMAIN_UNREST, _DOMAIN_FINANCIAL}:
|
|
return explicit
|
|
if _CONFLICT_HINTS.search(text):
|
|
return _DOMAIN_CONFLICT
|
|
if _UNREST_HINTS.search(text):
|
|
return _DOMAIN_UNREST
|
|
if _FINANCIAL_HINTS.search(text):
|
|
return _DOMAIN_FINANCIAL
|
|
return _DOMAIN_FINANCIAL
|
|
|
|
|
|
def _text_from_record(
|
|
record: dict[str, Any],
|
|
*,
|
|
prefer_translation: bool = False,
|
|
) -> str:
|
|
"""Build ingest text; prefer English translations for Telegram OSINT when set."""
|
|
if prefer_translation:
|
|
translated_parts = [
|
|
record.get("title_translated"),
|
|
record.get("description_translated"),
|
|
]
|
|
translated = "\n".join(
|
|
str(p).strip() for p in translated_parts if p and str(p).strip()
|
|
)
|
|
if translated:
|
|
return translated
|
|
|
|
parts = [
|
|
record.get("title"),
|
|
record.get("description"),
|
|
record.get("text"),
|
|
record.get("summary"),
|
|
]
|
|
return "\n".join(str(p).strip() for p in parts if p and str(p).strip())
|
|
|
|
|
|
_HASHTAG_REGION = re.compile(r"#([a-z][a-z0-9_-]{2,})", re.I)
|
|
|
|
|
|
def _region_from_hashtags(text: str) -> str | None:
|
|
"""Map common theater hashtags (#Ukraine) to dossier/heatmap region keys."""
|
|
for match in _HASHTAG_REGION.finditer(text or ""):
|
|
tag = match.group(1).lower()
|
|
if tag in {
|
|
"ukraine",
|
|
"russia",
|
|
"israel",
|
|
"iran",
|
|
"gaza",
|
|
"syria",
|
|
"taiwan",
|
|
"china",
|
|
"belfast",
|
|
"uk",
|
|
"usa",
|
|
}:
|
|
return tag
|
|
return None
|
|
|
|
|
|
def _region_from_record(record: dict[str, Any], *, text: str = "") -> str:
|
|
for key in ("geotag", "region", "country", "location"):
|
|
if record.get(key):
|
|
return _clean_region(record[key])
|
|
hashtag_region = _region_from_hashtags(text)
|
|
if hashtag_region:
|
|
return hashtag_region
|
|
coords = record.get("coords")
|
|
if isinstance(coords, (list, tuple)) and len(coords) >= 2:
|
|
try:
|
|
lat = float(coords[0])
|
|
lng = float(coords[1])
|
|
return f"{lat:.2f},{lng:.2f}"
|
|
except (TypeError, ValueError):
|
|
pass
|
|
return "global"
|
|
|
|
|
|
def _entities_from_record(record: dict[str, Any]) -> list[str]:
|
|
entities: list[str] = []
|
|
for key in ("entities", "tags", "keywords"):
|
|
raw = record.get(key)
|
|
if isinstance(raw, list):
|
|
entities.extend(str(v).strip() for v in raw if str(v).strip())
|
|
elif isinstance(raw, str) and raw.strip():
|
|
entities.extend(part.strip() for part in raw.split(",") if part.strip())
|
|
channel = str(record.get("channel") or "").strip()
|
|
if channel:
|
|
entities.append(f"channel:{channel}")
|
|
source = str(record.get("source") or "").strip()
|
|
if source:
|
|
entities.append(f"source:{source}")
|
|
return entities
|
|
|
|
|
|
def normalize_feed_item(record: dict[str, Any], *, source_type: str = "generic") -> dict[str, Any]:
|
|
"""Map a news/Telegram/GDELT record into the GT engine schema."""
|
|
prefer_translation = source_type == "telegram_osint"
|
|
text = _text_from_record(record, prefer_translation=prefer_translation)
|
|
if prefer_translation and not text.strip():
|
|
text = _text_from_record(record, prefer_translation=False)
|
|
region = _region_from_record(record, text=text)
|
|
domain = _infer_domain(text, record.get("domain"))
|
|
coords = record.get("coords")
|
|
lat = lng = None
|
|
if isinstance(coords, (list, tuple)) and len(coords) >= 2:
|
|
try:
|
|
lat = float(coords[0])
|
|
lng = float(coords[1])
|
|
except (TypeError, ValueError):
|
|
lat = lng = None
|
|
|
|
return {
|
|
"id": record.get("id") or record.get("link"),
|
|
"text": text,
|
|
"source": str(record.get("source") or source_type),
|
|
"source_type": source_type,
|
|
"region": region,
|
|
"domain": domain,
|
|
"entities": _entities_from_record(record),
|
|
"coords": [lat, lng] if lat is not None and lng is not None else None,
|
|
"published": record.get("published"),
|
|
"risk_score": record.get("risk_score"),
|
|
}
|
|
|
|
|
|
def iter_telegram_posts(payload: dict[str, Any] | None) -> Iterable[dict[str, Any]]:
|
|
from services.telegram_translate import apply_post_translation, telegram_translate_enabled
|
|
|
|
posts = list((payload or {}).get("posts") or [])
|
|
for post in posts:
|
|
if not isinstance(post, dict):
|
|
continue
|
|
if not (post.get("description") or post.get("title")):
|
|
continue
|
|
enriched = (
|
|
apply_post_translation(post)
|
|
if telegram_translate_enabled()
|
|
else post
|
|
)
|
|
yield normalize_feed_item(enriched, source_type="telegram_osint")
|
|
|
|
|
|
def iter_news_items(payload: list[dict[str, Any]] | None) -> Iterable[dict[str, Any]]:
|
|
for item in list(payload or []):
|
|
if not isinstance(item, dict):
|
|
continue
|
|
yield normalize_feed_item(item, source_type="news")
|
|
for article in list(item.get("articles") or []):
|
|
if isinstance(article, dict):
|
|
yield normalize_feed_item(article, source_type="news_cluster")
|
|
|
|
|
|
def iter_gdelt_features(payload: list[dict[str, Any]] | None) -> Iterable[dict[str, Any]]:
|
|
for feature in list(payload or []):
|
|
if not isinstance(feature, dict):
|
|
continue
|
|
props = dict(feature.get("properties") or {})
|
|
geometry = dict(feature.get("geometry") or {})
|
|
coords = None
|
|
if geometry.get("type") == "Point":
|
|
raw = geometry.get("coordinates")
|
|
if isinstance(raw, (list, tuple)) and len(raw) >= 2:
|
|
coords = [float(raw[1]), float(raw[0])]
|
|
record = {
|
|
"title": props.get("name") or props.get("title"),
|
|
"description": props.get("snippet") or props.get("description"),
|
|
"source": props.get("source") or "gdelt",
|
|
"coords": coords,
|
|
"published": props.get("date") or props.get("published"),
|
|
"region": props.get("location") or props.get("country"),
|
|
}
|
|
if record["title"] or record["description"]:
|
|
yield normalize_feed_item(record, source_type="gdelt") |