Shadowbroker/backend/analytics/feed_adapter.py

"""Normalize Shadowbroker feed records into GT analytics feed items."""

from __future__ import annotations

import re
from typing import Any, Iterable

_DOMAIN_CONFLICT = "conflict"
_DOMAIN_UNREST = "unrest"
_DOMAIN_FINANCIAL = "financial"

_CONFLICT_HINTS = re.compile(
    r"\b(war|missile|strike|attack|military|invasion|troop|shelling|drone|bomb|nuclear)\b",
    re.I,
)
_UNREST_HINTS = re.compile(
    r"\b(protest|rally|strike|riot|unrest|mobiliz|demonstrat|curfew|purge|coup)\b",
    re.I,
)
_FINANCIAL_HINTS = re.compile(
    r"\b(payroll|loan|default|bankruptcy|liquidity|sanction|supply\s+chain|delay|shortage)\b",
    re.I,
)


def _clean_region(value: Any) -> str:
    region = str(value or "").strip().lower()
    return region or "global"


def _infer_domain(text: str, explicit: str | None = None) -> str:
    if explicit in {_DOMAIN_CONFLICT, _DOMAIN_UNREST, _DOMAIN_FINANCIAL}:
        return explicit
    if _CONFLICT_HINTS.search(text):
        return _DOMAIN_CONFLICT
    if _UNREST_HINTS.search(text):
        return _DOMAIN_UNREST
    if _FINANCIAL_HINTS.search(text):
        return _DOMAIN_FINANCIAL
    return _DOMAIN_FINANCIAL


def _text_from_record(
    record: dict[str, Any],
    *,
    prefer_translation: bool = False,
) -> str:
    """Build ingest text; prefer English translations for Telegram OSINT when set."""
    if prefer_translation:
        translated_parts = [
            record.get("title_translated"),
            record.get("description_translated"),
        ]
        translated = "\n".join(
            str(p).strip() for p in translated_parts if p and str(p).strip()
        )
        if translated:
            return translated

    parts = [
        record.get("title"),
        record.get("description"),
        record.get("text"),
        record.get("summary"),
    ]
    return "\n".join(str(p).strip() for p in parts if p and str(p).strip())


_HASHTAG_REGION = re.compile(r"#([a-z][a-z0-9_-]{2,})", re.I)


def _region_from_hashtags(text: str) -> str | None:
    """Map common theater hashtags (#Ukraine) to dossier/heatmap region keys."""
    for match in _HASHTAG_REGION.finditer(text or ""):
        tag = match.group(1).lower()
        if tag in {
            "ukraine",
            "russia",
            "israel",
            "iran",
            "gaza",
            "syria",
            "taiwan",
            "china",
            "belfast",
            "uk",
            "usa",
        }:
            return tag
    return None


def _region_from_record(record: dict[str, Any], *, text: str = "") -> str:
    for key in ("geotag", "region", "country", "location"):
        if record.get(key):
            return _clean_region(record[key])
    hashtag_region = _region_from_hashtags(text)
    if hashtag_region:
        return hashtag_region
    coords = record.get("coords")
    if isinstance(coords, (list, tuple)) and len(coords) >= 2:
        try:
            lat = float(coords[0])
            lng = float(coords[1])
            return f"{lat:.2f},{lng:.2f}"
        except (TypeError, ValueError):
            pass
    return "global"


def _entities_from_record(record: dict[str, Any]) -> list[str]:
    entities: list[str] = []
    for key in ("entities", "tags", "keywords"):
        raw = record.get(key)
        if isinstance(raw, list):
            entities.extend(str(v).strip() for v in raw if str(v).strip())
        elif isinstance(raw, str) and raw.strip():
            entities.extend(part.strip() for part in raw.split(",") if part.strip())
    channel = str(record.get("channel") or "").strip()
    if channel:
        entities.append(f"channel:{channel}")
    source = str(record.get("source") or "").strip()
    if source:
        entities.append(f"source:{source}")
    return entities


def normalize_feed_item(record: dict[str, Any], *, source_type: str = "generic") -> dict[str, Any]:
    """Map a news/Telegram/GDELT record into the GT engine schema."""
    prefer_translation = source_type == "telegram_osint"
    text = _text_from_record(record, prefer_translation=prefer_translation)
    if prefer_translation and not text.strip():
        text = _text_from_record(record, prefer_translation=False)
    region = _region_from_record(record, text=text)
    domain = _infer_domain(text, record.get("domain"))
    coords = record.get("coords")
    lat = lng = None
    if isinstance(coords, (list, tuple)) and len(coords) >= 2:
        try:
            lat = float(coords[0])
            lng = float(coords[1])
        except (TypeError, ValueError):
            lat = lng = None

    return {
        "id": record.get("id") or record.get("link"),
        "text": text,
        "source": str(record.get("source") or source_type),
        "source_type": source_type,
        "region": region,
        "domain": domain,
        "entities": _entities_from_record(record),
        "coords": [lat, lng] if lat is not None and lng is not None else None,
        "published": record.get("published"),
        "risk_score": record.get("risk_score"),
    }


def iter_telegram_posts(payload: dict[str, Any] | None) -> Iterable[dict[str, Any]]:
    from services.telegram_translate import apply_post_translation, telegram_translate_enabled

    posts = list((payload or {}).get("posts") or [])
    for post in posts:
        if not isinstance(post, dict):
            continue
        if not (post.get("description") or post.get("title")):
            continue
        enriched = (
            apply_post_translation(post)
            if telegram_translate_enabled()
            else post
        )
        yield normalize_feed_item(enriched, source_type="telegram_osint")


def iter_news_items(payload: list[dict[str, Any]] | None) -> Iterable[dict[str, Any]]:
    for item in list(payload or []):
        if not isinstance(item, dict):
            continue
        yield normalize_feed_item(item, source_type="news")
        for article in list(item.get("articles") or []):
            if isinstance(article, dict):
                yield normalize_feed_item(article, source_type="news_cluster")


def iter_gdelt_features(payload: list[dict[str, Any]] | None) -> Iterable[dict[str, Any]]:
    for feature in list(payload or []):
        if not isinstance(feature, dict):
            continue
        props = dict(feature.get("properties") or {})
        geometry = dict(feature.get("geometry") or {})
        coords = None
        if geometry.get("type") == "Point":
            raw = geometry.get("coordinates")
            if isinstance(raw, (list, tuple)) and len(raw) >= 2:
                coords = [float(raw[1]), float(raw[0])]
        record = {
            "title": props.get("name") or props.get("title"),
            "description": props.get("snippet") or props.get("description"),
            "source": props.get("source") or "gdelt",
            "coords": coords,
            "published": props.get("date") or props.get("published"),
            "region": props.get("location") or props.get("country"),
        }
        if record["title"] or record["description"]:
            yield normalize_feed_item(record, source_type="gdelt")