mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-07-03 02:55:53 +02:00
cfbeabda1e
* feat(telegram): auto-translate OSINT channel posts to English Cherry-picked from @Bobpick PR #391 (telegram-only slice): server-side translation during fetch, SHOW ORIGINAL toggle in TelegramOsintPopup, and on-demand /api/telegram-feed?lang=. Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com> Co-authored-by: Cursor <cursoragent@cursor.com> * feat(gt): experimental Derived OSINT analytics with lean-node safeguards Cherry-picked from @Bobpick PR #391 (GT + OpenClaw slice): Bayesian strategic-risk engine, map overlay, OpenClaw commands, and telegram_rhetoric watchdog. Off by default (GT_ANALYTICS_ENABLED=false, gt_risk layer false). 1 vCPU nodes get cgroup detection, UI warning on layer toggle, and lean profile that skips scheduled ingest/Louvain unless GT_ANALYTICS_ACK_LOW_CPU=true. Backtest HUD removed from dashboard (OpenClaw/API regression only). Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com> Co-authored-by: Cursor <cursoragent@cursor.com> --------- Co-authored-by: Robert Pickett <bobpickettsr@yahoo.com> Co-authored-by: Cursor <cursoragent@cursor.com>
243 lines
7.7 KiB
Python
243 lines
7.7 KiB
Python
"""Auto-translation for Telegram OSINT post text (server-side, cached)."""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import re
|
|
import urllib.parse
|
|
from threading import Lock
|
|
from typing import Any
|
|
|
|
import requests
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_CYRILLIC_RE = re.compile(r"[\u0400-\u04FF]")
|
|
_UKRAINIAN_MARKERS_RE = re.compile(r"[іїєґІЇЄҐ]")
|
|
_ARABIC_RE = re.compile(r"[\u0600-\u06FF]")
|
|
_HEBREW_RE = re.compile(r"[\u0590-\u05FF]")
|
|
_CJK_RE = re.compile(r"[\u4e00-\u9fff]")
|
|
|
|
# Common war-reporting shorthand that machine translation often transliterates.
|
|
_POST_TRANSLATION_GLOSSARY: tuple[tuple[re.Pattern[str], str], ...] = (
|
|
(re.compile(r"\bBpLa\b", re.IGNORECASE), "UAV"),
|
|
(re.compile(r"\bБпЛА\b", re.IGNORECASE), "UAV"),
|
|
(re.compile(r"\bбпла\b"), "UAV"),
|
|
(re.compile(r"\bБПЛА\b"), "UAV"),
|
|
(re.compile(r"\bрсзв\b", re.IGNORECASE), "MLRS"),
|
|
(re.compile(r"\bРСЗВ\b"), "MLRS"),
|
|
)
|
|
|
|
_SOURCE_LANG_LABELS = {
|
|
"uk": "Ukrainian",
|
|
"ru": "Russian",
|
|
"en": "English",
|
|
"ar": "Arabic",
|
|
"he": "Hebrew",
|
|
"zh-cn": "Chinese",
|
|
"fr": "French",
|
|
"de": "German",
|
|
"pl": "Polish",
|
|
}
|
|
|
|
_CACHE: dict[str, tuple[str, str]] = {}
|
|
_CACHE_LOCK = Lock()
|
|
_CACHE_MAX = 512
|
|
|
|
_LOCALE_TO_GOOGLE = {
|
|
"en": "en",
|
|
"fr": "fr",
|
|
"zh-cn": "zh-CN",
|
|
"zh": "zh-CN",
|
|
}
|
|
|
|
|
|
def telegram_translate_enabled() -> bool:
|
|
return str(os.environ.get("TELEGRAM_OSINT_TRANSLATE", "true")).strip().lower() not in {
|
|
"0",
|
|
"false",
|
|
"no",
|
|
"off",
|
|
"",
|
|
}
|
|
|
|
|
|
def telegram_translate_target() -> str:
|
|
raw = str(os.environ.get("TELEGRAM_OSINT_TRANSLATE_TO", "en")).strip().lower()
|
|
return _LOCALE_TO_GOOGLE.get(raw, raw or "en")
|
|
|
|
|
|
def normalize_translate_target(locale: str | None) -> str:
|
|
raw = str(locale or telegram_translate_target()).strip().lower().replace("_", "-")
|
|
return _LOCALE_TO_GOOGLE.get(raw, raw or "en")
|
|
|
|
|
|
def _looks_english(text: str) -> bool:
|
|
letters = [char for char in text if char.isalpha()]
|
|
if not letters:
|
|
return True
|
|
ascii_letters = sum(1 for char in letters if ord(char) < 128)
|
|
return ascii_letters / len(letters) > 0.9
|
|
|
|
|
|
def contains_cyrillic(text: str) -> bool:
|
|
return bool(_CYRILLIC_RE.search(str(text or "")))
|
|
|
|
|
|
def source_lang_label(code: str | None) -> str:
|
|
raw = str(code or "").strip().lower().replace("_", "-")
|
|
return _SOURCE_LANG_LABELS.get(raw, raw.upper() if raw else "Unknown")
|
|
|
|
|
|
def polish_translation(text: str) -> str:
|
|
polished = str(text or "")
|
|
for pattern, replacement in _POST_TRANSLATION_GLOSSARY:
|
|
polished = pattern.sub(replacement, polished)
|
|
return polished.strip()
|
|
|
|
|
|
def guess_source_lang(text: str) -> str:
|
|
if _UKRAINIAN_MARKERS_RE.search(text):
|
|
return "uk"
|
|
if _CYRILLIC_RE.search(text):
|
|
return "ru"
|
|
if _ARABIC_RE.search(text):
|
|
return "ar"
|
|
if _HEBREW_RE.search(text):
|
|
return "he"
|
|
if _CJK_RE.search(text):
|
|
return "zh-CN"
|
|
if _looks_english(text):
|
|
return "en"
|
|
return "auto"
|
|
|
|
|
|
def _cache_key(text: str, target_lang: str) -> str:
|
|
digest = hashlib.sha1(f"{target_lang}|{text}".encode("utf-8")).hexdigest()
|
|
return digest
|
|
|
|
|
|
def _cache_get(text: str, target_lang: str) -> tuple[str, str] | None:
|
|
key = _cache_key(text, target_lang)
|
|
with _CACHE_LOCK:
|
|
return _CACHE.get(key)
|
|
|
|
|
|
def _cache_put(text: str, target_lang: str, translated: str, source_lang: str) -> None:
|
|
key = _cache_key(text, target_lang)
|
|
with _CACHE_LOCK:
|
|
if len(_CACHE) >= _CACHE_MAX:
|
|
_CACHE.pop(next(iter(_CACHE)))
|
|
_CACHE[key] = (translated, source_lang)
|
|
|
|
|
|
def _google_translate(clean: str, target: str, source: str | None = None) -> tuple[str, str]:
|
|
params = {
|
|
"client": "gtx",
|
|
"sl": source or "auto",
|
|
"tl": target,
|
|
"dt": "t",
|
|
"q": clean[:4500],
|
|
}
|
|
url = "https://translate.googleapis.com/translate_a/single?" + urllib.parse.urlencode(params)
|
|
resp = requests.get(
|
|
url,
|
|
timeout=8,
|
|
headers={"User-Agent": "Mozilla/5.0 (compatible; Shadowbroker-Telegram-Translate/1.0)"},
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
detected = str(data[2] or guess_source_lang(clean)).strip().lower()
|
|
if detected in {"zh-cn", "zh-tw"}:
|
|
detected = "zh-CN"
|
|
parts: list[str] = []
|
|
for chunk in data[0] or []:
|
|
if chunk and chunk[0]:
|
|
parts.append(str(chunk[0]))
|
|
translated = polish_translation("".join(parts).strip() or clean)
|
|
return translated, detected
|
|
|
|
|
|
def translate_text(text: str, target_lang: str | None = None) -> tuple[str, str]:
|
|
"""Translate text via Google Translate (unofficial client endpoint).
|
|
|
|
Returns ``(translated_text, detected_source_lang)``.
|
|
"""
|
|
clean = str(text or "").strip()
|
|
if not clean:
|
|
return "", "en"
|
|
|
|
target = normalize_translate_target(target_lang)
|
|
if _looks_english(clean) and target == "en":
|
|
return clean, "en"
|
|
|
|
cached = _cache_get(clean, target)
|
|
if cached:
|
|
return cached
|
|
|
|
try:
|
|
translated, detected = _google_translate(clean, target)
|
|
if detected == target or (detected == "en" and target == "en"):
|
|
result = (clean, detected)
|
|
_cache_put(clean, target, clean, detected)
|
|
return result
|
|
if contains_cyrillic(translated) and contains_cyrillic(clean):
|
|
hinted = guess_source_lang(clean)
|
|
if hinted not in {"auto", target}:
|
|
retry_translated, retry_detected = _google_translate(clean, target, hinted)
|
|
if not contains_cyrillic(retry_translated) or len(retry_translated) > len(translated):
|
|
translated, detected = retry_translated, retry_detected
|
|
result = (translated, detected)
|
|
_cache_put(clean, target, translated, detected)
|
|
return result
|
|
except Exception as exc:
|
|
logger.warning("Telegram translation failed: %s", exc)
|
|
fallback_lang = guess_source_lang(clean)
|
|
return clean, fallback_lang
|
|
|
|
|
|
def apply_post_translation(post: dict[str, Any], target_lang: str | None = None) -> dict[str, Any]:
|
|
"""Add translation fields to a Telegram OSINT post dict."""
|
|
if not telegram_translate_enabled():
|
|
return post
|
|
|
|
target = normalize_translate_target(target_lang)
|
|
description = str(post.get("description") or "").strip()
|
|
title = str(post.get("title") or "").strip()
|
|
full_text = description or title
|
|
if not full_text:
|
|
return post
|
|
|
|
existing_translated = str(post.get("description_translated") or post.get("title_translated") or "").strip()
|
|
if post.get("translate_to") == target and existing_translated:
|
|
updated = dict(post)
|
|
polished = polish_translation(existing_translated)
|
|
if polished != existing_translated:
|
|
lines = polished.split("\n", 1)
|
|
updated["title_translated"] = lines[0][:160]
|
|
updated["description_translated"] = polished[:1200]
|
|
updated["source_lang_label"] = source_lang_label(str(post.get("source_lang") or ""))
|
|
return updated
|
|
|
|
translated_full, source_lang = translate_text(full_text, target)
|
|
updated = dict(post)
|
|
updated["source_lang"] = source_lang
|
|
updated["translate_to"] = target
|
|
updated["source_lang_label"] = source_lang_label(source_lang)
|
|
|
|
if translated_full != full_text and source_lang != target:
|
|
lines = translated_full.split("\n", 1)
|
|
updated["title_translated"] = lines[0][:160]
|
|
updated["description_translated"] = translated_full[:1200]
|
|
|
|
return updated
|
|
|
|
|
|
def apply_posts_translations(
|
|
posts: list[dict[str, Any]],
|
|
target_lang: str | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
if not telegram_translate_enabled():
|
|
return posts
|
|
return [apply_post_translation(post, target_lang) for post in posts] |