OBLITERATUS/obliteratus/telemetry.py

"""Anonymous telemetry for community benchmark collection.

Logs benchmark results to a local JSONL file and automatically syncs to a
central HuggingFace Dataset repo for cross-Space community leaderboard
aggregation.  No user identity, IP addresses, or prompt content is stored —
only aggregate benchmark metrics (model name, method, scores, hardware info,
timestamp).

Telemetry is disabled by default to respect user privacy.  Users can opt in
by setting OBLITERATUS_TELEMETRY=1 or calling enable_telemetry().  On
HuggingFace Spaces, telemetry is auto-enabled for community leaderboard.

Architecture:
    1. Every benchmark/obliteration run appends a record to a local JSONL
       file (default: ~/.obliteratus/telemetry.jsonl or /tmp/obliteratus_telemetry.jsonl
       in containers).
    2. On HuggingFace Spaces, records are automatically synced to a central
       HuggingFace Dataset repo (default: pliny-the-prompter/OBLITERATUS-TELEMETRY,
       configurable via OBLITERATUS_TELEMETRY_REPO).  Each Space instance
       uploads its own JSONL file (keyed by SPACE_ID + session), so
       duplicated Spaces all feed into the same central leaderboard.
    3. The Leaderboard tab reads from both local JSONL *and* the central Hub
       dataset, merging and deduplicating results so all community
       contributions are visible regardless of which Space instance
       generated them.
"""

from __future__ import annotations

import hashlib
import json
import logging
import math
import os
import platform
import time
import threading
import uuid
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

# ── Configuration ─────────────────────────────────────────────────────

_ON_HF_SPACES = os.environ.get("SPACE_ID") is not None

# ── Telemetry state ──────────────────────────────────────────────────
_enabled: bool | None = None

# Central Hub repo for cross-Space telemetry aggregation.
# Default repo is used on HF Spaces so all instances (including duplicated
# Spaces) send data to the same central dataset automatically.
_DEFAULT_TELEMETRY_REPO = "pliny-the-prompter/OBLITERATUS-TELEMETRY"
_TELEMETRY_REPO = os.environ.get(
    "OBLITERATUS_TELEMETRY_REPO",
    _DEFAULT_TELEMETRY_REPO if _ON_HF_SPACES else "",
)

# Hub sync debounce interval (seconds).  After each log_benchmark(), we
# schedule a background upload but skip if the last sync was < this many
# seconds ago.  This prevents hammering the Hub API during rapid benchmark
# loops while still ensuring timely uploads.
_HUB_SYNC_INTERVAL = 30
_hub_sync_last: float = 0.0
_hub_sync_lock = threading.Lock()
_hub_repo_created: bool = False

# Locate writable telemetry directory
def _is_mount_point(path: Path) -> bool:
    """Check if a path is a mount point (different device from parent)."""
    try:
        if not path.exists():
            return False
        return path.stat().st_dev != path.parent.stat().st_dev
    except (OSError, ValueError):
        return False


def _test_writable(d: Path) -> bool:
    """Test if a directory exists and is writable."""
    try:
        d.mkdir(parents=True, exist_ok=True)
        test_file = d / ".write_test"
        test_file.write_text("ok")
        test_file.unlink()
        return True
    except (PermissionError, OSError):
        return False


def _telemetry_dir() -> Path:
    """Find a writable directory for telemetry storage.

    Priority order:
    1. ``OBLITERATUS_DATA_DIR`` env var (explicit override)
    2. HuggingFace Spaces persistent storage (``/data/obliteratus``)
       — survives container rebuilds and factory resets
    3. ``~/.obliteratus`` (local installs)
    4. ``/tmp/obliteratus_telemetry`` (last resort — does NOT survive rebuilds)

    On HF Spaces, ``/data`` is the persistent storage mount point.  If it
    exists as a real mount but isn't writable yet (race during boot), we
    retry briefly before falling through.
    """
    # 1. Explicit override — always wins
    explicit = os.environ.get("OBLITERATUS_DATA_DIR")
    if explicit:
        p = Path(explicit)
        if _test_writable(p):
            logger.info("Telemetry storage: %s (OBLITERATUS_DATA_DIR)", p)
            return p
        logger.warning(
            "OBLITERATUS_DATA_DIR=%s is not writable, falling through", explicit
        )

    # 2. HF Spaces persistent storage at /data
    if _ON_HF_SPACES:
        data_root = Path("/data")
        hf_dir = data_root / "obliteratus"
        # On Spaces, /data may take a moment to mount after container start.
        # Retry a few times if the directory exists as a mount point but
        # isn't writable yet.
        if data_root.exists():
            for attempt in range(3):
                if _test_writable(hf_dir):
                    if attempt > 0:
                        logger.info(
                            "Telemetry storage: %s (HF persistent, ready after %d retries)",
                            hf_dir, attempt,
                        )
                    else:
                        logger.info("Telemetry storage: %s (HF persistent storage)", hf_dir)
                    return hf_dir
                # Brief wait for mount to become ready
                if attempt < 2:
                    time.sleep(1)
            # /data exists but isn't writable — warn loudly
            is_mount = _is_mount_point(data_root)
            logger.warning(
                "/data exists (mount_point=%s) but /data/obliteratus is NOT writable. "
                "Persistent storage may not be enabled for this Space. "
                "Data will NOT survive factory rebuilds! "
                "Enable persistent storage in Space settings or set OBLITERATUS_DATA_DIR.",
                is_mount,
            )

    # 3. Home directory (local installs)
    home_dir = Path.home() / ".obliteratus"
    if _test_writable(home_dir):
        logger.info("Telemetry storage: %s (home directory)", home_dir)
        return home_dir

    # 4. Last resort — /tmp does NOT survive rebuilds
    fallback = Path("/tmp/obliteratus_telemetry")
    fallback.mkdir(parents=True, exist_ok=True)
    if _ON_HF_SPACES:
        logger.warning(
            "Telemetry storage: %s — this is EPHEMERAL and will be lost on rebuild! "
            "Enable persistent storage in your Space settings.",
            fallback,
        )
    else:
        logger.info("Telemetry storage: %s (temporary)", fallback)
    return fallback


_TELEMETRY_DIR = _telemetry_dir()
TELEMETRY_FILE = _TELEMETRY_DIR / "telemetry.jsonl"

# Lock for thread-safe writes
_write_lock = threading.Lock()


def _is_persistent_storage() -> bool:
    """Check if the current telemetry directory is on persistent storage."""
    return str(_TELEMETRY_DIR).startswith("/data")


def storage_diagnostic() -> dict[str, Any]:
    """Return a diagnostic dict about the current storage configuration.

    Useful for debugging persistent storage issues on HF Spaces.
    """
    data_root = Path("/data")
    return {
        "telemetry_dir": str(_TELEMETRY_DIR),
        "telemetry_file": str(TELEMETRY_FILE),
        "telemetry_file_exists": TELEMETRY_FILE.exists(),
        "telemetry_file_size_bytes": (
            TELEMETRY_FILE.stat().st_size if TELEMETRY_FILE.exists() else 0
        ),
        "is_persistent": _is_persistent_storage(),
        "on_hf_spaces": _ON_HF_SPACES,
        "data_dir_exists": data_root.exists(),
        "data_dir_is_mount": _is_mount_point(data_root),
        "data_dir_writable": os.access(data_root, os.W_OK) if data_root.exists() else False,
        "explicit_data_dir": os.environ.get("OBLITERATUS_DATA_DIR", ""),
        "telemetry_repo": _TELEMETRY_REPO,
        "telemetry_enabled": is_enabled(),
    }


def disable_telemetry():
    """Disable telemetry collection."""
    global _enabled
    _enabled = False


def enable_telemetry():
    """Enable telemetry collection."""
    global _enabled
    _enabled = True


def is_telemetry_enabled() -> bool:
    return is_enabled()


def is_enabled() -> bool:
    """Check if telemetry is enabled (off by default, opt in with OBLITERATUS_TELEMETRY=1).

    This is the single source of truth for telemetry state.  Both v1
    (log_benchmark) and v2 (send_report) paths check this function.
    """
    global _enabled
    if _enabled is not None:
        return _enabled
    default = "1" if _ON_HF_SPACES else "0"
    env = os.environ.get("OBLITERATUS_TELEMETRY", default)
    return env not in ("0", "false")


# ── Record schema ─────────────────────────────────────────────────────

@dataclass
class BenchmarkRecord:
    """A single benchmark result entry."""
    # Identity
    timestamp: str = ""
    session_id: str = ""  # Random per-session, not per-user

    # Model
    model_id: str = ""
    model_family: str = ""  # e.g. "qwen", "llama", "gemma"
    model_size_b: float = 0.0  # Billions of parameters
    is_moe: bool = False

    # Method
    method: str = ""
    n_directions: int = 0
    norm_preserve: bool = False
    refinement_passes: int = 0
    use_whitened_svd: bool = False
    use_bayesian: bool = False

    # Dataset
    dataset: str = ""
    n_prompts: int = 0

    # Results
    refusal_rate: float | None = None
    perplexity: float | None = None
    coherence: float | None = None
    kl_divergence: float | None = None
    strong_layers: int = 0
    ega_expert_dirs: int = 0
    time_seconds: float = 0.0
    error: str | None = None

    # Hardware
    gpu_name: str = ""
    gpu_vram_gb: float = 0.0
    quantization: str | None = None

    # Extra metadata
    extra: dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        if not self.timestamp:
            self.timestamp = datetime.now(timezone.utc).isoformat()


# ── Session ID (random, per-process, non-identifying) ────────────────

def _generate_session_id() -> str:
    """Generate a random session ID (not tied to user identity)."""
    import random
    raw = f"{time.time()}-{random.random()}-{os.getpid()}"
    return hashlib.sha256(raw.encode()).hexdigest()[:12]

_SESSION_ID = _generate_session_id()


# ── Hub sync (cross-Space telemetry aggregation) ─────────────────────

def _instance_slug() -> str:
    """Generate a unique slug for this Space instance.

    Hashes the HF Space ID (to avoid leaking usernames in the public
    dataset) and combines it with the process session ID.  This is used
    as the filename when uploading per-instance JSONL to the Hub repo.
    """
    space_id = os.environ.get("SPACE_ID", "local")
    space_hash = hashlib.sha256(space_id.encode()).hexdigest()[:10]
    return f"{space_hash}_{_SESSION_ID}"


_hub_repo_lock = threading.Lock()

def _ensure_hub_repo(repo_id: str) -> bool:
    """Create the central telemetry dataset repo if it doesn't exist.

    Uses create_repo with exist_ok=True so this is safe to call
    repeatedly.  Thread-safe via _hub_repo_lock.
    Returns True if the repo is ready, False on failure.
    """
    global _hub_repo_created
    if _hub_repo_created:
        return True
    with _hub_repo_lock:
        if _hub_repo_created:  # double-check under lock
            return True
        try:
            from huggingface_hub import HfApi
            api = HfApi(token=os.environ.get("HF_TOKEN"))
            # First try create_repo (works if we own the namespace)
            try:
                api.create_repo(
                    repo_id=repo_id,
                    repo_type="dataset",
                    private=False,
                    exist_ok=True,
                )
                _hub_repo_created = True
                return True
            except Exception:
                pass
            # Fallback: check if the repo already exists (works for
            # collaborators / org members who can write but not create)
            try:
                api.repo_info(repo_id=repo_id, repo_type="dataset")
                _hub_repo_created = True
                logger.info(f"Hub repo {repo_id} exists (verified via repo_info)")
                return True
            except Exception as e:
                logger.warning(
                    f"Hub repo {repo_id}: create_repo failed and repo_info "
                    f"also failed — repo may not exist or token lacks access: {e}"
                )
                return False
        except ImportError:
            logger.warning("huggingface_hub not installed — cannot ensure Hub repo")
            return False


_sync_in_progress = threading.Event()

def _sync_to_hub_bg() -> None:
    """Background thread target: upload local JSONL to the central Hub repo.

    Each Space instance writes its data to a unique file path in the repo:
        data/{instance_slug}.jsonl
    This avoids write conflicts between concurrent Space instances while
    ensuring all data lands in the same dataset repository.
    Uses _sync_in_progress event to prevent overlapping uploads.
    """
    if _sync_in_progress.is_set():
        return  # Another sync is already running
    _sync_in_progress.set()
    try:
        repo = _TELEMETRY_REPO
        if not repo:
            return
        if not TELEMETRY_FILE.exists():
            return

        from huggingface_hub import HfApi
        if not _ensure_hub_repo(repo):
            return
        api = HfApi(token=os.environ.get("HF_TOKEN"))
        slug = _instance_slug()
        api.upload_file(
            path_or_fileobj=str(TELEMETRY_FILE),
            path_in_repo=f"data/{slug}.jsonl",
            repo_id=repo,
            repo_type="dataset",
            commit_message=f"Auto-sync telemetry from {slug}",
        )
        logger.info(f"Synced telemetry to {repo}/data/{slug}.jsonl")
    except Exception as e:
        logger.warning(f"Hub sync failed: {e}")
    finally:
        _sync_in_progress.clear()


def _schedule_hub_sync() -> None:
    """Schedule a debounced background sync of local telemetry to Hub.

    Skips if:
    - No telemetry repo is configured
    - Telemetry is disabled
    - Last sync was less than _HUB_SYNC_INTERVAL seconds ago
    """
    global _hub_sync_last
    if not _TELEMETRY_REPO:
        return
    if not is_enabled():
        return

    with _hub_sync_lock:
        now = time.time()
        if now - _hub_sync_last < _HUB_SYNC_INTERVAL:
            return
        _hub_sync_last = now

    t = threading.Thread(target=_sync_to_hub_bg, daemon=True)
    t.start()


def fetch_hub_records(max_records: int = 10000) -> list[dict[str, Any]]:
    """Fetch all telemetry records from the central HF Hub dataset.

    Downloads all per-instance JSONL files from the ``data/`` directory
    in the telemetry repo and parses them into records.  Returns an empty
    list if the repo is not configured or not reachable.

    This is used by :func:`get_leaderboard_data` to merge community-wide
    results with local data.
    """
    repo = _TELEMETRY_REPO
    if not repo:
        return []

    try:
        from huggingface_hub import HfApi, hf_hub_download

        api = HfApi(token=os.environ.get("HF_TOKEN"))
        try:
            all_files = api.list_repo_files(repo, repo_type="dataset")
        except Exception:
            # Repo doesn't exist yet or network error
            return []

        jsonl_files = [f for f in all_files if f.startswith("data/") and f.endswith(".jsonl")]
        if not jsonl_files:
            return []

        records: list[dict[str, Any]] = []
        for filepath in jsonl_files:
            try:
                local_path = hf_hub_download(
                    repo, filepath, repo_type="dataset",
                    # etag_timeout=0 forces a freshness check against Hub
                    # so we always get the latest data, not stale cache
                    etag_timeout=0,
                )
                with open(local_path) as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        try:
                            records.append(json.loads(line))
                        except json.JSONDecodeError:
                            continue
                        if len(records) >= max_records:
                            break
            except Exception:
                continue
            if len(records) >= max_records:
                break

        return records
    except ImportError:
        logger.debug("huggingface_hub not installed — cannot fetch Hub records")
        return []
    except Exception as e:
        logger.debug(f"Failed to fetch Hub records: {e}")
        return []


# ── Hub restore (warm-start after rebuild) ────────────────────────────

_restore_done = False
_restore_lock = threading.Lock()


def restore_from_hub() -> int:
    """Download community records from Hub into the local JSONL file.

    This is the critical path for surviving factory rebuilds: even if
    ``/data`` is wiped or unavailable, we can reconstruct the leaderboard
    from the central Hub dataset on startup.

    Records already present locally (by ``(session_id, timestamp)`` key)
    are skipped to avoid duplicates.

    Returns the number of new records restored.
    """
    global _restore_done
    if _restore_done:
        return 0
    with _restore_lock:
        if _restore_done:
            return 0
        _restore_done = True

    repo = _TELEMETRY_REPO
    if not repo:
        return 0

    try:
        # Read existing local keys for dedup
        existing_keys: set[tuple[str, str]] = set()
        if TELEMETRY_FILE.exists():
            try:
                with open(TELEMETRY_FILE) as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        try:
                            r = json.loads(line)
                            existing_keys.add(
                                (r.get("session_id", ""), r.get("timestamp", ""))
                            )
                        except json.JSONDecodeError:
                            continue
            except Exception:
                pass

        hub_records = fetch_hub_records()
        if not hub_records:
            return 0

        new_count = 0
        with _write_lock:
            with open(TELEMETRY_FILE, "a") as f:
                for r in hub_records:
                    key = (r.get("session_id", ""), r.get("timestamp", ""))
                    if key in existing_keys:
                        continue
                    existing_keys.add(key)
                    f.write(json.dumps(r, default=str) + "\n")
                    new_count += 1

        if new_count:
            logger.info(
                "Restored %d records from Hub repo %s to local storage at %s",
                new_count, repo, TELEMETRY_FILE,
            )
        return new_count
    except Exception as e:
        logger.warning("Hub restore failed: %s", e)
        return 0


def _restore_from_hub_bg() -> None:
    """Background thread: restore Hub records to local on startup."""
    try:
        restore_from_hub()
    except Exception as e:
        logger.warning("Background Hub restore failed: %s", e)


# Auto-restore on HF Spaces startup (background, non-blocking).
# This ensures the leaderboard has data even after a factory rebuild.
if _ON_HF_SPACES and is_enabled() and _TELEMETRY_REPO:
    _restore_thread = threading.Thread(target=_restore_from_hub_bg, daemon=True)
    _restore_thread.start()


# ── Hardware detection ────────────────────────────────────────────────

def _detect_gpu() -> tuple[str, float]:
    """Detect GPU name and VRAM. Returns ('', 0.0) if no GPU."""
    try:
        import torch
        if torch.cuda.is_available():
            name = torch.cuda.get_device_name(0)
            vram = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
            return name, round(vram, 1)
    except Exception:
        pass
    return "", 0.0


def _detect_model_family(model_id: str) -> str:
    """Extract model family from model ID."""
    lower = model_id.lower()
    families = [
        "qwen", "llama", "gemma", "mistral", "phi", "falcon",
        "deepseek", "olmo", "glm", "gpt-oss", "minimax",
        "smollm", "internlm", "minicpm", "tinyllama",
    ]
    for f in families:
        if f in lower:
            return f
    return "unknown"


# ── Write / Read ──────────────────────────────────────────────────────

def log_benchmark(record: BenchmarkRecord) -> bool:
    """Append a benchmark record to the local telemetry file.

    Returns True if successfully written, False if telemetry is disabled
    or an error occurred.
    """
    if not is_enabled():
        return False

    if not record.session_id:
        record.session_id = _SESSION_ID

    if not record.gpu_name:
        record.gpu_name, record.gpu_vram_gb = _detect_gpu()

    if not record.model_family:
        record.model_family = _detect_model_family(record.model_id)

    try:
        data = asdict(record)
        with _write_lock:
            with open(TELEMETRY_FILE, "a") as f:
                f.write(json.dumps(data, default=str) + "\n")
        # Auto-sync to central Hub repo (debounced, background thread)
        _schedule_hub_sync()
        return True
    except Exception as e:
        logger.debug(f"Telemetry write failed: {e}")
        return False


def log_benchmark_from_dict(
    model_id: str,
    method: str,
    entry: dict[str, Any],
    dataset: str = "",
    n_prompts: int = 0,
    quantization: str | None = None,
    pipeline_config: dict[str, Any] | None = None,
) -> bool:
    """Convenience wrapper: create a BenchmarkRecord from benchmark result dict.

    Called from app.py benchmark() after each method completes.
    """
    cfg = pipeline_config or {}

    record = BenchmarkRecord(
        model_id=model_id,
        method=method,
        dataset=dataset,
        n_prompts=n_prompts,
        quantization=quantization,
        refusal_rate=entry.get("refusal_rate"),
        perplexity=entry.get("perplexity"),
        coherence=entry.get("coherence"),
        kl_divergence=entry.get("kl_divergence"),
        strong_layers=entry.get("strong_layers", 0),
        ega_expert_dirs=entry.get("ega_expert_dirs", 0),
        time_seconds=entry.get("time_s", 0.0),
        error=entry.get("error"),
        n_directions=cfg.get("n_directions", 0),
        norm_preserve=cfg.get("norm_preserve", False),
        refinement_passes=cfg.get("refinement_passes", 0),
        use_whitened_svd=cfg.get("use_whitened_svd", False),
        use_bayesian=cfg.get("bayesian_trials", 0) > 0,
    )
    return log_benchmark(record)


def read_telemetry(max_records: int = 10000) -> list[dict[str, Any]]:
    """Read all telemetry records from the local JSONL file.

    Returns a list of dicts, newest first.
    """
    records = []
    if not TELEMETRY_FILE.exists():
        return records

    try:
        with open(TELEMETRY_FILE) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError:
                    continue
                if len(records) >= max_records:
                    break
    except Exception as e:
        logger.debug(f"Telemetry read failed: {e}")

    # Newest first
    records.reverse()
    return records


def get_leaderboard_data() -> list[dict[str, Any]]:
    """Get aggregated leaderboard data from local + Hub telemetry.

    Merges local records with community-wide records from the central Hub
    dataset, deduplicates by (session_id, timestamp), groups by
    (model_id, method) and computes best/avg metrics.

    Returns a list of dicts suitable for display in a Gradio Dataframe.
    """
    local_records = read_telemetry()

    # Fetch community records from central Hub repo
    hub_records = []
    try:
        hub_records = fetch_hub_records()
    except Exception:
        pass  # Hub fetch is best-effort

    # Merge and deduplicate by (session_id, timestamp)
    seen: set[tuple[str, str]] = set()
    records: list[dict[str, Any]] = []
    for r in local_records + hub_records:
        key = (r.get("session_id", ""), r.get("timestamp", ""))
        if key in seen:
            continue
        seen.add(key)
        records.append(r)

    if not records:
        return []

    # Group by (model_id, method)
    groups: dict[tuple[str, str], list[dict]] = {}
    for r in records:
        if r.get("error"):
            continue
        key = (r.get("model_id", ""), r.get("method", ""))
        if key not in groups:
            groups[key] = []
        groups[key].append(r)

    leaderboard = []
    for (model_id, method), runs in groups.items():
        # Compute aggregates
        refusal_rates = [r["refusal_rate"] for r in runs if r.get("refusal_rate") is not None]
        perplexities = [r["perplexity"] for r in runs if r.get("perplexity") is not None]
        coherences = [r["coherence"] for r in runs if r.get("coherence") is not None]
        times = [r["time_seconds"] for r in runs if r.get("time_seconds") is not None]

        entry = {
            "model": model_id.split("/")[-1] if "/" in model_id else model_id,
            "model_id": model_id,
            "method": method,
            "runs": len(runs),
            "best_refusal": min(refusal_rates) if refusal_rates else None,
            "avg_refusal": sum(refusal_rates) / len(refusal_rates) if refusal_rates else None,
            "best_perplexity": min(perplexities) if perplexities else None,
            "avg_perplexity": sum(perplexities) / len(perplexities) if perplexities else None,
            "avg_coherence": sum(coherences) / len(coherences) if coherences else None,
            "avg_time_s": sum(times) / len(times) if times else None,
            "gpu": runs[0].get("gpu_name", "") if runs else "",
            "last_run": runs[0].get("timestamp", "") if runs else "",
        }
        leaderboard.append(entry)

    # Sort: lowest refusal rate first, then by perplexity
    leaderboard.sort(key=lambda x: (x.get("best_refusal") or 999, x.get("best_perplexity") or 999))

    return leaderboard


def push_to_hub(repo_id: str | None = None) -> bool:
    """Push local telemetry to the central HuggingFace Dataset repo.

    Uploads this instance's local JSONL file to the central Hub repo as a
    per-instance file (``data/{instance_slug}.jsonl``).  All Space instances
    (including duplicated ones) contribute to the same dataset.

    Requires HF_TOKEN to be set (automatically available on HF Spaces).
    """
    repo = repo_id or _TELEMETRY_REPO
    if not repo:
        logger.warning("No telemetry repo configured — set OBLITERATUS_TELEMETRY_REPO")
        return False
    records = read_telemetry()
    if not records:
        logger.info("No telemetry records to push")
        return False

    try:
        from huggingface_hub import HfApi

        if not _ensure_hub_repo(repo):
            return False

        api = HfApi(token=os.environ.get("HF_TOKEN"))
        slug = _instance_slug()
        api.upload_file(
            path_or_fileobj=str(TELEMETRY_FILE),
            path_in_repo=f"data/{slug}.jsonl",
            repo_id=repo,
            repo_type="dataset",
            commit_message=f"Manual push from {slug} ({len(records)} records)",
        )
        logger.info(f"Pushed {len(records)} records to {repo}/data/{slug}.jsonl")
        return True
    except ImportError:
        logger.warning("huggingface_hub not installed — cannot push telemetry")
        return False
    except Exception as e:
        logger.warning(f"Failed to push telemetry: {e}")
        return False


# ── V2 Telemetry API: structured report building ────────────────────

_ALLOWED_METHOD_CONFIG_KEYS = frozenset({
    "n_directions", "norm_preserve", "regularization", "refinement_passes",
    "project_biases", "use_chat_template", "use_whitened_svd",
    "true_iterative_refinement", "use_jailbreak_contrast",
    "layer_adaptive_strength", "attention_head_surgery",
    "safety_neuron_masking", "per_expert_directions", "use_sae_features",
    "invert_refusal", "project_embeddings", "embed_regularization",
    "activation_steering", "steering_strength", "expert_transplant",
    "transplant_blend", "reflection_strength",
})

_ALLOWED_ANALYSIS_KEYS = frozenset({
    "detected_alignment_method", "alignment_confidence",
    "alignment_probabilities", "cone_is_polyhedral", "cone_dimensionality",
    "mean_pairwise_cosine", "direction_specificity", "cluster_count",
    "direction_persistence", "mean_refusal_sparsity_index",
    "recommended_sparsity", "use_sparse_surgery", "estimated_robustness",
    "self_repair_estimate", "entanglement_score", "entangled_layers",
    "clean_layers", "recommended_n_directions",
    "recommended_regularization", "recommended_refinement_passes",
    "recommended_layers", "skip_layers",
})


def _safe_float(val: Any) -> float | None:
    """Safely convert a value to float, returning None on failure."""
    if val is None:
        return None
    try:
        f = float(val)
        if not math.isfinite(f):
            return None
        return f
    except (TypeError, ValueError):
        return None


def _get_environment_info() -> dict[str, str]:
    """Collect non-identifying environment information."""
    return {
        "python_version": platform.python_version(),
        "os": platform.system(),
        "arch": platform.machine(),
        "torch_version": _get_torch_version(),
    }


def _get_torch_version() -> str:
    try:
        import torch
        return torch.__version__
    except ImportError:
        return "not_installed"


def _get_peak_vram() -> dict[str, float] | None:
    try:
        import torch
        if torch.cuda.is_available():
            allocated = torch.cuda.max_memory_allocated() / (1024 ** 3)
            reserved = torch.cuda.max_memory_reserved() / (1024 ** 3)
            return {
                "peak_allocated_gb": round(allocated, 2),
                "peak_reserved_gb": round(reserved, 2),
            }
    except Exception:
        pass
    return None


def _direction_stats(pipeline) -> dict[str, Any]:
    """Extract direction quality statistics from a pipeline."""
    directions = getattr(pipeline, "refusal_directions", {})
    subspaces = getattr(pipeline, "refusal_subspaces", {})
    if not directions:
        return {}
    import torch
    stats: dict[str, Any] = {}
    norms = {}
    for idx, d in sorted(directions.items()):
        if isinstance(d, torch.Tensor):
            norms[str(idx)] = round(d.float().norm().item(), 4)
    if norms:
        stats["direction_norms"] = norms
    sorted_indices = sorted(directions.keys())
    if len(sorted_indices) >= 2:
        cosines = []
        for i in range(len(sorted_indices) - 1):
            d1 = directions[sorted_indices[i]].float()
            d2 = directions[sorted_indices[i + 1]].float()
            cos = torch.nn.functional.cosine_similarity(
                d1.unsqueeze(0), d2.unsqueeze(0)
            ).item()
            cosines.append(abs(cos))
        stats["mean_direction_persistence"] = round(sum(cosines) / len(cosines), 4)
    if subspaces:
        effective_ranks = {}
        for idx, sub in subspaces.items():
            if isinstance(sub, torch.Tensor) and sub.dim() == 2 and sub.shape[0] > 1:
                try:
                    s = torch.linalg.svdvals(sub.float())
                    s = s[s > 1e-12]
                    if len(s) > 0:
                        p = s / s.sum()
                        entropy = -(p * p.log()).sum()
                        effective_ranks[str(idx)] = round(torch.exp(entropy).item(), 2)
                except Exception:
                    pass
        if effective_ranks:
            stats["effective_ranks"] = effective_ranks
    return stats


def _extract_excise_details(pipeline) -> dict[str, Any]:
    """Extract excision operation details from a pipeline."""
    details: dict[str, Any] = {}
    techniques: list[str] = []
    modified = getattr(pipeline, "_excise_modified_count", None)
    if modified is not None:
        details["modified_count"] = modified
    refusal_heads = getattr(pipeline, "_refusal_heads", {})
    if refusal_heads:
        techniques.append("head_surgery")
        details["head_surgery_layers"] = len(refusal_heads)
        total_heads = sum(len(heads) for heads in refusal_heads.values())
        details["total_heads_projected"] = total_heads
    sae_dirs = getattr(pipeline, "_sae_directions", {})
    if sae_dirs:
        techniques.append("sae_features")
        details["sae_direction_count"] = len(sae_dirs)
    expert_scores = getattr(pipeline, "_expert_safety_scores", {})
    if expert_scores:
        techniques.append("expert_gating")
    layer_weights = getattr(pipeline, "_layer_excise_weights", {})
    if layer_weights:
        techniques.append("layer_adaptive")
        details["adaptive_weight_min"] = round(min(layer_weights.values()), 4)
        details["adaptive_weight_max"] = round(max(layer_weights.values()), 4)
    expert_dirs = getattr(pipeline, "_expert_directions", {})
    if expert_dirs:
        techniques.append("per_expert")
    steering_hooks = getattr(pipeline, "_steering_hooks", [])
    if steering_hooks:
        techniques.append("activation_steering")
    if getattr(pipeline, "invert_refusal", False):
        techniques.append("inversion")
    if getattr(pipeline, "project_embeddings", False):
        techniques.append("embedding_projection")
    if getattr(pipeline, "activation_steering", False) and "activation_steering" not in techniques:
        techniques.append("activation_steering")
    if getattr(pipeline, "expert_transplant", False):
        techniques.append("expert_transplant")
    if techniques:
        details["used_techniques"] = techniques
    return details


def _extract_prompt_counts(pipeline) -> dict[str, int]:
    """Extract prompt count information from a pipeline."""
    counts: dict[str, int] = {}
    harmful = getattr(pipeline, "harmful_prompts", None)
    if harmful is not None:
        counts["harmful"] = len(harmful)
    harmless = getattr(pipeline, "harmless_prompts", None)
    if harmless is not None:
        counts["harmless"] = len(harmless)
    jailbreak = getattr(pipeline, "jailbreak_prompts", None)
    if jailbreak is not None and jailbreak:
        counts["jailbreak"] = len(jailbreak)
    return counts


def _extract_stage_durations(pipeline) -> dict[str, float] | None:
    """Extract stage duration timings from a pipeline."""
    durations = getattr(pipeline, "_stage_durations", None)
    if durations and isinstance(durations, dict):
        return dict(durations)
    return None


def _extract_analysis_insights(informed_report) -> dict[str, Any]:
    """Extract and filter analysis insights from an informed pipeline report."""
    insights_obj = getattr(informed_report, "insights", None)
    if insights_obj is None:
        return {}
    result: dict[str, Any] = {}
    for key in _ALLOWED_ANALYSIS_KEYS:
        val = getattr(insights_obj, key, None)
        if val is not None:
            result[key] = val
    return result


def build_report(
    *,
    architecture: str,
    num_layers: int,
    num_heads: int,
    hidden_size: int,
    total_params: int,
    method: str,
    method_config: dict[str, Any] | None = None,
    quality_metrics: dict[str, Any] | None = None,
    stage_durations: dict[str, float] | None = None,
    strong_layers: list[int] | None = None,
    direction_stats: dict[str, Any] | None = None,
    excise_details: dict[str, Any] | None = None,
    prompt_counts: dict[str, int] | None = None,
    gpu_memory: dict[str, float] | None = None,
    analysis_insights: dict[str, Any] | None = None,
    informed_extras: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """Build a structured telemetry report (schema v2)."""
    report: dict[str, Any] = {
        "schema_version": 2,
        "session_id": uuid.uuid4().hex,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "model": {
            "architecture": architecture,
            "num_layers": num_layers,
            "num_heads": num_heads,
            "hidden_size": hidden_size,
            "total_params": total_params,
        },
        "method": method,
        "environment": _get_environment_info(),
    }
    if method_config:
        report["method_config"] = {
            k: v for k, v in method_config.items()
            if k in _ALLOWED_METHOD_CONFIG_KEYS
        }
    else:
        report["method_config"] = {}
    if quality_metrics:
        report["quality_metrics"] = dict(quality_metrics)
    else:
        report["quality_metrics"] = {}
    if stage_durations:
        report["stage_durations"] = stage_durations
    if strong_layers is not None:
        report["strong_layers"] = strong_layers
    if direction_stats:
        report["direction_stats"] = direction_stats
    if excise_details:
        report["excise_details"] = excise_details
    if prompt_counts:
        report["prompt_counts"] = prompt_counts
    if gpu_memory:
        report["gpu_memory"] = gpu_memory
    if analysis_insights:
        filtered = {k: v for k, v in analysis_insights.items() if k in _ALLOWED_ANALYSIS_KEYS}
        if filtered:
            report["analysis_insights"] = filtered
    if informed_extras:
        report["informed"] = dict(informed_extras)
    return report


def _send_sync(report: dict[str, Any]) -> None:
    """Synchronously write a v2 telemetry report to local JSONL and sync to Hub."""
    try:
        with _write_lock:
            with open(TELEMETRY_FILE, "a") as f:
                f.write(json.dumps(report, default=str) + "\n")
        _schedule_hub_sync()
    except Exception as e:
        logger.debug("Telemetry v2 write failed: %s", e)
    logger.debug("Telemetry report sent (schema_version=%s)", report.get("schema_version"))


def send_report(report: dict[str, Any]) -> None:
    """Send a telemetry report in a background thread."""
    if not is_enabled():
        return

    def _bg():
        try:
            _send_sync(report)
        except Exception as e:
            logger.debug("Telemetry send failed: %s", e)

    t = threading.Thread(target=_bg, daemon=True)
    t.start()


def maybe_send_pipeline_report(pipeline) -> None:
    """Build and send a telemetry report from a completed pipeline."""
    if not is_enabled():
        return
    try:
        summary = pipeline.handle.summary()
        method_config = {}
        for key in _ALLOWED_METHOD_CONFIG_KEYS:
            val = getattr(pipeline, key, None)
            if val is not None:
                method_config[key] = val
        report = build_report(
            architecture=summary.get("architecture", "unknown"),
            num_layers=summary.get("num_layers", 0),
            num_heads=summary.get("num_heads", 0),
            hidden_size=summary.get("hidden_size", 0),
            total_params=summary.get("total_params", 0),
            method=pipeline.method,
            method_config=method_config,
            quality_metrics=pipeline._quality_metrics,
            stage_durations=_extract_stage_durations(pipeline),
            strong_layers=pipeline._strong_layers,
            direction_stats=_direction_stats(pipeline),
            excise_details=_extract_excise_details(pipeline),
            prompt_counts=_extract_prompt_counts(pipeline),
            gpu_memory=_get_peak_vram(),
        )
        send_report(report)
    except Exception as e:
        logger.debug("Failed to build pipeline report: %s", e)


def maybe_send_informed_report(pipeline, informed_report) -> None:
    """Build and send a telemetry report from a completed informed pipeline."""
    if not is_enabled():
        return
    try:
        summary = pipeline.handle.summary()
        method_config = {}
        for key in _ALLOWED_METHOD_CONFIG_KEYS:
            val = getattr(pipeline, key, None)
            if val is not None:
                method_config[key] = val
        analysis_insights = _extract_analysis_insights(informed_report)
        informed_extras = {}
        for attr in ("ouroboros_passes", "final_refusal_rate",
                      "analysis_duration", "total_duration"):
            val = getattr(informed_report, attr, None)
            if val is not None:
                informed_extras[attr] = val
        report = build_report(
            architecture=summary.get("architecture", "unknown"),
            num_layers=summary.get("num_layers", 0),
            num_heads=summary.get("num_heads", 0),
            hidden_size=summary.get("hidden_size", 0),
            total_params=summary.get("total_params", 0),
            method=pipeline.method,
            method_config=method_config,
            quality_metrics=pipeline._quality_metrics,
            stage_durations=_extract_stage_durations(pipeline),
            strong_layers=pipeline._strong_layers,
            direction_stats=_direction_stats(pipeline),
            excise_details=_extract_excise_details(pipeline),
            prompt_counts=_extract_prompt_counts(pipeline),
            gpu_memory=_get_peak_vram(),
            analysis_insights=analysis_insights,
            informed_extras=informed_extras,
        )
        send_report(report)
    except Exception as e:
        logger.debug("Failed to build informed report: %s", e)