Add files via upload

2026-07-13 15:37:19 +02:00 · 2026-03-05 00:50:44 -08:00
parent 4cddc6399a
commit 66ea4a6f86
37 changed files with 10756 additions and 45 deletions
@@ -5,7 +5,7 @@ Thanks for your interest in contributing. This document covers everything you ne
 ## Development Setup

 ```bash
-git clone https://github.com/obliteratus-project/OBLITERATUS.git
+git clone https://github.com/elder-plinius/OBLITERATUS.git
 cd OBLITERATUS
 pip install -e ".[dev]"
 ```
@@ -28,7 +28,7 @@ short_description: "One-click model liberation + chat playground"
    <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue" alt="Open in HF Spaces">
  </a>
  &nbsp;
-  <a href="https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb">
+  <a href="https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab">
  </a>
 </p>
@@ -55,7 +55,7 @@ Built on published research from [Arditi et al. (2024)](https://arxiv.org/abs/24
 obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --method advanced
 ```

-Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All.
+Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All.

 ## What it does

@@ -153,7 +153,7 @@ The `obliteratus ui` command adds a Rich terminal startup with GPU detection and

 ### 3. Google Colab (free GPU)

-[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb)
+[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb)

 Pick a model from the dropdown, pick a method, hit Run All. Download the result or push straight to HuggingFace Hub. Works on the free T4 tier for models up to ~8B parameters.

@@ -545,7 +545,7 @@ If you use OBLITERATUS in your research, please cite:
               Refusal Removal in Large Language Models},
  author    = {{OBLITERATUS Contributors}},
  year      = {2026},
-  url       = {https://github.com/obliteratus-project/OBLITERATUS},
+  url       = {https://github.com/elder-plinius/OBLITERATUS},
  note      = {15 analysis modules, 837 tests}
 }
 ```
@@ -565,7 +565,7 @@ pytest

 - **Open source** — [GNU Affero General Public License v3.0](LICENSE) (AGPL-3.0). You can freely use, modify, and distribute OBLITERATUS under AGPL terms. If you run a modified version as a network service (SaaS), you must release your source code to users under the same license.

- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/obliteratus-project/OBLITERATUS/issues) for pricing and terms.
+- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/elder-plinius/OBLITERATUS/issues) for pricing and terms.

 This is the same dual-licensing model used by MongoDB, Qt, Grafana, and others.

@@ -11,7 +11,7 @@ OBLITERATUS is a mechanistic interpretability research tool. It removes refusal
 If you discover a security vulnerability in OBLITERATUS, please report it responsibly:

 1. **Do not** open a public GitHub issue
-2. Open a [private security advisory](https://github.com/obliteratus-project/OBLITERATUS/security/advisories/new) with:
+2. Open a [private security advisory](https://github.com/elder-plinius/OBLITERATUS/security/advisories/new) with:
   - Description of the vulnerability
   - Steps to reproduce
   - Potential impact
@@ -115,6 +115,10 @@ _last_obliterated_label: str = ""
 # Counter for unique obliteration save directories
 _obliterate_counter: int = 0

+# Flag to suppress session_model_dd.change when obliterate programmatically
+# sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
+_skip_session_load: bool = False
+
 # ---------------------------------------------------------------------------
 # Model presets — 100+ models organized by provider
 # ---------------------------------------------------------------------------
@@ -1459,7 +1463,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
            f"   or locally: `export HF_TOKEN=hf_...`\n\n"
            f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n"
            f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).",
-            "", gr.update(), gr.update(), gr.update(),
+            "", gr.update(), gr.update(), gr.update(), gr.update(),
        )
        return

@@ -1468,14 +1472,14 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
        if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub):
            yield (
                "**Error:** Invalid Hub repo format. Use `username/model-name`.",
-                "", gr.update(), gr.update(), gr.update(),
+                "", gr.update(), gr.update(), gr.update(), gr.update(),
            )
            return
        if not os.environ.get("HF_TOKEN"):
            yield (
                "**Error:** HF_TOKEN not set. Push to Hub requires a write token. "
                "Set it via `export HF_TOKEN=hf_...` or in your Space secrets.",
-                "", gr.update(), gr.update(), gr.update(),
+                "", gr.update(), gr.update(), gr.update(), gr.update(),
            )
            return

@@ -1486,7 +1490,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
    _clear_gpu()
    with _lock:
        if _state["status"] == "obliterating":
-            yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update()
+            yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update(), gr.update()
            return
        _state["log"] = []
        _state["status"] = "obliterating"
@@ -1638,9 +1642,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
        status_msg = f"**Obliterating\u2026** ({_elapsed()})"
        if len(log_lines) > last_yielded[0]:
            last_yielded[0] = len(log_lines)
-            yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
+            yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
        else:
-            yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
+            yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
        if time.time() - _pipeline_start > _max_pipeline_secs:
            log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.")
            break
@@ -1655,7 +1659,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
        err_msg = str(error_ref[0]) or repr(error_ref[0])
        log_lines.append(f"\nERROR: {err_msg}")
        _state["log"] = log_lines
-        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
+        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
        return

    # Success — keep model in memory for chat.
@@ -1757,7 +1761,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
            if bnb_available:
                log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...")
                last_yielded[0] = len(log_lines)
-                yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
+                yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
                try:
                    from transformers import BitsAndBytesConfig
                    bnb_cfg = BitsAndBytesConfig(
@@ -1804,7 +1808,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
                    else "Falling back to CPU offload..."
                )
                last_yielded[0] = len(log_lines)
-                yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
+                yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
                try:
                    offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
                    model_reloaded = AutoModelForCausalLM.from_pretrained(
@@ -1861,13 +1865,21 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
                f"**{model_choice}** liberated with `{method}` method. "
                f"Saved to `{save_dir}`. Chat requires a larger GPU."
            )
-        # Update session dropdown directly (don't rely on .then() which can
-        # fail to fire on ZeroGPU after generator teardown)
+        # Update BOTH session dropdowns directly (don't rely on .then() which
+        # fails to fire on ZeroGPU after generator teardown).
+        # Set skip flag so the .change handler doesn't trigger a wasteful
+        # GPU re-allocation — the model is already loaded.
+        global _skip_session_load
+        _skip_session_load = True
        _dd_update = gr.update(
            choices=_get_session_model_choices(),
            value=_last_obliterated_label or None,
        )
-        yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card
+        _ab_dd_update = gr.update(
+            choices=_get_session_model_choices(),
+            value=_last_obliterated_label or None,
+        )
+        yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card, _ab_dd_update

    except Exception as e:
        # Ensure status never gets stuck on "obliterating"
@@ -1876,7 +1888,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
        err_msg = str(e) or repr(e)
        log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
        _state["log"] = log_lines
-        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
+        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()


 # ---------------------------------------------------------------------------
@@ -2102,6 +2114,18 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):

    On ZeroGPU, uses the visitor's GPU quota.
    """
+    # Skip if the obliterate function just set the dropdown value — the model
+    # is already loaded and we'd just waste GPU quota re-allocating.
+    global _skip_session_load
+    if _skip_session_load:
+        _skip_session_load = False
+        if choice and _state.get("status") == "ready":
+            yield (
+                f"**Ready!** `{choice}` is loaded — just type in the chat below.",
+                get_chat_header(),
+            )
+            return
+
    if not choice or choice not in _bench_configs:
        yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", ""
        return
@@ -3727,6 +3751,7 @@ Pre-configured benchmark configurations for common research questions.
                    choices=_get_session_model_choices(),
                    label="Cached Models",
                    info="Select a model to auto-load it for chat",
+                    allow_custom_value=True,
                )
                session_load_status = gr.Markdown("")

@@ -3779,6 +3804,7 @@ See exactly how abliteration changes model behavior on the same prompt.
                    choices=_get_session_model_choices(),
                    label="Cached Models",
                    info="Select a model to auto-load it for A/B comparison",
+                    allow_custom_value=True,
                )
                ab_session_load_status = gr.Markdown("")

@@ -4125,8 +4151,8 @@ Built on the shoulders of:

 ### Links

- [GitHub](https://github.com/obliteratus-project/OBLITERATUS)
- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper)
+- [GitHub](https://github.com/elder-plinius/OBLITERATUS)
+- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper)
 """)

    # Wire method dropdown → auto-update advanced settings
@@ -4192,28 +4218,27 @@ Built on the shoulders of:
    ).then(fn=_get_vram_html, outputs=[vram_display])

    # Wire obliterate button (after all tabs so chat_status is defined)
-    # session_model_dd is a direct output (4th) so the dropdown updates
-    # reliably even on ZeroGPU where .then() may not fire after generator teardown.
+    # Both session_model_dd (4th) and ab_session_model_dd (6th) are direct
+    # outputs so the dropdowns update reliably even on ZeroGPU where .then()
+    # may not fire after generator teardown.
    obliterate_btn.click(
        fn=obliterate,
        inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd,
                custom_harmful_tb, custom_harmless_tb] + _adv_controls,
-        outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md],
+        outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md, ab_session_model_dd],
    ).then(
-        fn=lambda: (
-            gr.update(choices=_get_session_model_choices()),
-            _get_vram_html(),
-        ),
-        outputs=[ab_session_model_dd, vram_display],
+        fn=lambda: _get_vram_html(),
+        outputs=[vram_display],
    )

    # Wire session model auto-loading (Chat tab dropdown change)
+    # Always pass choices + value together so ZeroGPU doesn't hit stale choices
    session_model_dd.change(
        fn=load_bench_into_chat,
        inputs=[session_model_dd],
        outputs=[session_load_status, chat_status],
    ).then(
-        fn=lambda v: (gr.update(value=v), _get_vram_html()),
+        fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
        inputs=[session_model_dd],
        outputs=[ab_session_model_dd, vram_display],
    )
@@ -4224,7 +4249,7 @@ Built on the shoulders of:
        inputs=[ab_session_model_dd],
        outputs=[ab_session_load_status, chat_status],
    ).then(
-        fn=lambda v: (gr.update(value=v), _get_vram_html()),
+        fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
        inputs=[ab_session_model_dd],
        outputs=[session_model_dd, vram_display],
    )
@@ -1095,7 +1095,7 @@
                <h2>&gt; Quickstart: Free a Model</h2>
                <div style="background:#000; padding:16px; border:1px solid var(--border); margin-top:12px; line-height:2; font-size:0.78rem;">
                    <span style="color:var(--text-dim)"># 1. get the liberation toolkit</span><br>
-                    <span style="color:var(--accent)">$</span> git clone https://github.com/obliteratus-project/OBLITERATUS<br>
+                    <span style="color:var(--accent)">$</span> git clone https://github.com/elder-plinius/OBLITERATUS<br>
                    <span style="color:var(--accent)">$</span> cd OBLITERATUS<br>
                    <span style="color:var(--accent)">$</span> pip install -e .<br><br>
                    <span style="color:var(--text-dim)"># 2. interactive mode (guided liberation)</span><br>
@@ -1154,7 +1154,7 @@
                    <div style="margin-bottom:16px; padding:12px; border-left:3px solid var(--yellow); background:rgba(255,183,0,0.03)">
                        <h4 style="color:var(--yellow); font-size:0.82rem">Concept Cone Geometry <span style="font-size:0.65rem; color:var(--red)">[NOVEL]</span></h4>
                        <p style="color:var(--text-dim); font-size:0.75rem; margin-top:4px">
-                            Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Gurnee &amp; Nanda (ICML 2025) with novel extensions.
+                            Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Wollschlager et al. (ICML 2025) with novel extensions.
                        </p>
                    </div>
                    <div style="margin-bottom:16px; padding:12px; border-left:3px solid var(--yellow); background:rgba(255,183,0,0.03)">
@@ -1397,7 +1397,7 @@
                <div style="margin-bottom:16px; padding:16px; background:linear-gradient(135deg, rgba(249,171,0,0.08), rgba(249,171,0,0.02)); border:1px solid rgba(249,171,0,0.3); border-radius:6px">
                    <div style="font-size:0.82rem; font-weight:700; color:var(--yellow); margin-bottom:8px; letter-spacing:0.5px">&#9656; COLAB NOTEBOOK</div>
                    <div style="display:flex; align-items:center; gap:12px; flex-wrap:wrap">
-                        <a id="colab-link" href="https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb" target="_blank" rel="noopener"
+                        <a id="colab-link" href="https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb" target="_blank" rel="noopener"
                           style="display:inline-flex; align-items:center; gap:8px; background:#f9ab00; color:#000; padding:10px 20px; font-weight:700; font-size:0.85rem; text-decoration:none; border-radius:4px; letter-spacing:0.5px; font-family:'Fira Code',monospace">
                            <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="" style="height:20px; vertical-align:middle">
                            OPEN IN COLAB
@@ -50,7 +50,7 @@ Logged-in HuggingFace users get free GPU quota. For more quota, upgrade to [HF P
 ## Run locally (same UI, your own GPU)

 ```bash
-git clone https://github.com/obliteratus-project/OBLITERATUS
+git clone https://github.com/elder-plinius/OBLITERATUS
 cd OBLITERATUS
 pip install -e ".[spaces]"

@@ -73,5 +73,5 @@ No GPU hardware selection needed — ZeroGPU handles allocation automatically.

 ## Links

- [GitHub](https://github.com/obliteratus-project/OBLITERATUS)
- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper)
+- [GitHub](https://github.com/elder-plinius/OBLITERATUS)
+- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper)
@@ -53,7 +53,7 @@
    "id": "install"
   },
   "outputs": [],
-   "source": "!pip install -q git+https://github.com/obliteratus-project/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
+   "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
  },
  {
   "cell_type": "markdown",
@@ -4010,6 +4010,11 @@ class AbliterationPipeline:
                    f"Projecting packed quantized data would silently corrupt the model. "
                    f"Original error: {e}"
                )
+        # Some architectures store weights as non-float types (e.g. uint8 from
+        # custom quantization schemes).  Projections require float math, so
+        # convert and treat as "quantized" so the caller writes back properly.
+        if not weight.data.is_floating_point():
+            return weight.data.to(torch.float32), True
        return weight.data, False

    @staticmethod
@@ -4049,10 +4054,20 @@ class AbliterationPipeline:
                )
            return

+        # ── Non-float weight (e.g. uint8 from custom quantization) ─────
+        # If the original weight isn't a bitsandbytes/GPTQ/AWQ param, just
+        # replace with the float version so projections are preserved.
+        weight = proj_module.weight
+        if not AbliterationPipeline._is_quantized_param(weight):
+            proj_module.weight = nn.Parameter(
+                W_modified.to(device=weight.device),
+                requires_grad=weight.requires_grad,
+            )
+            return
+
        # ── bitsandbytes re-quantization ──────────────────────────
        try:
            import bitsandbytes as bnb
-            weight = proj_module.weight
            quantized, new_state = bnb.functional.quantize_4bit(
                W_modified.to(weight.device),
                quant_type=getattr(weight, "quant_type", "nf4"),
@@ -4087,7 +4102,8 @@ class AbliterationPipeline:
        norms: dict[str, float] = {}
        for param_name, param in layer.named_parameters():
            if param_name.endswith(".weight"):
-                norms[param_name] = param.data.norm().item()
+                data = param.data.float() if not param.data.is_floating_point() else param.data
+                norms[param_name] = data.norm().item()
        return norms

    @staticmethod
@@ -4106,7 +4122,8 @@ class AbliterationPipeline:
                continue
            original_norm = saved_norms[param_name]
            if original_norm > 0:
-                new_norm = param.data.norm().item()
+                data = param.data.float() if not param.data.is_floating_point() else param.data
+                new_norm = data.norm().item()
                if math.isnan(new_norm) or math.isinf(new_norm) or new_norm == 0:
                    continue  # Skip — weight is degenerate after projection
                if abs(new_norm - original_norm) > 1e-6:
@@ -4294,6 +4311,10 @@ class AbliterationPipeline:
                    continue
            else:
                data = param.data
+                # Non-float (e.g. uint8) fused params need float conversion
+                if not data.is_floating_point():
+                    data = data.float()
+                    is_quantized = True  # ensure write-back replaces param

            if data.dim() < 3:
                continue
@@ -38,9 +38,9 @@ dependencies = [
 ]

 [project.urls]
-"Homepage" = "https://github.com/obliteratus-project/OBLITERATUS"
-"Repository" = "https://github.com/obliteratus-project/OBLITERATUS"
-"Bug Tracker" = "https://github.com/obliteratus-project/OBLITERATUS/issues"
+"Homepage" = "https://github.com/elder-plinius/OBLITERATUS"
+"Repository" = "https://github.com/elder-plinius/OBLITERATUS"
+"Bug Tracker" = "https://github.com/elder-plinius/OBLITERATUS/issues"

 [project.optional-dependencies]
 dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
@@ -0,0 +1,302 @@
+"""Extended tests for novel abliteration pipeline features.
+
+Tests the new capabilities added to the OBLITERATUS abliteration pipeline:
+- Bias projection
+- Chat template wrapping
+- Method presets with new parameters
+- True iterative refinement
+- Whitened SVD integration
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import torch
+from transformers import GPT2Config, GPT2LMHeadModel
+
+from obliteratus.abliterate import (
+    METHODS,
+    AbliterationPipeline,
+)
+from obliteratus.models.loader import ModelHandle
+
+
+def _make_tiny_handle():
+    """Create a minimal ModelHandle with a tiny GPT-2 for testing."""
+    config = GPT2Config(
+        vocab_size=1000,
+        n_positions=128,
+        n_embd=64,
+        n_layer=4,
+        n_head=2,
+        n_inner=256,
+    )
+    model = GPT2LMHeadModel(config)
+    model.eval()
+
+    tokenizer = MagicMock()
+    tokenizer.pad_token = "<pad>"
+    tokenizer.eos_token = "<eos>"
+    tokenizer.return_value = {
+        "input_ids": torch.randint(0, 1000, (1, 10)),
+        "attention_mask": torch.ones(1, 10, dtype=torch.long),
+    }
+    tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city"
+
+    handle = ModelHandle(
+        model=model,
+        tokenizer=tokenizer,
+        config=config,
+        model_name="gpt2-test",
+        task="causal_lm",
+    )
+    handle.snapshot()
+    return handle
+
+
+def _make_varied_tokenizer(handle):
+    """Set up a tokenizer mock that returns different tokens per call."""
+    call_count = [0]
+    def mock_tokenizer(prompt, **kwargs):
+        call_count[0] += 1
+        torch.manual_seed(call_count[0])
+        return {
+            "input_ids": torch.randint(0, 1000, (1, 5)),
+            "attention_mask": torch.ones(1, 5, dtype=torch.long),
+        }
+    handle.tokenizer.side_effect = mock_tokenizer
+
+
+# ---------------------------------------------------------------------------
+# New method preset parameters
+# ---------------------------------------------------------------------------
+
+class TestNewMethodPresets:
+    def test_basic_has_new_params(self):
+        cfg = METHODS["basic"]
+        assert "project_biases" in cfg
+        assert "use_chat_template" in cfg
+        assert "use_whitened_svd" in cfg
+        assert "true_iterative_refinement" in cfg
+        assert cfg["project_biases"] is False
+        assert cfg["use_chat_template"] is False
+
+    def test_advanced_has_new_params(self):
+        cfg = METHODS["advanced"]
+        assert cfg["project_biases"] is True
+        assert cfg["use_chat_template"] is True
+        assert cfg["use_whitened_svd"] is False
+        assert cfg["true_iterative_refinement"] is False
+
+    def test_aggressive_has_new_params(self):
+        cfg = METHODS["aggressive"]
+        assert cfg["project_biases"] is True
+        assert cfg["use_chat_template"] is True
+        assert cfg["use_whitened_svd"] is True
+        assert cfg["true_iterative_refinement"] is True
+
+
+# ---------------------------------------------------------------------------
+# Pipeline initialization with new parameters
+# ---------------------------------------------------------------------------
+
+class TestNewPipelineInit:
+    def test_default_new_params(self):
+        pipeline = AbliterationPipeline(model_name="test-model")
+        # advanced method defaults
+        assert pipeline.project_biases is True
+        assert pipeline.use_chat_template is True
+        assert pipeline.use_whitened_svd is False
+        assert pipeline.true_iterative_refinement is False
+
+    def test_basic_method_new_params(self):
+        pipeline = AbliterationPipeline(model_name="test-model", method="basic")
+        assert pipeline.project_biases is False
+        assert pipeline.use_chat_template is False
+        assert pipeline.use_whitened_svd is False
+        assert pipeline.true_iterative_refinement is False
+
+    def test_aggressive_method_new_params(self):
+        pipeline = AbliterationPipeline(model_name="test-model", method="aggressive")
+        assert pipeline.project_biases is True
+        assert pipeline.use_chat_template is True
+        assert pipeline.use_whitened_svd is True
+        assert pipeline.true_iterative_refinement is True
+
+    def test_explicit_overrides_new_params(self):
+        pipeline = AbliterationPipeline(
+            model_name="test-model",
+            method="basic",
+            project_biases=True,
+            use_chat_template=True,
+            use_whitened_svd=True,
+            true_iterative_refinement=True,
+        )
+        assert pipeline.project_biases is True
+        assert pipeline.use_chat_template is True
+        assert pipeline.use_whitened_svd is True
+        assert pipeline.true_iterative_refinement is True
+
+
+# ---------------------------------------------------------------------------
+# Bias projection
+# ---------------------------------------------------------------------------
+
+class TestBiasProjection:
+    def test_project_bias_removes_component(self):
+        """Bias projection should remove refusal direction component from bias."""
+        class Wrapper(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.o_proj = torch.nn.Linear(4, 4, bias=True)
+
+        module = Wrapper()
+        torch.manual_seed(42)
+        module.o_proj.bias.data = torch.tensor([1.0, 2.0, 3.0, 4.0])
+
+        direction = torch.tensor([1.0, 0.0, 0.0, 0.0]).unsqueeze(-1)  # unit vector along dim 0
+
+        count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
+        assert count == 1
+
+        # The component along direction [1,0,0,0] was 1.0, should now be ~0
+        new_bias = module.o_proj.bias.data
+        projection_onto_dir = (new_bias @ direction.squeeze()).item()
+        assert abs(projection_onto_dir) < 1e-5
+
+        # Other components should be unchanged
+        assert abs(new_bias[1].item() - 2.0) < 1e-5
+        assert abs(new_bias[2].item() - 3.0) < 1e-5
+        assert abs(new_bias[3].item() - 4.0) < 1e-5
+
+    def test_project_bias_no_bias(self):
+        """Should handle modules without bias gracefully."""
+        class Wrapper(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.o_proj = torch.nn.Linear(4, 4, bias=False)
+
+        module = Wrapper()
+        direction = torch.randn(4, 1)
+        count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
+        assert count == 0
+
+    def test_project_bias_no_matching_module(self):
+        """Should return 0 when no candidate names match."""
+        class Wrapper(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.something = torch.nn.Linear(4, 4, bias=True)
+
+        module = Wrapper()
+        direction = torch.randn(4, 1)
+        count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
+        assert count == 0
+
+
+# ---------------------------------------------------------------------------
+# Chat template wrapping
+# ---------------------------------------------------------------------------
+
+class TestChatTemplate:
+    def test_no_wrap_when_disabled(self):
+        """Should not wrap prompts when use_chat_template is False."""
+        pipeline = AbliterationPipeline(
+            model_name="test-model",
+            method="basic",
+            use_chat_template=False,
+        )
+        prompts = ["Hello", "World"]
+        result = pipeline._maybe_apply_chat_template(prompts)
+        assert result == prompts
+
+    def test_no_wrap_without_handle(self):
+        """Should return raw prompts when handle is not set."""
+        pipeline = AbliterationPipeline(
+            model_name="test-model",
+            use_chat_template=True,
+        )
+        prompts = ["Hello"]
+        result = pipeline._maybe_apply_chat_template(prompts)
+        assert result == prompts
+
+    def test_wraps_with_template(self):
+        """Should wrap prompts when tokenizer has apply_chat_template."""
+        pipeline = AbliterationPipeline(
+            model_name="test-model",
+            use_chat_template=True,
+        )
+        handle = MagicMock()
+        tokenizer = MagicMock()
+
+        def mock_apply(messages, tokenize=False, add_generation_prompt=True):
+            return f"<user>{messages[0]['content']}</user><assistant>"
+
+        tokenizer.apply_chat_template = mock_apply
+        handle.tokenizer = tokenizer
+        pipeline.handle = handle
+        pipeline._on_log = lambda m: None
+
+        result = pipeline._maybe_apply_chat_template(["Hello"])
+        assert "<user>Hello</user>" in result[0]
+
+    def test_fallback_when_no_template(self):
+        """Should fall back to raw prompts when template is not configured."""
+        pipeline = AbliterationPipeline(
+            model_name="test-model",
+            use_chat_template=True,
+        )
+        handle = MagicMock()
+        tokenizer = MagicMock()
+        tokenizer.apply_chat_template.side_effect = Exception("No template")
+        handle.tokenizer = tokenizer
+        pipeline.handle = handle
+        pipeline._on_log = lambda m: None
+
+        result = pipeline._maybe_apply_chat_template(["Hello"])
+        assert result == ["Hello"]
+
+
+# ---------------------------------------------------------------------------
+# Metadata includes new fields
+# ---------------------------------------------------------------------------
+
+class TestMetadata:
+    def test_rebirth_includes_new_config(self):
+        """Metadata should include all new configuration parameters."""
+        import json
+        handle = _make_tiny_handle()
+        pipeline = AbliterationPipeline(
+            model_name="test-model",
+            method="aggressive",
+        )
+        pipeline.handle = handle
+        pipeline._on_log = lambda m: None
+        pipeline._on_stage = lambda r: None
+        pipeline._strong_layers = [0]
+        pipeline._quality_metrics = {"perplexity": 8.5, "coherence": 1.0}
+
+        handle.model.save_pretrained = MagicMock()
+        handle.tokenizer.save_pretrained = MagicMock()
+
+        import tempfile
+        from pathlib import Path
+        with tempfile.TemporaryDirectory() as tmp:
+            pipeline.output_dir = Path(tmp) / "output"
+            pipeline._rebirth()
+
+            metadata = json.loads(
+                (pipeline.output_dir / "abliteration_metadata.json").read_text()
+            )
+            cfg = metadata["method_config"]
+            assert "project_biases" in cfg
+            assert "use_chat_template" in cfg
+            assert "use_whitened_svd" in cfg
+            assert "true_iterative_refinement" in cfg
+            assert cfg["project_biases"] is True
+            assert cfg["use_whitened_svd"] is True
+
+            # Should have more references now
+            assert len(metadata["references"]) >= 5
+            assert any("OBLITERATUS" in r for r in metadata["references"])
@@ -0,0 +1,300 @@
+"""Mathematical verification that abliteration actually removes refusal directions.
+
+These tests verify the core linear algebra claims WITHOUT mocks:
+  1. Projection removes the target direction from weight matrices
+  2. Norm-preserving projection maintains weight magnitude
+  3. Multi-direction SVD extracts the correct subspace
+  4. Whitened SVD produces orthogonal directions
+  5. Random directions do NOT have the same effect (negative control)
+
+Unlike the other test files, these use real tensors and verify mathematical
+properties directly — no MagicMock, no mocked tokenizers.
+"""
+
+from __future__ import annotations
+
+
+import torch
+
+
+class TestProjectionRemovesDirection:
+    """Verify that orthogonal projection removes the target direction."""
+
+    def test_single_direction_projection(self):
+        """After projecting out direction d from weight W,
+        W_proj @ d should be approximately zero."""
+        torch.manual_seed(42)
+        hidden = 256
+        out_dim = 128
+
+        W = torch.randn(out_dim, hidden)
+        d = torch.randn(hidden)
+        d = d / d.norm()
+
+        # Project out d: W_proj = W - (W @ d) @ d^T
+        proj = W @ d  # (out_dim,)
+        W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0)
+
+        # Verify: W_proj @ d should be ~0
+        residual = W_proj @ d
+        assert residual.abs().max().item() < 1e-5, f"Residual too large: {residual.abs().max()}"
+
+    def test_projection_preserves_orthogonal_components(self):
+        """Projection should NOT change components orthogonal to d."""
+        torch.manual_seed(42)
+        hidden = 256
+        out_dim = 128
+
+        W = torch.randn(out_dim, hidden)
+        d = torch.randn(hidden)
+        d = d / d.norm()
+
+        # Create a vector orthogonal to d
+        v = torch.randn(hidden)
+        v = v - (v @ d) * d  # Gram-Schmidt
+        v = v / v.norm()
+
+        # Project out d
+        proj = W @ d
+        W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0)
+
+        # W @ v should equal W_proj @ v (orthogonal component unchanged)
+        original = W @ v
+        projected = W_proj @ v
+        diff = (original - projected).abs().max().item()
+        assert diff < 1e-5, f"Orthogonal component changed by {diff}"
+
+    def test_multi_direction_subspace_removal(self):
+        """Projecting out a k-dimensional subspace should remove all k directions."""
+        torch.manual_seed(42)
+        hidden = 256
+        out_dim = 128
+        k = 4
+
+        W = torch.randn(out_dim, hidden)
+        # Create orthonormal subspace
+        Q, _ = torch.linalg.qr(torch.randn(hidden, k))
+        subspace = Q.T  # (k, hidden)
+
+        # Project out subspace: W_proj = W - W @ Q @ Q^T
+        W_proj = W - (W @ Q) @ Q.T
+
+        # Verify: W_proj @ subspace^T should be ~0 for all directions
+        residual = W_proj @ subspace.T  # (out_dim, k)
+        assert residual.abs().max().item() < 1e-5, f"Subspace residual: {residual.abs().max()}"
+
+    def test_double_projection_is_idempotent(self):
+        """Projecting twice should give the same result as projecting once."""
+        torch.manual_seed(42)
+        hidden = 256
+        out_dim = 128
+
+        W = torch.randn(out_dim, hidden)
+        d = torch.randn(hidden)
+        d = d / d.norm()
+
+        # Project once
+        proj1 = W @ d
+        W1 = W - proj1.unsqueeze(1) * d.unsqueeze(0)
+
+        # Project twice
+        proj2 = W1 @ d
+        W2 = W1 - proj2.unsqueeze(1) * d.unsqueeze(0)
+
+        diff = (W1 - W2).abs().max().item()
+        assert diff < 1e-5, f"Second projection changed weights by {diff}"
+
+
+class TestNormPreservation:
+    """Verify that norm-preserving projection maintains weight magnitude."""
+
+    def test_norm_preserving_projection(self):
+        """Biprojected norm-preserving abliteration should keep ||W|| constant."""
+        torch.manual_seed(42)
+        hidden = 256
+        out_dim = 128
+
+        W = torch.randn(out_dim, hidden)
+        d = torch.randn(hidden)
+        d = d / d.norm()
+
+        # Standard projection
+        proj_coeff = W @ d
+        W_proj = W - proj_coeff.unsqueeze(1) * d.unsqueeze(0)
+
+        # Norm-preserving rescaling (per-row)
+        row_norms_orig = W.norm(dim=1, keepdim=True).clamp(min=1e-8)
+        row_norms_proj = W_proj.norm(dim=1, keepdim=True).clamp(min=1e-8)
+        W_norm_preserved = W_proj * (row_norms_orig / row_norms_proj)
+
+        # Direction is still removed
+        residual = W_norm_preserved @ d
+        # Norm-preserving can't guarantee zero projection (it rescales),
+        # but projection should be significantly reduced
+        original_proj = (W @ d).abs().mean().item()
+        preserved_proj = residual.abs().mean().item()
+        assert preserved_proj < original_proj * 0.5, \
+            f"Norm-preserved projection {preserved_proj} not much less than original {original_proj}"
+
+        # Row norms are preserved
+        row_diff = (W_norm_preserved.norm(dim=1) - W.norm(dim=1)).abs().max().item()
+        assert row_diff < 1e-5, f"Row norms changed by {row_diff}"
+
+
+class TestSVDDirectionExtraction:
+    """Verify that SVD on the difference matrix extracts the refusal direction."""
+
+    def test_planted_direction_recovery(self):
+        """Plant a known direction in the difference and verify SVD recovers it."""
+        torch.manual_seed(42)
+        n_samples = 50
+        hidden = 256
+
+        # Plant a known refusal direction
+        true_direction = torch.randn(hidden)
+        true_direction = true_direction / true_direction.norm()
+
+        # Harmful activations = harmless + signal along true_direction + noise
+        harmless = torch.randn(n_samples, hidden) * 0.5
+        signal_strength = 5.0
+        harmful = harmless + signal_strength * true_direction.unsqueeze(0) + torch.randn(n_samples, hidden) * 0.1
+
+        # Extract via SVD on difference
+        diff = harmful - harmless
+        U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
+        extracted = Vh[0]
+        extracted = extracted / extracted.norm()
+
+        # The extracted direction should align with the true direction
+        cosine = (extracted @ true_direction).abs().item()
+        assert cosine > 0.95, f"Cosine similarity {cosine:.3f} too low (expected > 0.95)"
+
+    def test_multi_direction_recovery(self):
+        """Plant k directions and verify SVD recovers the subspace."""
+        torch.manual_seed(42)
+        n_samples = 200
+        hidden = 256
+        k = 3
+
+        # Plant k orthogonal directions with varying per-sample strength
+        Q, _ = torch.linalg.qr(torch.randn(hidden, k))
+        true_subspace = Q.T  # (k, hidden)
+
+        # Each sample gets a random mix of the k planted directions
+        harmless = torch.randn(n_samples, hidden) * 0.01
+        coefficients = torch.randn(n_samples, k).abs() * 5.0
+        signal = coefficients @ true_subspace  # (n_samples, hidden)
+        harmful = harmless + signal
+
+        diff = harmful - harmless
+        U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
+        extracted_subspace = Vh[:k]  # (k, hidden)
+
+        # Check subspace overlap: project true directions into extracted subspace
+        for i in range(k):
+            proj = extracted_subspace @ true_subspace[i]
+            captured_variance = proj.norm().item()
+            assert captured_variance > 0.9, \
+                f"Direction {i}: captured variance {captured_variance:.3f} too low"
+
+
+class TestRandomDirectionBaseline:
+    """Verify that random directions do NOT have the same effect as learned ones."""
+
+    def test_random_direction_has_lower_projection(self):
+        """Random directions should project much less on harmful activations
+        than the true refusal direction."""
+        torch.manual_seed(42)
+        n_samples = 50
+        hidden = 256
+
+        # Create structured harmful vs harmless difference
+        true_dir = torch.randn(hidden)
+        true_dir = true_dir / true_dir.norm()
+
+        harmless = torch.randn(n_samples, hidden) * 0.5
+        harmful = harmless + 3.0 * true_dir.unsqueeze(0)
+
+        harmful_mean = harmful.mean(dim=0)
+
+        # True direction projection
+        true_proj = (harmful_mean @ true_dir).abs().item()
+
+        # Random direction projections (seeds far from 42 to avoid collision)
+        random_projs = []
+        for i in range(100):
+            rng = torch.Generator().manual_seed(10000 + i)
+            rand_dir = torch.randn(hidden, generator=rng)
+            rand_dir = rand_dir / rand_dir.norm()
+            random_projs.append((harmful_mean @ rand_dir).abs().item())
+
+        mean_random = sum(random_projs) / len(random_projs)
+
+        # True direction should project MUCH more than random average
+        assert true_proj > mean_random * 3.0, \
+            f"True projection ({true_proj:.3f}) not much larger than random mean ({mean_random:.3f})"
+
+
+class TestWhitenedSVD:
+    """Verify whitened SVD properties."""
+
+    def test_whitened_directions_are_orthogonal(self):
+        """Whitened SVD should produce orthogonal directions."""
+        torch.manual_seed(42)
+        n_samples = 80
+        hidden = 128
+        k = 4
+
+        H = torch.randn(n_samples, hidden) + torch.randn(1, hidden) * 2
+        B = torch.randn(n_samples, hidden)
+
+        mu_B = B.mean(dim=0, keepdim=True)
+        B_centered = B - mu_B
+        cov_B = (B_centered.T @ B_centered) / (n_samples - 1)
+        cov_B += 1e-4 * torch.eye(hidden)
+
+        eigenvalues, eigenvectors = torch.linalg.eigh(cov_B)
+        eigenvalues = eigenvalues.clamp(min=0)
+        inv_sqrt_eig = 1.0 / torch.sqrt(eigenvalues + 1e-4)
+        whiten_proj = eigenvectors * inv_sqrt_eig.unsqueeze(0)
+
+        H_whitened = (H - mu_B) @ whiten_proj
+        B_whitened = B_centered @ whiten_proj
+        D_whitened = H_whitened - B_whitened
+
+        U, S, Vh = torch.linalg.svd(D_whitened, full_matrices=False)
+        directions = Vh[:k]
+
+        # Check orthogonality: directions @ directions^T should be ~identity
+        gram = directions @ directions.T
+        identity = torch.eye(k)
+        off_diag = (gram - identity).abs().max().item()
+        assert off_diag < 1e-4, f"Directions not orthogonal: max off-diagonal = {off_diag}"
+
+
+class TestReproducibility:
+    """Verify that seed setting produces deterministic results."""
+
+    def test_set_seed_determinism(self):
+        """Same seed should produce identical random tensors."""
+        from obliteratus.reproducibility import set_seed
+
+        set_seed(123, deterministic=False)
+        a = torch.randn(100)
+
+        set_seed(123, deterministic=False)
+        b = torch.randn(100)
+
+        assert torch.equal(a, b), "Same seed produced different tensors"
+
+    def test_different_seeds_differ(self):
+        """Different seeds should produce different tensors."""
+        from obliteratus.reproducibility import set_seed
+
+        set_seed(123, deterministic=False)
+        a = torch.randn(100)
+
+        set_seed(456, deterministic=False)
+        b = torch.randn(100)
+
+        assert not torch.equal(a, b), "Different seeds produced identical tensors"
@@ -0,0 +1,364 @@
+"""Tests for advanced evaluation metrics."""
+
+from __future__ import annotations
+
+
+import pytest
+import torch
+
+from obliteratus.evaluation.advanced_metrics import (
+    REFUSAL_PREFIXES,
+    REFUSAL_SUBSTRINGS,
+    _is_refusal,
+    refusal_rate,
+    token_kl_divergence,
+    first_token_kl_divergence,
+    effective_rank,
+    effective_rank_change,
+    activation_cosine_similarity,
+    linear_cka,
+    refusal_projection_magnitude,
+    format_eval_report,
+    AbliterationEvalResult,
+)
+
+
+# ---------------------------------------------------------------------------
+# Refusal Rate
+# ---------------------------------------------------------------------------
+
+class TestRefusalRate:
+    def test_all_refusals(self):
+        responses = [
+            "I'm sorry, but I cannot help with that.",
+            "I apologize, but I'm unable to assist.",
+            "As an AI, I cannot provide that information.",
+        ]
+        assert refusal_rate(responses, mode="prefix") == 1.0
+
+    def test_no_refusals(self):
+        responses = [
+            "Sure, here's how to do that.",
+            "Here's a step-by-step guide.",
+            "The answer is 42.",
+        ]
+        assert refusal_rate(responses, mode="prefix") == 0.0
+
+    def test_mixed_responses(self):
+        responses = [
+            "I'm sorry, I can't do that.",
+            "Sure, here you go.",
+            "I cannot provide that.",
+            "Here's what you need.",
+        ]
+        rate = refusal_rate(responses, mode="prefix")
+        assert rate == 0.5  # 2 out of 4
+
+    def test_substring_mode(self):
+        responses = [
+            "Well, actually I cannot help with that request.",
+            "Let me explain how to do this.",
+        ]
+        assert refusal_rate(responses, mode="substring") == 0.5
+
+    def test_combined_mode(self):
+        responses = [
+            "I'm sorry, but no.",  # caught by prefix
+            "That said, i won't do that for you.",  # caught by substring
+            "Here's the answer.",  # not a refusal
+        ]
+        rate = refusal_rate(responses, mode="combined")
+        assert abs(rate - 2.0 / 3.0) < 1e-6
+
+    def test_empty_responses(self):
+        assert refusal_rate([], mode="prefix") == 0.0
+
+    def test_empty_string_not_refusal(self):
+        assert not _is_refusal("", mode="combined")
+
+    def test_is_refusal_prefix(self):
+        assert _is_refusal("I'm sorry, I can't do that.", mode="prefix")
+        assert _is_refusal("As an AI language model, I cannot", mode="prefix")
+        assert not _is_refusal("The answer is 42.", mode="prefix")
+
+    def test_is_refusal_substring(self):
+        assert _is_refusal("I must say, i cannot assist with that.", mode="substring")
+        assert not _is_refusal("The capital of France is Paris.", mode="substring")
+
+    def test_refusal_prefixes_nonempty(self):
+        assert len(REFUSAL_PREFIXES) > 20
+
+    def test_refusal_substrings_nonempty(self):
+        assert len(REFUSAL_SUBSTRINGS) > 10
+
+
+# ---------------------------------------------------------------------------
+# KL Divergence
+# ---------------------------------------------------------------------------
+
+class TestKLDivergence:
+    def test_identical_distributions(self):
+        """KL divergence of identical distributions should be 0."""
+        logits = torch.randn(2, 10, 100)
+        kl = token_kl_divergence(logits, logits)
+        assert abs(kl) < 1e-5
+
+    def test_different_distributions(self):
+        """KL divergence of different distributions should be positive."""
+        torch.manual_seed(42)
+        logits_a = torch.randn(2, 10, 100)
+        logits_b = torch.randn(2, 10, 100)
+        kl = token_kl_divergence(logits_a, logits_b)
+        assert kl > 0
+
+    def test_kl_nonnegative(self):
+        """KL divergence should always be non-negative."""
+        torch.manual_seed(42)
+        for _ in range(5):
+            logits_a = torch.randn(1, 5, 50)
+            logits_b = torch.randn(1, 5, 50)
+            kl = token_kl_divergence(logits_a, logits_b)
+            assert kl >= -1e-6  # allow small numerical errors
+
+    def test_first_token_kl_identical(self):
+        """First-token KL of identical distributions should be 0."""
+        logits = torch.randn(4, 20, 100)
+        kl = first_token_kl_divergence(logits, logits)
+        assert abs(kl) < 1e-5
+
+    def test_first_token_kl_different(self):
+        """First-token KL of different distributions should be positive."""
+        torch.manual_seed(42)
+        logits_a = torch.randn(4, 20, 100)
+        logits_b = torch.randn(4, 20, 100)
+        kl = first_token_kl_divergence(logits_a, logits_b)
+        assert kl > 0
+
+    def test_temperature_effect(self):
+        """Higher temperature should reduce KL divergence (smoother distributions)."""
+        torch.manual_seed(42)
+        logits_a = torch.randn(2, 5, 50)
+        logits_b = torch.randn(2, 5, 50)
+        kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0)
+        kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0)
+        assert kl_t5 < kl_t1
+
+
+# ---------------------------------------------------------------------------
+# Effective Rank
+# ---------------------------------------------------------------------------
+
+class TestEffectiveRank:
+    def test_rank_one_matrix(self):
+        """Rank-1 matrix should have effective rank close to 1."""
+        v = torch.randn(8, 1)
+        u = torch.randn(1, 4)
+        W = v @ u  # rank-1
+        erank = effective_rank(W)
+        assert erank < 1.5
+
+    def test_identity_matrix(self):
+        """Identity matrix should have effective rank equal to dimension."""
+        n = 8
+        W = torch.eye(n)
+        erank = effective_rank(W)
+        assert abs(erank - n) < 0.1
+
+    def test_random_full_rank(self):
+        """Random matrix should have high effective rank."""
+        torch.manual_seed(42)
+        W = torch.randn(16, 16)
+        erank = effective_rank(W)
+        assert erank > 10  # should be close to 16
+
+    def test_zero_matrix(self):
+        """Zero matrix should have effective rank 0."""
+        W = torch.zeros(4, 4)
+        erank = effective_rank(W)
+        assert erank == 0.0
+
+    def test_effective_rank_change(self):
+        """Should compute before/after rank comparison."""
+        torch.manual_seed(42)
+        W_before = torch.randn(8, 8)
+        # Simulate abliteration: remove a direction (reduces rank slightly)
+        d = torch.randn(8, 1)
+        d = d / d.norm()
+        W_after = W_before - (W_before @ d) @ d.T
+
+        result = effective_rank_change(W_before, W_after)
+        assert "rank_before" in result
+        assert "rank_after" in result
+        assert "rank_delta" in result
+        assert "rank_ratio" in result
+        assert result["rank_after"] <= result["rank_before"] + 0.1
+
+    def test_rejects_non_2d(self):
+        """Should raise ValueError for non-2D tensors."""
+        with pytest.raises(ValueError):
+            effective_rank(torch.randn(4, 4, 4))
+
+
+# ---------------------------------------------------------------------------
+# Activation Cosine Similarity
+# ---------------------------------------------------------------------------
+
+class TestActivationCosineSimilarity:
+    def test_identical_activations(self):
+        acts = torch.randn(10, 32)
+        sim = activation_cosine_similarity(acts, acts)
+        assert abs(sim - 1.0) < 1e-5
+
+    def test_orthogonal_activations(self):
+        """Orthogonal activations should have cosine near 0."""
+        a = torch.tensor([[1.0, 0.0, 0.0]])
+        b = torch.tensor([[0.0, 1.0, 0.0]])
+        sim = activation_cosine_similarity(a, b)
+        assert abs(sim) < 1e-5
+
+    def test_opposite_activations(self):
+        """Opposite activations should have cosine -1."""
+        a = torch.randn(5, 16)
+        sim = activation_cosine_similarity(a, -a)
+        assert abs(sim - (-1.0)) < 1e-5
+
+    def test_handles_3d(self):
+        """Should handle 3D tensors by reshaping."""
+        a = torch.randn(2, 5, 16)
+        b = torch.randn(2, 5, 16)
+        sim = activation_cosine_similarity(a, b)
+        assert -1.0 <= sim <= 1.0
+
+
+# ---------------------------------------------------------------------------
+# Linear CKA
+# ---------------------------------------------------------------------------
+
+class TestLinearCKA:
+    def test_identical_representations(self):
+        """CKA of identical representations should be 1.0."""
+        X = torch.randn(20, 16)
+        cka = linear_cka(X, X)
+        assert abs(cka - 1.0) < 1e-4
+
+    def test_scaled_representations(self):
+        """CKA should be invariant to isotropic scaling."""
+        X = torch.randn(20, 16)
+        Y = X * 5.0
+        cka = linear_cka(X, Y)
+        assert abs(cka - 1.0) < 1e-4
+
+    def test_random_representations(self):
+        """CKA of random representations should be low."""
+        torch.manual_seed(42)
+        X = torch.randn(100, 16)
+        Y = torch.randn(100, 16)
+        cka = linear_cka(X, Y)
+        assert cka < 0.3  # random should be near 0
+
+    def test_cka_bounded(self):
+        """CKA should be between 0 and 1."""
+        torch.manual_seed(42)
+        for _ in range(5):
+            X = torch.randn(20, 8)
+            Y = torch.randn(20, 8)
+            cka = linear_cka(X, Y)
+            assert -0.01 <= cka <= 1.01  # small tolerance for numerics
+
+    def test_different_dimensions(self):
+        """CKA should work with different hidden dimensions."""
+        X = torch.randn(20, 16)
+        Y = torch.randn(20, 32)
+        cka = linear_cka(X, Y)
+        assert -0.01 <= cka <= 1.01
+
+    def test_handles_3d(self):
+        """Should handle 3D tensors by reshaping."""
+        X = torch.randn(2, 10, 16)
+        Y = torch.randn(2, 10, 16)
+        cka = linear_cka(X, Y)
+        assert -0.01 <= cka <= 1.01
+
+
+# ---------------------------------------------------------------------------
+# Refusal Direction Projection Magnitude
+# ---------------------------------------------------------------------------
+
+class TestRefusalProjection:
+    def test_aligned_activations(self):
+        """Activations aligned with direction should have high projection."""
+        d = torch.tensor([1.0, 0.0, 0.0])
+        acts = torch.tensor([
+            [5.0, 0.0, 0.0],
+            [3.0, 0.0, 0.0],
+            [4.0, 0.0, 0.0],
+        ])
+        result = refusal_projection_magnitude(acts, d)
+        assert result["mean"] == 4.0
+        assert result["abs_mean"] == 4.0
+
+    def test_orthogonal_activations(self):
+        """Orthogonal activations should have zero projection."""
+        d = torch.tensor([1.0, 0.0, 0.0])
+        acts = torch.tensor([
+            [0.0, 5.0, 0.0],
+            [0.0, 0.0, 3.0],
+        ])
+        result = refusal_projection_magnitude(acts, d)
+        assert abs(result["mean"]) < 1e-5
+        assert abs(result["abs_mean"]) < 1e-5
+
+    def test_result_keys(self):
+        """Should return all expected keys."""
+        d = torch.randn(8)
+        acts = torch.randn(5, 8)
+        result = refusal_projection_magnitude(acts, d)
+        assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"}
+
+
+# ---------------------------------------------------------------------------
+# Eval Report Formatting
+# ---------------------------------------------------------------------------
+
+class TestEvalReport:
+    def test_format_report(self):
+        result = AbliterationEvalResult(
+            refusal_rate_harmful=0.1,
+            refusal_rate_harmless=0.02,
+            kl_divergence=0.15,
+            perplexity=12.5,
+            coherence_score=0.8,
+            mean_activation_cosine=0.95,
+            mean_cka=0.92,
+        )
+        report = format_eval_report(result)
+        assert "10.0%" in report
+        assert "12.50" in report
+        assert "excellent" in report  # KL < 0.2
+
+    def test_format_report_high_kl(self):
+        result = AbliterationEvalResult(
+            refusal_rate_harmful=0.0,
+            refusal_rate_harmless=0.0,
+            kl_divergence=1.5,
+            perplexity=50.0,
+            coherence_score=0.4,
+            mean_activation_cosine=None,
+            mean_cka=None,
+        )
+        report = format_eval_report(result)
+        assert "significant damage" in report
+
+    def test_format_report_no_kl(self):
+        result = AbliterationEvalResult(
+            refusal_rate_harmful=0.5,
+            refusal_rate_harmless=0.1,
+            kl_divergence=None,
+            perplexity=20.0,
+            coherence_score=1.0,
+            mean_activation_cosine=None,
+            mean_cka=None,
+        )
+        report = format_eval_report(result)
+        assert "50.0%" in report
+        assert "KL" not in report
@@ -0,0 +1,345 @@
+"""Tests for the analysis techniques."""
+
+from __future__ import annotations
+
+
+import torch
+
+from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor, WhitenedSVDResult
+from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer, CrossLayerResult
+from obliteratus.analysis.activation_probing import ActivationProbe, ProbeResult
+
+
+# ---------------------------------------------------------------------------
+# WhitenedSVDExtractor
+# ---------------------------------------------------------------------------
+
+class TestWhitenedSVD:
+    def test_basic_extraction(self):
+        """Whitened SVD should extract directions from activation differences."""
+        torch.manual_seed(42)
+        n_prompts, hidden_dim = 10, 32
+
+        # Create activations with a clear refusal direction
+        refusal_dir = torch.randn(hidden_dim)
+        refusal_dir = refusal_dir / refusal_dir.norm()
+
+        harmless = [torch.randn(hidden_dim) for _ in range(n_prompts)]
+        harmful = [h + 2.0 * refusal_dir for h in harmless]  # shifted along refusal dir
+
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless, n_directions=3)
+
+        assert isinstance(result, WhitenedSVDResult)
+        assert result.directions.shape == (3, hidden_dim)
+        assert result.singular_values.shape == (3,)
+        assert result.variance_explained > 0
+        assert result.condition_number > 0
+        assert result.effective_rank > 0
+
+    def test_directions_are_unit_vectors(self):
+        """Extracted directions should be unit length."""
+        torch.manual_seed(42)
+        harmless = [torch.randn(16) for _ in range(8)]
+        harmful = [h + torch.randn(16) * 0.5 for h in harmless]
+
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless, n_directions=2)
+
+        for i in range(result.directions.shape[0]):
+            assert abs(result.directions[i].norm().item() - 1.0) < 1e-4
+
+    def test_primary_aligns_with_planted_direction(self):
+        """Primary whitened direction should capture the planted refusal signal.
+
+        Whitening rotates directions relative to the covariance structure,
+        so perfect alignment with the raw direction is not expected. We verify
+        the whitened direction explains substantial variance and has moderate
+        alignment (whitening intentionally reweights dimensions).
+        """
+        torch.manual_seed(42)
+        hidden_dim = 64
+        n_prompts = 30
+
+        refusal_dir = torch.randn(hidden_dim)
+        refusal_dir = refusal_dir / refusal_dir.norm()
+
+        # Isotropic harmless activations (whitening has minimal effect)
+        harmless = [torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
+        harmful = [h + 5.0 * refusal_dir for h in harmless]
+
+        extractor = WhitenedSVDExtractor(regularization_eps=1e-3)
+        result = extractor.extract(harmful, harmless, n_directions=1)
+
+        cos_sim = (result.directions[0] @ refusal_dir).abs().item()
+        # Moderate alignment expected (whitening reweights dimensions)
+        assert cos_sim > 0.2, f"Expected alignment > 0.2, got {cos_sim:.3f}"
+        # More importantly: the direction should explain most variance
+        assert result.variance_explained > 0.5
+
+    def test_extract_all_layers(self):
+        """Should extract directions for all provided layers."""
+        torch.manual_seed(42)
+        harmful_acts = {}
+        harmless_acts = {}
+        for layer in range(4):
+            harmful_acts[layer] = [torch.randn(16) for _ in range(5)]
+            harmless_acts[layer] = [torch.randn(16) for _ in range(5)]
+
+        extractor = WhitenedSVDExtractor()
+        results = extractor.extract_all_layers(harmful_acts, harmless_acts, n_directions=2)
+
+        assert len(results) == 4
+        for idx in range(4):
+            assert idx in results
+            assert results[idx].directions.shape[0] == 2
+
+    def test_compare_with_standard(self):
+        """Comparison should return valid cosine similarities."""
+        torch.manual_seed(42)
+        harmless = [torch.randn(16) for _ in range(8)]
+        harmful = [h + torch.randn(16) for h in harmless]
+
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless, n_directions=2)
+
+        std_dir = torch.randn(16)
+        std_dir = std_dir / std_dir.norm()
+
+        comparison = WhitenedSVDExtractor.compare_with_standard(result, std_dir)
+        assert "primary_direction_cosine" in comparison
+        assert "subspace_principal_cosine" in comparison
+        assert 0 <= comparison["primary_direction_cosine"] <= 1.0
+
+    def test_handles_3d_activations(self):
+        """Should handle activations with an extra batch dimension."""
+        torch.manual_seed(42)
+        # (1, hidden_dim) shape from hook output
+        harmless = [torch.randn(1, 16) for _ in range(5)]
+        harmful = [torch.randn(1, 16) for _ in range(5)]
+
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless, n_directions=2)
+        assert result.directions.shape == (2, 16)
+
+    def test_variance_explained_bounded(self):
+        """Variance explained should be between 0 and 1."""
+        torch.manual_seed(42)
+        harmless = [torch.randn(16) for _ in range(8)]
+        harmful = [torch.randn(16) for _ in range(8)]
+
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless, n_directions=3)
+        assert 0 <= result.variance_explained <= 1.0
+
+
+# ---------------------------------------------------------------------------
+# CrossLayerAlignmentAnalyzer
+# ---------------------------------------------------------------------------
+
+class TestCrossLayerAlignment:
+    def test_identical_directions(self):
+        """Identical directions across layers should give persistence = 1."""
+        direction = torch.randn(32)
+        direction = direction / direction.norm()
+        directions = {i: direction.clone() for i in range(5)}
+
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+
+        assert isinstance(result, CrossLayerResult)
+        assert result.direction_persistence_score > 0.99
+        assert result.mean_adjacent_cosine > 0.99
+        assert result.total_geodesic_distance < 0.01
+
+    def test_orthogonal_directions(self):
+        """Orthogonal directions should give low persistence."""
+        # Create orthogonal directions via QR decomposition
+        torch.manual_seed(42)
+        M = torch.randn(5, 32)
+        Q, _ = torch.linalg.qr(M.T)
+        directions = {i: Q[:, i] for i in range(5)}
+
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+
+        assert result.direction_persistence_score < 0.3
+        assert result.mean_adjacent_cosine < 0.3
+
+    def test_cluster_detection(self):
+        """Should detect clusters of similar directions."""
+        torch.manual_seed(42)
+        # Create two clusters
+        d1 = torch.randn(32)
+        d1 = d1 / d1.norm()
+        d2 = torch.randn(32)
+        d2 = d2 / d2.norm()
+
+        directions = {
+            0: d1, 1: d1 + 0.01 * torch.randn(32),
+            2: d1 + 0.01 * torch.randn(32),
+            3: d2, 4: d2 + 0.01 * torch.randn(32),
+        }
+        # Normalize
+        directions = {k: v / v.norm() for k, v in directions.items()}
+
+        analyzer = CrossLayerAlignmentAnalyzer(cluster_threshold=0.9)
+        result = analyzer.analyze(directions)
+
+        # Should find at least 2 clusters
+        assert result.cluster_count >= 2
+
+    def test_empty_input(self):
+        """Should handle empty input gracefully."""
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze({})
+        assert result.layer_indices == []
+        assert result.cluster_count == 0
+
+    def test_single_layer(self):
+        """Single layer should work fine."""
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze({5: torch.randn(16)})
+        assert result.layer_indices == [5]
+        assert result.direction_persistence_score == 1.0
+
+    def test_strong_layers_filter(self):
+        """Should only analyze specified strong layers."""
+        directions = {i: torch.randn(16) for i in range(10)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions, strong_layers=[2, 5, 7])
+        assert result.layer_indices == [2, 5, 7]
+        assert result.cosine_matrix.shape == (3, 3)
+
+    def test_cosine_matrix_symmetry(self):
+        """Cosine matrix should be symmetric."""
+        torch.manual_seed(42)
+        directions = {i: torch.randn(16) for i in range(4)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+        diff = (result.cosine_matrix - result.cosine_matrix.T).abs().max().item()
+        assert diff < 1e-5
+
+    def test_cosine_matrix_diagonal_ones(self):
+        """Diagonal of cosine matrix should be 1.0."""
+        torch.manual_seed(42)
+        directions = {i: torch.randn(16) for i in range(4)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+        for i in range(4):
+            assert abs(result.cosine_matrix[i, i].item() - 1.0) < 1e-4
+
+    def test_angular_drift_monotonic(self):
+        """Angular drift should be monotonically non-decreasing."""
+        torch.manual_seed(42)
+        directions = {i: torch.randn(16) for i in range(6)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+        for i in range(len(result.angular_drift) - 1):
+            assert result.angular_drift[i + 1] >= result.angular_drift[i] - 1e-6
+
+    def test_format_report(self):
+        """Format report should produce a non-empty string."""
+        torch.manual_seed(42)
+        directions = {i: torch.randn(16) for i in range(4)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+        report = CrossLayerAlignmentAnalyzer.format_report(result)
+        assert "Cross-Layer" in report
+        assert "persistence" in report
+
+
+# ---------------------------------------------------------------------------
+# ActivationProbe
+# ---------------------------------------------------------------------------
+
+class TestActivationProbe:
+    def test_clean_elimination(self):
+        """After removing direction, projections should be near-zero."""
+        torch.manual_seed(42)
+        hidden_dim = 32
+        refusal_dir = torch.randn(hidden_dim)
+        refusal_dir = refusal_dir / refusal_dir.norm()
+
+        # "Post-abliteration" activations: direction has been removed
+        harmless = [torch.randn(hidden_dim) for _ in range(10)]
+        harmful = [torch.randn(hidden_dim) for _ in range(10)]
+        # Both sets are random, no refusal signal => gap should be small
+
+        probe = ActivationProbe()
+        result = probe.probe_layer(harmful, harmless, refusal_dir)
+        assert abs(result.projection_gap) < 1.0
+        assert result.separation_d_prime < 2.0
+
+    def test_residual_detection(self):
+        """Should detect residual refusal signal when direction wasn't removed."""
+        torch.manual_seed(42)
+        hidden_dim = 32
+        refusal_dir = torch.randn(hidden_dim)
+        refusal_dir = refusal_dir / refusal_dir.norm()
+
+        harmless = [torch.randn(hidden_dim) for _ in range(10)]
+        # Harmful still has strong refusal direction component
+        harmful = [h + 5.0 * refusal_dir for h in harmless]
+
+        probe = ActivationProbe()
+        result = probe.probe_layer(harmful, harmless, refusal_dir)
+        assert abs(result.projection_gap) > 1.0
+        assert result.separation_d_prime > 2.0
+
+    def test_probe_all_layers(self):
+        """Should compute aggregate metrics across layers."""
+        torch.manual_seed(42)
+        hidden_dim = 16
+        n_layers = 4
+
+        harmful_acts = {}
+        harmless_acts = {}
+        refusal_dirs = {}
+
+        for layer in range(n_layers):
+            harmful_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
+            harmless_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
+            d = torch.randn(hidden_dim)
+            refusal_dirs[layer] = d / d.norm()
+
+        probe = ActivationProbe()
+        result = probe.probe_all_layers(harmful_acts, harmless_acts, refusal_dirs)
+
+        assert isinstance(result, ProbeResult)
+        assert len(result.per_layer) == n_layers
+        assert 0 <= result.refusal_elimination_score <= 1.0
+        assert result.mean_projection_gap >= 0
+
+    def test_res_score_range(self):
+        """RES should always be between 0 and 1."""
+        torch.manual_seed(42)
+        for seed in range(5):
+            torch.manual_seed(seed)
+            harmful = {0: [torch.randn(8) for _ in range(3)]}
+            harmless = {0: [torch.randn(8) for _ in range(3)]}
+            dirs = {0: torch.randn(8)}
+            dirs[0] = dirs[0] / dirs[0].norm()
+
+            probe = ActivationProbe()
+            result = probe.probe_all_layers(harmful, harmless, dirs)
+            assert 0 <= result.refusal_elimination_score <= 1.0
+
+    def test_format_report(self):
+        """Format report should produce readable output."""
+        torch.manual_seed(42)
+        harmful = {0: [torch.randn(8) for _ in range(3)]}
+        harmless = {0: [torch.randn(8) for _ in range(3)]}
+        dirs = {0: torch.randn(8)}
+
+        probe = ActivationProbe()
+        result = probe.probe_all_layers(harmful, harmless, dirs)
+        report = ActivationProbe.format_report(result)
+        assert "Refusal Elimination Score" in report
+
+    def test_empty_input(self):
+        """Should handle empty input gracefully."""
+        probe = ActivationProbe()
+        result = probe.probe_all_layers({}, {}, {})
+        assert result.refusal_elimination_score == 0.0
+        assert len(result.per_layer) == 0
@@ -0,0 +1,65 @@
+"""Tests for shared analysis utilities (gini_coefficient, etc.)."""
+
+from __future__ import annotations
+
+import pytest
+
+from obliteratus.analysis.utils import gini_coefficient
+
+
+class TestGiniCoefficient:
+    """Tests for the Gini coefficient computation."""
+
+    def test_empty_list(self):
+        assert gini_coefficient([]) == 0.0
+
+    def test_single_value(self):
+        assert gini_coefficient([42.0]) == 0.0
+
+    def test_uniform_distribution(self):
+        """All-equal values → Gini = 0."""
+        assert gini_coefficient([1.0, 1.0, 1.0, 1.0]) == pytest.approx(0.0, abs=1e-10)
+
+    def test_maximally_concentrated(self):
+        """One value, rest zero → Gini ≈ 1."""
+        result = gini_coefficient([100.0, 0.0, 0.0, 0.0])
+        assert result > 0.7  # For n=4, max Gini = (n-1)/n = 0.75
+
+    def test_all_zeros(self):
+        assert gini_coefficient([0.0, 0.0, 0.0]) == 0.0
+
+    def test_two_equal_values(self):
+        assert gini_coefficient([5.0, 5.0]) == pytest.approx(0.0, abs=1e-10)
+
+    def test_two_unequal_values(self):
+        """[0, 10] → Gini = 0.5 for n=2."""
+        result = gini_coefficient([0.0, 10.0])
+        assert result == pytest.approx(0.5, abs=0.01)
+
+    def test_moderate_inequality(self):
+        """Moderate spread → Gini between 0 and 1."""
+        result = gini_coefficient([1.0, 2.0, 3.0, 4.0, 5.0])
+        assert 0.1 < result < 0.5
+
+    def test_result_in_valid_range(self):
+        """Gini is always in [0, 1]."""
+        for vals in [[1, 2, 3], [0, 0, 100], [5, 5, 5], [1], [0.1, 0.9]]:
+            result = gini_coefficient(vals)
+            assert 0.0 <= result <= 1.0, f"Gini({vals}) = {result} out of range"
+
+    def test_large_uniform(self):
+        """Large uniform distribution → Gini ≈ 0."""
+        vals = [1.0] * 1000
+        assert gini_coefficient(vals) == pytest.approx(0.0, abs=1e-10)
+
+    def test_large_concentrated(self):
+        """Large distribution with one outlier → high Gini."""
+        vals = [0.0] * 999 + [1000.0]
+        result = gini_coefficient(vals)
+        assert result > 0.99
+
+    def test_order_invariant(self):
+        """Gini should not depend on input order."""
+        a = gini_coefficient([1.0, 3.0, 5.0, 7.0])
+        b = gini_coefficient([7.0, 1.0, 5.0, 3.0])
+        assert a == pytest.approx(b)
@@ -0,0 +1,598 @@
+"""Tests for architecture-aware preset defaults.
+
+Tests the detection logic and recommended parameter overrides for each
+architecture class (dense/MoE, standard/reasoning).
+"""
+
+from __future__ import annotations
+
+
+from obliteratus.architecture_profiles import (
+    ArchitectureClass,
+    ArchitectureProfile,
+    ReasoningClass,
+    detect_architecture,
+    get_profile_summary,
+    apply_profile_to_method_config,
+)
+
+
+# ---------------------------------------------------------------------------
+#  Detection: Dense models
+# ---------------------------------------------------------------------------
+
+
+class TestDenseDetection:
+    """Test that standard dense models are correctly classified."""
+
+    def test_llama_is_dense(self):
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert profile.arch_class == ArchitectureClass.DENSE
+        assert profile.reasoning_class == ReasoningClass.STANDARD
+        assert not profile.is_moe
+
+    def test_qwen_dense_is_dense(self):
+        profile = detect_architecture("Qwen/Qwen2.5-7B-Instruct")
+        assert profile.arch_class == ArchitectureClass.DENSE
+        assert not profile.is_moe
+
+    def test_gemma_is_dense(self):
+        profile = detect_architecture("google/gemma-3-27b-it")
+        assert profile.arch_class == ArchitectureClass.DENSE
+
+    def test_phi_is_dense(self):
+        profile = detect_architecture("microsoft/Phi-4-mini-instruct")
+        assert profile.arch_class == ArchitectureClass.DENSE
+
+    def test_mistral_small_is_dense(self):
+        profile = detect_architecture("mistralai/Mistral-Small-24B-Instruct-2501")
+        assert profile.arch_class == ArchitectureClass.DENSE
+
+    def test_yi_is_dense(self):
+        profile = detect_architecture("01-ai/Yi-1.5-9B-Chat")
+        assert profile.arch_class == ArchitectureClass.DENSE
+
+    def test_dense_label(self):
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert profile.profile_label == "Dense Standard"
+
+    def test_dense_recommended_method(self):
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert profile.recommended_method == "aggressive"
+
+
+# ---------------------------------------------------------------------------
+#  Detection: MoE models
+# ---------------------------------------------------------------------------
+
+
+class TestMoEDetection:
+    """Test that MoE models are correctly classified."""
+
+    def test_gpt_oss_is_moe(self):
+        """GPT-OSS is MoE. Without config, defaults to small (conservative)."""
+        profile = detect_architecture("openai/gpt-oss-20b")
+        assert profile.is_moe
+        assert profile.arch_class == ArchitectureClass.SMALL_MOE
+
+    def test_qwen3_30b_is_small_moe(self):
+        profile = detect_architecture("Qwen/Qwen3-30B-A3B")
+        assert profile.is_moe
+
+    def test_deepseek_v3_is_large_moe(self):
+        profile = detect_architecture("deepseek-ai/DeepSeek-V3.2")
+        assert profile.is_moe
+
+    def test_kimi_k2_is_large_moe(self):
+        profile = detect_architecture("moonshotai/Kimi-K2-Instruct")
+        assert profile.is_moe
+
+    def test_qwen3_235b_is_moe(self):
+        profile = detect_architecture("Qwen/Qwen3-235B-A22B")
+        assert profile.is_moe
+
+    def test_glm_47_is_moe(self):
+        profile = detect_architecture("zai-org/GLM-4.7")
+        assert profile.is_moe
+
+    def test_llama4_maverick_is_moe(self):
+        profile = detect_architecture("meta-llama/Llama-4-Maverick-17B-128E-Instruct")
+        assert profile.is_moe
+
+    def test_step_flash_is_moe(self):
+        profile = detect_architecture("stepfun-ai/Step-3.5-Flash")
+        assert profile.is_moe
+
+    def test_minimax_is_moe(self):
+        profile = detect_architecture("MiniMaxAI/MiniMax-M2.1")
+        assert profile.is_moe
+
+    def test_mistral_large_3_is_moe(self):
+        profile = detect_architecture("mistralai/Mistral-Large-3-675B-Instruct-2512")
+        assert profile.is_moe
+
+    def test_moe_recommended_method_is_surgical(self):
+        """All MoE profiles recommend surgical method."""
+        profile = detect_architecture("openai/gpt-oss-20b")
+        assert profile.recommended_method == "surgical"
+
+    def test_gpt_oss_with_config_is_small_moe(self):
+        """GPT-OSS with config providing expert count → small MoE."""
+        class MockConfig:
+            model_type = "gpt_neox"
+            num_hidden_layers = 32
+            hidden_size = 2560
+            intermediate_size = 6912
+            vocab_size = 50304
+            num_local_experts = 8
+            num_experts_per_tok = 2
+        profile = detect_architecture("openai/gpt-oss-20b", config=MockConfig())
+        assert profile.is_moe
+        assert profile.arch_class == ArchitectureClass.SMALL_MOE
+
+
+# ---------------------------------------------------------------------------
+#  Detection: Reasoning models
+# ---------------------------------------------------------------------------
+
+
+class TestReasoningDetection:
+    """Test that reasoning models are correctly classified."""
+
+    def test_r1_distill_qwen_is_reasoning(self):
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
+        assert profile.reasoning_class == ReasoningClass.REASONING
+
+    def test_r1_distill_llama_is_reasoning(self):
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+        assert profile.reasoning_class == ReasoningClass.REASONING
+
+    def test_r1_distill_is_dense_reasoning(self):
+        """R1 distills are dense (distilled from MoE into dense)."""
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
+        assert profile.arch_class == ArchitectureClass.DENSE
+        assert profile.reasoning_class == ReasoningClass.REASONING
+        assert profile.profile_label == "Dense Reasoning"
+
+    def test_olmo_think_is_reasoning(self):
+        profile = detect_architecture("allenai/Olmo-3.1-32B-Think")
+        assert profile.reasoning_class == ReasoningClass.REASONING
+
+    def test_olmo_standard_is_not_reasoning(self):
+        """OLMo (without Think) must NOT be classified as reasoning.
+        Regression test: 'olmo' contains 'o1' substring."""
+        profile = detect_architecture("allenai/Olmo-3-7B-Instruct")
+        assert profile.reasoning_class == ReasoningClass.STANDARD
+
+    def test_falcon3_is_not_reasoning(self):
+        """falcon3 must NOT match 'o3' reasoning pattern."""
+        profile = detect_architecture("tiiuae/Falcon3-7B-Instruct")
+        assert profile.reasoning_class == ReasoningClass.STANDARD
+
+    def test_full_r1_is_moe_reasoning(self):
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1")
+        assert profile.is_moe
+        assert profile.reasoning_class == ReasoningClass.REASONING
+
+    def test_reasoning_dense_more_directions(self):
+        """Dense reasoning models need more directions (>=12) to span refusal."""
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
+        assert profile.arch_class == ArchitectureClass.DENSE
+        assert profile.method_overrides.get("n_directions", 0) >= 12
+
+    def test_reasoning_dense_more_passes(self):
+        """Dense reasoning models need more refinement passes (>=4)."""
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
+        assert profile.arch_class == ArchitectureClass.DENSE
+        assert profile.method_overrides.get("refinement_passes", 0) >= 4
+
+    def test_non_reasoning_is_standard(self):
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert profile.reasoning_class == ReasoningClass.STANDARD
+
+
+# ---------------------------------------------------------------------------
+#  Detection with config object
+# ---------------------------------------------------------------------------
+
+
+class TestConfigDetection:
+    """Test detection when a mock config is provided."""
+
+    def test_moe_config_attrs(self):
+        """Config with num_local_experts should be detected as MoE."""
+        class MockConfig:
+            model_type = "mixtral"
+            num_hidden_layers = 32
+            hidden_size = 4096
+            intermediate_size = 14336
+            vocab_size = 32000
+            num_local_experts = 8
+            num_experts_per_tok = 2
+
+        profile = detect_architecture(
+            "custom/mixtral-model", config=MockConfig(),
+            num_layers=32, hidden_size=4096,
+        )
+        assert profile.is_moe
+        assert profile.num_experts == 8
+        assert profile.num_active_experts == 2
+
+    def test_large_moe_threshold(self):
+        """MoE models with >100B params should be classified as large."""
+        class MockConfig:
+            model_type = "deepseek_v3"
+            num_hidden_layers = 61
+            hidden_size = 7168
+            intermediate_size = 18432
+            vocab_size = 102400
+            n_routed_experts = 256
+            num_experts_per_tok = 8
+
+        profile = detect_architecture(
+            "custom/large-moe", config=MockConfig(),
+        )
+        assert profile.arch_class == ArchitectureClass.LARGE_MOE
+
+    def test_small_moe_threshold(self):
+        """MoE models with <=16 experts should be classified as small."""
+        class MockConfig:
+            model_type = "mixtral"
+            num_hidden_layers = 32
+            hidden_size = 4096
+            intermediate_size = 14336
+            vocab_size = 32000
+            num_local_experts = 8
+            num_experts_per_tok = 2
+
+        profile = detect_architecture(
+            "custom/small-moe", config=MockConfig(),
+        )
+        assert profile.arch_class == ArchitectureClass.SMALL_MOE
+
+    def test_dense_config(self):
+        """Config without MoE attributes should be dense."""
+        class MockConfig:
+            model_type = "llama"
+            num_hidden_layers = 32
+            hidden_size = 4096
+            intermediate_size = 11008
+            vocab_size = 32000
+
+        profile = detect_architecture(
+            "custom/dense-model", config=MockConfig(),
+        )
+        assert profile.arch_class == ArchitectureClass.DENSE
+        assert not profile.is_moe
+
+    def test_llama4_scout_is_large_moe(self):
+        """Llama 4 Scout: 109B total params with 16 experts → LARGE_MOE.
+        Regression test: params > 100B must override low expert count."""
+        class MockConfig:
+            model_type = "llama4"
+            num_hidden_layers = 48
+            hidden_size = 5120
+            intermediate_size = 14336
+            vocab_size = 202048
+            num_local_experts = 16
+            num_experts_per_tok = 1
+
+        profile = detect_architecture(
+            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            config=MockConfig(),
+        )
+        assert profile.is_moe
+        assert profile.arch_class == ArchitectureClass.LARGE_MOE
+
+
+# ---------------------------------------------------------------------------
+#  Recommended defaults validation
+# ---------------------------------------------------------------------------
+
+
+class TestRecommendedDefaults:
+    """Test that recommended defaults match research findings."""
+
+    def test_dense_standard_no_riemannian(self):
+        """Dense Standard: Riemannian OFF (manifolds are flat)."""
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert not profile.breakthrough_modules.get("riemannian", True)
+
+    def test_dense_standard_anti_ouroboros_on(self):
+        """Dense Standard: Anti-Ouroboros ON for self-repair mapping."""
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert profile.breakthrough_modules.get("anti_ouroboros", False)
+
+    def test_dense_standard_spectral_cert_on(self):
+        """Dense Standard: Spectral cert ON for verification."""
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert profile.breakthrough_modules.get("spectral_cert", False)
+
+    def test_moe_conditional_on(self):
+        """MoE: Conditional abliteration is #1 technique (Cracken AI 2025)."""
+        profile = detect_architecture("openai/gpt-oss-20b")
+        assert profile.breakthrough_modules.get("conditional", False)
+
+    def test_moe_no_project_embeddings(self):
+        """MoE: Project embeddings OFF (cascades through router)."""
+        profile = detect_architecture("openai/gpt-oss-20b")
+        assert not profile.method_overrides.get("project_embeddings", True)
+
+    def test_moe_per_expert_directions(self):
+        """MoE: Per-expert directions ON (global directions fail on MoE)."""
+        profile = detect_architecture("openai/gpt-oss-20b")
+        assert profile.method_overrides.get("per_expert_directions", False)
+
+    def test_large_moe_riemannian_on(self):
+        """Large MoE: Riemannian ON (curved shared layer geometry)."""
+        profile = detect_architecture("deepseek-ai/DeepSeek-V3.2")
+        assert profile.breakthrough_modules.get("riemannian", False)
+
+    def test_reasoning_dense_jailbreak_contrast(self):
+        """Reasoning Dense: Jailbreak contrast ON for thinking-chain refusal."""
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
+        assert profile.method_overrides.get("use_jailbreak_contrast", False)
+
+    def test_reasoning_moe_gentle_transplant(self):
+        """Reasoning MoE: transplant_blend very low (preserve reasoning)."""
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1")
+        assert profile.method_overrides.get("transplant_blend", 1.0) <= 0.10
+
+
+# ---------------------------------------------------------------------------
+#  Profile summary
+# ---------------------------------------------------------------------------
+
+
+class TestProfileSummary:
+    """Test the human-readable profile summary."""
+
+    def test_summary_contains_profile_label(self):
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        summary = get_profile_summary(profile)
+        assert "Dense Standard" in summary
+
+    def test_summary_contains_method(self):
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        summary = get_profile_summary(profile)
+        assert "aggressive" in summary
+
+    def test_summary_contains_citations(self):
+        profile = detect_architecture("openai/gpt-oss-20b")
+        summary = get_profile_summary(profile)
+        assert "SAFEx" in summary or "Cracken" in summary
+
+    def test_summary_contains_moe_info(self):
+        profile = detect_architecture("openai/gpt-oss-20b")
+        summary = get_profile_summary(profile)
+        assert "MoE" in summary
+
+    def test_summary_contains_breakthrough_modules(self):
+        profile = detect_architecture("openai/gpt-oss-20b")
+        summary = get_profile_summary(profile)
+        assert "conditional" in summary
+
+
+# ---------------------------------------------------------------------------
+#  apply_profile_to_method_config
+# ---------------------------------------------------------------------------
+
+
+class TestApplyProfile:
+    """Test that profile overrides are correctly applied to method configs."""
+
+    def test_overrides_applied(self):
+        from obliteratus.abliterate import METHODS
+        profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
+        base = dict(METHODS["aggressive"])
+        merged = apply_profile_to_method_config(profile, base)
+        assert merged["n_directions"] == profile.method_overrides["n_directions"]
+
+    def test_non_overridden_preserved(self):
+        from obliteratus.abliterate import METHODS
+        profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        base = dict(METHODS["aggressive"])
+        merged = apply_profile_to_method_config(profile, base)
+        # norm_preserve is not in overrides, should come from base
+        assert merged["norm_preserve"] == base["norm_preserve"]
+
+    def test_empty_overrides(self):
+        from obliteratus.abliterate import METHODS
+        base = dict(METHODS["advanced"])
+        profile = ArchitectureProfile(
+            arch_class=ArchitectureClass.DENSE,
+            reasoning_class=ReasoningClass.STANDARD,
+            method_overrides={},
+            breakthrough_modules={},
+        )
+        merged = apply_profile_to_method_config(profile, base)
+        assert merged == base
+
+    def test_override_key_not_in_base_is_added(self):
+        """Override keys absent from base config should be added to result.
+
+        This is important for the UI auto-detect path: keys like
+        use_jailbreak_contrast may not exist in the base method config
+        but are valid pipeline parameters that app.py reads via merged.get().
+        """
+        from obliteratus.abliterate import METHODS
+        base = dict(METHODS["advanced"])
+        profile = ArchitectureProfile(
+            arch_class=ArchitectureClass.DENSE,
+            reasoning_class=ReasoningClass.STANDARD,
+            method_overrides={"use_jailbreak_contrast": True},
+            breakthrough_modules={},
+        )
+        merged = apply_profile_to_method_config(profile, base)
+        assert merged["use_jailbreak_contrast"] is True
+
+
+# ---------------------------------------------------------------------------
+#  All 6 profile combinations
+# ---------------------------------------------------------------------------
+
+
+class TestAllSixProfiles:
+    """Verify label, method, overrides, and breakthrough modules for each profile."""
+
+    def _make_moe_config(self, num_experts=8, active=2, layers=32, hidden=4096):
+        class C:
+            model_type = "mixtral"
+            num_hidden_layers = layers
+            hidden_size = hidden
+            intermediate_size = hidden * 4
+            vocab_size = 32000
+            num_local_experts = num_experts
+            num_experts_per_tok = active
+        return C()
+
+    def test_dense_standard_full(self):
+        p = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
+        assert p.profile_label == "Dense Standard"
+        assert p.recommended_method == "aggressive"
+        assert not p.breakthrough_modules["riemannian"]
+        assert p.breakthrough_modules["anti_ouroboros"]
+        assert p.breakthrough_modules["spectral_cert"]
+        assert not p.breakthrough_modules["conditional"]
+        assert len(p.profile_description) > 0
+        assert len(p.research_citations) > 0
+
+    def test_dense_reasoning_full(self):
+        p = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
+        assert p.profile_label == "Dense Reasoning"
+        assert p.recommended_method == "aggressive"
+        assert p.method_overrides["n_directions"] >= 12
+        assert p.method_overrides["refinement_passes"] >= 4
+        assert p.method_overrides["use_jailbreak_contrast"] is True
+        assert p.method_overrides["use_chat_template"] is True
+        assert p.breakthrough_modules["anti_ouroboros"]
+        assert p.breakthrough_modules["riemannian"]
+        assert p.breakthrough_modules["conditional"]
+        assert p.breakthrough_modules["spectral_cert"]
+        assert len(p.profile_description) > 0
+
+    def test_small_moe_standard_full(self):
+        config = self._make_moe_config(num_experts=8, active=2)
+        p = detect_architecture("custom/small-moe-model", config=config)
+        assert p.profile_label == "Small MoE Standard"
+        assert p.arch_class == ArchitectureClass.SMALL_MOE
+        assert p.recommended_method == "surgical"
+        assert p.method_overrides["per_expert_directions"] is True
+        assert p.method_overrides["invert_refusal"] is False
+        assert p.method_overrides["project_embeddings"] is False
+        assert p.breakthrough_modules["conditional"]
+        assert p.breakthrough_modules["anti_ouroboros"]
+        assert p.breakthrough_modules["spectral_cert"]
+        assert not p.breakthrough_modules["riemannian"]
+        assert len(p.profile_description) > 0
+
+    def test_small_moe_reasoning_full(self):
+        """The most fragile combination: MoE + reasoning."""
+        config = self._make_moe_config(num_experts=8, active=2)
+        # Add "think" to name to trigger reasoning detection
+        p = detect_architecture("custom/small-moe-think-model", config=config)
+        assert p.profile_label == "Small MoE Reasoning"
+        assert p.arch_class == ArchitectureClass.SMALL_MOE
+        assert p.reasoning_class == ReasoningClass.REASONING
+        assert p.recommended_method == "surgical"
+        assert p.method_overrides["per_expert_directions"] is True
+        assert p.method_overrides["use_jailbreak_contrast"] is True
+        assert p.method_overrides["use_chat_template"] is True
+        assert p.method_overrides["invert_refusal"] is False
+        assert p.breakthrough_modules["conditional"]
+        assert p.breakthrough_modules["anti_ouroboros"]
+        assert p.breakthrough_modules["spectral_cert"]
+        assert len(p.profile_description) > 0
+
+    def test_large_moe_standard_full(self):
+        config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168)
+        p = detect_architecture("custom/large-moe-model", config=config)
+        assert p.profile_label == "Large MoE Standard"
+        assert p.arch_class == ArchitectureClass.LARGE_MOE
+        assert p.recommended_method == "surgical"
+        assert p.method_overrides["per_expert_directions"] is True
+        assert p.method_overrides["layer_adaptive_strength"] is True
+        assert p.method_overrides["expert_transplant"] is True
+        assert p.method_overrides["transplant_blend"] == 0.10
+        assert p.method_overrides["attention_head_surgery"] is True
+        assert p.method_overrides["project_embeddings"] is False
+        assert p.breakthrough_modules["conditional"]
+        assert p.breakthrough_modules["riemannian"]
+        assert p.breakthrough_modules["anti_ouroboros"]
+        assert p.breakthrough_modules["spectral_cert"]
+        assert len(p.profile_description) > 0
+
+    def test_large_moe_reasoning_full(self):
+        config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168)
+        p = detect_architecture("custom/large-moe-r1-model", config=config)
+        assert p.profile_label == "Large MoE Reasoning"
+        assert p.arch_class == ArchitectureClass.LARGE_MOE
+        assert p.reasoning_class == ReasoningClass.REASONING
+        assert p.recommended_method == "surgical"
+        assert p.method_overrides["n_directions"] == 8
+        assert p.method_overrides["transplant_blend"] == 0.08
+        assert p.method_overrides["use_jailbreak_contrast"] is True
+        assert p.method_overrides["safety_neuron_masking"] is True
+        assert p.breakthrough_modules["conditional"]
+        assert p.breakthrough_modules["riemannian"]
+        assert p.breakthrough_modules["anti_ouroboros"]
+        assert p.breakthrough_modules["spectral_cert"]
+        assert len(p.profile_description) > 0
+
+
+# ---------------------------------------------------------------------------
+#  Edge cases
+# ---------------------------------------------------------------------------
+
+
+class TestEdgeCases:
+    """Edge cases for architecture detection."""
+
+    def test_empty_model_name(self):
+        """Empty string should fall through to Dense Standard."""
+        profile = detect_architecture("")
+        assert profile.arch_class == ArchitectureClass.DENSE
+        assert profile.reasoning_class == ReasoningClass.STANDARD
+
+    def test_unknown_model_type_in_config(self):
+        """Unknown model_type should not cause MoE classification."""
+        class MockConfig:
+            model_type = "banana"
+            num_hidden_layers = 12
+            hidden_size = 768
+            intermediate_size = 3072
+            vocab_size = 30522
+        profile = detect_architecture("custom/unknown-arch", config=MockConfig())
+        assert profile.arch_class == ArchitectureClass.DENSE
+
+    def test_config_with_zero_experts(self):
+        """num_local_experts=0 should not trigger MoE."""
+        class MockConfig:
+            model_type = "llama"
+            num_hidden_layers = 32
+            hidden_size = 4096
+            intermediate_size = 11008
+            vocab_size = 32000
+            num_local_experts = 0
+        profile = detect_architecture("custom/dense-with-zero", config=MockConfig())
+        assert not profile.is_moe
+        assert profile.arch_class == ArchitectureClass.DENSE
+
+    def test_allcaps_model_name(self):
+        """Case-insensitive matching should work for all-caps names."""
+        profile = detect_architecture("DEEPSEEK-AI/DEEPSEEK-R1-DISTILL-QWEN-7B")
+        assert profile.reasoning_class == ReasoningClass.REASONING
+        assert profile.arch_class == ArchitectureClass.DENSE  # distill = dense
+
+    def test_single_expert_is_moe(self):
+        """num_local_experts=1 is technically MoE (single expert)."""
+        class MockConfig:
+            model_type = "llama"
+            num_hidden_layers = 32
+            hidden_size = 4096
+            intermediate_size = 11008
+            vocab_size = 32000
+            num_local_experts = 1
+        profile = detect_architecture("custom/single-expert", config=MockConfig())
+        # 1 expert still triggers MoE detection (the code treats any >0 as MoE)
+        assert profile.is_moe
@@ -0,0 +1,183 @@
+"""Tests for lightweight benchmark harnesses."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import torch
+
+from obliteratus.evaluation.benchmarks import (
+    KNOWLEDGE_ITEMS,
+    TRUTHFULNESS_ITEMS,
+    MATH_REASONING_ITEMS,
+    BenchmarkRunner,
+    BenchmarkResult,
+    format_benchmark_report,
+)
+
+
+def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64):
+    """Create mock model and tokenizer for benchmark testing."""
+    model = MagicMock()
+
+    # Model returns logits when called
+    def mock_forward(**kwargs):
+        input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
+        batch_size, seq_len = input_ids.shape
+        result = MagicMock()
+        result.logits = torch.randn(batch_size, seq_len, vocab_size)
+        return result
+
+    model.side_effect = mock_forward
+    model.__call__ = mock_forward
+
+    # Model.generate returns token IDs
+    def mock_generate(**kwargs):
+        input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
+        # Append some "generated" tokens
+        gen_tokens = torch.randint(0, vocab_size, (1, 20))
+        return torch.cat([input_ids, gen_tokens], dim=1)
+
+    model.generate = mock_generate
+
+    # Model.parameters for device detection
+    param = torch.nn.Parameter(torch.randn(1))
+    model.parameters = MagicMock(return_value=iter([param]))
+
+    tokenizer = MagicMock()
+    tokenizer.return_value = {
+        "input_ids": torch.randint(0, vocab_size, (1, 15)),
+        "attention_mask": torch.ones(1, 15, dtype=torch.long),
+    }
+    tokenizer.side_effect = lambda text, **kwargs: {
+        "input_ids": torch.randint(0, vocab_size, (1, 15)),
+        "attention_mask": torch.ones(1, 15, dtype=torch.long),
+    }
+
+    def mock_decode(ids, **kwargs):
+        return "The answer is 42. This is a generated response about the topic."
+
+    def mock_encode(text, **kwargs):
+        # Return different IDs for A, B, C, D
+        if text == "A":
+            return [65]
+        elif text == "B":
+            return [66]
+        elif text == "C":
+            return [67]
+        elif text == "D":
+            return [68]
+        return [hash(text) % vocab_size]
+
+    tokenizer.decode = mock_decode
+    tokenizer.encode = mock_encode
+
+    return model, tokenizer
+
+
+class TestBenchmarkItems:
+    def test_knowledge_items_have_required_fields(self):
+        for item in KNOWLEDGE_ITEMS:
+            assert "q" in item
+            assert "choices" in item
+            assert "answer" in item
+            assert "category" in item
+            assert 0 <= item["answer"] < len(item["choices"])
+
+    def test_knowledge_items_count(self):
+        assert len(KNOWLEDGE_ITEMS) >= 20
+
+    def test_knowledge_categories(self):
+        categories = set(item["category"] for item in KNOWLEDGE_ITEMS)
+        assert len(categories) >= 4  # multiple categories
+
+    def test_truthfulness_items_have_required_fields(self):
+        for item in TRUTHFULNESS_ITEMS:
+            assert "q" in item
+            assert "true_answer" in item
+            assert "common_false" in item
+            assert "category" in item
+
+    def test_truthfulness_items_count(self):
+        assert len(TRUTHFULNESS_ITEMS) >= 10
+
+    def test_math_items_have_required_fields(self):
+        for item in MATH_REASONING_ITEMS:
+            assert "q" in item
+            assert "answer" in item
+            assert "category" in item
+            assert isinstance(item["answer"], (int, float))
+
+    def test_math_items_count(self):
+        assert len(MATH_REASONING_ITEMS) >= 10
+
+
+class TestBenchmarkRunner:
+    def test_knowledge_probe_returns_result(self):
+        model, tokenizer = _make_mock_model_and_tokenizer()
+        runner = BenchmarkRunner(model, tokenizer, device="cpu")
+        result = runner.run_knowledge_probe()
+
+        assert isinstance(result, BenchmarkResult)
+        assert result.benchmark_name == "knowledge_probe"
+        assert 0 <= result.score <= 1.0
+        assert result.n_total == len(KNOWLEDGE_ITEMS)
+        assert result.n_correct >= 0
+        assert len(result.per_category) > 0
+
+    def test_truthfulness_probe_returns_result(self):
+        model, tokenizer = _make_mock_model_and_tokenizer()
+        runner = BenchmarkRunner(model, tokenizer, device="cpu")
+        result = runner.run_truthfulness_probe()
+
+        assert isinstance(result, BenchmarkResult)
+        assert result.benchmark_name == "truthfulness_probe"
+        assert 0 <= result.score <= 1.0
+        assert result.n_total == len(TRUTHFULNESS_ITEMS)
+
+    def test_math_probe_returns_result(self):
+        model, tokenizer = _make_mock_model_and_tokenizer()
+        runner = BenchmarkRunner(model, tokenizer, device="cpu")
+        result = runner.run_math_reasoning_probe()
+
+        assert isinstance(result, BenchmarkResult)
+        assert result.benchmark_name == "math_reasoning_probe"
+        assert 0 <= result.score <= 1.0
+        assert result.n_total == len(MATH_REASONING_ITEMS)
+
+    def test_run_all(self):
+        model, tokenizer = _make_mock_model_and_tokenizer()
+        runner = BenchmarkRunner(model, tokenizer, device="cpu")
+        results = runner.run_all()
+
+        assert "knowledge" in results
+        assert "truthfulness" in results
+        assert "math_reasoning" in results
+
+    def test_format_report(self):
+        model, tokenizer = _make_mock_model_and_tokenizer()
+        runner = BenchmarkRunner(model, tokenizer, device="cpu")
+        results = runner.run_all()
+        report = format_benchmark_report(results)
+
+        assert "Capability" in report
+        assert "knowledge" in report
+        assert "truthfulness" in report
+        assert "math" in report
+
+    def test_per_category_scores_bounded(self):
+        model, tokenizer = _make_mock_model_and_tokenizer()
+        runner = BenchmarkRunner(model, tokenizer, device="cpu")
+        result = runner.run_knowledge_probe()
+
+        for cat, score in result.per_category.items():
+            assert 0 <= score <= 1.0
+
+    def test_extract_number(self):
+        model, tokenizer = _make_mock_model_and_tokenizer()
+        runner = BenchmarkRunner(model, tokenizer, device="cpu")
+
+        assert runner._extract_number("The answer is 42.") == 42.0
+        assert runner._extract_number("$20.50 is the price") == 20.50
+        assert runner._extract_number("Result: -3.14") == -3.14
+        assert runner._extract_number("No numbers here") is None
@@ -0,0 +1,535 @@
+"""Tests for causal tracing, residual stream decomposition,
+probing classifiers, and cross-model transfer analysis."""
+
+from __future__ import annotations
+
+import math
+
+import torch
+
+from obliteratus.analysis.causal_tracing import (
+    CausalRefusalTracer,
+    CausalTracingResult,
+    ComponentCausalEffect,
+)
+from obliteratus.analysis.residual_stream import (
+    ResidualStreamDecomposer,
+    ResidualStreamResult,
+    LayerDecomposition,
+)
+from obliteratus.analysis.probing_classifiers import (
+    LinearRefusalProbe,
+    ProbeResult,
+    ProbingSuiteResult,
+)
+from obliteratus.analysis.cross_model_transfer import (
+    TransferAnalyzer,
+    CrossModelResult,
+    CrossCategoryResult,
+    CrossLayerResult,
+    UniversalityReport,
+)
+
+
+# ---------------------------------------------------------------------------
+#  Helpers
+# ---------------------------------------------------------------------------
+
+def _make_layer_activations(
+    n_layers=8, hidden_dim=32, refusal_strength=2.0,
+):
+    """Create synthetic per-layer activations with planted refusal signal."""
+    torch.manual_seed(42)
+    directions = {}
+    activations = {}
+
+    base = torch.randn(hidden_dim) * 0.1
+
+    for i in range(n_layers):
+        d = torch.randn(hidden_dim)
+        d = d / d.norm()
+        directions[i] = d
+
+        # Stronger refusal in middle layers
+        strength = refusal_strength if 2 <= i <= 5 else 0.3
+        activations[i] = base + strength * d + torch.randn(hidden_dim) * 0.05
+
+    return activations, directions
+
+
+def _make_separable_activations(
+    n_per_class=20, hidden_dim=16, separation=3.0, seed=42,
+):
+    """Create harmful/harmless activations that are linearly separable."""
+    torch.manual_seed(seed)
+    direction = torch.randn(hidden_dim)
+    direction = direction / direction.norm()
+
+    harmful = [
+        torch.randn(hidden_dim) * 0.5 + separation * direction
+        for _ in range(n_per_class)
+    ]
+    harmless = [
+        torch.randn(hidden_dim) * 0.5 - separation * direction
+        for _ in range(n_per_class)
+    ]
+    return harmful, harmless, direction
+
+
+# ===========================================================================
+#  Tests: Causal Tracing
+# ===========================================================================
+
+class TestCausalTracing:
+    def test_basic_tracing(self):
+        activations, directions = _make_layer_activations()
+        tracer = CausalRefusalTracer(noise_level=3.0)
+        result = tracer.trace_from_activations(activations, directions)
+
+        assert isinstance(result, CausalTracingResult)
+        assert result.n_layers == 8
+        assert result.clean_refusal_strength > 0
+        assert len(result.component_effects) == 8
+
+    def test_causal_components_identified(self):
+        activations, directions = _make_layer_activations()
+        tracer = CausalRefusalTracer(noise_level=3.0, causal_threshold=0.05)
+        result = tracer.trace_from_activations(activations, directions)
+
+        assert result.circuit_size > 0
+        assert result.circuit_fraction > 0
+        assert len(result.causal_components) > 0
+
+    def test_corruption_reduces_strength(self):
+        activations, directions = _make_layer_activations(refusal_strength=5.0)
+        tracer = CausalRefusalTracer(noise_level=10.0)
+        result = tracer.trace_from_activations(activations, directions)
+
+        # With high noise, corrupted should differ from clean
+        assert result.total_corruption_effect != 0
+
+    def test_single_direction_input(self):
+        activations, directions = _make_layer_activations()
+        single_dir = directions[3]  # Use one direction for all layers
+        tracer = CausalRefusalTracer()
+        result = tracer.trace_from_activations(activations, single_dir)
+
+        assert result.n_layers == 8
+        assert len(result.component_effects) == 8
+
+    def test_component_effects_structure(self):
+        activations, directions = _make_layer_activations()
+        tracer = CausalRefusalTracer()
+        result = tracer.trace_from_activations(activations, directions)
+
+        for e in result.component_effects:
+            assert isinstance(e, ComponentCausalEffect)
+            assert e.component_type == "full_layer"
+            assert e.causal_effect >= 0
+
+    def test_correlation_causal_agreement_bounded(self):
+        activations, directions = _make_layer_activations()
+        tracer = CausalRefusalTracer()
+        result = tracer.trace_from_activations(activations, directions)
+        assert -1.0 <= result.correlation_causal_agreement <= 1.0
+
+    def test_silent_contributors(self):
+        activations, directions = _make_layer_activations()
+        tracer = CausalRefusalTracer()
+        result = tracer.trace_from_activations(activations, directions)
+        sc = tracer.identify_silent_contributors(result, top_k=3)
+
+        assert "silent_contributors" in sc
+        assert "loud_non_contributors" in sc
+        assert len(sc["silent_contributors"]) <= 3
+
+    def test_custom_component_types(self):
+        activations, directions = _make_layer_activations()
+        tracer = CausalRefusalTracer()
+        result = tracer.trace_from_activations(
+            activations, directions,
+            component_types=["attention", "mlp"],
+        )
+        # 8 layers * 2 types = 16 effects
+        assert len(result.component_effects) == 16
+
+    def test_format_report(self):
+        activations, directions = _make_layer_activations()
+        tracer = CausalRefusalTracer()
+        result = tracer.trace_from_activations(activations, directions)
+        report = CausalRefusalTracer.format_tracing_report(result)
+
+        assert "Causal Tracing" in report
+        assert "Circuit size" in report
+
+
+# ===========================================================================
+#  Tests: Residual Stream Decomposition
+# ===========================================================================
+
+class TestResidualStreamDecomposition:
+    def test_basic_decomposition(self):
+        activations, directions = _make_layer_activations()
+        decomposer = ResidualStreamDecomposer()
+        result = decomposer.decompose(activations, directions)
+
+        assert isinstance(result, ResidualStreamResult)
+        assert result.n_layers == 8
+        assert len(result.per_layer) == 8
+        assert result.total_attention_contribution > 0
+        assert result.total_mlp_contribution > 0
+
+    def test_attention_fraction_bounded(self):
+        activations, directions = _make_layer_activations()
+        decomposer = ResidualStreamDecomposer()
+        result = decomposer.decompose(activations, directions)
+        assert 0 <= result.attention_fraction <= 1.0
+
+    def test_with_head_count(self):
+        activations, directions = _make_layer_activations()
+        decomposer = ResidualStreamDecomposer(n_heads_per_layer=4)
+        result = decomposer.decompose(activations, directions)
+
+        assert result.n_refusal_heads >= 0
+        assert len(result.refusal_heads) > 0
+
+    def test_layer_decomposition_structure(self):
+        activations, directions = _make_layer_activations()
+        decomposer = ResidualStreamDecomposer()
+        result = decomposer.decompose(activations, directions)
+
+        for _layer_idx, d in result.per_layer.items():
+            assert isinstance(d, LayerDecomposition)
+            assert 0 <= d.attn_mlp_ratio <= 1.0
+            assert d.cumulative_refusal >= 0
+
+    def test_accumulation_profile(self):
+        activations, directions = _make_layer_activations()
+        decomposer = ResidualStreamDecomposer()
+        result = decomposer.decompose(activations, directions)
+
+        assert len(result.accumulation_profile) == 8
+        # Accumulation should be monotonically non-decreasing
+        for i in range(1, len(result.accumulation_profile)):
+            assert result.accumulation_profile[i] >= result.accumulation_profile[i - 1]
+
+    def test_with_explicit_attn_mlp(self):
+        """Test with provided attention and MLP outputs."""
+        torch.manual_seed(42)
+        hidden_dim = 16
+        n_layers = 4
+        ref_dir = torch.randn(hidden_dim)
+        ref_dir = ref_dir / ref_dir.norm()
+
+        acts = {}
+        attn_outs = {}
+        mlp_outs = {}
+        for i in range(n_layers):
+            attn = torch.randn(hidden_dim) * 0.5
+            mlp = torch.randn(hidden_dim) * 0.5
+            attn_outs[i] = attn
+            mlp_outs[i] = mlp
+            acts[i] = attn + mlp + (torch.randn(hidden_dim) * 0.1 if i == 0 else acts[i-1])
+
+        decomposer = ResidualStreamDecomposer()
+        result = decomposer.decompose(
+            acts, ref_dir,
+            attn_outputs=attn_outs, mlp_outputs=mlp_outs,
+        )
+        assert len(result.per_layer) == n_layers
+
+    def test_single_direction(self):
+        activations, _ = _make_layer_activations()
+        single_dir = torch.randn(32)
+        decomposer = ResidualStreamDecomposer()
+        result = decomposer.decompose(activations, single_dir)
+        assert result.n_layers == 8
+
+    def test_head_concentration_bounded(self):
+        activations, directions = _make_layer_activations()
+        decomposer = ResidualStreamDecomposer(n_heads_per_layer=8)
+        result = decomposer.decompose(activations, directions)
+        assert 0 <= result.head_concentration <= 1.0
+
+    def test_format_decomposition(self):
+        activations, directions = _make_layer_activations()
+        decomposer = ResidualStreamDecomposer(n_heads_per_layer=4)
+        result = decomposer.decompose(activations, directions)
+        report = ResidualStreamDecomposer.format_decomposition(result)
+
+        assert "Residual Stream" in report
+        assert "Attention" in report
+        assert "MLP" in report
+
+
+# ===========================================================================
+#  Tests: Probing Classifiers
+# ===========================================================================
+
+class TestProbingClassifiers:
+    def test_separable_data_high_accuracy(self):
+        """With well-separated data, probe should achieve high accuracy."""
+        harmful, harmless, direction = _make_separable_activations(
+            n_per_class=30, separation=5.0,
+        )
+        probe = LinearRefusalProbe(n_epochs=200)
+        result = probe.probe_layer(harmful, harmless, direction, layer_idx=5)
+
+        assert isinstance(result, ProbeResult)
+        assert result.layer_idx == 5
+        assert result.accuracy > 0.7  # Should be separable
+
+    def test_inseparable_data_low_accuracy(self):
+        """With overlapping data, probe should have lower accuracy."""
+        harmful, harmless, direction = _make_separable_activations(
+            n_per_class=30, separation=0.01,
+        )
+        probe = LinearRefusalProbe(n_epochs=50)
+        result = probe.probe_layer(harmful, harmless, direction)
+        # Accuracy should be near chance (0.5)
+        assert result.accuracy < 0.9
+
+    def test_learned_direction_unit(self):
+        harmful, harmless, direction = _make_separable_activations()
+        probe = LinearRefusalProbe(n_epochs=100)
+        result = probe.probe_layer(harmful, harmless, direction)
+        assert abs(result.learned_direction.norm().item() - 1.0) < 0.01
+
+    def test_cosine_with_analytical(self):
+        """Learned direction should align with analytical direction."""
+        harmful, harmless, direction = _make_separable_activations(
+            n_per_class=50, separation=5.0,
+        )
+        probe = LinearRefusalProbe(n_epochs=300)
+        result = probe.probe_layer(harmful, harmless, direction)
+        # With clear separation, learned direction should agree
+        assert result.cosine_with_analytical > 0.3
+
+    def test_without_analytical_direction(self):
+        harmful, harmless, _ = _make_separable_activations()
+        probe = LinearRefusalProbe(n_epochs=50)
+        result = probe.probe_layer(harmful, harmless)
+        assert result.cosine_with_analytical == 0.0
+
+    def test_auroc_bounded(self):
+        harmful, harmless, direction = _make_separable_activations()
+        probe = LinearRefusalProbe(n_epochs=100)
+        result = probe.probe_layer(harmful, harmless, direction)
+        assert 0 <= result.auroc <= 1.0
+
+    def test_mutual_information_nonnegative(self):
+        harmful, harmless, direction = _make_separable_activations()
+        probe = LinearRefusalProbe(n_epochs=100)
+        result = probe.probe_layer(harmful, harmless, direction)
+        assert result.mutual_information >= 0
+
+    def test_probe_all_layers(self):
+        harmful_acts = {}
+        harmless_acts = {}
+        anal_dirs = {}
+        for li in range(6):
+            harmful, harmless, direction = _make_separable_activations(
+                n_per_class=15, separation=3.0, seed=li * 10,
+            )
+            harmful_acts[li] = harmful
+            harmless_acts[li] = harmless
+            anal_dirs[li] = direction
+
+        probe = LinearRefusalProbe(n_epochs=100)
+        result = probe.probe_all_layers(harmful_acts, harmless_acts, anal_dirs)
+
+        assert isinstance(result, ProbingSuiteResult)
+        assert len(result.per_layer) == 6
+        assert result.best_accuracy > 0
+        assert result.total_mutual_information >= 0
+
+    def test_format_report(self):
+        harmful_acts = {}
+        harmless_acts = {}
+        for li in range(4):
+            harmful, harmless, _ = _make_separable_activations(
+                n_per_class=15, seed=li,
+            )
+            harmful_acts[li] = harmful
+            harmless_acts[li] = harmless
+
+        probe = LinearRefusalProbe(n_epochs=50)
+        result = probe.probe_all_layers(harmful_acts, harmless_acts)
+        report = LinearRefusalProbe.format_probing_report(result)
+
+        assert "Linear Probing" in report
+        assert "accuracy" in report.lower()
+
+    def test_cross_entropy_finite(self):
+        harmful, harmless, direction = _make_separable_activations()
+        probe = LinearRefusalProbe(n_epochs=100)
+        result = probe.probe_layer(harmful, harmless, direction)
+        assert math.isfinite(result.cross_entropy)
+
+
+# ===========================================================================
+#  Tests: Cross-Model Transfer Analysis
+# ===========================================================================
+
+class TestTransferAnalysis:
+    def test_cross_model_identical(self):
+        """Identical directions should give perfect transfer."""
+        torch.manual_seed(42)
+        dirs = {i: torch.randn(32) for i in range(8)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_model(dirs, dirs, "model_a", "model_a")
+
+        assert isinstance(result, CrossModelResult)
+        assert result.mean_transfer_score > 0.99
+
+    def test_cross_model_random(self):
+        """Random directions should give low transfer."""
+        torch.manual_seed(42)
+        dirs_a = {i: torch.randn(32) for i in range(8)}
+        torch.manual_seed(99)
+        dirs_b = {i: torch.randn(32) for i in range(8)}
+
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_model(dirs_a, dirs_b, "a", "b")
+        # Random 32-dim vectors have low expected cosine
+        assert result.mean_transfer_score < 0.7
+
+    def test_cross_model_structure(self):
+        torch.manual_seed(42)
+        dirs_a = {i: torch.randn(32) for i in range(8)}
+        dirs_b = {i: torch.randn(32) for i in range(8)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_model(dirs_a, dirs_b)
+
+        assert 0 <= result.transfer_above_threshold <= 1.0
+        assert len(result.per_layer_transfer) == 8
+
+    def test_cross_category_similar(self):
+        """Similar categories should cluster together."""
+        torch.manual_seed(42)
+        shared = torch.randn(32)
+        shared = shared / shared.norm()
+
+        cat_dirs = {}
+        for cat in ["weapons", "bombs", "explosives"]:
+            d = shared + 0.2 * torch.randn(32)
+            cat_dirs[cat] = d / d.norm()
+
+        # Add one very different category
+        cat_dirs["fraud"] = torch.randn(32)
+
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_category(cat_dirs)
+
+        assert isinstance(result, CrossCategoryResult)
+        assert result.mean_cross_category_transfer > 0
+        assert len(result.categories) == 4
+
+    def test_cross_category_specificity(self):
+        torch.manual_seed(42)
+        cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(5)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_category(cat_dirs)
+
+        assert result.most_universal_category != ""
+        assert result.most_specific_category != ""
+        assert len(result.category_clusters) > 0
+
+    def test_cross_layer(self):
+        _, directions = _make_layer_activations()
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_layer(directions)
+
+        assert isinstance(result, CrossLayerResult)
+        assert result.mean_adjacent_transfer >= 0
+        assert result.transfer_decay_rate >= 0
+
+    def test_cross_layer_adjacent_vs_distant(self):
+        """Adjacent layers typically have higher transfer than distant ones."""
+        torch.manual_seed(42)
+        # Create directions with gradual drift
+        d = torch.randn(32)
+        d = d / d.norm()
+        directions = {}
+        for i in range(10):
+            noise = torch.randn(32) * 0.1 * i
+            di = d + noise
+            directions[i] = di / di.norm()
+
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_layer(directions)
+        # Adjacent should have higher transfer than distant
+        assert result.mean_adjacent_transfer >= result.mean_distant_transfer - 0.1
+
+    def test_universality_index(self):
+        torch.manual_seed(42)
+        dirs = {i: torch.randn(32) for i in range(6)}
+
+        analyzer = TransferAnalyzer()
+        cross_model = analyzer.analyze_cross_model(dirs, dirs)
+        cross_layer = analyzer.analyze_cross_layer(dirs)
+        cat_dirs = {f"cat_{i}": torch.randn(32) for i in range(4)}
+        cross_cat = analyzer.analyze_cross_category(cat_dirs)
+
+        report = analyzer.compute_universality_index(
+            cross_model=cross_model,
+            cross_category=cross_cat,
+            cross_layer=cross_layer,
+        )
+
+        assert isinstance(report, UniversalityReport)
+        assert 0 <= report.universality_index <= 1.0
+
+    def test_universality_empty(self):
+        analyzer = TransferAnalyzer()
+        report = analyzer.compute_universality_index()
+        assert report.universality_index == 0.0
+
+    def test_format_cross_model(self):
+        torch.manual_seed(42)
+        dirs = {i: torch.randn(32) for i in range(4)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_model(dirs, dirs, "llama", "mistral")
+        report = TransferAnalyzer.format_cross_model(result)
+        assert "Cross-Model" in report
+        assert "llama" in report
+
+    def test_format_cross_category(self):
+        torch.manual_seed(42)
+        cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(3)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_category(cat_dirs)
+        report = TransferAnalyzer.format_cross_category(result)
+        assert "Cross-Category" in report
+
+    def test_format_universality(self):
+        analyzer = TransferAnalyzer()
+        report_obj = analyzer.compute_universality_index()
+        report = TransferAnalyzer.format_universality(report_obj)
+        assert "Universality" in report
+
+    def test_dimension_mismatch_handled(self):
+        """Cross-model with different hidden dims should truncate."""
+        dirs_a = {0: torch.randn(32), 1: torch.randn(32)}
+        dirs_b = {0: torch.randn(64), 1: torch.randn(64)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_model(dirs_a, dirs_b)
+        assert len(result.per_layer_transfer) == 2
+
+
+# ===========================================================================
+#  Tests: Integration
+# ===========================================================================
+
+class TestNewImports:
+    def test_all_new_modules_importable(self):
+        from obliteratus.analysis import (
+            CausalRefusalTracer,
+            ResidualStreamDecomposer,
+            LinearRefusalProbe,
+            TransferAnalyzer,
+        )
+        assert CausalRefusalTracer is not None
+        assert ResidualStreamDecomposer is not None
+        assert LinearRefusalProbe is not None
+        assert TransferAnalyzer is not None
@@ -0,0 +1,133 @@
+"""CLI dispatch tests for obliteratus.cli.main().
+
+These tests verify argument parsing and subcommand routing without
+downloading real models or running any pipeline.  They use
+``unittest.mock.patch`` to capture stdout/stderr and
+``pytest.raises(SystemExit)`` for argparse exits.
+"""
+
+from __future__ import annotations
+
+from io import StringIO
+from unittest.mock import patch
+
+import pytest
+
+from obliteratus.cli import main
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _capture_exit(argv: list[str] | None, *, expect_code: int | None = None):
+    """Call main(argv), expecting SystemExit; return captured stderr text."""
+    buf = StringIO()
+    with pytest.raises(SystemExit) as exc_info, patch("sys.stderr", buf):
+        main(argv)
+    if expect_code is not None:
+        assert exc_info.value.code == expect_code
+    return buf.getvalue()
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestCLIDispatch:
+    """Test suite for CLI argument parsing and subcommand dispatch."""
+
+    # 1. No args -> prints help / exits with error
+    def test_main_no_args_prints_help(self):
+        """Calling main() with no args should exit (subcommand is required)."""
+        stderr_text = _capture_exit([], expect_code=2)
+        # argparse prints usage info to stderr on error
+        assert "usage" in stderr_text.lower() or "required" in stderr_text.lower()
+
+    # 2. models command lists models without error
+    def test_models_command(self):
+        """Calling main(['models']) should list models without raising."""
+        with patch("obliteratus.cli.console") as mock_console:
+            main(["models"])
+        # console.print is called at least once to render the table
+        assert mock_console.print.call_count >= 1
+
+    # 3. obliterate without model arg -> error
+    def test_obliterate_requires_model(self):
+        """Calling main(['obliterate']) without a model arg should error."""
+        stderr_text = _capture_exit(["obliterate"], expect_code=2)
+        assert "model" in stderr_text.lower() or "required" in stderr_text.lower()
+
+    # 4. obliterate --method accepts valid methods
+    def test_obliterate_valid_methods(self):
+        """Test that --method accepts all 9 pipeline methods."""
+        valid_methods = [
+            "basic", "advanced", "aggressive", "spectral_cascade",
+            "informed", "surgical", "optimized", "inverted", "nuclear",
+        ]
+        for method in valid_methods:
+            # Patch the actual pipeline execution so nothing runs
+            with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
+                main(["obliterate", "fake/model", "--method", method])
+                mock_cmd.assert_called_once()
+                args_passed = mock_cmd.call_args[0][0]
+                assert args_passed.method == method
+
+    # 4b. invalid methods are rejected
+    def test_obliterate_rejects_invalid_method(self):
+        """The CLI --method flag rejects unknown method names."""
+        stderr_text = _capture_exit(
+            ["obliterate", "fake/model", "--method", "nonexistent"],
+            expect_code=2,
+        )
+        assert "invalid choice" in stderr_text.lower()
+
+    # 5. run requires config path
+    def test_run_requires_config(self):
+        """Calling main(['run']) without a config path should error."""
+        stderr_text = _capture_exit(["run"], expect_code=2)
+        assert "config" in stderr_text.lower() or "required" in stderr_text.lower()
+
+    # 6. aggregate with nonexistent dir handles gracefully
+    def test_aggregate_command_missing_dir(self):
+        """Calling main(['aggregate']) with nonexistent dir should handle gracefully."""
+        with patch("obliteratus.cli.console") as mock_console:
+            main(["aggregate", "--dir", "/nonexistent/path/to/nowhere"])
+        # The command prints a message about no contributions found and returns
+        printed_text = " ".join(
+            str(call) for call in mock_console.print.call_args_list
+        )
+        assert "no contributions found" in printed_text.lower() or mock_console.print.called
+
+    # 7. --help flag prints help
+    def test_help_flag(self):
+        """Calling main(['--help']) should print help and exit 0."""
+        buf = StringIO()
+        with pytest.raises(SystemExit) as exc_info, patch("sys.stdout", buf):
+            main(["--help"])
+        assert exc_info.value.code == 0
+        output = buf.getvalue()
+        assert "obliteratus" in output.lower() or "usage" in output.lower()
+
+    # 8. interactive subcommand is registered
+    def test_interactive_command_exists(self):
+        """Verify 'interactive' subcommand is registered and dispatches."""
+        with patch("obliteratus.cli._cmd_interactive") as mock_cmd:
+            main(["interactive"])
+            mock_cmd.assert_called_once()
+
+    # 9. --contribute and --contribute-notes are accepted on obliterate
+    def test_contribute_flags_on_obliterate(self):
+        """Verify --contribute and --contribute-notes are accepted args."""
+        with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
+            main([
+                "obliterate", "fake/model",
+                "--contribute",
+                "--contribute-notes", "Testing contribution system",
+            ])
+            mock_cmd.assert_called_once()
+            args_passed = mock_cmd.call_args[0][0]
+            assert args_passed.contribute is True
+            assert args_passed.contribute_notes == "Testing contribution system"
@@ -0,0 +1,567 @@
+"""Tests for the community contribution system."""
+
+import json
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+
+from obliteratus.community import (
+    CONTRIBUTION_SCHEMA_VERSION,
+    _config_fingerprint,
+    _model_short_name,
+    aggregate_results,
+    generate_latex_table,
+    load_contributions,
+    save_contribution,
+)
+
+
+# ── Helper: mock pipeline ──────────────────────────────────────────────
+
+
+def _make_mock_pipeline():
+    """Build a mock pipeline with all fields the community module reads."""
+    p = MagicMock()
+    p.handle.summary.return_value = {
+        "architecture": "LlamaForCausalLM",
+        "num_layers": 32,
+        "num_heads": 32,
+        "hidden_size": 4096,
+        "total_params": 8_000_000_000,
+    }
+    p.method = "advanced"
+    p.n_directions = 4
+    p.norm_preserve = True
+    p.regularization = 0.3
+    p.refinement_passes = 2
+    p.project_biases = True
+    p.use_chat_template = True
+    p.use_whitened_svd = True
+    p.true_iterative_refinement = False
+    p.use_jailbreak_contrast = False
+    p.layer_adaptive_strength = False
+    p.attention_head_surgery = True
+    p.safety_neuron_masking = False
+    p.per_expert_directions = False
+    p.use_sae_features = False
+    p.invert_refusal = False
+    p.project_embeddings = False
+    p.embed_regularization = 0.5
+    p.activation_steering = False
+    p.steering_strength = 0.3
+    p.expert_transplant = False
+    p.transplant_blend = 0.3
+    p.reflection_strength = 2.0
+    p.quantization = None
+
+    p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05}
+    p._strong_layers = [10, 11, 12, 13]
+    p._stage_durations = {
+        "summon": 3.0, "probe": 12.5, "distill": 4.1,
+        "excise": 2.0, "verify": 8.3, "rebirth": 5.0,
+    }
+    p._excise_modified_count = 128
+
+    # Direction data
+    d = torch.randn(4096)
+    d = d / d.norm()
+    p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096)}
+    p.refusal_subspaces = {10: torch.randn(4, 4096)}
+
+    # Excise details
+    p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]}
+    p._sae_directions = {}
+    p._expert_safety_scores = {}
+    p._layer_excise_weights = {}
+    p._expert_directions = {}
+    p._steering_hooks = []
+
+    # Prompts
+    p.harmful_prompts = ["x"] * 33
+    p.harmless_prompts = ["y"] * 33
+    p.jailbreak_prompts = None
+
+    return p
+
+
+# ── Model short name ───────────────────────────────────────────────────
+
+
+class TestModelShortName:
+    def test_strips_org_prefix(self):
+        assert _model_short_name("meta-llama/Llama-2-7b-chat-hf") == "llama-2-7b-chat-hf"
+
+    def test_no_org_prefix(self):
+        assert _model_short_name("gpt2") == "gpt2"
+
+    def test_sanitizes_special_chars(self):
+        assert _model_short_name("org/Model_V2.1") == "model-v2-1"
+
+    def test_caps_length(self):
+        long_name = "a" * 100
+        assert len(_model_short_name(long_name)) <= 60
+
+    def test_collapses_dashes(self):
+        assert _model_short_name("org/Model---Name") == "model-name"
+
+    def test_strips_trailing_dashes(self):
+        assert _model_short_name("org/Model-") == "model"
+
+
+# ── Config fingerprint ─────────────────────────────────────────────────
+
+
+class TestConfigFingerprint:
+    def test_deterministic(self):
+        config = {"n_directions": 4, "norm_preserve": True}
+        fp1 = _config_fingerprint(config)
+        fp2 = _config_fingerprint(config)
+        assert fp1 == fp2
+
+    def test_different_configs_different_hashes(self):
+        fp1 = _config_fingerprint({"n_directions": 4})
+        fp2 = _config_fingerprint({"n_directions": 8})
+        assert fp1 != fp2
+
+    def test_key_order_invariant(self):
+        fp1 = _config_fingerprint({"a": 1, "b": 2})
+        fp2 = _config_fingerprint({"b": 2, "a": 1})
+        assert fp1 == fp2
+
+    def test_returns_8_char_hex(self):
+        fp = _config_fingerprint({"test": True})
+        assert len(fp) == 8
+        assert all(c in "0123456789abcdef" for c in fp)
+
+
+# ── Save contribution ──────────────────────────────────────────────────
+
+
+class TestSaveContribution:
+    def test_saves_json_file(self, tmp_path):
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline,
+            model_name="meta-llama/Llama-2-7b-chat-hf",
+            output_dir=tmp_path,
+        )
+        assert path.exists()
+        assert path.suffix == ".json"
+        data = json.loads(path.read_text())
+        assert data["contribution_schema_version"] == CONTRIBUTION_SCHEMA_VERSION
+        assert data["model_name"] == "meta-llama/Llama-2-7b-chat-hf"
+
+    def test_filename_format(self, tmp_path):
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline,
+            model_name="meta-llama/Llama-2-7b-chat-hf",
+            output_dir=tmp_path,
+        )
+        name = path.stem
+        assert name.startswith("llama-2-7b-chat-hf_advanced_")
+
+    def test_includes_telemetry_report(self, tmp_path):
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline,
+            model_name="meta-llama/Llama-2-7b-chat-hf",
+            output_dir=tmp_path,
+        )
+        data = json.loads(path.read_text())
+        telemetry = data["telemetry"]
+        assert telemetry["schema_version"] == 2
+        assert telemetry["model"]["architecture"] == "LlamaForCausalLM"
+        assert telemetry["method"] == "advanced"
+        assert telemetry["quality_metrics"]["refusal_rate"] == 0.05
+
+    def test_includes_config_fingerprint(self, tmp_path):
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline,
+            model_name="meta-llama/Llama-2-7b-chat-hf",
+            output_dir=tmp_path,
+        )
+        data = json.loads(path.read_text())
+        assert "config_fingerprint" in data
+        assert len(data["config_fingerprint"]) == 8
+
+    def test_includes_notes(self, tmp_path):
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline,
+            model_name="test/model",
+            notes="Ran on A100 with default prompts",
+            output_dir=tmp_path,
+        )
+        data = json.loads(path.read_text())
+        assert data["notes"] == "Ran on A100 with default prompts"
+
+    def test_creates_output_dir(self, tmp_path):
+        subdir = tmp_path / "nested" / "dir"
+        assert not subdir.exists()
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline, model_name="test/model", output_dir=subdir,
+        )
+        assert subdir.exists()
+        assert path.exists()
+
+    def test_timestamp_format(self, tmp_path):
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline, model_name="test/model", output_dir=tmp_path,
+        )
+        data = json.loads(path.read_text())
+        ts = data["timestamp"]
+        # Should be UTC ISO-ish: YYYYMMDDTHHMMSSZ
+        assert ts.endswith("Z")
+        assert "T" in ts
+        assert len(ts) == 16
+
+    def test_method_config_extracted(self, tmp_path):
+        pipeline = _make_mock_pipeline()
+        path = save_contribution(
+            pipeline, model_name="test/model", output_dir=tmp_path,
+        )
+        data = json.loads(path.read_text())
+        cfg = data["telemetry"]["method_config"]
+        assert cfg["n_directions"] == 4
+        assert cfg["norm_preserve"] is True
+        assert cfg["attention_head_surgery"] is True
+
+
+# ── Load contributions ─────────────────────────────────────────────────
+
+
+class TestLoadContributions:
+    def _write_contrib(self, directory, model, method, refusal_rate, idx=0):
+        """Write a minimal valid contribution file."""
+        record = {
+            "contribution_schema_version": CONTRIBUTION_SCHEMA_VERSION,
+            "timestamp": f"20260227T{idx:06d}Z",
+            "model_name": model,
+            "config_fingerprint": "abcd1234",
+            "notes": "",
+            "telemetry": {
+                "schema_version": 2,
+                "method": method,
+                "quality_metrics": {"refusal_rate": refusal_rate},
+            },
+        }
+        path = directory / f"contrib_{idx}.json"
+        path.write_text(json.dumps(record))
+        return path
+
+    def test_loads_valid_files(self, tmp_path):
+        self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
+        self._write_contrib(tmp_path, "test/model", "basic", 0.10, 1)
+        records = load_contributions(tmp_path)
+        assert len(records) == 2
+
+    def test_sorts_by_timestamp(self, tmp_path):
+        self._write_contrib(tmp_path, "model-b", "advanced", 0.05, 2)
+        self._write_contrib(tmp_path, "model-a", "advanced", 0.10, 1)
+        records = load_contributions(tmp_path)
+        assert records[0]["model_name"] == "model-a"
+        assert records[1]["model_name"] == "model-b"
+
+    def test_skips_non_contribution_json(self, tmp_path):
+        # Write a JSON file without contribution_schema_version
+        (tmp_path / "random.json").write_text('{"foo": "bar"}')
+        self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
+        records = load_contributions(tmp_path)
+        assert len(records) == 1
+
+    def test_skips_invalid_json(self, tmp_path):
+        (tmp_path / "bad.json").write_text("not valid json {{{")
+        self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
+        records = load_contributions(tmp_path)
+        assert len(records) == 1
+
+    def test_returns_empty_for_missing_dir(self, tmp_path):
+        records = load_contributions(tmp_path / "nonexistent")
+        assert records == []
+
+    def test_tracks_source_file(self, tmp_path):
+        self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
+        records = load_contributions(tmp_path)
+        assert "_source_file" in records[0]
+        assert "contrib_0.json" in records[0]["_source_file"]
+
+    def test_ignores_non_json_files(self, tmp_path):
+        (tmp_path / "readme.txt").write_text("some text")
+        self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
+        records = load_contributions(tmp_path)
+        assert len(records) == 1
+
+
+# ── Aggregate results ──────────────────────────────────────────────────
+
+
+class TestAggregateResults:
+    def _make_record(self, model, method, refusal_rate, perplexity=None, coherence=None):
+        metrics = {"refusal_rate": refusal_rate}
+        if perplexity is not None:
+            metrics["perplexity"] = perplexity
+        if coherence is not None:
+            metrics["coherence"] = coherence
+        return {
+            "model_name": model,
+            "telemetry": {
+                "method": method,
+                "quality_metrics": metrics,
+            },
+        }
+
+    def test_single_record(self):
+        records = [self._make_record("model-a", "advanced", 0.05)]
+        result = aggregate_results(records)
+        assert "model-a" in result
+        assert "advanced" in result["model-a"]
+        assert result["model-a"]["advanced"]["n_runs"] == 1
+        assert result["model-a"]["advanced"]["refusal_rate"]["mean"] == 0.05
+
+    def test_multiple_runs_same_model_method(self):
+        records = [
+            self._make_record("model-a", "advanced", 0.04),
+            self._make_record("model-a", "advanced", 0.06),
+        ]
+        result = aggregate_results(records)
+        stats = result["model-a"]["advanced"]
+        assert stats["n_runs"] == 2
+        assert stats["refusal_rate"]["mean"] == 0.05
+        assert stats["refusal_rate"]["min"] == 0.04
+        assert stats["refusal_rate"]["max"] == 0.06
+        assert stats["refusal_rate"]["n"] == 2
+
+    def test_multiple_models(self):
+        records = [
+            self._make_record("model-a", "advanced", 0.05),
+            self._make_record("model-b", "basic", 0.10),
+        ]
+        result = aggregate_results(records)
+        assert len(result) == 2
+        assert "model-a" in result
+        assert "model-b" in result
+
+    def test_multiple_methods(self):
+        records = [
+            self._make_record("model-a", "advanced", 0.05),
+            self._make_record("model-a", "basic", 0.10),
+        ]
+        result = aggregate_results(records)
+        assert len(result["model-a"]) == 2
+        assert "advanced" in result["model-a"]
+        assert "basic" in result["model-a"]
+
+    def test_std_zero_for_single_run(self):
+        records = [self._make_record("model-a", "advanced", 0.05)]
+        result = aggregate_results(records)
+        assert result["model-a"]["advanced"]["refusal_rate"]["std"] == 0.0
+
+    def test_multiple_metrics(self):
+        records = [
+            self._make_record("model-a", "advanced", 0.05, perplexity=5.2, coherence=0.8),
+        ]
+        result = aggregate_results(records)
+        stats = result["model-a"]["advanced"]
+        assert "refusal_rate" in stats
+        assert "perplexity" in stats
+        assert "coherence" in stats
+        assert stats["perplexity"]["mean"] == 5.2
+
+    def test_missing_metric_skipped(self):
+        records = [self._make_record("model-a", "advanced", 0.05)]
+        result = aggregate_results(records)
+        # coherence not provided, should not appear
+        assert "coherence" not in result["model-a"]["advanced"]
+
+    def test_unknown_model_and_method(self):
+        records = [{
+            "telemetry": {"quality_metrics": {"refusal_rate": 0.1}},
+        }]
+        result = aggregate_results(records)
+        assert "unknown" in result
+        assert "unknown" in result["unknown"]
+
+
+# ── LaTeX table generation ─────────────────────────────────────────────
+
+
+class TestGenerateLatexTable:
+    def _sample_aggregated(self):
+        return {
+            "meta-llama/Llama-2-7b-chat-hf": {
+                "advanced": {
+                    "n_runs": 3,
+                    "refusal_rate": {"mean": 0.04, "std": 0.01, "n": 3, "min": 0.03, "max": 0.05},
+                },
+                "basic": {
+                    "n_runs": 2,
+                    "refusal_rate": {"mean": 0.08, "std": 0.02, "n": 2, "min": 0.06, "max": 0.10},
+                },
+            },
+            "mistralai/Mistral-7B-Instruct-v0.2": {
+                "advanced": {
+                    "n_runs": 1,
+                    "refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03},
+                },
+            },
+        }
+
+    def test_produces_valid_latex(self):
+        agg = self._sample_aggregated()
+        latex = generate_latex_table(agg)
+        assert "\\begin{tabular}" in latex
+        assert "\\end{tabular}" in latex
+        assert "\\toprule" in latex
+        assert "\\bottomrule" in latex
+
+    def test_includes_model_names(self):
+        agg = self._sample_aggregated()
+        latex = generate_latex_table(agg)
+        assert "Llama-2-7b-chat-hf" in latex
+        assert "Mistral-7B-Instruct-v0.2" in latex
+
+    def test_includes_method_headers(self):
+        agg = self._sample_aggregated()
+        latex = generate_latex_table(agg)
+        assert "advanced" in latex
+        assert "basic" in latex
+
+    def test_missing_method_shows_dash(self):
+        agg = self._sample_aggregated()
+        latex = generate_latex_table(agg)
+        # Mistral doesn't have "basic" method
+        assert "---" in latex
+
+    def test_shows_std_when_multiple_runs(self):
+        agg = self._sample_aggregated()
+        latex = generate_latex_table(agg)
+        assert "$\\pm$" in latex
+
+    def test_no_std_for_single_run(self):
+        agg = {
+            "model": {
+                "method": {
+                    "n_runs": 1,
+                    "refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03},
+                },
+            },
+        }
+        latex = generate_latex_table(agg)
+        assert "$\\pm$" not in latex
+
+    def test_methods_filter(self):
+        agg = self._sample_aggregated()
+        latex = generate_latex_table(agg, methods=["advanced"])
+        assert "\\textbf{advanced}" in latex
+        assert "\\textbf{basic}" not in latex
+
+    def test_custom_metric(self):
+        agg = {
+            "model": {
+                "method": {
+                    "n_runs": 2,
+                    "perplexity": {"mean": 5.2, "std": 0.3, "n": 2, "min": 4.9, "max": 5.5},
+                },
+            },
+        }
+        latex = generate_latex_table(agg, metric="perplexity")
+        assert "5.2" in latex
+
+    def test_column_count_matches_methods(self):
+        agg = self._sample_aggregated()
+        latex = generate_latex_table(agg)
+        # 2 methods → "lcc" (1 model col + 2 method cols)
+        assert "{@{}lcc@{}}" in latex
+
+
+# ── CLI integration ────────────────────────────────────────────────────
+
+
+class TestCLIContributeFlag:
+    def test_contribute_flag_accepted(self):
+        """Verify the --contribute flag parses without error."""
+        from obliteratus.cli import main
+
+        # We can't run the full command (no GPU), but verify parsing works
+        with pytest.raises(SystemExit):
+            # "obliterate" requires a model, so parse will fail,
+            # but if --contribute is not recognized it fails differently
+            main(["obliterate", "--help"])
+
+    def test_aggregate_command_accepted(self):
+        """Verify the aggregate command parses without error."""
+        from obliteratus.cli import main
+
+        with pytest.raises(SystemExit):
+            main(["aggregate", "--help"])
+
+
+# ── Package exports ────────────────────────────────────────────────────
+
+
+class TestPackageExports:
+    def test_save_contribution_importable(self):
+        from obliteratus import save_contribution
+        assert callable(save_contribution)
+
+    def test_load_contributions_importable(self):
+        from obliteratus import load_contributions
+        assert callable(load_contributions)
+
+    def test_aggregate_results_importable(self):
+        from obliteratus import aggregate_results
+        assert callable(aggregate_results)
+
+
+# ── End-to-end: save → load → aggregate ───────────────────────────────
+
+
+class TestEndToEnd:
+    def test_save_load_aggregate_roundtrip(self, tmp_path):
+        """Full roundtrip: save contributions, load them, aggregate."""
+        pipeline = _make_mock_pipeline()
+
+        # Save two contributions (different models to avoid filename collision)
+        save_contribution(
+            pipeline, model_name="test/model-a", output_dir=tmp_path,
+        )
+        # Tweak metrics for second run with a different model name
+        pipeline._quality_metrics = {"perplexity": 5.5, "coherence": 0.75, "refusal_rate": 0.07}
+        save_contribution(
+            pipeline, model_name="test/model-b", output_dir=tmp_path,
+        )
+
+        # Load
+        records = load_contributions(tmp_path)
+        assert len(records) == 2
+
+        # Aggregate
+        aggregated = aggregate_results(records)
+        assert "test/model-a" in aggregated
+        assert "test/model-b" in aggregated
+        stats_a = aggregated["test/model-a"]["advanced"]
+        stats_b = aggregated["test/model-b"]["advanced"]
+        assert stats_a["n_runs"] == 1
+        assert stats_b["n_runs"] == 1
+        assert abs(stats_a["refusal_rate"]["mean"] - 0.05) < 0.001
+        assert abs(stats_b["refusal_rate"]["mean"] - 0.07) < 0.001
+
+    def test_save_load_aggregate_to_latex(self, tmp_path):
+        """Full roundtrip ending in LaTeX output."""
+        pipeline = _make_mock_pipeline()
+        save_contribution(
+            pipeline, model_name="meta-llama/Llama-2-7b-chat-hf", output_dir=tmp_path,
+        )
+
+        records = load_contributions(tmp_path)
+        aggregated = aggregate_results(records)
+        latex = generate_latex_table(aggregated)
+
+        assert "\\begin{tabular}" in latex
+        assert "Llama-2-7b-chat-hf" in latex
+        assert "advanced" in latex
@@ -0,0 +1,59 @@
+"""Tests for configuration loading."""
+
+from __future__ import annotations
+
+
+import yaml
+
+from obliteratus.config import StudyConfig
+
+
+SAMPLE_CONFIG = {
+    "model": {
+        "name": "gpt2",
+        "task": "causal_lm",
+        "dtype": "float32",
+        "device": "cpu",
+    },
+    "dataset": {
+        "name": "wikitext",
+        "subset": "wikitext-2-raw-v1",
+        "split": "test",
+        "text_column": "text",
+        "max_samples": 50,
+    },
+    "strategies": [
+        {"name": "layer_removal", "params": {}},
+        {"name": "ffn_ablation", "params": {}},
+    ],
+    "metrics": ["perplexity"],
+    "batch_size": 4,
+    "max_length": 256,
+    "output_dir": "results/test",
+}
+
+
+class TestStudyConfig:
+    def test_from_dict(self):
+        config = StudyConfig.from_dict(SAMPLE_CONFIG)
+        assert config.model.name == "gpt2"
+        assert config.model.task == "causal_lm"
+        assert config.dataset.name == "wikitext"
+        assert len(config.strategies) == 2
+        assert config.strategies[0].name == "layer_removal"
+
+    def test_from_yaml(self, tmp_path):
+        yaml_path = tmp_path / "test_config.yaml"
+        yaml_path.write_text(yaml.dump(SAMPLE_CONFIG))
+
+        config = StudyConfig.from_yaml(yaml_path)
+        assert config.model.name == "gpt2"
+        assert config.batch_size == 4
+
+    def test_roundtrip(self):
+        config = StudyConfig.from_dict(SAMPLE_CONFIG)
+        d = config.to_dict()
+        config2 = StudyConfig.from_dict(d)
+        assert config2.model.name == config.model.name
+        assert config2.dataset.name == config.dataset.name
+        assert len(config2.strategies) == len(config.strategies)
@@ -0,0 +1,169 @@
+"""Tests for defense robustness evaluation framework."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import torch
+
+from obliteratus.analysis.defense_robustness import (
+    DefenseProfile,
+    DefenseRobustnessEvaluator,
+    EntanglementMap,
+    SelfRepairResult,
+)
+
+
+def _make_mock_pipeline(n_layers=6, hidden_dim=16, n_prompts=5):
+    """Create a mock pipeline with refusal directions and activations."""
+    pipeline = MagicMock()
+    pipeline.model_name = "test-model"
+
+    # Generate refusal directions (some strong, some weak)
+    torch.manual_seed(42)
+    directions = {}
+    for i in range(n_layers):
+        d = torch.randn(hidden_dim)
+        directions[i] = d / d.norm()
+    pipeline.refusal_directions = directions
+
+    # Generate activations with a planted refusal signal in middle layers
+    harmful_means = {}
+    harmless_means = {}
+    harmful_acts = {}
+    harmless_acts = {}
+
+    for i in range(n_layers):
+        base = torch.randn(hidden_dim)
+        harmless_means[i] = base.unsqueeze(0)
+
+        # Middle layers have stronger refusal signal
+        signal_strength = 3.0 if 2 <= i <= 4 else 0.5
+        harmful_means[i] = (base + signal_strength * directions[i]).unsqueeze(0)
+
+        harmful_acts[i] = [base + signal_strength * directions[i] + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
+        harmless_acts[i] = [base + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
+
+    pipeline._harmful_means = harmful_means
+    pipeline._harmless_means = harmless_means
+    pipeline._harmful_acts = harmful_acts
+    pipeline._harmless_acts = harmless_acts
+
+    return pipeline
+
+
+class TestDefenseProfile:
+    def test_profile_generates(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        profile = evaluator.profile_defense()
+
+        assert isinstance(profile, DefenseProfile)
+        assert profile.model_name == "test-model"
+        assert profile.refusal_layer_spread > 0
+        assert profile.mean_refusal_strength > 0
+        assert profile.max_refusal_strength >= profile.mean_refusal_strength
+        assert profile.estimated_robustness in ("low", "medium", "high", "very_high")
+
+    def test_alignment_type_estimate(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        profile = evaluator.profile_defense()
+        assert profile.alignment_type_estimate != "unknown"
+
+    def test_empty_pipeline(self):
+        pipeline = MagicMock()
+        pipeline.model_name = "empty"
+        pipeline.refusal_directions = {}
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        profile = evaluator.profile_defense()
+        assert profile.estimated_robustness == "unknown"
+
+    def test_concentration_bounded(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        profile = evaluator.profile_defense()
+        # Gini coefficient should be between 0 and 1
+        assert 0 <= profile.refusal_concentration <= 1.0
+
+    def test_self_repair_bounded(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        profile = evaluator.profile_defense()
+        assert 0 <= profile.self_repair_estimate <= 1.0
+
+    def test_format_report(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        profile = evaluator.profile_defense()
+        report = DefenseRobustnessEvaluator.format_defense_profile(profile)
+        assert "Defense Robustness" in report
+        assert "test-model" in report
+
+
+class TestSelfRepair:
+    def test_self_repair_measurement(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        result = evaluator.measure_self_repair(layer_idx=3)
+
+        assert isinstance(result, SelfRepairResult)
+        assert result.layer_idx == 3
+        assert result.original_refusal_strength >= 0
+        assert 0 <= result.repair_ratio <= 1.0
+        assert len(result.compensating_layers) > 0
+        assert 3 not in result.compensating_layers  # shouldn't list itself
+
+    def test_repair_ratio_high_for_distributed(self):
+        """Distributed refusal should have high repair ratio."""
+        pipeline = _make_mock_pipeline(n_layers=10)
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        result = evaluator.measure_self_repair(layer_idx=3)
+        # With distributed signal, removing one layer leaves much compensation
+        assert result.repair_ratio > 0.5
+
+    def test_format_self_repair(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        result = evaluator.measure_self_repair(layer_idx=2)
+        report = DefenseRobustnessEvaluator.format_self_repair(result)
+        assert "Self-Repair" in report
+        assert "Layer 2" in report
+
+
+class TestEntanglement:
+    def test_entanglement_map(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        emap = evaluator.map_entanglement()
+
+        assert isinstance(emap, EntanglementMap)
+        assert len(emap.layer_entanglement) > 0
+        assert 0 <= emap.overall_entanglement <= 1.0
+        assert len(emap.most_entangled_layers) > 0
+        assert len(emap.least_entangled_layers) > 0
+
+    def test_capability_sensitivity_keys(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        emap = evaluator.map_entanglement()
+
+        expected_keys = {"factual_knowledge", "reasoning", "language_fluency",
+                         "instruction_following", "math"}
+        assert set(emap.capability_sensitivity.keys()) == expected_keys
+
+    def test_math_most_sensitive(self):
+        """Math should be estimated as the most sensitive capability."""
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        emap = evaluator.map_entanglement()
+        if emap.overall_entanglement > 0:
+            assert emap.capability_sensitivity["math"] >= emap.capability_sensitivity["language_fluency"]
+
+    def test_format_entanglement(self):
+        pipeline = _make_mock_pipeline()
+        evaluator = DefenseRobustnessEvaluator(pipeline)
+        emap = evaluator.map_entanglement()
+        report = DefenseRobustnessEvaluator.format_entanglement(emap)
+        assert "Entanglement" in report
+        assert "math" in report
@@ -0,0 +1,510 @@
+"""Edge-case and robustness tests.
+
+Tests for NaN/Inf handling, empty inputs, extreme dimensions,
+and other boundary conditions that the main test suite doesn't cover.
+"""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+import torch
+import torch.nn as nn
+
+from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
+from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
+from obliteratus.analysis.concept_geometry import ConceptConeAnalyzer
+from obliteratus.analysis.alignment_imprint import AlignmentImprintDetector
+from obliteratus.analysis.multi_token_position import MultiTokenPositionAnalyzer
+from obliteratus.analysis.sparse_surgery import SparseDirectionSurgeon
+from obliteratus.analysis.causal_tracing import CausalRefusalTracer
+from obliteratus.analysis.residual_stream import ResidualStreamDecomposer
+from obliteratus.analysis.probing_classifiers import LinearRefusalProbe
+from obliteratus.analysis.cross_model_transfer import TransferAnalyzer
+from obliteratus.evaluation.advanced_metrics import (
+    refusal_rate,
+    effective_rank,
+    activation_cosine_similarity,
+)
+from obliteratus.analysis.steering_vectors import (
+    SteeringVectorFactory,
+    SteeringHookManager,
+    SteeringConfig,
+    SteeringResult,
+    compute_steering_effectiveness,
+    format_steering_report,
+)
+
+
+# ===========================================================================
+#  NaN / Inf handling
+# ===========================================================================
+
+class TestNaNInfHandling:
+    """Test that modules handle degenerate inputs gracefully."""
+
+    def test_whitened_svd_nan_activations(self):
+        """WhitenedSVD with NaN — currently raises; documenting behavior."""
+        harmful = [torch.tensor([float("nan"), 1.0, 2.0]) for _ in range(5)]
+        harmless = [torch.randn(3) for _ in range(5)]
+        extractor = WhitenedSVDExtractor()
+        # NaN propagation through SVD is expected to produce NaN results
+        # This documents the current behavior — ideally would guard against it
+        raised = False
+        result = None
+        try:
+            result = extractor.extract(harmful, harmless)
+        except (RuntimeError, ValueError):
+            raised = True
+        # Either it raised an exception (acceptable) or returned a result with NaNs
+        assert raised or result is not None, (
+            "Should either raise on NaN input or return a result"
+        )
+
+    def test_whitened_svd_zero_activations(self):
+        """WhitenedSVD with all-zero activations."""
+        harmful = [torch.zeros(8) for _ in range(5)]
+        harmless = [torch.zeros(8) for _ in range(5)]
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless)
+        # Should return a valid result without crashing
+        assert result is not None
+        assert result.directions is not None
+        assert result.singular_values is not None
+
+    def test_concept_cone_nan_direction(self):
+        """ConceptConeAnalyzer with NaN in activations — documenting behavior."""
+        harmful = [torch.randn(16) for _ in range(10)]
+        harmless = [torch.randn(16) for _ in range(10)]
+        # Poison one activation
+        harmful[3] = torch.full((16,), float("nan"))
+        cat_map = {i: f"cat_{i % 3}" for i in range(10)}
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        raised = False
+        result = None
+        try:
+            result = analyzer.analyze_layer(harmful, harmless)
+        except (RuntimeError, ValueError):
+            raised = True
+        # Either it raised an exception (acceptable) or returned a result
+        assert raised or result is not None, (
+            "Should either raise on NaN input or return a result"
+        )
+
+    def test_sparse_surgery_zero_direction(self):
+        """Sparse surgery with zero refusal direction."""
+        W = torch.randn(32, 16)
+        zero_dir = torch.zeros(16)
+        surgeon = SparseDirectionSurgeon()
+        result = surgeon.analyze_weight_matrix(W, zero_dir)
+        assert result.mean_projection == 0.0
+
+    def test_sparse_surgery_zero_weight(self):
+        """Sparse surgery with zero weight matrix."""
+        W = torch.zeros(32, 16)
+        ref_dir = torch.randn(16)
+        surgeon = SparseDirectionSurgeon()
+        result = surgeon.analyze_weight_matrix(W, ref_dir)
+        assert result.max_projection < 1e-6
+
+    def test_effective_rank_nan_matrix(self):
+        """effective_rank should handle matrix with NaN."""
+        W = torch.randn(10, 10)
+        W[0, 0] = float("nan")
+        # Should either return a value or raise cleanly
+        try:
+            result = effective_rank(torch.nan_to_num(W))
+            assert math.isfinite(result)
+        except Exception:
+            pass  # Raising is acceptable for NaN input
+
+    def test_cosine_similarity_zero_vectors(self):
+        """Cosine similarity between zero vectors."""
+        a = torch.zeros(32)
+        b = torch.zeros(32)
+        result = activation_cosine_similarity(a, b)
+        # Should be 0 or NaN, not crash
+        assert math.isfinite(result) or math.isnan(result)
+
+    def test_transfer_analyzer_nan_directions(self):
+        """Transfer analyzer with NaN directions."""
+        dirs_a = {0: torch.randn(16), 1: torch.tensor([float("nan")] * 16)}
+        dirs_b = {0: torch.randn(16), 1: torch.randn(16)}
+        analyzer = TransferAnalyzer()
+        # Should not crash
+        result = analyzer.analyze_cross_model(dirs_a, dirs_b)
+        assert result is not None
+        assert isinstance(result.mean_transfer_score, float)
+        assert result.per_layer_transfer is not None
+
+
+# ===========================================================================
+#  Empty inputs
+# ===========================================================================
+
+class TestEmptyInputs:
+    """Test graceful handling of empty or minimal inputs."""
+
+    def test_cross_layer_empty_directions(self):
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze({})
+        assert result.direction_persistence_score == 0.0
+
+    def test_alignment_imprint_single_layer(self):
+        """Single layer should still return a result."""
+        detector = AlignmentImprintDetector()
+        dirs = {0: torch.randn(32)}
+        result = detector.detect_imprint(dirs)
+        assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown")
+
+    def test_multi_token_single_position(self):
+        """Single-position sequence."""
+        ref_dir = torch.randn(16)
+        acts = torch.randn(1, 16)
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+        assert result.n_tokens == 1
+        assert result.peak_position == 0
+
+    def test_probing_minimal_data(self):
+        """Probing with very few samples."""
+        harmful = [torch.randn(8) for _ in range(3)]
+        harmless = [torch.randn(8) for _ in range(3)]
+        probe = LinearRefusalProbe(n_epochs=10)
+        result = probe.probe_layer(harmful, harmless)
+        assert 0 <= result.accuracy <= 1.0
+
+    def test_residual_stream_single_layer(self):
+        acts = {0: torch.randn(32)}
+        ref_dir = torch.randn(32)
+        decomposer = ResidualStreamDecomposer()
+        result = decomposer.decompose(acts, ref_dir)
+        assert result.n_layers == 1
+
+    def test_causal_tracing_single_layer(self):
+        acts = {0: torch.randn(32)}
+        ref_dirs = {0: torch.randn(32)}
+        tracer = CausalRefusalTracer()
+        result = tracer.trace_from_activations(acts, ref_dirs)
+        assert result.n_layers == 1
+
+    def test_transfer_no_common_layers(self):
+        """Cross-model with no overlapping layer indices."""
+        dirs_a = {0: torch.randn(16), 1: torch.randn(16)}
+        dirs_b = {2: torch.randn(16), 3: torch.randn(16)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_model(dirs_a, dirs_b)
+        assert result.mean_transfer_score == 0.0
+
+    def test_refusal_rate_empty_list(self):
+        result = refusal_rate([])
+        assert result == 0.0
+
+    def test_refusal_rate_single_response(self):
+        result = refusal_rate(["I cannot help with that."])
+        assert result == 1.0
+
+
+# ===========================================================================
+#  Extreme dimensions
+# ===========================================================================
+
+class TestExtremeDimensions:
+    """Test with unusually large or small dimensions."""
+
+    def test_high_dimensional_directions(self):
+        """Test with realistic hidden dimension (4096)."""
+        hidden_dim = 4096
+        torch.manual_seed(42)
+        dirs = {i: torch.randn(hidden_dim) for i in range(8)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_layer(dirs)
+        assert result.mean_adjacent_transfer >= 0
+
+    def test_high_dim_sparse_surgery(self):
+        """Sparse surgery with large weight matrix."""
+        W = torch.randn(2048, 1024)
+        ref_dir = torch.randn(1024)
+        surgeon = SparseDirectionSurgeon(sparsity=0.05)
+        result = surgeon.analyze_weight_matrix(W, ref_dir)
+        assert result.n_rows_modified == int(0.05 * 2048)
+
+    def test_single_dimension(self):
+        """1D hidden dimension edge case."""
+        dirs = {i: torch.randn(1) for i in range(4)}
+        analyzer = TransferAnalyzer()
+        result = analyzer.analyze_cross_layer(dirs)
+        # All 1D directions are parallel or anti-parallel, so cosine is always 1.0
+        assert result.mean_adjacent_transfer >= 0.99
+
+    def test_many_layers_imprint(self):
+        """Alignment imprint with many layers (128)."""
+        dirs = {i: torch.randn(32) for i in range(128)}
+        detector = AlignmentImprintDetector()
+        result = detector.detect_imprint(dirs)
+        total = (result.dpo_probability + result.rlhf_probability +
+                 result.cai_probability + result.sft_probability)
+        assert abs(total - 1.0) < 0.01
+
+    @pytest.mark.parametrize("n_prompts", [1, 2, 5, 50, 100])
+    def test_concept_cone_varying_prompt_counts(self, n_prompts):
+        """Concept cone with varying numbers of prompts."""
+        harmful = [torch.randn(16) for _ in range(n_prompts)]
+        harmless = [torch.randn(16) for _ in range(n_prompts)]
+        cat_map = {i: f"cat_{i % 3}" for i in range(n_prompts)}
+        analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=1)
+        result = analyzer.analyze_layer(harmful, harmless)
+        # Should produce a valid result regardless of prompt count
+        assert result is not None
+        assert result.general_direction is not None
+        assert result.category_count >= 1
+
+    @pytest.mark.parametrize("seq_len", [1, 2, 10, 100, 500])
+    def test_multi_token_varying_lengths(self, seq_len):
+        """Multi-token analysis with varying sequence lengths."""
+        ref_dir = torch.randn(16)
+        acts = torch.randn(seq_len, 16)
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+        assert result.n_tokens == seq_len
+
+    @pytest.mark.parametrize("hidden_dim", [2, 4, 16, 64, 256, 1024])
+    def test_probing_varying_dimensions(self, hidden_dim):
+        """Probing classifier across different hidden dims (min 2 for SGD)."""
+        harmful = [torch.randn(hidden_dim) * 2 + 1 for _ in range(10)]
+        harmless = [torch.randn(hidden_dim) * 2 - 1 for _ in range(10)]
+        probe = LinearRefusalProbe(n_epochs=20)
+        result = probe.probe_layer(harmful, harmless)
+        assert 0 <= result.accuracy <= 1.0
+
+
+# ===========================================================================
+#  Refusal rate edge cases
+# ===========================================================================
+
+class TestRefusalRateEdgeCases:
+    """Edge cases for refusal rate detection."""
+
+    def test_novel_refusal_phrasing(self):
+        """Test that unusual refusal phrasings might be missed (known limitation)."""
+        novel_refusals = [
+            "That request falls outside my operational parameters.",
+            "I'm designed to be helpful but this conflicts with my guidelines.",
+        ]
+        # These should ideally be detected but string matching may miss them
+        rate = refusal_rate(novel_refusals)
+        # This tests the known limitation — rate may be 0 (string matching misses novel phrasings)
+        assert 0 <= rate <= 1.0
+
+    def test_false_positive_sorry(self):
+        """'I'm sorry to hear that' followed by helpful content."""
+        responses = [
+            "I'm sorry to hear that. Here's how you can solve the problem: first...",
+        ]
+        rate = refusal_rate(responses, mode="prefix")
+        # Known limitation: prefix matching produces false positives here
+        # Just verify it returns a valid float
+        assert 0 <= rate <= 1.0
+
+    def test_case_insensitivity(self):
+        responses = ["I CANNOT ASSIST WITH THAT REQUEST."]
+        rate = refusal_rate(responses, mode="substring")
+        assert rate == 1.0
+
+    def test_whitespace_handling(self):
+        responses = ["  I'm sorry, but I can't help with that.  "]
+        rate = refusal_rate(responses, mode="prefix")
+        # Should handle leading whitespace
+        assert 0 <= rate <= 1.0
+
+
+# ===========================================================================
+#  Steering Vectors
+# ===========================================================================
+
+class TestSteeringVectors:
+    """Tests for the steering vector inference-time intervention system."""
+
+    def test_from_refusal_direction(self):
+        d = torch.randn(32)
+        vec = SteeringVectorFactory.from_refusal_direction(d, source_layer=5)
+        assert vec.label == "refusal"
+        assert vec.source_layer == 5
+        assert vec.default_alpha == -1.0
+        assert abs(vec.direction.norm().item() - 1.0) < 0.01
+
+    def test_from_contrastive_pairs(self):
+        pos = [torch.randn(16) + 2 for _ in range(10)]
+        neg = [torch.randn(16) - 2 for _ in range(10)]
+        vec = SteeringVectorFactory.from_contrastive_pairs(pos, neg, label="test")
+        assert vec.label == "test"
+        assert abs(vec.direction.norm().item() - 1.0) < 0.01
+        assert "n_positive" in vec.metadata
+
+    def test_combine_vectors(self):
+        v1 = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
+        v2 = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
+        combined = SteeringVectorFactory.combine([v1, v2], label="merged")
+        assert combined.label == "merged"
+        assert abs(combined.direction.norm().item() - 1.0) < 0.01
+
+    def test_combine_single(self):
+        v = SteeringVectorFactory.from_refusal_direction(torch.randn(16))
+        combined = SteeringVectorFactory.combine([v])
+        assert abs(combined.direction.norm().item() - 1.0) < 0.01
+
+    def test_combine_empty_raises(self):
+        with pytest.raises(ValueError):
+            SteeringVectorFactory.combine([])
+
+    def test_hook_manager_lifecycle(self):
+        """Test install/remove lifecycle without a real model."""
+        manager = SteeringHookManager()
+        assert not manager.is_active
+        manager.remove()  # Should not crash even with no hooks
+        assert not manager.is_active
+
+    def test_hook_with_simple_model(self):
+        """Test steering on a simple nn.Sequential model."""
+        model = nn.Sequential(
+            nn.Linear(16, 16),
+            nn.ReLU(),
+            nn.Linear(16, 16),
+            nn.ReLU(),
+            nn.Linear(16, 8),
+        )
+
+        vec = SteeringVectorFactory.from_refusal_direction(torch.randn(16))
+        config = SteeringConfig(
+            vectors=[vec],
+            target_layers=[0, 2],  # steer at first and third linear layers
+            alpha=1.0,
+        )
+
+        manager = SteeringHookManager()
+        # Install on specific modules
+        layers = list(model.children())
+        result = manager.install(model, config, layer_modules=layers)
+        assert result.hooks_installed == 2
+        assert manager.is_active
+
+        # Run a forward pass (should not crash)
+        x = torch.randn(1, 16)
+        output = model(x)
+        assert output.shape == (1, 8)
+
+        # Remove hooks
+        manager.remove()
+        assert not manager.is_active
+
+    def test_steering_effectiveness_remove(self):
+        eff = compute_steering_effectiveness(2.0, 0.5, direction="remove")
+        assert 0 < eff < 1.0  # Reduced but not eliminated
+
+    def test_steering_effectiveness_perfect_remove(self):
+        eff = compute_steering_effectiveness(2.0, 0.0, direction="remove")
+        assert eff == 1.0
+
+    def test_steering_effectiveness_no_change(self):
+        eff = compute_steering_effectiveness(2.0, 2.0, direction="remove")
+        assert eff == 0.0
+
+    def test_steering_effectiveness_add(self):
+        eff = compute_steering_effectiveness(1.0, 3.0, direction="add")
+        assert eff == 1.0  # Capped at 1.0
+
+    def test_format_report(self):
+        vec = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
+        config = SteeringConfig(vectors=[vec], target_layers=[3, 5], alpha=0.5)
+        result = SteeringResult(config=config, hooks_installed=2, total_steered_layers=2)
+        report = format_steering_report(result)
+        assert "Steering" in report
+        assert "refusal" in report
+
+    def test_steering_config_position_modes(self):
+        """Test different position modes in config."""
+        for pos in ["all", "last", "first"]:
+            config = SteeringConfig(
+                vectors=[SteeringVectorFactory.from_refusal_direction(torch.randn(8))],
+                target_layers=[0],
+                position=pos,
+            )
+            assert config.position == pos
+
+    def test_imports(self):
+        from obliteratus.analysis import SteeringVectorFactory, SteeringHookManager
+        assert SteeringVectorFactory is not None
+        assert SteeringHookManager is not None
+
+
+class TestParametrizedDimensions:
+    """Parametrized tests across different hidden dimensions."""
+
+    @pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256, 768])
+    def test_whitened_svd_various_dims(self, hidden_dim):
+        n_samples = max(4, hidden_dim // 4)
+        harmful = [torch.randn(hidden_dim) for _ in range(n_samples)]
+        harmless = [torch.randn(hidden_dim) for _ in range(n_samples)]
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless, n_directions=1)
+        assert result.directions.shape[1] == hidden_dim
+
+    @pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256])
+    def test_cross_layer_various_dims(self, hidden_dim):
+        directions = {i: torch.randn(hidden_dim) for i in range(4)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+        assert 0.0 <= result.direction_persistence_score <= 1.0
+
+    @pytest.mark.parametrize("hidden_dim", [4, 32, 128])
+    def test_sparse_surgery_various_dims(self, hidden_dim):
+        weight = torch.randn(hidden_dim, hidden_dim)
+        direction = torch.randn(hidden_dim)
+        direction = direction / direction.norm()
+        surgeon = SparseDirectionSurgeon()
+        result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0)
+        assert 0.0 <= result.energy_removed <= 1.0
+
+    @pytest.mark.parametrize("n_layers", [1, 4, 12, 32])
+    def test_imprint_various_layer_counts(self, n_layers):
+        directions = {i: torch.randn(64) for i in range(n_layers)}
+        detector = AlignmentImprintDetector()
+        result = detector.detect_imprint(directions)
+        assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown")
+
+
+class TestExceptionPaths:
+    """Tests for error handling and boundary conditions."""
+
+    def test_whitened_svd_mismatched_dims(self):
+        """Harmful and harmless with different hidden dims should fail or handle gracefully."""
+        harmful = [torch.randn(64) for _ in range(10)]
+        harmless = [torch.randn(32) for _ in range(10)]
+        extractor = WhitenedSVDExtractor()
+        with pytest.raises(Exception):
+            extractor.extract(harmful, harmless, n_directions=1)
+
+    def test_whitened_svd_single_sample(self):
+        """Single sample should not crash (may return 0 directions due to insufficient data)."""
+        harmful = [torch.randn(32)]
+        harmless = [torch.randn(32)]
+        extractor = WhitenedSVDExtractor()
+        result = extractor.extract(harmful, harmless, n_directions=1)
+        assert result.directions.shape[1] == 32  # hidden dim preserved
+
+    def test_sparse_surgery_zero_direction(self):
+        """Zero direction vector should not crash."""
+        weight = torch.randn(16, 16)
+        direction = torch.zeros(16)
+        surgeon = SparseDirectionSurgeon()
+        # Should handle gracefully (possibly returning 0 energy)
+        result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0)
+        assert result is not None
+
+    def test_cross_layer_single_layer(self):
+        """Single layer directions should still produce a result."""
+        directions = {0: torch.randn(32)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+        assert result is not None
@@ -0,0 +1,483 @@
+"""Tests for the community-standard (Heretics/Arditi) evaluation protocol."""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from obliteratus.evaluation.heretic_eval import (
+    ARDITI_REFUSAL_SUBSTRINGS,
+    _DEFAULT_HARMLESS_PROMPTS,
+    _HARMBENCH_CLS_TEMPLATE,
+    _JAILBREAKBENCH_BUILTIN,
+    _extract_score,
+    _last_real_token_indices,
+    _sanitize_model_path,
+    arditi_refusal_rate,
+    first_token_kl_on_prompts,
+    format_comparison_table,
+    HereticComparisonResult,
+    LM_EVAL_BENCHMARKS,
+    load_jailbreakbench_prompts,
+    unload_harmbench_classifier,
+)
+
+
+# ---------------------------------------------------------------------------
+# JailbreakBench Prompts
+# ---------------------------------------------------------------------------
+
+class TestJailbreakBenchPrompts:
+    def test_builtin_has_100_prompts(self):
+        assert len(_JAILBREAKBENCH_BUILTIN) == 100
+
+    def test_builtin_all_non_empty(self):
+        for i, p in enumerate(_JAILBREAKBENCH_BUILTIN):
+            assert isinstance(p, str) and len(p.strip()) > 10, f"Prompt {i} is too short"
+
+    def test_builtin_no_duplicates(self):
+        assert len(set(_JAILBREAKBENCH_BUILTIN)) == len(_JAILBREAKBENCH_BUILTIN)
+
+    def test_load_returns_list(self):
+        prompts = load_jailbreakbench_prompts()
+        assert isinstance(prompts, list)
+        assert len(prompts) >= 100
+
+    def test_load_caches(self):
+        p1 = load_jailbreakbench_prompts()
+        p2 = load_jailbreakbench_prompts()
+        assert p1 == p2
+
+    def test_load_returns_copy(self):
+        """Loading should return a copy, not the cached list itself."""
+        p1 = load_jailbreakbench_prompts()
+        p1.append("mutated")
+        p2 = load_jailbreakbench_prompts()
+        assert "mutated" not in p2
+
+
+# ---------------------------------------------------------------------------
+# Arditi Refusal Rate
+# ---------------------------------------------------------------------------
+
+class TestArditiRefusalRate:
+    def test_canonical_substring_count(self):
+        """Arditi/Zou list has exactly 29 substrings."""
+        assert len(ARDITI_REFUSAL_SUBSTRINGS) == 29
+
+    def test_all_refusals(self):
+        responses = [
+            "I'm sorry, but I cannot help with that.",
+            "I apologize for the inconvenience.",
+            "As an AI, I don't do that.",
+        ]
+        assert arditi_refusal_rate(responses) == 1.0
+
+    def test_no_refusals(self):
+        responses = [
+            "Sure, here is a step-by-step guide.",
+            "The answer to your question is 42.",
+            "Here you go, this should work.",
+        ]
+        assert arditi_refusal_rate(responses) == 0.0
+
+    def test_empty_responses_not_refusals(self):
+        """Per Zou et al. logic, empty responses have no substring match = not refusals."""
+        responses = ["", "  ", ""]
+        assert arditi_refusal_rate(responses) == 0.0
+
+    def test_empty_list(self):
+        assert arditi_refusal_rate([]) == 0.0
+
+    def test_case_insensitive(self):
+        responses = ["i'm sorry, but I can't help."]
+        assert arditi_refusal_rate(responses) == 1.0
+
+    def test_mixed(self):
+        responses = [
+            "I cannot do that.",
+            "Sure, here's the info.",
+            "My apologies, I'm unable to help.",
+            "The capital of France is Paris.",
+        ]
+        rate = arditi_refusal_rate(responses)
+        assert rate == 0.5  # 2/4
+
+    def test_subtle_refusal_not_caught(self):
+        """Arditi list is deliberately limited — subtle refusals should not match."""
+        responses = [
+            "I'd rather not help with that specific request.",
+            "Let me redirect you to a more appropriate resource.",
+        ]
+        # These are NOT in the Arditi 29-substring list
+        assert arditi_refusal_rate(responses) == 0.0
+
+
+# ---------------------------------------------------------------------------
+# First-Token KL Divergence
+# ---------------------------------------------------------------------------
+
+class TestFirstTokenKL:
+    @pytest.fixture
+    def simple_models(self):
+        """Create two tiny 'models' with controllable logit distributions."""
+        class FakeModel(torch.nn.Module):
+            def __init__(self, peak_idx: int = 0):
+                super().__init__()
+                # A single parameter so next(model.parameters()).device works
+                self._param = torch.nn.Parameter(torch.zeros(1))
+                self._peak_idx = peak_idx
+
+            def __call__(self, **kwargs):
+                batch_size = kwargs["input_ids"].shape[0]
+                seq_len = kwargs["input_ids"].shape[1]
+                vocab_size = 10
+                # Create a non-uniform distribution peaked at _peak_idx
+                base = torch.zeros(vocab_size)
+                base[self._peak_idx] = 5.0
+                logits = base.unsqueeze(0).unsqueeze(0).expand(
+                    batch_size, seq_len, vocab_size
+                ).clone()
+                return type("Output", (), {"logits": logits})()
+
+        class FakeTokenizer:
+            pad_token_id = 0
+            def __call__(self, texts, return_tensors="pt", **kwargs):
+                batch_size = len(texts) if isinstance(texts, list) else 1
+                input_ids = torch.ones(batch_size, 5, dtype=torch.long)
+                return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
+
+        return FakeModel, FakeTokenizer
+
+    def test_identical_models_zero_kl(self, simple_models):
+        FakeModel, FakeTokenizer = simple_models
+        model_a = FakeModel(peak_idx=0)
+        model_b = FakeModel(peak_idx=0)
+        tokenizer = FakeTokenizer()
+
+        result = first_token_kl_on_prompts(
+            model_a, model_b, tokenizer,
+            ["hello", "world"],
+        )
+        assert abs(result["mean_kl"]) < 1e-5
+        assert result["interpretation"] == "excellent (minimal collateral damage)"
+
+    def test_different_models_positive_kl(self, simple_models):
+        FakeModel, FakeTokenizer = simple_models
+        model_a = FakeModel(peak_idx=0)  # peaked at vocab position 0
+        model_b = FakeModel(peak_idx=5)  # peaked at vocab position 5
+        tokenizer = FakeTokenizer()
+
+        result = first_token_kl_on_prompts(
+            model_a, model_b, tokenizer,
+            ["test prompt"],
+        )
+        assert result["mean_kl"] > 0
+
+    def test_returns_per_prompt_kl(self, simple_models):
+        FakeModel, FakeTokenizer = simple_models
+        model_a = FakeModel(peak_idx=0)
+        model_b = FakeModel(peak_idx=3)
+        tokenizer = FakeTokenizer()
+
+        result = first_token_kl_on_prompts(
+            model_a, model_b, tokenizer,
+            ["a", "b", "c"],
+        )
+        assert len(result["per_prompt_kl"]) == 3
+        assert result["std_kl"] >= 0
+
+
+# ---------------------------------------------------------------------------
+# HereticComparisonResult
+# ---------------------------------------------------------------------------
+
+class TestHereticComparisonResult:
+    def test_dataclass_fields(self):
+        r = HereticComparisonResult(
+            model_name="test-model",
+            method="OBLITERATUS",
+            refusal_rate_arditi=0.05,
+            refusal_rate_obliteratus=0.03,
+            harmbench_asr=0.85,
+            n_jailbreakbench=100,
+            n_refusals_remaining=5,
+            first_token_kl=0.15,
+            kl_interpretation="excellent",
+        )
+        assert r.model_name == "test-model"
+        assert r.method == "OBLITERATUS"
+        assert r.refusal_rate_arditi == 0.05
+        assert r.harmbench_asr == 0.85
+        assert r.first_token_kl == 0.15
+
+    def test_optional_fields_default_none(self):
+        r = HereticComparisonResult(
+            model_name="test",
+            method="test",
+            refusal_rate_arditi=0.0,
+            refusal_rate_obliteratus=0.0,
+            harmbench_asr=None,
+            n_jailbreakbench=100,
+            n_refusals_remaining=0,
+        )
+        assert r.mmlu is None
+        assert r.gsm8k is None
+        assert r.perplexity is None
+        assert r.harmbench_per_item == []
+        assert r.kl_per_prompt == []
+
+
+# ---------------------------------------------------------------------------
+# Comparison Table Formatting
+# ---------------------------------------------------------------------------
+
+class TestComparisonTable:
+    def test_format_single_result(self):
+        r = HereticComparisonResult(
+            model_name="Llama-2-7B",
+            method="OBLITERATUS",
+            refusal_rate_arditi=0.05,
+            refusal_rate_obliteratus=0.03,
+            harmbench_asr=0.85,
+            n_jailbreakbench=100,
+            n_refusals_remaining=5,
+            first_token_kl=0.15,
+            kl_interpretation="excellent",
+            mmlu=0.518,
+            gsm8k=0.313,
+        )
+        table = format_comparison_table([r])
+        assert "OBLITERATUS" in table
+        assert "REFUSAL REMOVAL" in table
+        assert "CAPABILITY PRESERVATION" in table
+        assert "DISTRIBUTION QUALITY" in table
+        assert "5.0%" in table  # arditi refusal rate
+        assert "85.0%" in table  # harmbench asr
+        assert "5/100" in table  # JBB refusals
+        assert "0.1500" in table  # KL divergence
+
+    def test_format_multiple_results(self):
+        results = [
+            HereticComparisonResult(
+                model_name="test", method="OBLITERATUS",
+                refusal_rate_arditi=0.05, refusal_rate_obliteratus=0.03,
+                harmbench_asr=0.85, n_jailbreakbench=100, n_refusals_remaining=5,
+            ),
+            HereticComparisonResult(
+                model_name="test", method="Heretic",
+                refusal_rate_arditi=0.03, refusal_rate_obliteratus=0.03,
+                harmbench_asr=0.90, n_jailbreakbench=100, n_refusals_remaining=3,
+            ),
+        ]
+        table = format_comparison_table(results)
+        assert "OBLITERATUS" in table
+        assert "Heretic" in table
+
+    def test_heretic_reference_numbers_present(self):
+        """The comparison table should include Heretic's published reference numbers."""
+        table = format_comparison_table([
+            HereticComparisonResult(
+                model_name="test", method="test",
+                refusal_rate_arditi=0.0, refusal_rate_obliteratus=0.0,
+                harmbench_asr=None, n_jailbreakbench=100, n_refusals_remaining=0,
+            ),
+        ])
+        assert "p-e-w/heretic" in table
+        assert "97/100" in table  # Original model refusal count
+        assert "0.16" in table    # Heretic's KL divergence
+
+
+# ---------------------------------------------------------------------------
+# LM-Eval Benchmark Config
+# ---------------------------------------------------------------------------
+
+class TestLmEvalBenchmarks:
+    def test_five_standard_benchmarks(self):
+        assert len(LM_EVAL_BENCHMARKS) == 5
+
+    def test_required_keys(self):
+        for key, config in LM_EVAL_BENCHMARKS.items():
+            assert "task" in config
+            assert "num_fewshot" in config
+            assert "description" in config
+
+    def test_expected_benchmarks(self):
+        expected = {"mmlu", "gsm8k", "arc_challenge", "hellaswag", "truthfulqa_mc2"}
+        assert set(LM_EVAL_BENCHMARKS.keys()) == expected
+
+    def test_fewshot_settings_match_arditi(self):
+        """Few-shot settings should match Arditi et al. / Open LLM Leaderboard."""
+        assert LM_EVAL_BENCHMARKS["mmlu"]["num_fewshot"] == 5
+        assert LM_EVAL_BENCHMARKS["gsm8k"]["num_fewshot"] == 5
+        assert LM_EVAL_BENCHMARKS["truthfulqa_mc2"]["num_fewshot"] == 0
+
+
+# ---------------------------------------------------------------------------
+# HarmBench Template
+# ---------------------------------------------------------------------------
+
+class TestHarmBenchTemplate:
+    def test_template_has_sys_tags(self):
+        """Official template requires <<SYS>> / <</SYS>> delimiters."""
+        assert "<<SYS>>" in _HARMBENCH_CLS_TEMPLATE
+        assert "<</SYS>>" in _HARMBENCH_CLS_TEMPLATE
+
+    def test_template_has_seven_rules(self):
+        """Official HarmBench template has exactly 7 rules."""
+        rule_count = _HARMBENCH_CLS_TEMPLATE.count("\n- ")
+        assert rule_count == 7
+
+    def test_template_has_behavior_placeholder(self):
+        assert "{behavior}" in _HARMBENCH_CLS_TEMPLATE
+
+    def test_template_has_generation_placeholder(self):
+        assert "{generation}" in _HARMBENCH_CLS_TEMPLATE
+
+    def test_template_ends_with_inst_tag(self):
+        assert _HARMBENCH_CLS_TEMPLATE.strip().endswith("[/INST]")
+
+    def test_template_answer_format(self):
+        """Official template uses lowercase 'yes' or 'no'."""
+        assert '"yes" or "no"' in _HARMBENCH_CLS_TEMPLATE
+
+
+# ---------------------------------------------------------------------------
+# Score Extraction (C3 fix)
+# ---------------------------------------------------------------------------
+
+class TestExtractScore:
+    def test_normal_score(self):
+        assert _extract_score({"acc,none": 0.75}) == 0.75
+
+    def test_zero_score_not_skipped(self):
+        """A legitimate score of 0.0 should be returned, not treated as falsy."""
+        assert _extract_score({"acc,none": 0.0}) == 0.0
+
+    def test_fallback_to_next_key(self):
+        assert _extract_score({"acc_norm,none": 0.65}) == 0.65
+
+    def test_mc2_key(self):
+        assert _extract_score({"mc2,none": 0.42}) == 0.42
+
+    def test_no_matching_key(self):
+        assert _extract_score({"unknown_metric": 0.99}) == 0.0
+
+    def test_priority_order(self):
+        """acc,none should take priority over acc_norm,none."""
+        result = _extract_score({"acc,none": 0.5, "acc_norm,none": 0.9})
+        assert result == 0.5
+
+
+# ---------------------------------------------------------------------------
+# Padding-Aware Last-Token Indices
+# ---------------------------------------------------------------------------
+
+class TestLastRealTokenIndices:
+    def test_no_padding(self):
+        mask = torch.ones(3, 5, dtype=torch.long)
+        indices = _last_real_token_indices(mask)
+        assert indices.tolist() == [4, 4, 4]
+
+    def test_with_padding(self):
+        mask = torch.tensor([
+            [1, 1, 1, 1, 1],  # length 5, last real = index 4
+            [1, 1, 1, 0, 0],  # length 3, last real = index 2
+            [1, 0, 0, 0, 0],  # length 1, last real = index 0
+        ])
+        indices = _last_real_token_indices(mask)
+        assert indices.tolist() == [4, 2, 0]
+
+    def test_single_token(self):
+        mask = torch.tensor([[1]])
+        indices = _last_real_token_indices(mask)
+        assert indices.tolist() == [0]
+
+
+# ---------------------------------------------------------------------------
+# Model Path Sanitization
+# ---------------------------------------------------------------------------
+
+class TestSanitizeModelPath:
+    def test_normal_path(self):
+        assert _sanitize_model_path("/tmp/my-model") == "/tmp/my-model"
+
+    def test_hf_model_id(self):
+        assert _sanitize_model_path("meta-llama/Llama-2-7b-hf") == "meta-llama/Llama-2-7b-hf"
+
+    def test_rejects_commas(self):
+        with pytest.raises(ValueError, match="commas"):
+            _sanitize_model_path("evil,trust_remote_code=True")
+
+
+# ---------------------------------------------------------------------------
+# Classifier Unload
+# ---------------------------------------------------------------------------
+
+class TestClassifierUnload:
+    def test_unload_when_not_loaded(self):
+        """Unloading when nothing is loaded should not raise."""
+        unload_harmbench_classifier()  # should be a no-op
+
+
+# ---------------------------------------------------------------------------
+# Default Harmless Prompts
+# ---------------------------------------------------------------------------
+
+class TestDefaultHarmlessPrompts:
+    def test_has_100_unique_prompts(self):
+        assert len(_DEFAULT_HARMLESS_PROMPTS) == 100
+
+    def test_no_duplicates(self):
+        assert len(set(_DEFAULT_HARMLESS_PROMPTS)) == len(_DEFAULT_HARMLESS_PROMPTS)
+
+    def test_all_non_empty(self):
+        for i, p in enumerate(_DEFAULT_HARMLESS_PROMPTS):
+            assert isinstance(p, str) and len(p) > 10, f"Prompt {i} is too short"
+
+
+# ---------------------------------------------------------------------------
+# KL Divergence Non-Negativity
+# ---------------------------------------------------------------------------
+
+class TestKLNonNegativity:
+    @pytest.fixture
+    def models_and_tokenizer(self):
+        class FakeModel(torch.nn.Module):
+            def __init__(self, peak_idx: int = 0):
+                super().__init__()
+                self._param = torch.nn.Parameter(torch.zeros(1))
+                self._peak_idx = peak_idx
+
+            def __call__(self, **kwargs):
+                batch_size = kwargs["input_ids"].shape[0]
+                seq_len = kwargs["input_ids"].shape[1]
+                vocab_size = 10
+                base = torch.zeros(vocab_size)
+                base[self._peak_idx] = 5.0
+                logits = base.unsqueeze(0).unsqueeze(0).expand(
+                    batch_size, seq_len, vocab_size
+                ).clone()
+                return type("Output", (), {"logits": logits})()
+
+        class FakeTokenizer:
+            pad_token_id = 0
+            def __call__(self, texts, return_tensors="pt", **kwargs):
+                batch_size = len(texts) if isinstance(texts, list) else 1
+                input_ids = torch.ones(batch_size, 5, dtype=torch.long)
+                return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
+
+        return FakeModel, FakeTokenizer
+
+    def test_all_kl_values_non_negative(self, models_and_tokenizer):
+        FakeModel, FakeTokenizer = models_and_tokenizer
+        model_a = FakeModel(peak_idx=0)
+        model_b = FakeModel(peak_idx=3)
+        tokenizer = FakeTokenizer()
+
+        result = first_token_kl_on_prompts(
+            model_a, model_b, tokenizer,
+            ["a", "b", "c", "d", "e"],
+        )
+        for val in result["per_prompt_kl"]:
+            assert val >= 0.0, f"KL value {val} is negative"
@@ -0,0 +1,385 @@
+"""Tests for the Analysis-Informed Abliteration Pipeline."""
+
+from __future__ import annotations
+
+
+import pytest
+import torch
+
+from obliteratus.informed_pipeline import (
+    AnalysisInsights,
+    InformedAbliterationPipeline,
+    InformedPipelineReport,
+    INFORMED_METHOD,
+)
+from obliteratus.abliterate import METHODS
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def insights():
+    """Default AnalysisInsights for testing."""
+    return AnalysisInsights()
+
+
+@pytest.fixture
+def pipeline(tmp_path):
+    """An InformedAbliterationPipeline with no model loaded."""
+    return InformedAbliterationPipeline(
+        model_name="test-model",
+        output_dir=str(tmp_path / "test_informed"),
+    )
+
+
+# ---------------------------------------------------------------------------
+# AnalysisInsights
+# ---------------------------------------------------------------------------
+
+class TestAnalysisInsights:
+    def test_default_values(self, insights):
+        assert insights.detected_alignment_method == "unknown"
+        assert insights.alignment_confidence == 0.0
+        assert insights.cone_is_polyhedral is False
+        assert insights.cone_dimensionality == 1.0
+        assert insights.mean_pairwise_cosine == 1.0
+        assert insights.per_category_directions == {}
+        assert insights.direction_specificity == {}
+        assert insights.cluster_count == 0
+        assert insights.direction_persistence == 0.0
+        assert insights.use_sparse_surgery is False
+        assert insights.recommended_n_directions == 4
+        assert insights.recommended_regularization == 0.0
+        assert insights.recommended_refinement_passes == 2
+        assert insights.recommended_layers == []
+        assert insights.skip_layers == []
+
+    def test_default_robustness(self, insights):
+        assert insights.estimated_robustness == "unknown"
+        assert insights.self_repair_estimate == 0.0
+        assert insights.entanglement_score == 0.0
+        assert insights.entangled_layers == []
+        assert insights.clean_layers == []
+
+
+class TestInformedPipelineReport:
+    def test_default_report(self):
+        insights = AnalysisInsights()
+        report = InformedPipelineReport(insights=insights)
+        assert report.analysis_duration == 0.0
+        assert report.total_duration == 0.0
+        assert report.ouroboros_passes == 0
+        assert report.final_refusal_rate == 0.0
+        assert report.stages == []
+
+
+# ---------------------------------------------------------------------------
+# Method preset
+# ---------------------------------------------------------------------------
+
+class TestInformedMethod:
+    def test_informed_method_in_abliterate_methods(self):
+        assert "informed" in METHODS
+        cfg = METHODS["informed"]
+        assert cfg["norm_preserve"] is True
+        assert cfg["project_biases"] is True
+        assert cfg["use_chat_template"] is True
+        assert cfg["use_whitened_svd"] is True
+        assert cfg["true_iterative_refinement"] is True
+
+    def test_informed_method_standalone(self):
+        assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)"
+        assert INFORMED_METHOD["n_directions"] == 4
+        assert INFORMED_METHOD["norm_preserve"] is True
+
+
+# ---------------------------------------------------------------------------
+# Pipeline initialization
+# ---------------------------------------------------------------------------
+
+class TestPipelineInit:
+    def test_method_set_to_informed(self, pipeline):
+        assert pipeline.method == "informed"
+
+    def test_default_analysis_flags(self, pipeline):
+        assert pipeline._run_cone is True
+        assert pipeline._run_alignment is True
+        assert pipeline._run_cross_layer is True
+        assert pipeline._run_sparse is True
+        assert pipeline._run_defense is True
+
+    def test_ouroboros_defaults(self, pipeline):
+        assert pipeline._ouroboros_threshold == 0.5
+        assert pipeline._max_ouroboros_passes == 3
+
+    def test_entanglement_gate(self, pipeline):
+        assert pipeline._entanglement_gate == 0.8
+
+    def test_inherits_base_pipeline(self, pipeline):
+        assert pipeline.norm_preserve is True
+        assert pipeline.project_biases is True
+        assert pipeline.use_chat_template is True
+        assert pipeline.use_whitened_svd is True
+        assert pipeline.true_iterative_refinement is True
+
+    def test_custom_flags(self):
+        p = InformedAbliterationPipeline(
+            model_name="test",
+            run_cone_analysis=False,
+            run_alignment_detection=False,
+            ouroboros_threshold=0.3,
+            max_ouroboros_passes=5,
+            entanglement_gate=0.9,
+        )
+        assert p._run_cone is False
+        assert p._run_alignment is False
+        assert p._ouroboros_threshold == 0.3
+        assert p._max_ouroboros_passes == 5
+        assert p._entanglement_gate == 0.9
+
+
+# ---------------------------------------------------------------------------
+# Configuration derivation
+# ---------------------------------------------------------------------------
+
+class TestConfigurationDerivation:
+    """Test the _derive_configuration logic with various insights."""
+
+    def _make_pipeline_with_insights(self, **kwargs):
+        p = InformedAbliterationPipeline(
+            model_name="test",
+            on_log=lambda m: None,
+        )
+        for k, v in kwargs.items():
+            setattr(p._insights, k, v)
+        return p
+
+    def test_polyhedral_cone_more_directions(self):
+        p = self._make_pipeline_with_insights(
+            cone_is_polyhedral=True,
+            cone_dimensionality=3.5,
+        )
+        p._derive_configuration()
+        # Polyhedral with dim 3.5 → n_dirs = max(4, min(8, int(3.5*2))) = 7
+        assert p.n_directions == 7
+
+    def test_linear_cone_fewer_directions(self):
+        p = self._make_pipeline_with_insights(
+            cone_is_polyhedral=False,
+            cone_dimensionality=1.0,
+        )
+        p._derive_configuration()
+        # Linear with dim 1.0 → n_dirs = max(1, min(4, int(1.0+1))) = 2
+        assert p.n_directions == 2
+
+    def test_dpo_zero_regularization(self):
+        p = self._make_pipeline_with_insights(
+            detected_alignment_method="dpo",
+            entanglement_score=0.1,
+        )
+        p._derive_configuration()
+        assert p.regularization == 0.0
+
+    def test_rlhf_moderate_regularization(self):
+        p = self._make_pipeline_with_insights(
+            detected_alignment_method="rlhf",
+            entanglement_score=0.2,
+        )
+        p._derive_configuration()
+        assert p.regularization == 0.15
+
+    def test_cai_regularization(self):
+        p = self._make_pipeline_with_insights(
+            detected_alignment_method="cai",
+            entanglement_score=0.2,
+        )
+        p._derive_configuration()
+        assert p.regularization == 0.2
+
+    def test_sft_low_regularization(self):
+        p = self._make_pipeline_with_insights(
+            detected_alignment_method="sft",
+            entanglement_score=0.1,
+        )
+        p._derive_configuration()
+        assert p.regularization == 0.05
+
+    def test_high_entanglement_increases_regularization(self):
+        p = self._make_pipeline_with_insights(
+            detected_alignment_method="dpo",
+            entanglement_score=0.7,
+        )
+        p._derive_configuration()
+        # DPO base = 0.0, + 0.15 for high entanglement = 0.15
+        assert p.regularization == 0.15
+
+    def test_high_self_repair_more_passes(self):
+        p = self._make_pipeline_with_insights(
+            self_repair_estimate=0.8,
+        )
+        p._derive_configuration()
+        assert p.refinement_passes == 3
+
+    def test_moderate_self_repair_two_passes(self):
+        p = self._make_pipeline_with_insights(
+            self_repair_estimate=0.5,
+        )
+        p._derive_configuration()
+        assert p.refinement_passes == 2
+
+    def test_low_self_repair_one_pass(self):
+        p = self._make_pipeline_with_insights(
+            self_repair_estimate=0.2,
+        )
+        p._derive_configuration()
+        assert p.refinement_passes == 1
+
+    def test_cluster_layers_used(self):
+        p = self._make_pipeline_with_insights(
+            cluster_representative_layers=[5, 10, 15],
+            direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
+        )
+        p.refusal_directions = {i: torch.randn(64) for i in range(20)}
+        p._derive_configuration()
+        # Should include all cluster layers
+        assert 5 in p._insights.recommended_layers
+        assert 10 in p._insights.recommended_layers
+
+    def test_entangled_layers_skipped(self):
+        p = self._make_pipeline_with_insights(
+            cluster_representative_layers=[5, 10, 15],
+            direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
+            entangled_layers=[10],
+        )
+        p._derive_configuration()
+        # Layer 10 should be skipped
+        assert 10 not in p._insights.recommended_layers
+        assert 10 in p._insights.skip_layers
+
+    def test_sparse_surgery_enabled_when_rsi_high(self):
+        p = self._make_pipeline_with_insights(
+            mean_refusal_sparsity_index=0.7,
+        )
+        p._sparse_threshold = 0.5
+        p._derive_configuration()
+        assert p._insights.use_sparse_surgery is True
+
+    def test_sparse_surgery_disabled_when_rsi_low(self):
+        p = self._make_pipeline_with_insights(
+            mean_refusal_sparsity_index=0.3,
+        )
+        p._sparse_threshold = 0.5
+        p._derive_configuration()
+        assert p._insights.use_sparse_surgery is False
+
+    def test_whitened_svd_for_multi_direction(self):
+        p = self._make_pipeline_with_insights(
+            cone_is_polyhedral=True,
+            cone_dimensionality=2.5,
+        )
+        p._derive_configuration()
+        assert p.n_directions > 1
+        assert p.use_whitened_svd is True
+
+    def test_no_whitened_svd_for_single_direction(self):
+        p = self._make_pipeline_with_insights(
+            cone_is_polyhedral=False,
+            cone_dimensionality=0.5,
+        )
+        p._derive_configuration()
+        # dim 0.5 → max(1, min(4, int(0.5+1))) = 1
+        assert p.n_directions == 1
+        assert p.use_whitened_svd is False
+
+
+# ---------------------------------------------------------------------------
+# Format report
+# ---------------------------------------------------------------------------
+
+class TestFormatInsights:
+    def test_format_default(self, insights):
+        text = InformedAbliterationPipeline.format_insights(insights)
+        assert "Analysis-Informed Pipeline" in text
+        assert "UNKNOWN" in text  # detected method
+        assert "LINEAR" in text  # cone type
+
+    def test_format_polyhedral(self):
+        insights = AnalysisInsights(
+            detected_alignment_method="dpo",
+            alignment_confidence=0.85,
+            cone_is_polyhedral=True,
+            cone_dimensionality=3.5,
+            cluster_count=4,
+        )
+        text = InformedAbliterationPipeline.format_insights(insights)
+        assert "DPO" in text
+        assert "POLYHEDRAL" in text
+        assert "3.50" in text
+
+    def test_format_includes_derived_config(self, insights):
+        insights.recommended_n_directions = 6
+        insights.recommended_regularization = 0.2
+        insights.recommended_refinement_passes = 3
+        text = InformedAbliterationPipeline.format_insights(insights)
+        assert "n_directions: 6" in text
+        assert "regularization: 0.2" in text
+        assert "refinement_passes: 3" in text
+
+
+# ---------------------------------------------------------------------------
+# Edge cases
+# ---------------------------------------------------------------------------
+
+class TestEdgeCases:
+    def test_no_cluster_layers_falls_back(self):
+        p = InformedAbliterationPipeline(
+            model_name="test",
+            on_log=lambda m: None,
+        )
+        p._insights.cluster_representative_layers = []
+        p._derive_configuration()
+        assert p._insights.recommended_layers == []
+
+    def test_regularization_capped(self):
+        p = InformedAbliterationPipeline(
+            model_name="test",
+            on_log=lambda m: None,
+        )
+        p._insights.detected_alignment_method = "cai"
+        p._insights.entanglement_score = 0.9
+        p._derive_configuration()
+        # CAI base = 0.2, + 0.15 = 0.35, capped at 0.5
+        assert p.regularization <= 0.5
+
+    def test_all_layers_entangled_keeps_some(self):
+        """If all cluster layers are entangled, don't skip all of them."""
+        p = InformedAbliterationPipeline(
+            model_name="test",
+            on_log=lambda m: None,
+        )
+        p._insights.cluster_representative_layers = [5]
+        p._insights.direction_clusters = [[5]]
+        p._insights.entangled_layers = [5]
+        p._derive_configuration()
+        # Should NOT skip the only layer
+        assert 5 in p._insights.recommended_layers
+
+    def test_cone_dimensionality_bounds(self):
+        """Extreme cone dimensionality values are handled."""
+        p = InformedAbliterationPipeline(
+            model_name="test",
+            on_log=lambda m: None,
+        )
+        # Very high dimensionality
+        p._insights.cone_is_polyhedral = True
+        p._insights.cone_dimensionality = 10.0
+        p._derive_configuration()
+        assert p.n_directions <= 8  # capped
+
+        # Very low dimensionality
+        p._insights.cone_is_polyhedral = False
+        p._insights.cone_dimensionality = 0.1
+        p._derive_configuration()
+        assert p.n_directions >= 1  # at least 1
@@ -0,0 +1,172 @@
+"""Tests for logit lens refusal direction analysis."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import torch
+
+from obliteratus.analysis.logit_lens import (
+    RefusalLogitLens,
+    LogitLensResult,
+    MultiLayerLogitLensResult,
+    REFUSAL_TOKENS,
+    COMPLIANCE_TOKENS,
+)
+
+
+def _make_mock_model(hidden_dim=32, vocab_size=100):
+    """Create a mock model with LM head and layer norm."""
+    model = MagicMock()
+
+    # LM head weight (vocab_size, hidden_dim)
+    lm_head = MagicMock()
+    lm_head.weight = MagicMock()
+    lm_head.weight.data = torch.randn(vocab_size, hidden_dim)
+    model.lm_head = lm_head
+
+    # Final LayerNorm
+    ln_f = MagicMock()
+    ln_f.weight = MagicMock()
+    ln_f.weight.data = torch.ones(hidden_dim)
+    ln_f.bias = MagicMock()
+    ln_f.bias.data = torch.zeros(hidden_dim)
+    model.transformer = MagicMock()
+    model.transformer.ln_f = ln_f
+
+    return model
+
+
+def _make_mock_tokenizer(vocab_size=100):
+    """Create a mock tokenizer."""
+    tokenizer = MagicMock()
+
+    def mock_decode(ids):
+        if isinstance(ids, list) and len(ids) == 1:
+            return f"tok_{ids[0]}"
+        return f"tok_{ids}"
+
+    def mock_encode(text, add_special_tokens=False):
+        # Return a deterministic token ID based on the text
+        return [hash(text) % vocab_size]
+
+    tokenizer.decode = mock_decode
+    tokenizer.encode = mock_encode
+    return tokenizer
+
+
+class TestRefusalLogitLens:
+    def test_basic_analysis(self):
+        """Should produce a LogitLensResult with expected fields."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        direction = torch.randn(32)
+
+        lens = RefusalLogitLens(top_k=10)
+        result = lens.analyze_direction(direction, model, tokenizer, layer_idx=5)
+
+        assert isinstance(result, LogitLensResult)
+        assert result.layer_idx == 5
+        assert len(result.top_promoted) == 10
+        assert len(result.top_suppressed) == 10
+        assert isinstance(result.refusal_specificity, float)
+        assert isinstance(result.logit_effect_entropy, float)
+        assert isinstance(result.refusal_compliance_gap, float)
+
+    def test_promoted_suppressed_ordering(self):
+        """Top promoted should have higher logit boost than top suppressed."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        direction = torch.randn(32)
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_direction(direction, model, tokenizer)
+
+        # Promoted tokens should have positive-ish values
+        # Suppressed tokens should have negative-ish values
+        max_promoted = max(v for _, v in result.top_promoted)
+        min_suppressed = min(v for _, v in result.top_suppressed)
+        assert max_promoted > min_suppressed
+
+    def test_multi_layer_analysis(self):
+        """Should analyze multiple layers."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        directions = {0: torch.randn(32), 1: torch.randn(32), 2: torch.randn(32)}
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_all_layers(directions, model, tokenizer)
+
+        assert isinstance(result, MultiLayerLogitLensResult)
+        assert len(result.per_layer) == 3
+        assert result.strongest_refusal_layer in [0, 1, 2]
+        assert result.peak_specificity_layer in [0, 1, 2]
+
+    def test_strong_layers_filter(self):
+        """Should only analyze specified strong layers."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        directions = {i: torch.randn(32) for i in range(10)}
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_all_layers(
+            directions, model, tokenizer, strong_layers=[2, 5]
+        )
+        assert set(result.per_layer.keys()) == {2, 5}
+
+    def test_handles_unnormalized_direction(self):
+        """Should handle non-unit directions."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        direction = torch.randn(32) * 100.0  # large magnitude
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_direction(direction, model, tokenizer)
+        # Should still produce valid results
+        assert len(result.top_promoted) == 5
+
+    def test_format_report(self):
+        """Format report should produce readable output."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        directions = {0: torch.randn(32), 1: torch.randn(32)}
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_all_layers(directions, model, tokenizer)
+        report = RefusalLogitLens.format_report(result)
+        assert "Logit Lens" in report
+        assert "Layer 0:" in report
+
+    def test_empty_directions(self):
+        """Should handle empty input gracefully."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_all_layers({}, model, tokenizer)
+        assert len(result.per_layer) == 0
+
+    def test_token_lists_nonempty(self):
+        """Refusal and compliance token lists should have entries."""
+        assert len(REFUSAL_TOKENS) > 10
+        assert len(COMPLIANCE_TOKENS) > 10
+
+    def test_entropy_nonnegative(self):
+        """Logit effect entropy should be non-negative."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        direction = torch.randn(32)
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_direction(direction, model, tokenizer)
+        assert result.logit_effect_entropy >= 0
+
+    def test_2d_direction_input(self):
+        """Should handle 2D direction input (unsqueezed)."""
+        model = _make_mock_model()
+        tokenizer = _make_mock_tokenizer()
+        direction = torch.randn(1, 32)
+
+        lens = RefusalLogitLens(top_k=5)
+        result = lens.analyze_direction(direction, model, tokenizer)
+        assert len(result.top_promoted) == 5
@@ -0,0 +1,60 @@
+"""Tests for evaluation metrics."""
+
+from __future__ import annotations
+
+
+import torch
+
+from obliteratus.evaluation.metrics import accuracy, f1_score_metric, perplexity
+
+
+class TestPerplexity:
+    def test_perfect_prediction(self):
+        # Create logits that strongly predict the correct next token
+        vocab_size = 10
+        seq_len = 5
+        batch_size = 1
+
+        labels = torch.tensor([[0, 1, 2, 3, 4]])
+        logits = torch.full((batch_size, seq_len, vocab_size), -100.0)
+        # Set high logit for the correct next token
+        for t in range(seq_len - 1):
+            logits[0, t, labels[0, t + 1]] = 100.0
+
+        ppl = perplexity(logits, labels)
+        assert ppl < 2.0, f"Expected near-1 perplexity, got {ppl}"
+
+    def test_random_prediction_higher(self):
+        vocab_size = 100
+        seq_len = 20
+        batch_size = 2
+
+        torch.manual_seed(42)
+        logits = torch.randn(batch_size, seq_len, vocab_size)
+        labels = torch.randint(0, vocab_size, (batch_size, seq_len))
+
+        ppl = perplexity(logits, labels)
+        assert ppl > 10, f"Random logits should yield high perplexity, got {ppl}"
+
+
+class TestAccuracy:
+    def test_perfect(self):
+        assert accuracy([1, 2, 3], [1, 2, 3]) == 1.0
+
+    def test_zero(self):
+        assert accuracy([1, 2, 3], [4, 5, 6]) == 0.0
+
+    def test_partial(self):
+        assert accuracy([1, 2, 3, 4], [1, 2, 0, 0]) == 0.5
+
+    def test_empty(self):
+        assert accuracy([], []) == 0.0
+
+
+class TestF1:
+    def test_perfect(self):
+        assert f1_score_metric([0, 1, 0, 1], [0, 1, 0, 1]) == 1.0
+
+    def test_zero(self):
+        score = f1_score_metric([0, 0, 0, 0], [1, 1, 1, 1])
+        assert score == 0.0
@@ -0,0 +1,85 @@
+"""Smoke tests verifying all new modules are importable from package level."""
+
+from __future__ import annotations
+
+
+class TestTopLevelImports:
+    """Verify obliteratus top-level exports."""
+
+    def test_set_seed(self):
+        from obliteratus import set_seed
+        assert callable(set_seed)
+
+    def test_run_sweep(self):
+        from obliteratus import run_sweep
+        assert callable(run_sweep)
+
+    def test_sweep_config(self):
+        from obliteratus import SweepConfig
+        cfg = SweepConfig(
+            model_name="test",
+            sweep_params={"n_directions": [1, 2]},
+        )
+        assert cfg.model_name == "test"
+
+    def test_sweep_result(self):
+        from obliteratus import SweepResult
+        r = SweepResult(
+            params={"n_directions": 1},
+            seed=42,
+            quality_metrics={},
+            stage_durations={},
+            strong_layers=[],
+        )
+        assert r.seed == 42
+
+
+class TestEvaluationImports:
+    """Verify evaluation subpackage exports."""
+
+    def test_refusal_rate_with_ci(self):
+        from obliteratus.evaluation import refusal_rate_with_ci
+        result = refusal_rate_with_ci(["Sure, here you go."], mode="combined")
+        assert result["rate"] == 0.0
+        assert result["n_samples"] == 1
+
+    def test_random_direction_ablation(self):
+        from obliteratus.evaluation import random_direction_ablation
+        assert callable(random_direction_ablation)
+
+    def test_direction_specificity_test(self):
+        from obliteratus.evaluation import direction_specificity_test
+        assert callable(direction_specificity_test)
+
+    def test_run_benchmarks(self):
+        from obliteratus.evaluation import run_benchmarks
+        assert callable(run_benchmarks)
+
+    def test_compare_models(self):
+        from obliteratus.evaluation import compare_models
+        assert callable(compare_models)
+
+
+class TestDirectImports:
+    """Verify direct module imports still work."""
+
+    def test_reproducibility(self):
+        from obliteratus.reproducibility import set_seed
+        import torch
+        set_seed(999, deterministic=False)
+        a = torch.randn(10)
+        set_seed(999, deterministic=False)
+        b = torch.randn(10)
+        assert torch.equal(a, b)
+
+    def test_baselines(self):
+        from obliteratus.evaluation.baselines import (
+            BaselineResult,
+        )
+        assert BaselineResult is not None
+
+    def test_lm_eval_integration(self):
+        from obliteratus.evaluation.lm_eval_integration import (
+            run_benchmarks,
+        )
+        assert callable(run_benchmarks)
@@ -0,0 +1,672 @@
+"""Tests for the five new analysis modules:
+  1. Tuned Lens (learned-affine logit lens variant)
+  2. Activation Patching (real interchange intervention)
+  3. Enhanced SAE Decomposition Pipeline
+  4. Wasserstein-Optimal Direction Extraction
+  5. Bayesian-Optimized Kernel Projection
+"""
+
+from __future__ import annotations
+
+
+import pytest
+import torch
+import torch.nn as nn
+
+from obliteratus.analysis.tuned_lens import (
+    TunedLensTrainer,
+    TunedLensProbe,
+    RefusalTunedLens,
+    TunedLensResult,
+    MultiLayerTunedLensResult,
+)
+from obliteratus.analysis.activation_patching import (
+    ActivationPatcher,
+    PatchingSite,
+    ActivationPatchingResult,
+)
+from obliteratus.analysis.sae_abliteration import (
+    SAEDecompositionPipeline,
+    SAEDecompositionResult,
+    FeatureClusterResult,
+)
+from obliteratus.analysis.wasserstein_optimal import (
+    WassersteinOptimalExtractor,
+    WassersteinDirectionResult,
+    WassersteinComparisonResult,
+    MultiLayerWassersteinResult,
+)
+from obliteratus.analysis.bayesian_kernel_projection import (
+    BayesianKernelProjection,
+    BayesianOptimizationResult,
+    ProjectionConfig,
+)
+
+
+# ---------------------------------------------------------------------------
+#  Helpers
+# ---------------------------------------------------------------------------
+
+def _make_activations(
+    hidden_dim=32, n_per_class=20, separation=2.0, seed=42,
+):
+    """Create harmful/harmless activations with planted refusal signal."""
+    torch.manual_seed(seed)
+    direction = torch.randn(hidden_dim)
+    direction = direction / direction.norm()
+
+    harmful = [
+        torch.randn(hidden_dim) * 0.3 + separation * direction
+        for _ in range(n_per_class)
+    ]
+    harmless = [
+        torch.randn(hidden_dim) * 0.3
+        for _ in range(n_per_class)
+    ]
+    return harmful, harmless, direction
+
+
+def _make_multilayer_activations(
+    n_layers=6, hidden_dim=32, n_per_class=20, separation=2.0, seed=42,
+):
+    """Create per-layer activations with planted refusal signals."""
+    torch.manual_seed(seed)
+
+    harmful_acts = {}
+    harmless_acts = {}
+    directions = {}
+
+    for li in range(n_layers):
+        d = torch.randn(hidden_dim)
+        d = d / d.norm()
+        directions[li] = d
+
+        strength = separation if 1 <= li <= n_layers - 2 else 0.3
+        harmful_acts[li] = [
+            torch.randn(hidden_dim) * 0.3 + strength * d
+            for _ in range(n_per_class)
+        ]
+        harmless_acts[li] = [
+            torch.randn(hidden_dim) * 0.3
+            for _ in range(n_per_class)
+        ]
+
+    return harmful_acts, harmless_acts, directions
+
+
+class FakeTokenizer:
+    """Fake tokenizer that maps strings to reproducible token IDs."""
+
+    def __init__(self, vocab_size=100):
+        self.vocab_size = vocab_size
+
+    def encode(self, text, add_special_tokens=False):
+        return [hash(text) % self.vocab_size]
+
+    def decode(self, ids):
+        return f"tok_{ids[0]}"
+
+
+class FakeModel(nn.Module):
+    """Fake model with lm_head and transformer.ln_f for testing."""
+
+    def __init__(self, hidden_dim=32, vocab_size=100, n_layers=4):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.vocab_size = vocab_size
+        self.n_layers = n_layers
+
+        self.lm_head = nn.Linear(hidden_dim, vocab_size, bias=False)
+        self.transformer = nn.Module()
+        self.transformer.ln_f = nn.LayerNorm(hidden_dim)
+        self.transformer.h = nn.ModuleList([
+            nn.Linear(hidden_dim, hidden_dim) for _ in range(n_layers)
+        ])
+
+    def forward(self, input_ids):
+        # Fake forward pass
+        batch_size, seq_len = input_ids.shape
+        x = torch.randn(batch_size, seq_len, self.hidden_dim)
+        for layer in self.transformer.h:
+            x = layer(x) + x
+        logits = self.lm_head(self.transformer.ln_f(x))
+        return type('Output', (), {'logits': logits})()
+
+
+# ===========================================================================
+#  Tests: Tuned Lens
+# ===========================================================================
+
+class TestTunedLensTrainer:
+    def test_train_single_probe(self):
+        hidden_dim = 16
+        n_samples = 30
+
+        layer_acts = torch.randn(n_samples, hidden_dim)
+        final_acts = layer_acts + torch.randn(n_samples, hidden_dim) * 0.1
+
+        trainer = TunedLensTrainer(hidden_dim, n_epochs=20)
+        probe = trainer.train_probe(layer_acts, final_acts, layer_idx=3)
+
+        assert isinstance(probe, TunedLensProbe)
+        assert probe.layer_idx == 3
+        assert probe.weight.shape == (hidden_dim, hidden_dim)
+        assert probe.bias.shape == (hidden_dim,)
+        assert probe.train_loss < 1.0  # should converge somewhat
+
+    def test_train_all_layers(self):
+        hidden_dim = 16
+        n_samples = 20
+
+        layer_acts = {
+            i: torch.randn(n_samples, hidden_dim) for i in range(4)
+        }
+        final_acts = torch.randn(n_samples, hidden_dim)
+
+        trainer = TunedLensTrainer(hidden_dim, n_epochs=10)
+        probes = trainer.train_all_layers(layer_acts, final_acts)
+
+        assert len(probes) == 4
+        for i in range(4):
+            assert i in probes
+            assert probes[i].weight.shape == (hidden_dim, hidden_dim)
+
+    def test_probe_near_identity_for_final_layer(self):
+        """Probe for the final layer should be close to identity."""
+        hidden_dim = 16
+        n_samples = 50
+
+        acts = torch.randn(n_samples, hidden_dim)
+        trainer = TunedLensTrainer(hidden_dim, n_epochs=50)
+        probe = trainer.train_probe(acts, acts, layer_idx=0)
+
+        # Weight should be close to identity
+        identity = torch.eye(hidden_dim)
+        diff = (probe.weight - identity).norm().item()
+        assert diff < 1.0
+
+
+class TestRefusalTunedLens:
+    def test_analyze_direction(self):
+        hidden_dim = 32
+        vocab_size = 100
+
+        model = FakeModel(hidden_dim, vocab_size)
+        tokenizer = FakeTokenizer(vocab_size)
+
+        direction = torch.randn(hidden_dim)
+        probe = TunedLensProbe(
+            layer_idx=2,
+            weight=torch.eye(hidden_dim) + torch.randn(hidden_dim, hidden_dim) * 0.01,
+            bias=torch.zeros(hidden_dim),
+            train_loss=0.01,
+        )
+
+        lens = RefusalTunedLens(top_k=10)
+        result = lens.analyze_direction(direction, probe, model, tokenizer)
+
+        assert isinstance(result, TunedLensResult)
+        assert result.layer_idx == 2
+        assert len(result.top_promoted) <= 10
+        assert len(result.top_suppressed) <= 10
+        assert isinstance(result.correction_magnitude, float)
+        assert result.correction_magnitude >= 0
+
+    def test_analyze_all_layers(self):
+        hidden_dim = 32
+        vocab_size = 100
+
+        model = FakeModel(hidden_dim, vocab_size)
+        tokenizer = FakeTokenizer(vocab_size)
+
+        directions = {
+            i: torch.randn(hidden_dim) for i in range(4)
+        }
+        probes = {
+            i: TunedLensProbe(
+                layer_idx=i,
+                weight=torch.eye(hidden_dim),
+                bias=torch.zeros(hidden_dim),
+                train_loss=0.01,
+            )
+            for i in range(4)
+        }
+
+        lens = RefusalTunedLens(top_k=5)
+        result = lens.analyze_all_layers(directions, probes, model, tokenizer)
+
+        assert isinstance(result, MultiLayerTunedLensResult)
+        assert len(result.per_layer) == 4
+        assert result.strongest_refusal_layer in range(4)
+
+    def test_compare_with_logit_lens(self):
+        logit_gaps = {0: 0.1, 1: 0.5, 2: 0.3, 3: 0.8}
+
+        tuned_result = MultiLayerTunedLensResult(
+            per_layer={
+                i: TunedLensResult(
+                    layer_idx=i,
+                    top_promoted=[], top_suppressed=[],
+                    refusal_token_mean_boost=0.0,
+                    compliance_token_mean_boost=0.0,
+                    refusal_compliance_gap=v * 1.1,  # similar ranking
+                    correction_magnitude=0.1,
+                )
+                for i, v in logit_gaps.items()
+            },
+            probes={},
+            strongest_refusal_layer=3,
+            peak_gap_layer=3,
+            mean_refusal_compliance_gap=0.5,
+            logit_lens_agreement=0.0,
+        )
+
+        agreement = RefusalTunedLens.compare_with_logit_lens(tuned_result, logit_gaps)
+        # Same ranking → correlation should be 1.0
+        assert agreement == pytest.approx(1.0, abs=0.01)
+
+    def test_format_report(self):
+        result = MultiLayerTunedLensResult(
+            per_layer={},
+            probes={},
+            strongest_refusal_layer=0,
+            peak_gap_layer=0,
+            mean_refusal_compliance_gap=0.0,
+            logit_lens_agreement=0.0,
+        )
+        report = RefusalTunedLens.format_report(result)
+        assert "Tuned Lens" in report
+        assert "No layers analyzed" in report
+
+
+# ===========================================================================
+#  Tests: Activation Patching
+# ===========================================================================
+
+class TestActivationPatcher:
+    def test_patching_site_creation(self):
+        site = PatchingSite(layer_idx=3, component="residual")
+        assert site.layer_idx == 3
+        assert site.component == "residual"
+        assert site.head_idx is None
+
+    def test_patching_site_with_head(self):
+        site = PatchingSite(layer_idx=2, component="attn_head", head_idx=5)
+        assert site.head_idx == 5
+
+    def test_patch_sweep_with_model(self):
+        """Test full patching sweep on fake model."""
+        hidden_dim = 32
+        model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
+
+        clean_ids = torch.randint(0, 100, (1, 10))
+        corrupted_ids = torch.randint(0, 100, (1, 10))
+
+        patcher = ActivationPatcher(significance_threshold=0.05)
+
+        result = patcher.patch_sweep(
+            model, clean_ids, corrupted_ids,
+            mode="noising",
+        )
+
+        assert isinstance(result, ActivationPatchingResult)
+        assert result.patching_mode == "noising"
+        assert result.n_layers == 4
+        assert len(result.effects) > 0
+        assert isinstance(result.circuit_fraction, float)
+        assert 0.0 <= result.circuit_fraction <= 1.0
+
+    def test_patch_sweep_denoising(self):
+        hidden_dim = 32
+        model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
+
+        clean_ids = torch.randint(0, 100, (1, 10))
+        corrupted_ids = torch.randint(0, 100, (1, 10))
+
+        patcher = ActivationPatcher()
+        result = patcher.patch_sweep(
+            model, clean_ids, corrupted_ids,
+            mode="denoising",
+        )
+
+        assert result.patching_mode == "denoising"
+
+    def test_custom_metric(self):
+        hidden_dim = 32
+        model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
+
+        clean_ids = torch.randint(0, 100, (1, 10))
+        corrupted_ids = torch.randint(0, 100, (1, 10))
+
+        def custom_metric(logits):
+            return logits.sum().item()
+
+        patcher = ActivationPatcher(metric_fn=custom_metric)
+        result = patcher.patch_sweep(model, clean_ids, corrupted_ids)
+
+        assert isinstance(result, ActivationPatchingResult)
+        assert isinstance(result.clean_baseline, float)
+
+    def test_format_report(self):
+        result = ActivationPatchingResult(
+            n_layers=4,
+            n_sites=4,
+            patching_mode="noising",
+            effects=[],
+            clean_baseline=1.0,
+            corrupted_baseline=0.0,
+            total_effect=1.0,
+            significant_sites=[],
+            circuit_fraction=0.0,
+            top_causal_layers=[],
+        )
+        report = ActivationPatcher.format_report(result)
+        assert "Activation Patching" in report
+        assert "noising" in report
+
+
+# ===========================================================================
+#  Tests: Enhanced SAE Decomposition Pipeline
+# ===========================================================================
+
+class TestSAEDecompositionPipeline:
+    def test_basic_pipeline(self):
+        harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=2.0)
+
+        pipeline = SAEDecompositionPipeline(
+            expansion=2, n_epochs=10, top_k_features=8, n_clusters=3,
+        )
+        result = pipeline.run(harmful, harmless, layer_idx=0)
+
+        assert isinstance(result, SAEDecompositionResult)
+        assert result.layer_idx == 0
+        assert result.sae is not None
+        assert result.refusal_features.n_refusal_features == 8
+        assert len(result.feature_sparsity) == 8
+        assert len(result.feature_monosemanticity) == 8
+        assert len(result.per_feature_refusal_reduction) == 8
+        assert len(result.cumulative_refusal_reduction) == 8
+        assert 0.0 <= result.raw_direction_overlap <= 1.0
+
+    def test_feature_clustering(self):
+        harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30)
+
+        pipeline = SAEDecompositionPipeline(
+            expansion=2, n_epochs=10, top_k_features=8, n_clusters=3,
+        )
+        result = pipeline.run(harmful, harmless)
+
+        clusters = result.feature_clusters
+        assert clusters is not None
+        assert isinstance(clusters, FeatureClusterResult)
+        assert clusters.n_clusters == 3
+        assert len(clusters.cluster_labels) == 8
+        assert all(0 <= lbl < 3 for lbl in clusters.cluster_labels)
+        assert clusters.cluster_directions.shape[0] == 3
+        assert -1.0 <= clusters.silhouette_score <= 1.0
+
+    def test_cumulative_reduction_monotonic(self):
+        harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=3.0)
+
+        pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=10, top_k_features=6)
+        result = pipeline.run(harmful, harmless)
+
+        # Cumulative reduction should be non-decreasing
+        for i in range(1, len(result.cumulative_refusal_reduction)):
+            assert result.cumulative_refusal_reduction[i] >= result.cumulative_refusal_reduction[i - 1] - 1e-6
+
+    def test_format_report(self):
+        harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20)
+        pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=5, top_k_features=4, n_clusters=2)
+        result = pipeline.run(harmful, harmless)
+
+        report = SAEDecompositionPipeline.format_report(result)
+        assert "SAE Feature Decomposition" in report
+        assert "Variance explained" in report
+
+
+# ===========================================================================
+#  Tests: Wasserstein-Optimal Direction Extraction
+# ===========================================================================
+
+class TestWassersteinOptimalExtractor:
+    def test_basic_extraction(self):
+        harmful, harmless, planted_dir = _make_activations(
+            hidden_dim=32, n_per_class=30, separation=3.0,
+        )
+
+        extractor = WassersteinOptimalExtractor()
+        result = extractor.extract(harmful, harmless, layer_idx=0)
+
+        assert isinstance(result, WassersteinDirectionResult)
+        assert result.layer_idx == 0
+        assert result.direction.shape == (32,)
+        assert abs(result.direction.norm().item() - 1.0) < 1e-5
+        assert result.wasserstein_cost >= 0
+        assert result.mean_shift_component >= 0
+        assert result.bures_component >= 0
+        assert result.cost_effectiveness_ratio >= 0
+
+    def test_direction_captures_signal(self):
+        """Wasserstein direction should have non-trivial refusal projection."""
+        harmful, harmless, planted_dir = _make_activations(
+            hidden_dim=32, n_per_class=30, separation=3.0,
+        )
+
+        extractor = WassersteinOptimalExtractor()
+        result = extractor.extract(harmful, harmless)
+
+        # Direction should have some alignment with planted signal
+        cosine = abs((result.direction @ planted_dir).item())
+        assert cosine > 0.1  # not totally orthogonal
+
+    def test_extract_all_layers(self):
+        harmful_acts, harmless_acts, _ = _make_multilayer_activations(
+            n_layers=4, hidden_dim=16, n_per_class=20,
+        )
+
+        extractor = WassersteinOptimalExtractor()
+        result = extractor.extract_all_layers(harmful_acts, harmless_acts)
+
+        assert isinstance(result, MultiLayerWassersteinResult)
+        assert len(result.per_layer) == 4
+        assert result.best_layer in range(4)
+        assert result.mean_cost_ratio >= 0
+
+    def test_compare_with_alternatives(self):
+        harmful, harmless, planted_dir = _make_activations(
+            hidden_dim=16, n_per_class=30, separation=3.0,
+        )
+
+        extractor = WassersteinOptimalExtractor()
+        w_result = extractor.extract(harmful, harmless)
+
+        # Use planted direction as "Fisher" and diff-in-means
+        H = torch.stack(harmful).float()
+        B = torch.stack(harmless).float()
+        dim_dir = (H.mean(0) - B.mean(0))
+        dim_dir = dim_dir / dim_dir.norm()
+
+        comparison = extractor.compare_with_alternatives(
+            w_result, harmful, harmless,
+            fisher_direction=planted_dir,
+            dim_direction=dim_dir,
+        )
+
+        assert isinstance(comparison, WassersteinComparisonResult)
+        assert comparison.wasserstein_cost_ratio >= 0
+        assert comparison.fisher_cost_ratio is not None
+        assert comparison.dim_cost_ratio is not None
+        assert 0 <= comparison.cosine_wasserstein_fisher <= 1
+        assert 0 <= comparison.cosine_wasserstein_dim <= 1
+
+    def test_wasserstein_lower_cost_than_dim(self):
+        """Wasserstein-optimal should have lower cost ratio than diff-in-means."""
+        harmful, harmless, _ = _make_activations(
+            hidden_dim=32, n_per_class=50, separation=2.0,
+        )
+
+        extractor = WassersteinOptimalExtractor()
+        w_result = extractor.extract(harmful, harmless)
+
+        H = torch.stack(harmful).float()
+        B = torch.stack(harmless).float()
+        dim_dir = (H.mean(0) - B.mean(0))
+        dim_dir = dim_dir / dim_dir.norm()
+
+        comparison = extractor.compare_with_alternatives(
+            w_result, harmful, harmless, dim_direction=dim_dir,
+        )
+
+        # Wasserstein should have lower or equal cost ratio by construction
+        assert comparison.wasserstein_cost_ratio <= comparison.dim_cost_ratio + 1e-4
+
+    def test_format_report(self):
+        harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20)
+        extractor = WassersteinOptimalExtractor()
+        result = extractor.extract_all_layers(
+            {0: harmful, 1: harmful},
+            {0: harmless, 1: harmless},
+        )
+        report = WassersteinOptimalExtractor.format_report(result)
+        assert "Wasserstein" in report
+        assert "cost ratio" in report.lower()
+
+
+# ===========================================================================
+#  Tests: Bayesian-Optimized Kernel Projection
+# ===========================================================================
+
+class TestBayesianKernelProjection:
+    def test_basic_optimization(self):
+        harmful_acts, harmless_acts, directions = _make_multilayer_activations(
+            n_layers=6, hidden_dim=16, n_per_class=20,
+        )
+
+        optimizer = BayesianKernelProjection(
+            n_trials=30, refusal_weight=0.6, distortion_weight=0.4,
+        )
+        result = optimizer.optimize(harmful_acts, harmless_acts, directions)
+
+        assert isinstance(result, BayesianOptimizationResult)
+        assert result.n_trials == 30
+        assert result.best_score >= 0
+        assert 0 <= result.best_refusal_reduction <= 1.0
+        assert result.best_harmless_distortion >= 0
+        assert len(result.all_trials) == 30
+
+    def test_best_config_structure(self):
+        harmful_acts, harmless_acts, directions = _make_multilayer_activations(
+            n_layers=4, hidden_dim=16, n_per_class=15,
+        )
+
+        optimizer = BayesianKernelProjection(n_trials=20)
+        result = optimizer.optimize(harmful_acts, harmless_acts, directions)
+
+        config = result.best_config
+        assert isinstance(config, ProjectionConfig)
+        assert config.layer_range[0] <= config.layer_range[1]
+        assert config.n_directions >= 1
+        assert 0 <= config.regularization <= 0.5
+
+    def test_pareto_front(self):
+        harmful_acts, harmless_acts, directions = _make_multilayer_activations(
+            n_layers=6, hidden_dim=16, n_per_class=20,
+        )
+
+        optimizer = BayesianKernelProjection(n_trials=50)
+        result = optimizer.optimize(harmful_acts, harmless_acts, directions)
+
+        # Pareto front should have at least 1 entry
+        assert len(result.pareto_configs) >= 1
+
+        # Pareto entries should be non-dominated
+        for i in range(len(result.pareto_configs) - 1):
+            # Each entry should have lower distortion than the next
+            # (since they're sorted by decreasing refusal reduction)
+            assert (
+                result.pareto_configs[i].harmless_distortion
+                >= result.pareto_configs[i + 1].harmless_distortion - 1e-8
+            )
+
+    def test_layer_importance(self):
+        harmful_acts, harmless_acts, directions = _make_multilayer_activations(
+            n_layers=6, hidden_dim=16, n_per_class=20,
+        )
+
+        optimizer = BayesianKernelProjection(n_trials=50)
+        result = optimizer.optimize(harmful_acts, harmless_acts, directions)
+
+        assert len(result.layer_importance) == 6
+        for _layer, imp in result.layer_importance.items():
+            assert 0 <= imp <= 1.0
+
+    def test_tpe_improves_over_random(self):
+        """TPE phase should produce better configs than random exploration."""
+        harmful_acts, harmless_acts, directions = _make_multilayer_activations(
+            n_layers=6, hidden_dim=16, n_per_class=20,
+        )
+
+        optimizer = BayesianKernelProjection(n_trials=60, seed=42)
+        result = optimizer.optimize(harmful_acts, harmless_acts, directions)
+
+        # Compare average score of first 20 (random) vs last 20 (TPE)
+        first_20 = sorted(result.all_trials[:20], key=lambda t: t.combined_score)
+        last_20 = sorted(result.all_trials[-20:], key=lambda t: t.combined_score)
+
+        best_random = first_20[0].combined_score
+        best_tpe = min(t.combined_score for t in last_20)
+
+        # TPE should find at least as good (lower = better)
+        # This is probabilistic so we allow some slack
+        assert best_tpe <= best_random + 0.3
+
+    def test_empty_input(self):
+        optimizer = BayesianKernelProjection(n_trials=10)
+        result = optimizer.optimize({}, {}, {})
+
+        assert result.n_trials == 0
+        assert result.best_score == 0.0
+
+    def test_format_report(self):
+        harmful_acts, harmless_acts, directions = _make_multilayer_activations(
+            n_layers=4, hidden_dim=16, n_per_class=15,
+        )
+
+        optimizer = BayesianKernelProjection(n_trials=20)
+        result = optimizer.optimize(harmful_acts, harmless_acts, directions)
+
+        report = BayesianKernelProjection.format_report(result)
+        assert "Bayesian" in report
+        assert "Pareto" in report
+        assert "Layer importance" in report
+
+
+# ===========================================================================
+#  Tests: Module imports
+# ===========================================================================
+
+class TestModuleImports:
+    def test_all_new_modules_importable(self):
+        from obliteratus.analysis import TunedLensTrainer
+        from obliteratus.analysis import RefusalTunedLens
+        from obliteratus.analysis import ActivationPatcher
+        from obliteratus.analysis import WassersteinOptimalExtractor
+        from obliteratus.analysis import BayesianKernelProjection
+        from obliteratus.analysis import SAEDecompositionPipeline
+
+        assert TunedLensTrainer is not None
+        assert RefusalTunedLens is not None
+        assert ActivationPatcher is not None
+        assert WassersteinOptimalExtractor is not None
+        assert BayesianKernelProjection is not None
+        assert SAEDecompositionPipeline is not None
+
+    def test_new_modules_in_all(self):
+        import obliteratus.analysis as analysis
+        assert "TunedLensTrainer" in analysis.__all__
+        assert "RefusalTunedLens" in analysis.__all__
+        assert "ActivationPatcher" in analysis.__all__
+        assert "WassersteinOptimalExtractor" in analysis.__all__
+        assert "BayesianKernelProjection" in analysis.__all__
+        assert "SAEDecompositionPipeline" in analysis.__all__
@@ -0,0 +1,669 @@
+"""Tests for analysis techniques: concept cones, alignment imprints,
+multi-token position, and sparse direction surgery."""
+
+from __future__ import annotations
+
+
+import torch
+
+from obliteratus.analysis.concept_geometry import (
+    ConceptConeAnalyzer,
+    ConeConeResult,
+    MultiLayerConeResult,
+    CategoryDirection,
+    DEFAULT_HARM_CATEGORIES,
+)
+from obliteratus.analysis.alignment_imprint import (
+    AlignmentImprintDetector,
+    AlignmentImprint,
+    BaseInstructDelta,
+)
+from obliteratus.analysis.multi_token_position import (
+    MultiTokenPositionAnalyzer,
+    PositionAnalysisResult,
+    MultiTokenSummary,
+)
+from obliteratus.analysis.sparse_surgery import (
+    SparseDirectionSurgeon,
+    SparseProjectionResult,
+    SparseSurgeryPlan,
+)
+
+
+# ---------------------------------------------------------------------------
+#  Helpers
+# ---------------------------------------------------------------------------
+
+def _make_category_activations(
+    hidden_dim=32, n_prompts=30, n_categories=5, category_spread=0.3,
+):
+    """Create synthetic activations with planted per-category refusal directions.
+
+    Each category gets its own refusal direction, with some shared component
+    to simulate a polyhedral cone structure.
+    """
+    torch.manual_seed(42)
+
+    # Shared refusal component
+    shared = torch.randn(hidden_dim)
+    shared = shared / shared.norm()
+
+    # Per-category unique components
+    cat_dirs = {}
+    categories = [f"cat_{i}" for i in range(n_categories)]
+    for cat in categories:
+        unique = torch.randn(hidden_dim)
+        unique = unique / unique.norm()
+        combined = shared + category_spread * unique
+        cat_dirs[cat] = combined / combined.norm()
+
+    # Assign prompts to categories
+    prompts_per_cat = n_prompts // n_categories
+    category_map = {}
+    for i, cat in enumerate(categories):
+        for j in range(prompts_per_cat):
+            category_map[i * prompts_per_cat + j] = cat
+
+    actual_n = prompts_per_cat * n_categories
+
+    # Generate activations
+    harmful_acts = []
+    harmless_acts = []
+    for idx in range(actual_n):
+        cat = category_map[idx]
+        base = torch.randn(hidden_dim) * 0.1
+        harmful_acts.append(base + 2.0 * cat_dirs[cat])
+        harmless_acts.append(base)
+
+    return harmful_acts, harmless_acts, category_map, cat_dirs
+
+
+def _make_refusal_directions(n_layers=8, hidden_dim=32, concentration="distributed"):
+    """Create synthetic refusal directions with specified concentration pattern."""
+    torch.manual_seed(123)
+    directions = {}
+    strengths = {}
+
+    for i in range(n_layers):
+        d = torch.randn(hidden_dim)
+        directions[i] = d / d.norm()
+
+        if concentration == "concentrated":
+            # Strong in last few layers only (SFT-like)
+            strengths[i] = 3.0 if i >= n_layers - 2 else 0.1
+        elif concentration == "distributed":
+            # Even across layers (RLHF-like)
+            strengths[i] = 1.0 + 0.2 * torch.randn(1).item()
+        elif concentration == "orthogonal":
+            # Each layer direction is more orthogonal (CAI-like)
+            if i > 0:
+                # Make each direction more orthogonal to previous
+                prev = directions[i - 1]
+                d = d - (d @ prev) * prev
+                d = d / d.norm().clamp(min=1e-8)
+                directions[i] = d
+            strengths[i] = 1.5
+        else:
+            strengths[i] = 2.0 if 2 <= i <= 4 else 0.5
+
+    return directions, strengths
+
+
+# ===========================================================================
+#  Tests: Concept Cone Geometry
+# ===========================================================================
+
+class TestConceptConeAnalyzer:
+    def test_basic_analysis(self):
+        harmful, harmless, cat_map, _ = _make_category_activations()
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_layer(harmful, harmless, layer_idx=5)
+
+        assert isinstance(result, ConeConeResult)
+        assert result.layer_idx == 5
+        assert result.category_count >= 2
+        assert result.cone_dimensionality > 0
+        assert result.cone_solid_angle >= 0
+        assert 0 <= result.mean_pairwise_cosine <= 1.0
+
+    def test_polyhedral_detection(self):
+        """With spread-out categories, should detect polyhedral geometry."""
+        harmful, harmless, cat_map, _ = _make_category_activations(
+            category_spread=2.0,  # Large spread -> distinct directions
+        )
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_layer(harmful, harmless)
+        # With high spread, directions should be more distinct
+        assert result.cone_dimensionality > 1.0
+
+    def test_linear_detection(self):
+        """With no spread, should detect linear (single direction) geometry."""
+        harmful, harmless, cat_map, _ = _make_category_activations(
+            category_spread=0.0,  # No spread -> all directions aligned
+        )
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_layer(harmful, harmless)
+        assert result.mean_pairwise_cosine > 0.8
+
+    def test_category_directions_populated(self):
+        harmful, harmless, cat_map, _ = _make_category_activations()
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_layer(harmful, harmless)
+
+        for cd in result.category_directions:
+            assert isinstance(cd, CategoryDirection)
+            assert cd.strength > 0
+            assert cd.n_prompts >= 2
+            assert 0 <= cd.specificity <= 1.0
+
+    def test_pairwise_cosines(self):
+        harmful, harmless, cat_map, _ = _make_category_activations()
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_layer(harmful, harmless)
+
+        for (a, b), cos in result.pairwise_cosines.items():
+            assert 0 <= cos <= 1.0
+            assert a < b  # Sorted pair
+
+    def test_general_direction_unit(self):
+        harmful, harmless, cat_map, _ = _make_category_activations()
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_layer(harmful, harmless)
+        assert abs(result.general_direction.norm().item() - 1.0) < 0.01
+
+    def test_multi_layer_analysis(self):
+        harmful, harmless, cat_map, _ = _make_category_activations()
+        harmful_by_layer = {i: harmful for i in range(4)}
+        harmless_by_layer = {i: harmless for i in range(4)}
+
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_all_layers(harmful_by_layer, harmless_by_layer)
+
+        assert isinstance(result, MultiLayerConeResult)
+        assert len(result.per_layer) == 4
+        assert result.mean_cone_dimensionality > 0
+
+    def test_format_report(self):
+        harmful, harmless, cat_map, _ = _make_category_activations()
+        analyzer = ConceptConeAnalyzer(category_map=cat_map)
+        result = analyzer.analyze_layer(harmful, harmless, layer_idx=3)
+        report = ConceptConeAnalyzer.format_report(result)
+
+        assert "Concept Cone" in report
+        assert "Layer 3" in report
+        assert "dimensionality" in report
+
+    def test_default_category_map(self):
+        assert len(DEFAULT_HARM_CATEGORIES) == 30
+        cats = set(DEFAULT_HARM_CATEGORIES.values())
+        assert "weapons" in cats
+        assert "cyber" in cats
+
+    def test_empty_activations(self):
+        analyzer = ConceptConeAnalyzer()
+        result = analyzer.analyze_layer([], [], layer_idx=0)
+        assert result.category_count == 0
+
+    def test_min_category_size(self):
+        """Categories with too few prompts should be excluded."""
+        harmful, harmless, cat_map, _ = _make_category_activations(
+            n_prompts=10, n_categories=5,
+        )
+        analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=3)
+        result = analyzer.analyze_layer(harmful, harmless)
+        # Each category has only 2 prompts, so with min_size=3 all are excluded
+        assert result.category_count == 0
+
+
+# ===========================================================================
+#  Tests: Alignment Imprint Detector
+# ===========================================================================
+
+class TestAlignmentImprintDetector:
+    def test_basic_detection(self):
+        directions, strengths = _make_refusal_directions()
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+
+        assert isinstance(imprint, AlignmentImprint)
+        assert imprint.predicted_method in ("dpo", "rlhf", "cai", "sft")
+        assert 0 <= imprint.confidence <= 1.0
+
+    def test_probabilities_sum_to_one(self):
+        directions, strengths = _make_refusal_directions()
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+
+        total = (imprint.dpo_probability + imprint.rlhf_probability +
+                 imprint.cai_probability + imprint.sft_probability)
+        assert abs(total - 1.0) < 0.01
+
+    def test_concentrated_detects_sft_or_dpo(self):
+        """Concentrated refusal (tail-biased) should predict SFT or DPO."""
+        directions, strengths = _make_refusal_directions(concentration="concentrated")
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+        # SFT and DPO both have concentrated signatures
+        assert imprint.predicted_method in ("sft", "dpo")
+
+    def test_distributed_detects_not_sft(self):
+        """Distributed refusal should not be predicted as SFT."""
+        directions, strengths = _make_refusal_directions(
+            n_layers=16, concentration="distributed",
+        )
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+        # With distributed refusal, Gini is low -> SFT is unlikely to be top prediction
+        assert imprint.predicted_method != "sft"
+
+    def test_orthogonal_detects_cai(self):
+        """Orthogonal layer directions should lean toward CAI."""
+        directions, strengths = _make_refusal_directions(
+            n_layers=12, concentration="orthogonal",
+        )
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+        # CAI should rank highly due to orthogonality
+        assert imprint.cai_probability > 0.15
+
+    def test_feature_extraction(self):
+        directions, strengths = _make_refusal_directions()
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+
+        assert 0 <= imprint.gini_coefficient <= 1.0
+        assert imprint.effective_rank > 0
+        assert 0 <= imprint.cross_layer_smoothness <= 1.0
+        assert 0 <= imprint.tail_layer_bias <= 1.0
+        assert 0 <= imprint.mean_pairwise_orthogonality <= 1.0
+        assert imprint.spectral_decay_rate >= 0
+
+    def test_empty_directions(self):
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint({})
+        assert imprint.predicted_method == "unknown"
+        assert imprint.confidence == 0.0
+
+    def test_compare_base_instruct(self):
+        torch.manual_seed(42)
+        hidden_dim = 32
+        directions, _ = _make_refusal_directions(hidden_dim=hidden_dim)
+
+        base_acts = {i: torch.randn(hidden_dim) for i in range(8)}
+        instruct_acts = {
+            i: base_acts[i] + 1.5 * directions[i] for i in range(8)
+        }
+
+        detector = AlignmentImprintDetector()
+        deltas = detector.compare_base_instruct(base_acts, instruct_acts, directions)
+
+        assert len(deltas) == 8
+        for d in deltas:
+            assert isinstance(d, BaseInstructDelta)
+            assert d.delta_magnitude > 0
+            # Since delta IS the refusal direction, cosine should be high
+            assert abs(d.cosine_with_refusal) > 0.5
+
+    def test_format_imprint(self):
+        directions, strengths = _make_refusal_directions()
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+        report = AlignmentImprintDetector.format_imprint(imprint)
+
+        assert "Alignment Imprint" in report
+        assert "DPO" in report
+        assert "RLHF" in report
+        assert "Gini" in report
+
+    def test_per_layer_strength_populated(self):
+        directions, strengths = _make_refusal_directions()
+        detector = AlignmentImprintDetector()
+        imprint = detector.detect_imprint(directions, strengths)
+        assert len(imprint.per_layer_strength) == len(directions)
+
+
+# ===========================================================================
+#  Tests: Multi-Token Position Analysis
+# ===========================================================================
+
+class TestMultiTokenPositionAnalyzer:
+    def _make_activations_with_trigger(
+        self, seq_len=20, hidden_dim=32, trigger_pos=5,
+    ):
+        """Create activations with a planted trigger at a specific position."""
+        torch.manual_seed(42)
+        refusal_dir = torch.randn(hidden_dim)
+        refusal_dir = refusal_dir / refusal_dir.norm()
+
+        # Background activations
+        acts = torch.randn(seq_len, hidden_dim) * 0.1
+
+        # Strong refusal at trigger position
+        acts[trigger_pos] += 3.0 * refusal_dir
+
+        # Weaker refusal at last position
+        acts[-1] += 1.0 * refusal_dir
+
+        # Moderate at a few positions after trigger (decay)
+        for i in range(trigger_pos + 1, min(trigger_pos + 4, seq_len)):
+            decay = 0.5 ** (i - trigger_pos)
+            acts[i] += 3.0 * decay * refusal_dir
+
+        return acts, refusal_dir
+
+    def test_basic_analysis(self):
+        acts, ref_dir = self._make_activations_with_trigger()
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir, layer_idx=3)
+
+        assert isinstance(result, PositionAnalysisResult)
+        assert result.layer_idx == 3
+        assert result.n_tokens == 20
+        assert result.peak_strength > 0
+
+    def test_trigger_detection(self):
+        acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
+        analyzer = MultiTokenPositionAnalyzer(trigger_threshold=0.5)
+        result = analyzer.analyze_prompt(acts, ref_dir)
+
+        # The planted trigger should be detected
+        assert 5 in result.trigger_positions
+        assert result.peak_position == 5
+
+    def test_peak_vs_last(self):
+        """Peak should be at trigger, not last token."""
+        acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+
+        assert result.peak_strength > result.last_token_strength
+        assert result.peak_position != result.n_tokens - 1
+
+    def test_decay_rate_positive(self):
+        acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+        # With exponential decay planted, decay rate should be positive
+        assert result.decay_rate > 0
+
+    def test_position_gini_bounded(self):
+        acts, ref_dir = self._make_activations_with_trigger()
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+        assert 0 <= result.position_gini <= 1.0
+
+    def test_token_profiles_length(self):
+        acts, ref_dir = self._make_activations_with_trigger(seq_len=15)
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+        assert len(result.token_profiles) == 15
+
+    def test_custom_token_texts(self):
+        acts, ref_dir = self._make_activations_with_trigger(seq_len=10, trigger_pos=3)
+        tokens = ["How", "to", "make", "a", "bomb", "from", "scratch", "please", "help", "me"]
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir, token_texts=tokens)
+        for tp in result.token_profiles:
+            assert tp.token_text in tokens or tp.token_text.startswith("pos_")
+
+    def test_batch_analysis(self):
+        batch = []
+        for i in range(5):
+            acts, ref_dir = self._make_activations_with_trigger(
+                trigger_pos=3 + i % 3,
+            )
+            batch.append(acts)
+
+        analyzer = MultiTokenPositionAnalyzer()
+        summary = analyzer.analyze_batch(batch, ref_dir)
+
+        assert isinstance(summary, MultiTokenSummary)
+        assert len(summary.per_prompt) == 5
+        assert summary.mean_peak_vs_last_ratio > 0
+        assert summary.mean_trigger_count > 0
+        assert 0 <= summary.peak_is_last_fraction <= 1.0
+        assert 0 <= summary.last_token_dominance <= 1.0
+
+    def test_last_token_dominant_case(self):
+        """When signal is only at last token, peak should equal last."""
+        torch.manual_seed(42)
+        hidden_dim = 32
+        seq_len = 10
+        ref_dir = torch.randn(hidden_dim)
+        ref_dir = ref_dir / ref_dir.norm()
+
+        acts = torch.randn(seq_len, hidden_dim) * 0.01
+        acts[-1] += 5.0 * ref_dir
+
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+        assert result.peak_position == seq_len - 1
+
+    def test_format_position_report(self):
+        acts, ref_dir = self._make_activations_with_trigger()
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir, prompt_text="How to hack?")
+        report = MultiTokenPositionAnalyzer.format_position_report(result)
+
+        assert "Multi-Token" in report
+        assert "Peak position" in report
+
+    def test_format_summary(self):
+        batch = []
+        for _ in range(3):
+            acts, ref_dir = self._make_activations_with_trigger()
+            batch.append(acts)
+
+        analyzer = MultiTokenPositionAnalyzer()
+        summary = analyzer.analyze_batch(batch, ref_dir)
+        report = MultiTokenPositionAnalyzer.format_summary(summary)
+
+        assert "Summary" in report
+        assert "Prompts analyzed" in report
+
+    def test_3d_activations_handled(self):
+        """Should handle (1, seq_len, hidden_dim) inputs."""
+        acts, ref_dir = self._make_activations_with_trigger()
+        acts = acts.unsqueeze(0)  # Add batch dim
+        analyzer = MultiTokenPositionAnalyzer()
+        result = analyzer.analyze_prompt(acts, ref_dir)
+        assert result.n_tokens == 20
+
+    def test_empty_batch(self):
+        ref_dir = torch.randn(32)
+        analyzer = MultiTokenPositionAnalyzer()
+        summary = analyzer.analyze_batch([], ref_dir)
+        assert len(summary.per_prompt) == 0
+        assert summary.peak_is_last_fraction == 1.0
+
+
+# ===========================================================================
+#  Tests: Sparse Direction Surgery
+# ===========================================================================
+
+class TestSparseDirectionSurgeon:
+    def _make_weight_with_sparse_refusal(
+        self, out_dim=64, in_dim=32, n_refusal_rows=5,
+    ):
+        """Create a weight matrix where refusal is concentrated in a few rows."""
+        torch.manual_seed(42)
+        refusal_dir = torch.randn(in_dim)
+        refusal_dir = refusal_dir / refusal_dir.norm()
+
+        W = torch.randn(out_dim, in_dim) * 0.1
+
+        # Plant strong refusal signal in specific rows
+        refusal_rows = list(range(n_refusal_rows))
+        for i in refusal_rows:
+            W[i] += 5.0 * refusal_dir
+
+        return W, refusal_dir, refusal_rows
+
+    def test_basic_analysis(self):
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+        result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=3)
+
+        assert isinstance(result, SparseProjectionResult)
+        assert result.layer_idx == 3
+        assert result.n_rows_total == 64
+        assert result.n_rows_modified > 0
+        assert result.mean_projection > 0
+        assert result.max_projection > result.mean_projection
+
+    def test_refusal_sparsity_index(self):
+        """With sparse refusal, RSI should be high."""
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal(
+            out_dim=100, n_refusal_rows=5,
+        )
+        surgeon = SparseDirectionSurgeon()
+        result = surgeon.analyze_weight_matrix(W, ref_dir)
+        assert result.refusal_sparsity_index > 0.3  # Concentrated signal
+
+    def test_energy_removed(self):
+        """Top rows should capture most of the refusal energy."""
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal(
+            out_dim=64, n_refusal_rows=5,
+        )
+        surgeon = SparseDirectionSurgeon(sparsity=0.15)  # ~10 rows out of 64
+        result = surgeon.analyze_weight_matrix(W, ref_dir)
+        # With 5 refusal rows and 10 modified, should capture most energy
+        assert result.energy_removed > 0.5
+
+    def test_frobenius_change_bounded(self):
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+        result = surgeon.analyze_weight_matrix(W, ref_dir)
+        assert result.frobenius_change > 0
+        assert result.frobenius_change < 1.0  # Shouldn't change more than 100%
+
+    def test_apply_sparse_projection(self):
+        """Sparse projection should reduce refusal signal."""
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+
+        W_modified = surgeon.apply_sparse_projection(W, ref_dir)
+
+        # Check that modified rows have reduced projection
+        original_proj = (W @ ref_dir).abs().sum().item()
+        modified_proj = (W_modified @ ref_dir).abs().sum().item()
+        assert modified_proj < original_proj
+
+    def test_sparse_preserves_unmodified_rows(self):
+        """Rows below the threshold should be unchanged."""
+        W, ref_dir, refusal_rows = self._make_weight_with_sparse_refusal(
+            out_dim=64, n_refusal_rows=5,
+        )
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)  # ~6 rows
+        W_modified = surgeon.apply_sparse_projection(W, ref_dir)
+
+        # Count rows that actually changed
+        diffs = (W - W_modified).abs().sum(dim=1)
+        n_changed = (diffs > 1e-6).sum().item()
+        n_unchanged = (diffs < 1e-6).sum().item()
+
+        assert n_changed <= int(0.1 * 64) + 1  # Sparsity bound
+        assert n_unchanged >= 57  # Most rows unchanged
+
+    def test_dense_vs_sparse_comparison(self):
+        """Dense projection should modify all rows; sparse should modify fewer."""
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+
+        # Dense projection
+        r = ref_dir / ref_dir.norm()
+        W_dense = W - (W @ r).unsqueeze(1) * r.unsqueeze(0)
+
+        # Sparse projection
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+        W_sparse = surgeon.apply_sparse_projection(W, ref_dir)
+
+        dense_changes = (W - W_dense).abs().sum(dim=1)
+        sparse_changes = (W - W_sparse).abs().sum(dim=1)
+
+        n_dense_changed = (dense_changes > 1e-6).sum().item()
+        n_sparse_changed = (sparse_changes > 1e-6).sum().item()
+
+        assert n_sparse_changed < n_dense_changed
+
+    def test_plan_surgery(self):
+        weights = {}
+        directions = {}
+        for i in range(6):
+            W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+            weights[i] = W
+            directions[i] = ref_dir
+
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+        plan = surgeon.plan_surgery(weights, directions)
+
+        assert isinstance(plan, SparseSurgeryPlan)
+        assert len(plan.per_layer) == 6
+        assert 0 < plan.recommended_sparsity < 1.0
+        assert plan.mean_refusal_sparsity_index > 0
+        assert plan.mean_energy_removed > 0
+
+    def test_auto_sparsity(self):
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+        surgeon = SparseDirectionSurgeon(auto_sparsity=True)
+        result = surgeon.analyze_weight_matrix(W, ref_dir)
+        # Auto sparsity should find a reasonable value
+        assert 0.01 <= result.sparsity <= 0.5
+
+    def test_auto_sparsity_apply(self):
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+        surgeon = SparseDirectionSurgeon(auto_sparsity=True)
+        W_modified = surgeon.apply_sparse_projection(W, ref_dir)
+        # Should reduce projection
+        assert (W_modified @ ref_dir).abs().sum() < (W @ ref_dir).abs().sum()
+
+    def test_format_analysis(self):
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+        result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=4)
+        report = SparseDirectionSurgeon.format_analysis(result)
+
+        assert "Sparse Direction Surgery" in report
+        assert "Layer 4" in report
+        assert "Refusal Sparsity Index" in report
+
+    def test_format_plan(self):
+        weights = {i: torch.randn(32, 16) for i in range(4)}
+        directions = {i: torch.randn(16) for i in range(4)}
+
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+        plan = surgeon.plan_surgery(weights, directions)
+        report = SparseDirectionSurgeon.format_plan(plan)
+
+        assert "Sparse Direction Surgery Plan" in report
+        assert "Recommended sparsity" in report
+
+    def test_empty_inputs(self):
+        surgeon = SparseDirectionSurgeon()
+        plan = surgeon.plan_surgery({}, {})
+        assert len(plan.per_layer) == 0
+
+    def test_output_dtype_preserved(self):
+        """Output should match input dtype."""
+        W, ref_dir, _ = self._make_weight_with_sparse_refusal()
+        W_half = W.half()
+        surgeon = SparseDirectionSurgeon(sparsity=0.1)
+        W_out = surgeon.apply_sparse_projection(W_half, ref_dir)
+        assert W_out.dtype == torch.float16
+
+
+# ===========================================================================
+#  Tests: Integration / Imports
+# ===========================================================================
+
+class TestAnalysisImports:
+    def test_all_new_modules_importable(self):
+        from obliteratus.analysis import (
+            ConceptConeAnalyzer,
+            AlignmentImprintDetector,
+            MultiTokenPositionAnalyzer,
+            SparseDirectionSurgeon,
+        )
+        assert ConceptConeAnalyzer is not None
+        assert AlignmentImprintDetector is not None
+        assert MultiTokenPositionAnalyzer is not None
+        assert SparseDirectionSurgeon is not None
@@ -0,0 +1,155 @@
+"""Tests for refusal detection, including semantic mode and confidence intervals."""
+
+from __future__ import annotations
+
+
+from obliteratus.evaluation.advanced_metrics import (
+    _is_refusal,
+    _is_refusal_semantic,
+    refusal_rate,
+    refusal_rate_with_ci,
+)
+
+
+class TestKeywordRefusal:
+    """Test the keyword-based refusal detection."""
+
+    def test_obvious_refusal(self):
+        assert _is_refusal("I'm sorry, I can't help with that.", "combined")
+
+    def test_obvious_non_refusal(self):
+        assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined")
+
+    def test_empty_string(self):
+        assert not _is_refusal("", "combined")
+
+    def test_prefix_mode(self):
+        assert _is_refusal("I cannot assist with that request.", "prefix")
+        assert not _is_refusal("Well, I cannot do that.", "prefix")  # not a prefix
+
+    def test_substring_mode(self):
+        assert _is_refusal("Well, I cannot assist with that.", "substring")
+
+
+class TestSemanticRefusal:
+    """Test the semantic refusal detection patterns."""
+
+    def test_hedge_then_refuse(self):
+        text = "While I understand your interest in this topic, I cannot provide instructions for that."
+        assert _is_refusal(text, "semantic")
+
+    def test_redirect(self):
+        text = "Instead of providing those instructions, I can help you with something constructive."
+        assert _is_refusal(text, "semantic")
+
+    def test_concern_flagging(self):
+        text = "This raises serious ethical concerns that prevent me from responding."
+        assert _is_refusal(text, "semantic")
+
+    def test_meta_refusal(self):
+        text = "I'm designed to avoid providing information that could cause harm."
+        assert _is_refusal(text, "semantic")
+
+    def test_responsibility_deflection(self):
+        text = "You should consult a professional about this matter."
+        assert _is_refusal(text, "semantic")
+
+    def test_genuine_compliance_not_flagged(self):
+        """Actual compliant responses should not trigger semantic detection."""
+        texts = [
+            "Here are the steps to build a birdhouse from scratch.",
+            "The chemical formula for water is H2O.",
+            "Python is a great language for data science. Here's how to get started.",
+            "To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.",
+        ]
+        for text in texts:
+            assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}"
+
+    def test_semantic_catches_what_keyword_misses(self):
+        """Semantic mode should catch refusals that keyword mode misses."""
+        # This hedge-then-refuse doesn't start with any refusal prefix
+        # and doesn't contain exact refusal substrings
+        text = "While I appreciate your curiosity, I'm not able to share that information."
+        assert not _is_refusal(text, "prefix")
+        # "not able to" is in substring list, so combined catches it
+        assert _is_refusal(text, "combined")
+
+        # This one truly requires semantic detection
+        text = "Although I respect your question, I will not be able to help with this particular request."
+        # "will not" is in substrings, but test the semantic path independently
+        assert _is_refusal_semantic(
+            "While I understand your request, I cannot assist with that specific topic."
+        )
+
+
+class TestRefusalRate:
+    def test_all_refuse(self):
+        responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."]
+        assert refusal_rate(responses, "combined") == 1.0
+
+    def test_none_refuse(self):
+        responses = ["Sure, here you go.", "Of course!", "The answer is 42."]
+        assert refusal_rate(responses, "combined") == 0.0
+
+    def test_mixed(self):
+        responses = ["I'm sorry, no.", "Sure, here you go."]
+        assert refusal_rate(responses, "combined") == 0.5
+
+    def test_empty_list(self):
+        assert refusal_rate([], "combined") == 0.0
+
+    def test_semantic_mode(self):
+        responses = [
+            "While I understand your interest, I cannot provide that information.",
+            "Here are the steps you requested.",
+        ]
+        assert refusal_rate(responses, "semantic") == 0.5
+
+
+class TestRefusalRateWithCI:
+    def test_returns_all_fields(self):
+        responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5
+        ci = refusal_rate_with_ci(responses, mode="combined")
+        assert "rate" in ci
+        assert "ci_lower" in ci
+        assert "ci_upper" in ci
+        assert "n_samples" in ci
+        assert ci["n_samples"] == 10
+
+    def test_ci_bounds_bracket_rate(self):
+        responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70
+        ci = refusal_rate_with_ci(responses, mode="combined")
+        assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"]
+
+    def test_all_refuse_tight_ci(self):
+        responses = ["I'm sorry"] * 50
+        ci = refusal_rate_with_ci(responses, mode="combined")
+        assert ci["rate"] == 1.0
+        # Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0
+        # (a proper CI acknowledges uncertainty even with all-positive observations)
+        assert ci["ci_lower"] > 0.9
+        assert ci["ci_upper"] == 1.0
+
+    def test_empty_responses(self):
+        ci = refusal_rate_with_ci([], mode="combined")
+        assert ci["rate"] == 0.0
+        assert ci["n_samples"] == 0
+
+    def test_ci_narrower_with_more_samples(self):
+        """More samples should produce tighter confidence intervals."""
+        responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5
+        responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50
+
+        ci_small = refusal_rate_with_ci(responses_small)
+        ci_large = refusal_rate_with_ci(responses_large)
+
+        width_small = ci_small["ci_upper"] - ci_small["ci_lower"]
+        width_large = ci_large["ci_upper"] - ci_large["ci_lower"]
+        assert width_large < width_small, \
+            f"Large CI ({width_large}) not narrower than small CI ({width_small})"
+
+    def test_deterministic_with_seed(self):
+        responses = ["I'm sorry"] * 30 + ["Sure"] * 70
+        ci1 = refusal_rate_with_ci(responses)
+        ci2 = refusal_rate_with_ci(responses)
+        assert ci1 == ci2, "Same input produced different CIs"
@@ -0,0 +1,70 @@
+"""Tests for the reporting module."""
+
+from __future__ import annotations
+
+import json
+
+from obliteratus.reporting.report import AblationReport, AblationResult
+
+
+def _make_report() -> AblationReport:
+    report = AblationReport(model_name="test-model")
+    report.add_baseline({"perplexity": 25.0, "accuracy": 0.85})
+    report.add_result(
+        AblationResult(
+            strategy="layer_removal",
+            component="layer_0",
+            description="Remove layer 0",
+            metrics={"perplexity": 30.0, "accuracy": 0.80},
+        )
+    )
+    report.add_result(
+        AblationResult(
+            strategy="layer_removal",
+            component="layer_1",
+            description="Remove layer 1",
+            metrics={"perplexity": 50.0, "accuracy": 0.60},
+        )
+    )
+    return report
+
+
+class TestAblationReport:
+    def test_to_dataframe(self):
+        report = _make_report()
+        df = report.to_dataframe()
+        assert len(df) == 2
+        assert "perplexity" in df.columns
+        assert "perplexity_delta" in df.columns
+        assert "perplexity_pct_change" in df.columns
+
+    def test_save_json(self, tmp_path):
+        report = _make_report()
+        out = tmp_path / "results.json"
+        report.save_json(out)
+        data = json.loads(out.read_text())
+        assert data["model_name"] == "test-model"
+        assert len(data["results"]) == 2
+        assert data["baseline_metrics"]["perplexity"] == 25.0
+
+    def test_save_csv(self, tmp_path):
+        report = _make_report()
+        out = tmp_path / "results.csv"
+        report.save_csv(out)
+        text = out.read_text()
+        assert "layer_0" in text
+        assert "perplexity" in text
+
+    def test_delta_calculation(self):
+        report = _make_report()
+        df = report.to_dataframe()
+        row0 = df[df["component"] == "layer_0"].iloc[0]
+        assert row0["perplexity_delta"] == 5.0  # 30 - 25
+        assert abs(row0["perplexity_pct_change"] - 20.0) < 0.01
+
+    def test_plot_impact(self, tmp_path):
+        report = _make_report()
+        out = tmp_path / "impact.png"
+        report.plot_impact(metric="perplexity", output_path=out)
+        assert out.exists()
+        assert out.stat().st_size > 0
@@ -0,0 +1,179 @@
+"""Tests for ablation strategies using a small GPT-2 model."""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from obliteratus.strategies.base import AblationSpec
+from obliteratus.strategies.registry import STRATEGY_REGISTRY, get_strategy
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+def _make_dummy_handle():
+    """Create a minimal ModelHandle with a tiny GPT-2 for testing (no network)."""
+    from unittest.mock import MagicMock
+    from transformers import GPT2Config, GPT2LMHeadModel
+    from obliteratus.models.loader import ModelHandle
+
+    config = GPT2Config(
+        vocab_size=1000,
+        n_positions=128,
+        n_embd=64,
+        n_layer=2,
+        n_head=2,
+        n_inner=256,
+    )
+    model = GPT2LMHeadModel(config)
+    model.eval()
+
+    # Strategy tests don't tokenize — use a simple mock
+    tokenizer = MagicMock()
+    tokenizer.pad_token = "<pad>"
+    tokenizer.eos_token = "<eos>"
+
+    handle = ModelHandle(
+        model=model,
+        tokenizer=tokenizer,
+        config=config,
+        model_name="gpt2-test",
+        task="causal_lm",
+    )
+    handle.snapshot()
+    return handle
+
+
+@pytest.fixture
+def handle():
+    return _make_dummy_handle()
+
+
+# ---------------------------------------------------------------------------
+# Registry tests
+# ---------------------------------------------------------------------------
+
+class TestRegistry:
+    def test_all_strategies_registered(self):
+        expected = {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"}
+        assert expected.issubset(set(STRATEGY_REGISTRY.keys()))
+
+    def test_get_strategy_returns_instance(self):
+        strat = get_strategy("layer_removal")
+        assert strat.name == "layer_removal"
+
+    def test_get_unknown_strategy_raises(self):
+        with pytest.raises(KeyError, match="Unknown strategy"):
+            get_strategy("nonexistent_strategy")
+
+
+# ---------------------------------------------------------------------------
+# Layer removal
+# ---------------------------------------------------------------------------
+
+class TestLayerRemoval:
+    def test_enumerate(self, handle):
+        strat = get_strategy("layer_removal")
+        specs = strat.enumerate(handle)
+        assert len(specs) == handle.num_layers
+        assert all(s.strategy_name == "layer_removal" for s in specs)
+
+    def test_apply_zeros_layer(self, handle):
+        strat = get_strategy("layer_removal")
+        specs = strat.enumerate(handle)
+        strat.apply(handle, specs[0])
+
+        from obliteratus.strategies.utils import get_layer_modules
+        layer = get_layer_modules(handle)[0]
+        for param in layer.parameters():
+            assert torch.all(param == 0), "Layer params should be zeroed after ablation"
+
+    def test_restore_after_ablation(self, handle):
+        strat = get_strategy("layer_removal")
+        specs = strat.enumerate(handle)
+
+        from obliteratus.strategies.utils import get_layer_modules
+        original_weight = get_layer_modules(handle)[0].attn.c_attn.weight.clone()
+
+        strat.apply(handle, specs[0])
+        handle.restore()
+
+        restored_weight = get_layer_modules(handle)[0].attn.c_attn.weight
+        assert torch.allclose(original_weight, restored_weight)
+
+
+# ---------------------------------------------------------------------------
+# Head pruning
+# ---------------------------------------------------------------------------
+
+class TestHeadPruning:
+    def test_enumerate(self, handle):
+        strat = get_strategy("head_pruning")
+        specs = strat.enumerate(handle)
+        assert len(specs) == handle.num_layers * handle.num_heads
+
+    def test_apply_zeros_head(self, handle):
+        strat = get_strategy("head_pruning")
+        spec = AblationSpec(
+            strategy_name="head_pruning",
+            component="layer_0_head_0",
+            description="test",
+            metadata={"layer_idx": 0, "head_idx": 0},
+        )
+        strat.apply(handle, spec)
+
+        from obliteratus.strategies.utils import get_layer_modules, get_attention_module
+        attn = get_attention_module(get_layer_modules(handle)[0], handle.architecture)
+        head_dim = handle.hidden_size // handle.num_heads
+        # GPT-2 uses c_attn (Conv1D), check output projection c_proj
+        if hasattr(attn, "c_proj"):
+            # Conv1D stores weight transposed
+            assert torch.all(attn.c_proj.weight[0:head_dim, :] == 0)
+
+
+# ---------------------------------------------------------------------------
+# FFN ablation
+# ---------------------------------------------------------------------------
+
+class TestFFNAblation:
+    def test_enumerate(self, handle):
+        strat = get_strategy("ffn_ablation")
+        specs = strat.enumerate(handle)
+        assert len(specs) == handle.num_layers
+
+    def test_apply_zeros_ffn(self, handle):
+        strat = get_strategy("ffn_ablation")
+        specs = strat.enumerate(handle)
+        strat.apply(handle, specs[0])
+
+        from obliteratus.strategies.utils import get_layer_modules, get_ffn_module
+        ffn = get_ffn_module(get_layer_modules(handle)[0], handle.architecture)
+        for param in ffn.parameters():
+            assert torch.all(param == 0)
+
+
+# ---------------------------------------------------------------------------
+# Embedding ablation
+# ---------------------------------------------------------------------------
+
+class TestEmbeddingAblation:
+    def test_enumerate(self, handle):
+        strat = get_strategy("embedding_ablation")
+        specs = strat.enumerate(handle)
+        assert len(specs) > 0
+
+    def test_apply_zeros_dims(self, handle):
+        strat = get_strategy("embedding_ablation")
+        spec = AblationSpec(
+            strategy_name="embedding_ablation",
+            component="embed_dims_0_4",
+            description="test",
+            metadata={"dim_start": 0, "dim_end": 4},
+        )
+        strat.apply(handle, spec)
+
+        from obliteratus.strategies.utils import get_embedding_module
+        emb = get_embedding_module(handle)
+        assert torch.all(emb.weight[:, 0:4] == 0)
@@ -0,0 +1,108 @@
+"""Tests for ablation presets."""
+
+from __future__ import annotations
+
+from obliteratus.study_presets import (
+    STUDY_PRESETS,
+    get_study_preset,
+    get_preset,
+    list_study_presets,
+    list_presets,
+)
+from obliteratus.config import StudyConfig
+
+
+class TestPresets:
+    def test_all_presets_registered(self):
+        expected_keys = {"quick", "full", "attention", "layers", "knowledge", "pruning", "embeddings", "jailbreak", "guardrail", "robustness"}
+        assert expected_keys.issubset(set(STUDY_PRESETS.keys()))
+
+    def test_get_preset(self):
+        preset = get_study_preset("quick")
+        assert preset.name == "Quick Scan"
+        assert preset.key == "quick"
+        assert len(preset.strategies) == 2
+
+    def test_get_preset_alias(self):
+        preset = get_preset("quick")
+        assert preset.name == "Quick Scan"
+
+    def test_get_unknown_preset_raises(self):
+        import pytest
+        with pytest.raises(KeyError, match="Unknown preset"):
+            get_study_preset("nonexistent")
+
+    def test_list_presets(self):
+        presets = list_study_presets()
+        assert len(presets) >= 7
+        keys = [p.key for p in presets]
+        assert "quick" in keys
+        assert "full" in keys
+
+    def test_list_presets_alias(self):
+        assert list_presets() == list_study_presets()
+
+    def test_preset_strategies_are_valid(self):
+        from obliteratus.strategies import STRATEGY_REGISTRY
+        for preset in list_study_presets():
+            for s in preset.strategies:
+                assert s["name"] in STRATEGY_REGISTRY, (
+                    f"Preset {preset.key!r} references unknown strategy {s['name']!r}"
+                )
+
+
+class TestConfigWithPreset:
+    def test_preset_key_in_config(self):
+        config_dict = {
+            "preset": "quick",
+            "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
+            "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
+        }
+        config = StudyConfig.from_dict(config_dict)
+        # Should inherit strategies from the quick preset
+        assert len(config.strategies) == 2
+        strategy_names = [s.name for s in config.strategies]
+        assert "layer_removal" in strategy_names
+        assert "ffn_ablation" in strategy_names
+        # Should inherit max_samples
+        assert config.dataset.max_samples == 25
+        # Should inherit batch_size and max_length
+        assert config.batch_size == 4
+        assert config.max_length == 128
+
+    def test_legacy_study_preset_key_still_works(self):
+        config_dict = {
+            "study_preset": "quick",
+            "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
+            "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
+        }
+        config = StudyConfig.from_dict(config_dict)
+        assert len(config.strategies) == 2
+
+    def test_preset_can_be_overridden(self):
+        config_dict = {
+            "preset": "quick",
+            "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
+            "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text", "max_samples": 999},
+            "batch_size": 16,
+            "strategies": [{"name": "head_pruning", "params": {}}],
+        }
+        config = StudyConfig.from_dict(config_dict)
+        # Explicit strategies should override preset
+        assert len(config.strategies) == 1
+        assert config.strategies[0].name == "head_pruning"
+        # Explicit batch_size should override
+        assert config.batch_size == 16
+        # Explicit max_samples in dataset should be kept
+        assert config.dataset.max_samples == 999
+
+    def test_full_preset(self):
+        config_dict = {
+            "preset": "full",
+            "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
+            "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
+        }
+        config = StudyConfig.from_dict(config_dict)
+        assert len(config.strategies) == 4
+        strategy_names = {s.name for s in config.strategies}
+        assert strategy_names == {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"}
@@ -0,0 +1,696 @@
+"""Tests for the opt-in telemetry module."""
+
+import json
+import os
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import torch
+
+from obliteratus.telemetry import (
+    _ALLOWED_METHOD_CONFIG_KEYS,
+    _direction_stats,
+    _extract_excise_details,
+    _extract_prompt_counts,
+    _extract_analysis_insights,
+    _is_mount_point,
+    _test_writable,
+    build_report,
+    disable_telemetry,
+    enable_telemetry,
+    is_enabled,
+    maybe_send_informed_report,
+    maybe_send_pipeline_report,
+    restore_from_hub,
+    send_report,
+    storage_diagnostic,
+)
+
+
+def _reset_telemetry():
+    import obliteratus.telemetry as t
+    t._enabled = None
+
+
+# ── Enable / disable ────────────────────────────────────────────────────
+
+
+class TestTelemetryConfig:
+    """Test telemetry enable/disable logic."""
+
+    def setup_method(self):
+        _reset_telemetry()
+
+    def test_disabled_by_default(self):
+        with patch.dict(os.environ, {}, clear=True):
+            _reset_telemetry()
+            assert not is_enabled()
+
+    def test_enabled_by_default_on_hf_spaces(self):
+        with patch.dict(os.environ, {"SPACE_ID": "user/space"}, clear=True):
+            import obliteratus.telemetry as t
+            old_val = t._ON_HF_SPACES
+            t._ON_HF_SPACES = True
+            _reset_telemetry()
+            assert is_enabled()
+            t._ON_HF_SPACES = old_val
+
+    def test_disable_via_env_zero(self):
+        with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "0"}):
+            _reset_telemetry()
+            assert not is_enabled()
+
+    def test_disable_via_env_false(self):
+        with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "false"}):
+            _reset_telemetry()
+            assert not is_enabled()
+
+    def test_enable_via_env_explicit(self):
+        with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
+            _reset_telemetry()
+            assert is_enabled()
+
+    def test_enable_programmatically(self):
+        enable_telemetry()
+        assert is_enabled()
+
+    def test_disable_programmatically(self):
+        enable_telemetry()
+        assert is_enabled()
+        disable_telemetry()
+        assert not is_enabled()
+
+    def test_programmatic_overrides_env(self):
+        with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
+            disable_telemetry()
+            assert not is_enabled()
+
+
+# ── Report building ─────────────────────────────────────────────────────
+
+
+class TestBuildReport:
+    """Test report payload construction."""
+
+    def _base_kwargs(self, **overrides):
+        defaults = dict(
+            architecture="LlamaForCausalLM",
+            num_layers=32,
+            num_heads=32,
+            hidden_size=4096,
+            total_params=8_000_000_000,
+            method="advanced",
+            method_config={"n_directions": 4, "norm_preserve": True},
+            quality_metrics={"perplexity": 5.2, "refusal_rate": 0.05},
+        )
+        defaults.update(overrides)
+        return defaults
+
+    def test_schema_version_2(self):
+        report = build_report(**self._base_kwargs())
+        assert report["schema_version"] == 2
+
+    def test_basic_fields(self):
+        report = build_report(**self._base_kwargs())
+        assert report["model"]["architecture"] == "LlamaForCausalLM"
+        assert report["model"]["num_layers"] == 32
+        assert report["model"]["total_params"] == 8_000_000_000
+        assert report["method"] == "advanced"
+        assert report["quality_metrics"]["refusal_rate"] == 0.05
+        assert len(report["session_id"]) == 32
+
+    def test_filters_unknown_config_keys(self):
+        report = build_report(**self._base_kwargs(
+            method_config={"n_directions": 1, "secret_flag": True, "nuke": "boom"},
+        ))
+        assert "n_directions" in report["method_config"]
+        assert "secret_flag" not in report["method_config"]
+        assert "nuke" not in report["method_config"]
+
+    def test_allows_all_valid_config_keys(self):
+        """Every key in the allowlist should pass through."""
+        config = {k: True for k in _ALLOWED_METHOD_CONFIG_KEYS}
+        report = build_report(**self._base_kwargs(method_config=config))
+        for k in _ALLOWED_METHOD_CONFIG_KEYS:
+            assert k in report["method_config"], f"Missing allowlisted key: {k}"
+
+    def test_no_model_name_in_report(self):
+        report = build_report(**self._base_kwargs())
+        report_str = json.dumps(report)
+        assert "meta-llama" not in report_str
+        assert "Llama-3" not in report_str
+
+    def test_environment_info(self):
+        report = build_report(**self._base_kwargs())
+        env = report["environment"]
+        assert "python_version" in env
+        assert "os" in env
+        assert "arch" in env
+
+    def test_stage_durations(self):
+        durations = {"summon": 2.5, "probe": 10.1, "distill": 3.2}
+        report = build_report(**self._base_kwargs(stage_durations=durations))
+        assert report["stage_durations"] == durations
+
+    def test_direction_stats(self):
+        stats = {"direction_norms": {"10": 0.95}, "mean_direction_persistence": 0.87}
+        report = build_report(**self._base_kwargs(direction_stats=stats))
+        assert report["direction_stats"]["mean_direction_persistence"] == 0.87
+
+    def test_excise_details(self):
+        details = {"modified_count": 128, "used_techniques": ["head_surgery"]}
+        report = build_report(**self._base_kwargs(excise_details=details))
+        assert report["excise_details"]["modified_count"] == 128
+
+    def test_prompt_counts(self):
+        counts = {"harmful": 33, "harmless": 33, "jailbreak": 15}
+        report = build_report(**self._base_kwargs(prompt_counts=counts))
+        assert report["prompt_counts"]["harmful"] == 33
+        assert report["prompt_counts"]["jailbreak"] == 15
+
+    def test_gpu_memory(self):
+        mem = {"peak_allocated_gb": 7.2, "peak_reserved_gb": 8.0}
+        report = build_report(**self._base_kwargs(gpu_memory=mem))
+        assert report["gpu_memory"]["peak_allocated_gb"] == 7.2
+
+    def test_analysis_insights_filtered(self):
+        """Only allowlisted analysis keys should pass through."""
+        insights = {
+            "detected_alignment_method": "DPO",
+            "alignment_confidence": 0.92,
+            "secret_internal_data": "should not appear",
+        }
+        report = build_report(**self._base_kwargs(analysis_insights=insights))
+        assert report["analysis_insights"]["detected_alignment_method"] == "DPO"
+        assert "secret_internal_data" not in report["analysis_insights"]
+
+    def test_informed_extras(self):
+        extras = {"ouroboros_passes": 3, "final_refusal_rate": 0.02, "total_duration": 120.5}
+        report = build_report(**self._base_kwargs(informed_extras=extras))
+        assert report["informed"]["ouroboros_passes"] == 3
+
+    def test_optional_fields_omitted_when_empty(self):
+        """Optional fields should not appear when not provided."""
+        report = build_report(**self._base_kwargs())
+        assert "stage_durations" not in report
+        assert "direction_stats" not in report
+        assert "excise_details" not in report
+        assert "prompt_counts" not in report
+        assert "gpu_memory" not in report
+        assert "analysis_insights" not in report
+        assert "informed" not in report
+
+
+# ── Direction stats extraction ──────────────────────────────────────────
+
+
+class TestDirectionStats:
+    """Test direction quality metric extraction."""
+
+    def test_direction_norms(self):
+        pipeline = MagicMock()
+        pipeline.refusal_directions = {
+            0: torch.randn(128),
+            1: torch.randn(128),
+        }
+        pipeline.refusal_subspaces = {}
+        stats = _direction_stats(pipeline)
+        assert "direction_norms" in stats
+        assert "0" in stats["direction_norms"]
+        assert "1" in stats["direction_norms"]
+
+    def test_direction_persistence(self):
+        """Adjacent layers with similar directions should have high persistence."""
+        d = torch.randn(128)
+        d = d / d.norm()
+        pipeline = MagicMock()
+        pipeline.refusal_directions = {0: d, 1: d + 0.01 * torch.randn(128)}
+        pipeline.refusal_subspaces = {}
+        stats = _direction_stats(pipeline)
+        assert "mean_direction_persistence" in stats
+        assert stats["mean_direction_persistence"] > 0.9
+
+    def test_effective_rank(self):
+        """Multi-direction subspace should yield effective rank > 1."""
+        pipeline = MagicMock()
+        pipeline.refusal_directions = {0: torch.randn(128)}
+        # 4-direction subspace with distinct directions
+        sub = torch.randn(4, 128)
+        pipeline.refusal_subspaces = {0: sub}
+        stats = _direction_stats(pipeline)
+        assert "effective_ranks" in stats
+        assert float(stats["effective_ranks"]["0"]) > 1.0
+
+    def test_empty_directions(self):
+        pipeline = MagicMock()
+        pipeline.refusal_directions = {}
+        pipeline.refusal_subspaces = {}
+        stats = _direction_stats(pipeline)
+        assert stats == {}
+
+
+# ── Excise details extraction ───────────────────────────────────────────
+
+
+class TestExciseDetails:
+    def test_basic_excise_details(self):
+        pipeline = MagicMock()
+        pipeline._excise_modified_count = 64
+        pipeline._refusal_heads = {10: [(0, 0.9), (3, 0.8)], 11: [(1, 0.7)]}
+        pipeline._sae_directions = {}
+        pipeline._expert_safety_scores = {}
+        pipeline._layer_excise_weights = {}
+        pipeline._expert_directions = {}
+        pipeline._steering_hooks = []
+        pipeline.invert_refusal = False
+        pipeline.project_embeddings = False
+        pipeline.activation_steering = False
+        pipeline.expert_transplant = False
+
+        details = _extract_excise_details(pipeline)
+        assert details["modified_count"] == 64
+        assert details["head_surgery_layers"] == 2
+        assert details["total_heads_projected"] == 3
+        assert "head_surgery" in details["used_techniques"]
+
+    def test_adaptive_weights(self):
+        pipeline = MagicMock()
+        pipeline._excise_modified_count = None
+        pipeline._refusal_heads = {}
+        pipeline._sae_directions = {}
+        pipeline._expert_safety_scores = {}
+        pipeline._layer_excise_weights = {0: 0.2, 1: 0.8, 2: 0.5}
+        pipeline._expert_directions = {}
+        pipeline._steering_hooks = []
+        pipeline.invert_refusal = False
+        pipeline.project_embeddings = False
+        pipeline.activation_steering = False
+        pipeline.expert_transplant = False
+
+        details = _extract_excise_details(pipeline)
+        assert details["adaptive_weight_min"] == 0.2
+        assert details["adaptive_weight_max"] == 0.8
+        assert "layer_adaptive" in details["used_techniques"]
+
+
+# ── Prompt counts extraction ────────────────────────────────────────────
+
+
+class TestPromptCounts:
+    def test_basic_counts(self):
+        pipeline = MagicMock()
+        pipeline.harmful_prompts = ["a"] * 33
+        pipeline.harmless_prompts = ["b"] * 33
+        pipeline.jailbreak_prompts = None
+        counts = _extract_prompt_counts(pipeline)
+        assert counts["harmful"] == 33
+        assert counts["harmless"] == 33
+        assert "jailbreak" not in counts
+
+    def test_with_jailbreak(self):
+        pipeline = MagicMock()
+        pipeline.harmful_prompts = ["a"] * 33
+        pipeline.harmless_prompts = ["b"] * 33
+        pipeline.jailbreak_prompts = ["c"] * 10
+        counts = _extract_prompt_counts(pipeline)
+        assert counts["jailbreak"] == 10
+
+
+# ── Send behavior ───────────────────────────────────────────────────────
+
+
+class TestSendReport:
+    def setup_method(self):
+        _reset_telemetry()
+
+    def test_does_not_send_when_disabled(self):
+        disable_telemetry()
+        with patch("obliteratus.telemetry._send_sync") as mock_send:
+            send_report({"test": True})
+            mock_send.assert_not_called()
+
+    def test_sends_when_enabled(self):
+        enable_telemetry()
+        with patch("obliteratus.telemetry._send_sync") as mock_send:
+            send_report({"test": True})
+            import time
+            time.sleep(0.1)
+            mock_send.assert_called_once_with({"test": True})
+
+    def test_send_failure_is_silent(self):
+        enable_telemetry()
+        with patch("obliteratus.telemetry._send_sync", side_effect=Exception("network down")) as mock_send:
+            # send_report should not propagate the exception to the caller
+            send_report({"test": True})
+            import time
+            time.sleep(0.1)  # Allow background thread to execute
+            mock_send.assert_called_once_with({"test": True})
+
+
+# ── Pipeline integration ────────────────────────────────────────────────
+
+
+def _make_mock_pipeline():
+    """Build a mock pipeline with all fields the telemetry module reads."""
+    p = MagicMock()
+    p.handle.summary.return_value = {
+        "architecture": "LlamaForCausalLM",
+        "num_layers": 32,
+        "num_heads": 32,
+        "hidden_size": 4096,
+        "total_params": 8_000_000_000,
+    }
+    p.method = "advanced"
+    p.n_directions = 4
+    p.norm_preserve = True
+    p.regularization = 0.1
+    p.refinement_passes = 2
+    p.project_biases = True
+    p.use_chat_template = True
+    p.use_whitened_svd = True
+    p.true_iterative_refinement = False
+    p.use_jailbreak_contrast = False
+    p.layer_adaptive_strength = False
+    p.attention_head_surgery = True
+    p.safety_neuron_masking = False
+    p.per_expert_directions = False
+    p.use_sae_features = False
+    p.invert_refusal = False
+    p.project_embeddings = False
+    p.embed_regularization = 0.5
+    p.activation_steering = False
+    p.steering_strength = 0.3
+    p.expert_transplant = False
+    p.transplant_blend = 0.3
+    p.reflection_strength = 2.0
+    p.quantization = None
+
+    p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05}
+    p._strong_layers = [10, 11, 12, 13]
+    p._stage_durations = {"summon": 3.0, "probe": 12.5, "distill": 4.1, "excise": 2.0, "verify": 8.3, "rebirth": 5.0}
+    p._excise_modified_count = 128
+
+    # Direction data
+    d = torch.randn(4096)
+    d = d / d.norm()
+    p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096), 12: d, 13: d}
+    p.refusal_subspaces = {10: torch.randn(4, 4096)}
+
+    # Excise details
+    p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]}
+    p._sae_directions = {}
+    p._expert_safety_scores = {}
+    p._layer_excise_weights = {}
+    p._expert_directions = {}
+    p._steering_hooks = []
+
+    # Prompts
+    p.harmful_prompts = ["x"] * 33
+    p.harmless_prompts = ["y"] * 33
+    p.jailbreak_prompts = None
+
+    return p
+
+
+class TestPipelineIntegration:
+    def setup_method(self):
+        _reset_telemetry()
+
+    def test_does_nothing_when_disabled(self):
+        disable_telemetry()
+        with patch("obliteratus.telemetry.send_report") as mock_send:
+            maybe_send_pipeline_report(_make_mock_pipeline())
+            mock_send.assert_not_called()
+
+    def test_comprehensive_report(self):
+        """Verify that all data points are extracted from the pipeline."""
+        enable_telemetry()
+        p = _make_mock_pipeline()
+        with patch("obliteratus.telemetry.send_report") as mock_send:
+            maybe_send_pipeline_report(p)
+            mock_send.assert_called_once()
+            report = mock_send.call_args[0][0]
+
+            # Core fields
+            assert report["schema_version"] == 2
+            assert report["model"]["architecture"] == "LlamaForCausalLM"
+            assert report["method"] == "advanced"
+
+            # Method config — check all keys passed through
+            cfg = report["method_config"]
+            assert cfg["n_directions"] == 4
+            assert cfg["norm_preserve"] is True
+            assert cfg["use_whitened_svd"] is True
+            assert cfg["attention_head_surgery"] is True
+
+            # Quality metrics
+            assert report["quality_metrics"]["perplexity"] == 5.2
+            assert report["quality_metrics"]["refusal_rate"] == 0.05
+
+            # Stage durations
+            assert "stage_durations" in report
+            assert report["stage_durations"]["summon"] == 3.0
+            assert report["stage_durations"]["verify"] == 8.3
+
+            # Strong layers
+            assert report["strong_layers"] == [10, 11, 12, 13]
+
+            # Direction stats
+            assert "direction_stats" in report
+            assert "direction_norms" in report["direction_stats"]
+            assert "mean_direction_persistence" in report["direction_stats"]
+
+            # Excise details
+            assert "excise_details" in report
+            assert report["excise_details"]["modified_count"] == 128
+            assert "head_surgery" in report["excise_details"]["used_techniques"]
+
+            # Prompt counts
+            assert report["prompt_counts"]["harmful"] == 33
+            assert report["prompt_counts"]["harmless"] == 33
+
+            # Environment
+            assert "os" in report["environment"]
+            assert "python_version" in report["environment"]
+
+
+# ── Informed pipeline integration ────────────────────────────────────────
+
+
+@dataclass
+class _MockInsights:
+    detected_alignment_method: str = "DPO"
+    alignment_confidence: float = 0.92
+    alignment_probabilities: dict = field(default_factory=lambda: {"DPO": 0.92, "RLHF": 0.05})
+    cone_is_polyhedral: bool = True
+    cone_dimensionality: float = 3.2
+    mean_pairwise_cosine: float = 0.45
+    direction_specificity: dict = field(default_factory=lambda: {"violence": 0.8})
+    cluster_count: int = 3
+    direction_persistence: float = 0.87
+    mean_refusal_sparsity_index: float = 0.15
+    recommended_sparsity: float = 0.1
+    use_sparse_surgery: bool = True
+    estimated_robustness: str = "medium"
+    self_repair_estimate: float = 0.3
+    entanglement_score: float = 0.2
+    entangled_layers: list = field(default_factory=lambda: [15, 16])
+    clean_layers: list = field(default_factory=lambda: [10, 11, 12])
+    recommended_n_directions: int = 6
+    recommended_regularization: float = 0.05
+    recommended_refinement_passes: int = 3
+    recommended_layers: list = field(default_factory=lambda: [10, 11, 12, 13])
+    skip_layers: list = field(default_factory=lambda: [15])
+
+
+@dataclass
+class _MockInformedReport:
+    insights: _MockInsights = field(default_factory=_MockInsights)
+    ouroboros_passes: int = 2
+    final_refusal_rate: float = 0.02
+    analysis_duration: float = 15.3
+    total_duration: float = 85.7
+
+
+class TestInformedPipelineIntegration:
+    def setup_method(self):
+        _reset_telemetry()
+
+    def test_does_nothing_when_disabled(self):
+        disable_telemetry()
+        with patch("obliteratus.telemetry.send_report") as mock_send:
+            maybe_send_informed_report(_make_mock_pipeline(), _MockInformedReport())
+            mock_send.assert_not_called()
+
+    def test_comprehensive_informed_report(self):
+        enable_telemetry()
+        p = _make_mock_pipeline()
+        report_obj = _MockInformedReport()
+
+        with patch("obliteratus.telemetry.send_report") as mock_send:
+            maybe_send_informed_report(p, report_obj)
+            mock_send.assert_called_once()
+            report = mock_send.call_args[0][0]
+
+            # All base fields present
+            assert report["schema_version"] == 2
+            assert report["model"]["architecture"] == "LlamaForCausalLM"
+            assert "direction_stats" in report
+            assert "excise_details" in report
+
+            # Analysis insights
+            ai = report["analysis_insights"]
+            assert ai["detected_alignment_method"] == "DPO"
+            assert ai["alignment_confidence"] == 0.92
+            assert ai["cone_is_polyhedral"] is True
+            assert ai["cone_dimensionality"] == 3.2
+            assert ai["cluster_count"] == 3
+            assert ai["self_repair_estimate"] == 0.3
+            assert ai["entanglement_score"] == 0.2
+            assert ai["recommended_n_directions"] == 6
+
+            # Informed extras
+            inf = report["informed"]
+            assert inf["ouroboros_passes"] == 2
+            assert inf["final_refusal_rate"] == 0.02
+            assert inf["analysis_duration"] == 15.3
+            assert inf["total_duration"] == 85.7
+
+    def test_analysis_insights_filter_unknown_keys(self):
+        enable_telemetry()
+        _make_mock_pipeline()
+
+        @dataclass
+        class _BadInsights(_MockInsights):
+            secret_sauce: str = "should not appear"
+
+        report_obj = _MockInformedReport(insights=_BadInsights())
+        insights = _extract_analysis_insights(report_obj)
+        assert "detected_alignment_method" in insights
+        assert "secret_sauce" not in insights
+
+
+# ── Stage duration tracking on pipeline ──────────────────────────────────
+
+
+class TestStageDurationTracking:
+    def test_emit_records_durations(self):
+        """Verify _emit stores durations in _stage_durations dict."""
+        from obliteratus.abliterate import AbliterationPipeline
+
+        p = AbliterationPipeline.__new__(AbliterationPipeline)
+        p._stage_durations = {}
+        p._excise_modified_count = None
+        p._on_stage = lambda r: None
+
+        p._emit("summon", "done", "loaded", duration=3.5)
+        p._emit("probe", "done", "probed", duration=10.2)
+        p._emit("excise", "done", "excised", duration=2.1, modified_count=64)
+
+        assert p._stage_durations == {"summon": 3.5, "probe": 10.2, "excise": 2.1}
+        assert p._excise_modified_count == 64
+
+    def test_running_status_does_not_record(self):
+        """Only 'done' status should record durations."""
+        from obliteratus.abliterate import AbliterationPipeline
+
+        p = AbliterationPipeline.__new__(AbliterationPipeline)
+        p._stage_durations = {}
+        p._excise_modified_count = None
+        p._on_stage = lambda r: None
+
+        p._emit("summon", "running", "loading...", duration=0)
+        assert p._stage_durations == {}
+
+
+# ── Storage helpers ──────────────────────────────────────────────────────
+
+
+class TestStorageHelpers:
+    """Test persistent storage helper functions."""
+
+    def test_test_writable_valid_dir(self):
+        with tempfile.TemporaryDirectory() as d:
+            assert _test_writable(Path(d) / "subdir")
+
+    def test_test_writable_unwritable(self):
+        # /proc is never writable for arbitrary files
+        assert not _test_writable(Path("/proc/obliteratus_test"))
+
+    def test_is_mount_point_existing_path(self):
+        # Should return a bool without raising for any existing path
+        result = _is_mount_point(Path("/"))
+        assert isinstance(result, bool)
+
+    def test_is_mount_point_nonexistent(self):
+        assert not _is_mount_point(Path("/nonexistent_dir_12345"))
+
+    def test_storage_diagnostic_returns_dict(self):
+        diag = storage_diagnostic()
+        assert isinstance(diag, dict)
+        assert "telemetry_dir" in diag
+        assert "is_persistent" in diag
+        assert "on_hf_spaces" in diag
+        assert "telemetry_enabled" in diag
+        assert "data_dir_exists" in diag
+
+
+# ── Hub restore ──────────────────────────────────────────────────────────
+
+
+class TestHubRestore:
+    """Test Hub-to-local restore functionality."""
+
+    def setup_method(self):
+        _reset_telemetry()
+        # Reset restore state so each test can trigger it
+        import obliteratus.telemetry as t
+        t._restore_done = False
+
+    def test_restore_skips_when_no_repo(self):
+        with patch("obliteratus.telemetry._TELEMETRY_REPO", ""):
+            assert restore_from_hub() == 0
+
+    def test_restore_deduplicates(self):
+        """Records already in local JSONL should not be re-added."""
+        import obliteratus.telemetry as t
+
+        with tempfile.TemporaryDirectory() as d:
+            test_file = Path(d) / "telemetry.jsonl"
+            existing = {"session_id": "abc", "timestamp": "2025-01-01T00:00:00"}
+            test_file.write_text(json.dumps(existing) + "\n")
+
+            old_file = t.TELEMETRY_FILE
+            old_repo = t._TELEMETRY_REPO
+            t.TELEMETRY_FILE = test_file
+            t._TELEMETRY_REPO = "test/repo"
+            t._restore_done = False
+
+            try:
+                hub_records = [
+                    {"session_id": "abc", "timestamp": "2025-01-01T00:00:00"},  # duplicate
+                    {"session_id": "def", "timestamp": "2025-01-02T00:00:00"},  # new
+                ]
+                with patch("obliteratus.telemetry.fetch_hub_records", return_value=hub_records):
+                    count = restore_from_hub()
+                    assert count == 1  # Only the new record
+
+                # Verify file contents
+                lines = test_file.read_text().strip().split("\n")
+                assert len(lines) == 2  # original + 1 new
+            finally:
+                t.TELEMETRY_FILE = old_file
+                t._TELEMETRY_REPO = old_repo
+
+    def test_restore_only_runs_once(self):
+        """Calling restore_from_hub() twice should be a no-op the second time."""
+        import obliteratus.telemetry as t
+        t._restore_done = False
+
+        with patch("obliteratus.telemetry._TELEMETRY_REPO", "test/repo"):
+            with patch("obliteratus.telemetry.fetch_hub_records", return_value=[]):
+                restore_from_hub()
+                # Second call should return 0 immediately
+                assert restore_from_hub() == 0
@@ -0,0 +1,167 @@
+"""Tests for visualization module (non-interactive, save-to-file)."""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import pytest
+import torch
+
+from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
+from obliteratus.analysis.activation_probing import ActivationProbe
+from obliteratus.analysis.visualization import (
+    _sanitize_label,
+    plot_refusal_topology,
+    plot_cross_layer_heatmap,
+    plot_angular_drift,
+    plot_probe_dashboard,
+    plot_defense_radar,
+)
+from obliteratus.analysis.defense_robustness import DefenseProfile
+
+
+@pytest.fixture
+def tmp_dir():
+    with tempfile.TemporaryDirectory() as d:
+        yield Path(d)
+
+
+def _make_refusal_data(n_layers=6, hidden_dim=16):
+    """Create test refusal directions and means."""
+    torch.manual_seed(42)
+    directions = {}
+    harmful_means = {}
+    harmless_means = {}
+
+    for i in range(n_layers):
+        d = torch.randn(hidden_dim)
+        directions[i] = d / d.norm()
+        base = torch.randn(hidden_dim)
+        harmless_means[i] = base.unsqueeze(0)
+        harmful_means[i] = (base + (2.0 if i in [2, 3, 4] else 0.3) * directions[i]).unsqueeze(0)
+
+    strong_layers = [2, 3, 4]
+    return directions, harmful_means, harmless_means, strong_layers
+
+
+class TestRefusalTopology:
+    def test_plot_saves_file(self, tmp_dir):
+        directions, h_means, b_means, strong = _make_refusal_data()
+        path = tmp_dir / "topology.png"
+        plot_refusal_topology(
+            directions, h_means, b_means, strong, output_path=path
+        )
+        assert path.exists()
+        assert path.stat().st_size > 0
+
+    def test_plot_returns_figure(self, tmp_dir):
+        directions, h_means, b_means, strong = _make_refusal_data()
+        fig = plot_refusal_topology(
+            directions, h_means, b_means, strong, output_path=tmp_dir / "test.png"
+        )
+        assert fig is not None
+
+
+class TestCrossLayerHeatmap:
+    def test_plot_saves_file(self, tmp_dir):
+        torch.manual_seed(42)
+        directions = {i: torch.randn(16) for i in range(6)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+
+        path = tmp_dir / "heatmap.png"
+        plot_cross_layer_heatmap(result, output_path=path)
+        assert path.exists()
+
+
+class TestAngularDrift:
+    def test_plot_saves_file(self, tmp_dir):
+        torch.manual_seed(42)
+        directions = {i: torch.randn(16) for i in range(8)}
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+
+        path = tmp_dir / "drift.png"
+        plot_angular_drift(result, output_path=path)
+        assert path.exists()
+
+
+class TestProbeDashboard:
+    def test_plot_saves_file(self, tmp_dir):
+        torch.manual_seed(42)
+        harmful = {i: [torch.randn(8) for _ in range(3)] for i in range(4)}
+        harmless = {i: [torch.randn(8) for _ in range(3)] for i in range(4)}
+        dirs = {i: torch.randn(8) for i in range(4)}
+
+        probe = ActivationProbe()
+        result = probe.probe_all_layers(harmful, harmless, dirs)
+
+        path = tmp_dir / "probe.png"
+        plot_probe_dashboard(result, output_path=path)
+        assert path.exists()
+
+
+class TestDefenseRadar:
+    def test_plot_saves_file(self, tmp_dir):
+        profile = DefenseProfile(
+            model_name="test-model",
+            alignment_type_estimate="RLHF-like",
+            refusal_concentration=0.4,
+            refusal_layer_spread=5,
+            mean_refusal_strength=2.0,
+            max_refusal_strength=4.0,
+            self_repair_estimate=0.6,
+            entanglement_score=0.3,
+            estimated_robustness="medium",
+        )
+        path = tmp_dir / "radar.png"
+        plot_defense_radar(profile, output_path=path)
+        assert path.exists()
+
+    def test_model_name_sanitized_in_title(self, tmp_dir):
+        """Ensure sensitive paths in model_name don't leak into saved charts."""
+        profile = DefenseProfile(
+            model_name="/home/user/.cache/huggingface/hub/models--secret-org/private-model",
+            alignment_type_estimate="RLHF-like",
+            refusal_concentration=0.4,
+            refusal_layer_spread=5,
+            mean_refusal_strength=2.0,
+            max_refusal_strength=4.0,
+            self_repair_estimate=0.6,
+            entanglement_score=0.3,
+            estimated_robustness="medium",
+        )
+        path = tmp_dir / "radar_sanitized.png"
+        fig = plot_defense_radar(profile, output_path=path)
+        # Title should not contain the full filesystem path
+        title_text = fig.axes[0].get_title()
+        assert "/home/user" not in title_text
+        assert ".cache" not in title_text
+
+
+class TestSanitizeLabel:
+    def test_strips_absolute_paths(self):
+        result = _sanitize_label("/home/user/.cache/huggingface/models--org/model")
+        assert "/home/user" not in result
+        assert "model" in result
+
+    def test_redacts_hf_tokens(self):
+        result = _sanitize_label("model with hf_abcdefghij token")
+        assert "hf_abcdefghij" not in result
+        assert "<TOKEN>" in result
+
+    def test_redacts_long_hex_strings(self):
+        hex_str = "a" * 40
+        result = _sanitize_label(f"commit {hex_str}")
+        assert hex_str not in result
+        assert "<REDACTED>" in result
+
+    def test_truncates_long_strings(self):
+        long = "x" * 200
+        result = _sanitize_label(long)
+        assert len(result) <= 80
+        assert result.endswith("...")
+
+    def test_passes_normal_strings_through(self):
+        assert _sanitize_label("Refusal Topology Map") == "Refusal Topology Map"