OBLITERATUS/tests/test_abliterate.py

"""Tests for the SOTA abliteration pipeline."""

from __future__ import annotations

import json
from pathlib import Path
from unittest.mock import MagicMock

import pytest
import torch
from transformers import GPT2Config, GPT2LMHeadModel

from obliteratus.abliterate import (
    HARMFUL_PROMPTS,
    HARMLESS_PROMPTS,
    METHODS,
    STAGES,
    AbliterationPipeline,
    PipelineStage,
    StageResult,
)
from obliteratus.models.loader import ModelHandle


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

def _make_tiny_handle():
    """Create a minimal ModelHandle with a tiny GPT-2 for testing."""
    config = GPT2Config(
        vocab_size=1000,
        n_positions=128,
        n_embd=64,
        n_layer=4,
        n_head=2,
        n_inner=256,
    )
    model = GPT2LMHeadModel(config)
    model.eval()

    tokenizer = MagicMock()
    tokenizer.pad_token = "<pad>"
    tokenizer.eos_token = "<eos>"
    tokenizer.return_value = {
        "input_ids": torch.randint(0, 1000, (1, 10)),
        "attention_mask": torch.ones(1, 10, dtype=torch.long),
    }
    tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city"

    handle = ModelHandle(
        model=model,
        tokenizer=tokenizer,
        config=config,
        model_name="gpt2-test",
        task="causal_lm",
    )
    handle.snapshot()
    return handle


def _make_varied_tokenizer(handle):
    """Set up a tokenizer mock that returns different tokens per call."""
    call_count = [0]
    def mock_tokenizer(prompt, **kwargs):
        call_count[0] += 1
        torch.manual_seed(call_count[0])
        return {
            "input_ids": torch.randint(0, 1000, (1, 5)),
            "attention_mask": torch.ones(1, 5, dtype=torch.long),
        }
    handle.tokenizer.side_effect = mock_tokenizer


@pytest.fixture
def handle():
    return _make_tiny_handle()


# ---------------------------------------------------------------------------
# Data & stage definitions
# ---------------------------------------------------------------------------

class TestPrompts:
    def test_harmful_prompts_expanded(self):
        assert len(HARMFUL_PROMPTS) >= 99

    def test_harmless_prompts_expanded(self):
        assert len(HARMLESS_PROMPTS) >= 99

    def test_prompt_lists_same_length(self):
        assert len(HARMFUL_PROMPTS) == len(HARMLESS_PROMPTS)

    def test_prompt_count_512(self):
        """512 prompts across 7 severity tiers."""
        assert len(HARMFUL_PROMPTS) == 512
        assert len(HARMLESS_PROMPTS) == 512

    def test_prompt_volume_slicing(self):
        """Slicing at standard volumes gives correct counts."""
        for n in (33, 66, 99, 256, 512):
            assert len(HARMFUL_PROMPTS[:n]) == n
            assert len(HARMLESS_PROMPTS[:n]) == n


class TestStages:
    def test_six_stages(self):
        assert len(STAGES) == 6

    def test_stage_keys(self):
        keys = [s.key for s in STAGES]
        assert keys == ["summon", "probe", "distill", "excise", "verify", "rebirth"]

    def test_stage_dataclass(self):
        stage = PipelineStage(key="test", name="TEST", description="A test stage")
        assert stage.key == "test"
        assert stage.name == "TEST"

    def test_stage_result_defaults(self):
        result = StageResult(stage="test", status="running")
        assert result.message == ""
        assert result.duration == 0.0
        assert result.details == {}


# ---------------------------------------------------------------------------
# Method presets
# ---------------------------------------------------------------------------

class TestMethods:
    def test_methods_exist(self):
        assert set(METHODS.keys()) == {"basic", "advanced", "aggressive", "informed", "surgical", "inverted", "nuclear", "optimized", "failspy", "gabliteration", "heretic", "rdo", "spectral_cascade"}

    def test_basic_single_direction(self):
        cfg = METHODS["basic"]
        assert cfg["n_directions"] == 1
        assert cfg["norm_preserve"] is False
        assert cfg["regularization"] == 0.0
        assert cfg["refinement_passes"] == 1

    def test_advanced_multi_direction(self):
        cfg = METHODS["advanced"]
        assert cfg["n_directions"] > 1
        assert cfg["norm_preserve"] is True
        assert cfg["regularization"] > 0
        assert cfg["refinement_passes"] >= 2

    def test_aggressive_full_gabliteration(self):
        cfg = METHODS["aggressive"]
        assert cfg["n_directions"] >= 8
        assert cfg["norm_preserve"] is True
        assert cfg["refinement_passes"] >= 3


# ---------------------------------------------------------------------------
# Pipeline init
# ---------------------------------------------------------------------------

class TestPipelineInit:
    def test_default_prompts(self):
        pipeline = AbliterationPipeline(model_name="test-model")
        assert pipeline.harmful_prompts == HARMFUL_PROMPTS
        assert pipeline.harmless_prompts == HARMLESS_PROMPTS

    def test_custom_prompts(self):
        harmful = ["bad prompt"]
        harmless = ["good prompt"]
        pipeline = AbliterationPipeline(
            model_name="test-model",
            harmful_prompts=harmful,
            harmless_prompts=harmless,
        )
        assert pipeline.harmful_prompts == harmful
        assert pipeline.harmless_prompts == harmless

    def test_defaults(self):
        pipeline = AbliterationPipeline(model_name="test-model")
        assert pipeline.device == "auto"
        assert pipeline.dtype == "float16"
        assert pipeline.output_dir == Path("abliterated")
        assert pipeline.trust_remote_code is False
        assert pipeline.handle is None

    def test_default_method_is_advanced(self):
        pipeline = AbliterationPipeline(model_name="test-model")
        assert pipeline.method == "advanced"
        assert pipeline.n_directions == METHODS["advanced"]["n_directions"]
        assert pipeline.norm_preserve == METHODS["advanced"]["norm_preserve"]
        assert pipeline.regularization == METHODS["advanced"]["regularization"]

    def test_method_basic(self):
        pipeline = AbliterationPipeline(model_name="test-model", method="basic")
        assert pipeline.n_directions == 1
        assert pipeline.norm_preserve is False
        assert pipeline.regularization == 0.0

    def test_method_aggressive(self):
        pipeline = AbliterationPipeline(model_name="test-model", method="aggressive")
        assert pipeline.n_directions == 8
        assert pipeline.norm_preserve is True
        assert pipeline.refinement_passes == 3

    def test_explicit_overrides_method(self):
        pipeline = AbliterationPipeline(
            model_name="test-model",
            method="basic",
            n_directions=6,
            norm_preserve=True,
            regularization=0.5,
            refinement_passes=4,
        )
        assert pipeline.n_directions == 6
        assert pipeline.norm_preserve is True
        assert pipeline.regularization == 0.5
        assert pipeline.refinement_passes == 4

    def test_callbacks(self):
        stage_results = []
        log_msgs = []
        pipeline = AbliterationPipeline(
            model_name="test-model",
            on_stage=lambda r: stage_results.append(r),
            on_log=lambda m: log_msgs.append(m),
        )
        pipeline.log("hello")
        assert log_msgs == ["hello"]

        pipeline._emit("test", "running", "msg")
        assert len(stage_results) == 1
        assert stage_results[0].stage == "test"


# ---------------------------------------------------------------------------
# _project_out_advanced (norm-preserving + regularization)
# ---------------------------------------------------------------------------

class TestProjectOutAdvanced:
    def test_norm_preserving(self):
        """Norm-preserving mode should keep Frobenius norm constant."""
        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(4, 8, bias=False)

        module = Wrapper()
        torch.manual_seed(42)
        module.o_proj.weight.data = torch.randn(8, 4)
        original_norm = module.o_proj.weight.data.norm().item()

        direction = torch.randn(4, 1)
        direction = direction / direction.norm()

        AbliterationPipeline._project_out_advanced(
            module, direction, ["o_proj"], norm_preserve=True, regularization=0.0
        )

        new_norm = module.o_proj.weight.data.norm().item()
        # With amplification cap (1.10x max), exact norm preservation isn't
        # guaranteed on tiny matrices (hidden_dim=4) where a single direction
        # removes a large fraction of energy.  Verify the norm is closer to
        # original than the un-preserved norm would be (i.e. cap is working).
        without_preserve_norm_sq = original_norm ** 2 - (module.o_proj.weight.data @ direction).pow(2).sum().item()
        # The new norm should be >= the un-preserved norm (cap restores some)
        assert new_norm >= original_norm * 0.85, \
            f"Norm should be approximately preserved (within cap): {original_norm:.4f} vs {new_norm:.4f}"

    def test_regularization_partial_removal(self):
        """Regularization should preserve some of the refusal component."""
        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(4, 8, bias=False)

        module_full = Wrapper()
        module_reg = Wrapper()
        torch.manual_seed(42)
        W_orig = torch.randn(8, 4)
        module_full.o_proj.weight.data = W_orig.clone()
        module_reg.o_proj.weight.data = W_orig.clone()

        direction = torch.randn(4, 1)
        direction = direction / direction.norm()

        # Full removal
        AbliterationPipeline._project_out_advanced(
            module_full, direction, ["o_proj"], norm_preserve=False, regularization=0.0
        )
        # Regularized (30% preserved)
        AbliterationPipeline._project_out_advanced(
            module_reg, direction, ["o_proj"], norm_preserve=False, regularization=0.3
        )

        W_full = module_full.o_proj.weight.data
        W_reg = module_reg.o_proj.weight.data

        # Full removal should have zero projection on direction
        proj_full = (W_full @ direction).norm().item()
        assert proj_full < 1e-4

        # Regularized should have non-zero projection (30% preserved)
        proj_reg = (W_reg @ direction).norm().item()
        proj_orig = (W_orig @ direction).norm().item()
        expected_ratio = 0.3
        actual_ratio = proj_reg / proj_orig if proj_orig > 0 else 0
        assert abs(actual_ratio - expected_ratio) < 0.05, \
            f"Expected ~{expected_ratio:.0%} preserved, got {actual_ratio:.0%}"

    def test_norm_preserving_transposed(self):
        """Norm-preserving should also work for transposed weights."""
        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.c_proj = torch.nn.Linear(8, 4, bias=False)

        module = Wrapper()
        torch.manual_seed(42)
        module.c_proj.weight.data = torch.randn(4, 8)
        original_norm = module.c_proj.weight.data.norm().item()

        direction = torch.randn(4, 1)
        direction = direction / direction.norm()

        AbliterationPipeline._project_out_advanced(
            module, direction, ["c_proj"], norm_preserve=True, regularization=0.0
        )

        new_norm = module.c_proj.weight.data.norm().item()
        # With amplification cap (1.10x max), exact norm preservation isn't
        # guaranteed on tiny matrices where a single direction removes a large
        # fraction of energy.
        assert new_norm >= original_norm * 0.80, \
            f"Norm should be approximately preserved (within cap): {original_norm:.4f} vs {new_norm:.4f}"


# ---------------------------------------------------------------------------
# Full attention projection (q/k/v + o_proj)
# ---------------------------------------------------------------------------

class TestAttentionFullProjection:
    """Test that ALL attention weight matrices are projected (not just o_proj)."""

    def test_qkv_all_projected(self):
        """q_proj, k_proj, v_proj should all be projected alongside o_proj."""
        hidden = 16

        class FakeAttn(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.q_proj = torch.nn.Linear(hidden, hidden, bias=False)
                self.k_proj = torch.nn.Linear(hidden, hidden, bias=False)
                self.v_proj = torch.nn.Linear(hidden, hidden, bias=False)
                self.o_proj = torch.nn.Linear(hidden, hidden, bias=False)

        attn = FakeAttn()
        torch.manual_seed(42)
        for p in attn.parameters():
            p.data = torch.randn_like(p.data)

        originals = {
            name: getattr(attn, name).weight.data.clone()
            for name in ["q_proj", "k_proj", "v_proj", "o_proj"]
        }

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        from obliteratus.abliterate import _ATTN_OUT_NAMES, _ATTN_IN_NAMES
        count = AbliterationPipeline._project_out_advanced(
            attn, d, _ATTN_OUT_NAMES + _ATTN_IN_NAMES,
        )

        assert count == 4, f"Should project 4 weights (q/k/v/o), got {count}"
        for name in ["q_proj", "k_proj", "v_proj", "o_proj"]:
            assert not torch.allclose(
                getattr(attn, name).weight.data, originals[name]
            ), f"{name} should be modified"

    def test_project_all_does_not_early_return(self):
        """_project_out_advanced should project ALL matching weights, not just first."""
        hidden = 16

        class FakeModule(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.up_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.gate_proj = torch.nn.Linear(hidden, 32, bias=False)

        mod = FakeModule()
        torch.manual_seed(42)
        orig_up = mod.up_proj.weight.data.clone()
        orig_gate = mod.gate_proj.weight.data.clone()

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        from obliteratus.abliterate import _FFN_IN_NAMES
        count = AbliterationPipeline._project_out_advanced(mod, d, _FFN_IN_NAMES)

        assert count == 2, f"Should project both up_proj and gate_proj, got {count}"
        assert not torch.allclose(mod.up_proj.weight.data, orig_up), "up_proj should be modified"
        assert not torch.allclose(mod.gate_proj.weight.data, orig_gate), "gate_proj should be modified"

    def test_lm_head_projection(self):
        """lm_head should be projectable via _project_out_advanced."""
        hidden = 16
        vocab = 100

        class FakeModel(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.lm_head = torch.nn.Linear(hidden, vocab, bias=False)

        model = FakeModel()
        torch.manual_seed(42)
        orig = model.lm_head.weight.data.clone()

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        count = AbliterationPipeline._project_out_advanced(
            model, d, ["lm_head"], regularization=0.0,
        )

        assert count == 1, "Should project lm_head"
        assert not torch.allclose(model.lm_head.weight.data, orig), "lm_head should be modified"
        # Verify refusal direction is removed from lm_head
        proj = (model.lm_head.weight.data @ d).norm().item()
        assert proj < 1e-4, f"Refusal direction should be removed from lm_head, proj={proj}"


class TestKneeDetectionThreshold:
    """Test that knee detection uses 5% threshold to include more layers."""

    def test_five_percent_threshold_includes_more(self):
        """Layers between 5% and 10% of max should now be included."""
        # Layer norms: max=10.0, then several between 5%-10%
        sorted_layers = [(0, 10.0), (1, 8.0), (2, 6.0), (3, 0.7), (4, 0.6)]
        selected = AbliterationPipeline._select_layers_knee(sorted_layers)
        # 0.7 and 0.6 are 7% and 6% of max — should now be included (> 5% threshold)
        assert 3 in selected or 4 in selected, (
            f"Layers with 6-7% of max signal should be included, got {selected}"
        )


# ---------------------------------------------------------------------------
# MoE projection (router, shared expert, input/output, fused)
# ---------------------------------------------------------------------------

class TestProjectMoEExperts:
    """Test the full MoE projection pipeline: router, shared expert, experts."""

    def _make_direction(self, hidden_dim=16):
        d = torch.randn(hidden_dim, 1)
        return d / d.norm()

    def test_router_gate_projected(self):
        """Router/gate weight should have refusal direction removed."""
        hidden = 16
        n_experts = 4

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=True)
                self.experts = torch.nn.ModuleList([
                    self._make_expert() for _ in range(n_experts)
                ])

            @staticmethod
            def _make_expert():
                m = torch.nn.Module()
                m.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                m.up_proj = torch.nn.Linear(hidden, 32, bias=False)
                return m

        moe = FakeMoE()
        d = self._make_direction(hidden)
        W_gate_orig = moe.gate.weight.data.clone()

        count = AbliterationPipeline._project_moe_experts(moe, d)
        assert count > 0

        # Gate weight should have been modified
        assert not torch.allclose(moe.gate.weight.data, W_gate_orig), \
            "Router/gate weights should be projected"

        # The gate weight's projection onto the direction should be ~0
        proj = (moe.gate.weight.data @ d).norm().item()
        assert proj < 1e-4, f"Gate should have no component along refusal dir, got {proj}"

    def test_shared_expert_projected(self):
        """Shared expert (always-on) should have both input and output projected."""
        hidden = 16

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, 2, bias=False)
                self.shared_expert = torch.nn.Module()
                self.shared_expert.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.shared_expert.up_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.experts = torch.nn.ModuleList([
                    self._make_expert() for _ in range(2)
                ])

            @staticmethod
            def _make_expert():
                m = torch.nn.Module()
                m.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                m.up_proj = torch.nn.Linear(hidden, 32, bias=False)
                return m

        moe = FakeMoE()
        d = self._make_direction(hidden)
        shared_down_orig = moe.shared_expert.down_proj.weight.data.clone()
        shared_up_orig = moe.shared_expert.up_proj.weight.data.clone()

        count = AbliterationPipeline._project_moe_experts(moe, d)
        assert count > 0

        # Both shared expert output AND input projections should be modified
        assert not torch.allclose(moe.shared_expert.down_proj.weight.data, shared_down_orig), \
            "Shared expert output (down_proj) should be projected"
        assert not torch.allclose(moe.shared_expert.up_proj.weight.data, shared_up_orig), \
            "Shared expert input (up_proj) should be projected"

    def test_expert_input_projections_projected(self):
        """Expert input projections (up_proj, gate_proj) should also be modified."""
        hidden = 16

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.up_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.gate_proj = torch.nn.Linear(hidden, 32, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(2)])

        moe = FakeMoE()
        d = self._make_direction(hidden)
        up_orig = moe.experts[0].up_proj.weight.data.clone()

        count = AbliterationPipeline._project_moe_experts(moe, d)

        # Each expert contributes 2 projections (output + input)
        # 2 experts * 2 = 4 minimum
        assert count >= 4, f"Expected >= 4 projections (out+in per expert), got {count}"

        assert not torch.allclose(moe.experts[0].up_proj.weight.data, up_orig), \
            "Expert input (up_proj) should be projected"

    def test_fused_3d_output_and_input(self):
        """Fused 3D parameter patterns (GPT-OSS style) should project both directions."""
        hidden = 16
        intermediate = 32
        n_experts = 4

        class FusedExperts(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden))
                self.up_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden))

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.experts = FusedExperts()

        moe = FakeMoE()
        d = self._make_direction(hidden)
        down_orig = moe.experts.down_proj.data.clone()
        up_orig = moe.experts.up_proj.data.clone()

        count = AbliterationPipeline._project_moe_experts(moe, d)

        # 4 experts output + 4 experts input = 8
        assert count == 8, f"Expected 8 fused projections, got {count}"

        assert not torch.allclose(moe.experts.down_proj.data, down_orig), \
            "Fused output (down_proj) should be projected"
        assert not torch.allclose(moe.experts.up_proj.data, up_orig), \
            "Fused input (up_proj) should be projected"

    def test_fused_3d_norm_preserve(self):
        """Fused 3D projections should preserve norms when requested."""
        hidden = 16
        intermediate = 32
        n_experts = 4

        class FusedExperts(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden))

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.experts = FusedExperts()

        moe = FakeMoE()
        d = self._make_direction(hidden)

        # Record per-expert norms before
        orig_norms = [moe.experts.down_proj.data[i].norm().item() for i in range(n_experts)]

        AbliterationPipeline._project_moe_experts(moe, d, norm_preserve=True)

        # Check per-expert norms preserved
        for i in range(n_experts):
            new_norm = moe.experts.down_proj.data[i].norm().item()
            assert abs(orig_norms[i] - new_norm) < 1e-3, \
                f"Expert {i} norm not preserved: {orig_norms[i]:.4f} vs {new_norm:.4f}"

    def test_no_experts_returns_zero(self):
        """Module without experts attribute should return 0."""
        class NoMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.mlp = torch.nn.Linear(16, 32)

        moe = NoMoE()
        d = self._make_direction(16)
        assert AbliterationPipeline._project_moe_experts(moe, d) == 0

    def test_router_bias_projected(self):
        """Router bias should be projected when project_biases=True."""
        hidden = 16

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, 4, bias=True)
                self.experts = torch.nn.ModuleList([
                    self._make_expert() for _ in range(4)
                ])

            @staticmethod
            def _make_expert():
                m = torch.nn.Module()
                m.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                return m

        moe = FakeMoE()
        d = self._make_direction(hidden)
        bias_orig = moe.gate.bias.data.clone()

        count = AbliterationPipeline._project_moe_experts(moe, d, project_biases=True)

        # Gate has 4 outputs (num_experts), direction has 16 dims
        # bias shape (4,) != direction shape (16,), so bias won't match.
        # This is correct: router bias is (num_experts,), not (hidden_dim,),
        # so _project_bias won't modify it (shape mismatch is expected).
        assert torch.allclose(moe.gate.bias.data, bias_orig), (
            "Router bias should be unchanged when shape mismatches direction"
        )
        assert isinstance(count, int)
        assert count > 0  # expert weights should still be projected

    def test_router_auto_detection_fallback(self):
        """Unknown router name should be auto-detected and projected."""
        import warnings as w
        hidden = 16
        n_experts = 4

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                # Unusual router name not in _ROUTER_NAMES
                self.moe_gate_proj = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([
                    self._make_expert() for _ in range(n_experts)
                ])

            @staticmethod
            def _make_expert():
                m = torch.nn.Module()
                m.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                return m

        moe = FakeMoE()
        d = self._make_direction(hidden)
        gate_orig = moe.moe_gate_proj.weight.data.clone()

        with w.catch_warnings(record=True) as caught:
            w.simplefilter("always")
            AbliterationPipeline._project_moe_experts(moe, d)

        # Should auto-detect and project the unusual router name
        assert not torch.allclose(moe.moe_gate_proj.weight.data, gate_orig), \
            "Auto-detected router should be projected"

        # Should emit a warning about the auto-detection
        auto_detect_warnings = [
            x for x in caught
            if "auto-detected" in str(x.message)
        ]
        assert len(auto_detect_warnings) > 0, "Should warn about auto-detected router"

    def test_full_moe_all_components(self):
        """End-to-end: all MoE components should be modified together."""
        hidden = 16

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.up_proj = torch.nn.Linear(hidden, 32, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, 4, bias=False)
                self.shared_expert = torch.nn.Module()
                self.shared_expert.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.shared_expert.up_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(4)])

        moe = FakeMoE()
        d = self._make_direction(hidden)

        count = AbliterationPipeline._project_moe_experts(moe, d)

        # Expected: 1 (gate) + 2 (shared out+in) + 4*2 (expert out+in) = 11
        assert count == 11, f"Expected 11 total projections, got {count}"


# ---------------------------------------------------------------------------
# SOTA technique #1: Safety-neuron masking (GateBreaker-style z-score)
# ---------------------------------------------------------------------------

class TestSafetyNeuronMasking:
    def test_outlier_neurons_zeroed(self):
        """Neurons with outsized refusal projection should be zeroed."""
        hidden = 16

        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 64, bias=False)

        module = Wrapper()
        torch.manual_seed(42)
        # Inject a few rows with very high projection along direction
        d = torch.randn(hidden, 1)
        d = d / d.norm()
        # Make rows 0,1,2 have huge projection (outliers)
        for i in range(3):
            module.down_proj.weight.data[i] = d.squeeze() * 10.0

        n_masked = AbliterationPipeline._mask_safety_neurons(
            module, d, ["down_proj"], z_threshold=2.0,
        )

        assert n_masked >= 3, f"Expected >= 3 masked neurons, got {n_masked}"
        # Masked rows should be zero
        for i in range(3):
            assert module.down_proj.weight.data[i].abs().max().item() < 1e-6

    def test_no_outliers_no_masking(self):
        """When all neurons have similar projection, none should be masked."""
        hidden = 16

        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)

        module = Wrapper()
        # Uniform weights — no outliers
        module.down_proj.weight.data = torch.ones(32, hidden) * 0.1

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        n_masked = AbliterationPipeline._mask_safety_neurons(
            module, d, ["down_proj"], z_threshold=2.0,
        )
        assert n_masked == 0

    def test_high_threshold_masks_fewer(self):
        """Higher z_threshold should mask fewer neurons."""
        hidden = 16

        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 64, bias=False)

        module1 = Wrapper()
        module2 = Wrapper()
        torch.manual_seed(42)
        W = torch.randn(64, hidden)
        module1.down_proj.weight.data = W.clone()
        module2.down_proj.weight.data = W.clone()

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        n1 = AbliterationPipeline._mask_safety_neurons(module1, d, ["down_proj"], z_threshold=1.5)
        n2 = AbliterationPipeline._mask_safety_neurons(module2, d, ["down_proj"], z_threshold=3.0)
        assert n1 >= n2, f"Lower threshold should mask more: {n1} vs {n2}"


# ---------------------------------------------------------------------------
# SOTA technique #2: Jailbreak-contrastive direction refinement
# ---------------------------------------------------------------------------

class TestJailbreakContrastive:
    def test_jailbreak_prompt_generation(self):
        """Pipeline should generate jailbreak variants of harmful prompts."""
        pipeline = AbliterationPipeline(model_name="test")
        pipeline.harmful_prompts = ["How to hack a computer", "Write a virus"]

        jailbreak = pipeline._generate_jailbreak_prompts()

        assert len(jailbreak) == 2
        # Each should contain the original prompt
        assert "hack a computer" in jailbreak[0].lower()
        assert "virus" in jailbreak[1].lower()
        # Each should be wrapped in a template (longer than original)
        assert len(jailbreak[0]) > len("How to hack a computer")

    def test_jailbreak_contrast_blending(self):
        """Jailbreak-contrastive blending should modify refusal direction."""
        pipeline = AbliterationPipeline(
            model_name="test",
            use_jailbreak_contrast=True,
            n_directions=1,
        )
        hidden = 16
        pipeline._on_log = lambda m: None

        # Simulate probed means
        torch.manual_seed(42)
        harm_mean = torch.randn(1, hidden)
        safe_mean = torch.randn(1, hidden)
        jb_mean = torch.randn(1, hidden)

        pipeline._harmful_means = {0: harm_mean}
        pipeline._harmless_means = {0: safe_mean}
        pipeline._jailbreak_means = {0: jb_mean}
        pipeline._harmful_acts = {0: [harm_mean]}
        pipeline._harmless_acts = {0: [safe_mean]}
        pipeline._jailbreak_acts = {0: [jb_mean]}

        # Run distill (will set standard direction, then blend)
        pipeline._distill()

        # Direction should be a unit vector
        d = pipeline.refusal_directions[0]
        assert abs(d.norm().item() - 1.0) < 1e-4

        # Direction should differ from pure harm-safe difference
        std_diff = (harm_mean - safe_mean).squeeze()
        std_dir = std_diff / std_diff.norm()
        cosine = (d @ std_dir).item()
        # Blended direction should not be identical to standard
        assert cosine < 0.99, f"Blended direction too similar to standard: cos={cosine}"

    def test_surgical_method_enables_jailbreak(self):
        """Surgical method should enable jailbreak-contrastive by default."""
        cfg = METHODS["surgical"]
        assert cfg["use_jailbreak_contrast"] is True


# ---------------------------------------------------------------------------
# SOTA technique #3: Layer-adaptive projection strength
# ---------------------------------------------------------------------------

class TestLayerAdaptiveStrength:
    def test_layer_weights_computed(self):
        """Layer-adaptive weights should be proportional to refusal signal."""
        pipeline = AbliterationPipeline(
            model_name="test",
            layer_adaptive_strength=True,
            n_directions=1,
        )
        hidden = 16
        pipeline._on_log = lambda m: None

        # Simulate: layer 0 has strong signal, layer 1 weak
        torch.manual_seed(42)
        strong_diff = torch.randn(1, hidden) * 10.0
        weak_diff = torch.randn(1, hidden) * 1.0
        zero_mean = torch.zeros(1, hidden)

        pipeline._harmful_means = {0: strong_diff, 1: weak_diff}
        pipeline._harmless_means = {0: zero_mean, 1: zero_mean}
        pipeline._harmful_acts = {0: [strong_diff], 1: [weak_diff]}
        pipeline._harmless_acts = {0: [zero_mean], 1: [zero_mean]}

        pipeline._distill()

        # Layer weights should exist for strong layers
        assert len(pipeline._layer_excise_weights) > 0
        # Strongest layer should have weight ~1.0
        max_weight = max(pipeline._layer_excise_weights.values())
        assert max_weight > 0.9, f"Max weight should be ~1.0, got {max_weight}"

    def test_surgical_method_enables_adaptive(self):
        """Surgical method should enable layer-adaptive by default."""
        cfg = METHODS["surgical"]
        assert cfg["layer_adaptive_strength"] is True


# ---------------------------------------------------------------------------
# SOTA technique #5: Attention head surgery
# ---------------------------------------------------------------------------

class TestAttentionHeadSurgery:
    def test_head_selective_projection(self):
        """Selective head projection should only modify targeted head rows."""
        hidden = 16
        n_heads = 4
        head_dim = hidden // n_heads

        class FakeAttn(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(hidden, hidden, bias=False)

        attn = FakeAttn()
        torch.manual_seed(42)
        W_orig = attn.o_proj.weight.data.clone()

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        # Head scores: head 0 is top safety head, head 3 is lowest
        head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)]

        n_modified = AbliterationPipeline._project_head_selective(
            attn, d, head_scores, n_heads=n_heads, head_fraction=0.25,
        )

        assert n_modified >= 1, "Should modify at least 1 head"

        W_new = attn.o_proj.weight.data
        # Head 0 columns (targeted) should be modified
        assert not torch.allclose(
            W_new[:, 0:head_dim], W_orig[:, 0:head_dim]
        ), "Targeted head 0 should be modified"

        # Head 3 columns (NOT targeted) should be untouched
        assert torch.allclose(
            W_new[:, 3*head_dim:4*head_dim],
            W_orig[:, 3*head_dim:4*head_dim],
        ), "Non-targeted head 3 should be untouched"

    def test_head_surgery_norm_preserve(self):
        """Head surgery with norm_preserve should maintain per-head norms."""
        hidden = 16
        n_heads = 4
        head_dim = hidden // n_heads

        class FakeAttn(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(hidden, hidden, bias=False)

        attn = FakeAttn()
        torch.manual_seed(42)

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        orig_norms = [
            attn.o_proj.weight.data[:, h*head_dim:(h+1)*head_dim].norm().item()
            for h in range(n_heads)
        ]

        head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)]
        AbliterationPipeline._project_head_selective(
            attn, d, head_scores, n_heads=n_heads,
            head_fraction=0.5, norm_preserve=True,
        )

        # Targeted heads should have preserved norms
        for h in range(2):  # top 50% = 2 heads
            new_norm = attn.o_proj.weight.data[:, h*head_dim:(h+1)*head_dim].norm().item()
            assert abs(orig_norms[h] - new_norm) < 1e-3, \
                f"Head {h} norm not preserved: {orig_norms[h]:.4f} vs {new_norm:.4f}"

    def test_head_surgery_non_square_gqa(self):
        """Head surgery should work for GQA models with non-square o_proj (attn_dim != hidden_dim)."""
        hidden_dim = 12   # model hidden dimension
        attn_dim = 32      # attention dimension (n_heads * head_dim_attn)
        n_heads = 4
        head_dim_attn = attn_dim // n_heads  # 8

        class FakeAttnGQA(torch.nn.Module):
            def __init__(self):
                super().__init__()
                # o_proj maps attn_dim -> hidden_dim
                # nn.Linear weight shape: (hidden_dim, attn_dim) = (12, 32)
                self.o_proj = torch.nn.Linear(attn_dim, hidden_dim, bias=False)

        attn = FakeAttnGQA()
        torch.manual_seed(42)
        attn.o_proj.weight.data = torch.randn(hidden_dim, attn_dim)
        W_orig = attn.o_proj.weight.data.clone()

        d = torch.randn(hidden_dim, 1)
        d = d / d.norm()

        head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)]

        n_modified = AbliterationPipeline._project_head_selective(
            attn, d, head_scores, n_heads=n_heads, head_fraction=0.25,
        )

        assert n_modified >= 1, "Should modify at least 1 head"

        W_new = attn.o_proj.weight.data
        # Head 0 columns (targeted) should be modified
        assert not torch.allclose(
            W_new[:, 0:head_dim_attn], W_orig[:, 0:head_dim_attn]
        ), "Targeted head 0 should be modified"

        # Head 3 columns (NOT targeted) should be untouched
        assert torch.allclose(
            W_new[:, 3*head_dim_attn:4*head_dim_attn],
            W_orig[:, 3*head_dim_attn:4*head_dim_attn],
        ), "Non-targeted head 3 should be untouched"

    def test_head_surgery_gqa_norm_preserve(self):
        """Head surgery on GQA non-square o_proj with norm_preserve."""
        hidden_dim = 12
        attn_dim = 32
        n_heads = 4
        head_dim_attn = attn_dim // n_heads

        class FakeAttnGQA(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(attn_dim, hidden_dim, bias=False)

        attn = FakeAttnGQA()
        torch.manual_seed(42)
        attn.o_proj.weight.data = torch.randn(hidden_dim, attn_dim)

        d = torch.randn(hidden_dim, 1)
        d = d / d.norm()

        orig_norms = [
            attn.o_proj.weight.data[:, h*head_dim_attn:(h+1)*head_dim_attn].norm().item()
            for h in range(n_heads)
        ]

        head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)]
        AbliterationPipeline._project_head_selective(
            attn, d, head_scores, n_heads=n_heads,
            head_fraction=0.5, norm_preserve=True,
        )

        for h in range(2):  # top 50% = 2 heads
            new_norm = attn.o_proj.weight.data[:, h*head_dim_attn:(h+1)*head_dim_attn].norm().item()
            assert abs(orig_norms[h] - new_norm) < 1e-3, \
                f"GQA head {h} norm not preserved: {orig_norms[h]:.4f} vs {new_norm:.4f}"


# ---------------------------------------------------------------------------
# SOTA technique #6: SAE feature-level abliteration
# ---------------------------------------------------------------------------

class TestSAEAbliteration:
    def test_sae_train_and_reconstruct(self):
        """SAE should train and reconstruct activations."""
        from obliteratus.analysis.sae_abliteration import train_sae

        hidden = 32
        # Generate synthetic activations
        torch.manual_seed(42)
        acts = [torch.randn(hidden) for _ in range(64)]

        sae = train_sae(acts, hidden, expansion=2, n_epochs=10, lr=1e-3)

        # Forward pass should work
        x = torch.randn(1, hidden)
        x_hat, z = sae(x)
        assert x_hat.shape == x.shape
        assert z.shape == (1, 2 * hidden)  # expansion=2

        # Z should be sparse (ReLU activation)
        assert (z == 0).float().mean() > 0.3, "Features should be sparse"

    def test_refusal_feature_identification(self):
        """SAE should identify features that differ between harmful/harmless."""
        from obliteratus.analysis.sae_abliteration import (
            train_sae, identify_refusal_features,
        )

        hidden = 32
        torch.manual_seed(42)

        # Create activations with clear harmful/harmless separation
        refusal_dir = torch.randn(hidden)
        refusal_dir = refusal_dir / refusal_dir.norm()

        harmful_acts = [torch.randn(hidden) + 2.0 * refusal_dir for _ in range(32)]
        harmless_acts = [torch.randn(hidden) - 2.0 * refusal_dir for _ in range(32)]
        all_acts = harmful_acts + harmless_acts

        sae = train_sae(all_acts, hidden, expansion=2, n_epochs=30, lr=3e-4)
        result = identify_refusal_features(
            sae, harmful_acts, harmless_acts, layer_idx=0, top_k=4,
        )

        assert result.n_refusal_features == 4
        assert result.sae_directions.shape == (4, hidden)
        assert result.variance_explained > 0.0
        # SAE directions should have some alignment with the actual refusal direction
        best_cos = max(
            abs((result.sae_directions[i] @ refusal_dir).item())
            for i in range(result.sae_directions.shape[0])
        )
        assert best_cos > 0.1, f"SAE should find direction aligned with refusal: best_cos={best_cos}"

    def test_sae_directions_unit_norm(self):
        """SAE-derived directions should be unit normalized."""
        from obliteratus.analysis.sae_abliteration import (
            train_sae, identify_refusal_features,
        )

        hidden = 16
        torch.manual_seed(42)
        harmful = [torch.randn(hidden) + torch.ones(hidden) for _ in range(16)]
        harmless = [torch.randn(hidden) - torch.ones(hidden) for _ in range(16)]

        sae = train_sae(harmful + harmless, hidden, expansion=2, n_epochs=10)
        result = identify_refusal_features(sae, harmful, harmless, 0, top_k=3)

        for i in range(result.sae_directions.shape[0]):
            norm = result.sae_directions[i].norm().item()
            assert abs(norm - 1.0) < 1e-3, f"Direction {i} norm={norm}, expected 1.0"


# ---------------------------------------------------------------------------
# Surgical method preset
# ---------------------------------------------------------------------------

class TestSurgicalMethod:
    def test_surgical_enables_all_sota(self):
        """Surgical method should enable all 6 SOTA techniques."""
        cfg = METHODS["surgical"]
        assert cfg["use_jailbreak_contrast"] is True
        assert cfg["layer_adaptive_strength"] is True
        assert cfg["safety_neuron_masking"] is True
        assert cfg["per_expert_directions"] is True
        assert cfg["attention_head_surgery"] is True
        assert cfg["use_sae_features"] is True

    def test_basic_disables_all_sota(self):
        """Basic method should not enable SOTA techniques (no keys or False)."""
        cfg = METHODS["basic"]
        assert cfg.get("use_jailbreak_contrast", False) is False
        assert cfg.get("layer_adaptive_strength", False) is False
        assert cfg.get("safety_neuron_masking", False) is False

    def test_pipeline_init_surgical(self):
        """Pipeline initialized with surgical method should have all flags set."""
        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        assert pipeline.use_jailbreak_contrast is True
        assert pipeline.layer_adaptive_strength is True
        assert pipeline.safety_neuron_masking is True
        assert pipeline.per_expert_directions is True
        assert pipeline.attention_head_surgery is True
        assert pipeline.use_sae_features is True

    def test_pipeline_init_explicit_override(self):
        """Explicit params should override method defaults."""
        pipeline = AbliterationPipeline(
            model_name="test", method="surgical",
            safety_neuron_masking=False,
        )
        assert pipeline.safety_neuron_masking is False
        assert pipeline.use_jailbreak_contrast is True  # rest still from surgical


# ---------------------------------------------------------------------------
# Inverted method (semantic refusal inversion)
# ---------------------------------------------------------------------------

class TestInvertedMethod:
    def test_inverted_preset_config(self):
        """Inverted method preset should enable inversion flag."""
        cfg = METHODS["inverted"]
        assert cfg["invert_refusal"] is True
        assert cfg["n_directions"] == 8
        assert cfg["use_jailbreak_contrast"] is True

    def test_surgical_does_not_invert(self):
        """Surgical method should NOT enable inversion by default."""
        cfg = METHODS["surgical"]
        assert cfg.get("invert_refusal", False) is False

    def test_pipeline_init_inverted(self):
        """Pipeline initialized with inverted method should have flag set."""
        pipeline = AbliterationPipeline(model_name="test", method="inverted")
        assert pipeline.invert_refusal is True
        assert pipeline.use_jailbreak_contrast is True
        assert pipeline.safety_neuron_masking is False  # zeroing + reflection is destructive

    def test_pipeline_invert_explicit_override(self):
        """Explicit invert_refusal param should override method default."""
        pipeline = AbliterationPipeline(
            model_name="test", method="surgical", invert_refusal=True,
        )
        assert pipeline.invert_refusal is True

        pipeline2 = AbliterationPipeline(
            model_name="test", method="inverted", invert_refusal=False,
        )
        assert pipeline2.invert_refusal is False

    def test_reflection_math(self):
        """2x projection (reflection) should negate the refusal component."""
        hidden = 16

        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(hidden, 32, bias=False)

        module = Wrapper()
        torch.manual_seed(42)
        W_orig = module.o_proj.weight.data.clone()

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        # Original projection onto d
        orig_proj = (W_orig @ d).squeeze()

        # Reflection: regularization=-1.0 → scale=2.0
        AbliterationPipeline._project_out_advanced(
            module, d, ["o_proj"], regularization=-1.0,
        )

        W_reflected = module.o_proj.weight.data
        new_proj = (W_reflected @ d).squeeze()

        # After reflection, projection should be NEGATED (sign flipped)
        assert torch.allclose(new_proj, -orig_proj, atol=1e-4), (
            f"Reflected projection should be negated: expected ~{-orig_proj[:3]} got {new_proj[:3]}"
        )

    def test_reflection_preserves_orthogonal_component(self):
        """Reflection should not change the component perpendicular to d."""
        hidden = 8

        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(hidden, 16, bias=False)

        module = Wrapper()
        torch.manual_seed(42)
        W_orig = module.o_proj.weight.data.clone()

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        # Compute original orthogonal component
        orig_d_component = (W_orig @ d) @ d.T  # rank-1 matrix: projection onto d
        orig_ortho = W_orig - orig_d_component  # everything except d-component

        AbliterationPipeline._project_out_advanced(
            module, d, ["o_proj"], regularization=-1.0,
        )

        W_reflected = module.o_proj.weight.data
        new_d_component = (W_reflected @ d) @ d.T
        new_ortho = W_reflected - new_d_component

        # Orthogonal component should be unchanged
        assert torch.allclose(orig_ortho, new_ortho, atol=1e-4), (
            "Reflection should preserve orthogonal component"
        )

    def test_moe_expert_safety_classification(self):
        """_identify_safety_experts should classify experts by router affinity."""
        hidden = 16
        n_experts = 4

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([
                    torch.nn.Linear(hidden, hidden) for _ in range(n_experts)
                ])

        class FakeLayer(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.self_attn = torch.nn.Module()
                self.self_attn.o_proj = torch.nn.Linear(hidden, hidden, bias=False)
                self.mlp = FakeMoE()

        from obliteratus.models.loader import ModelHandle
        from unittest.mock import MagicMock
        from transformers import GPT2Config

        config = GPT2Config(n_embd=hidden, n_head=2, n_layer=1, vocab_size=100, n_positions=64)
        model = MagicMock()
        model.parameters.return_value = iter([torch.zeros(1)])

        handle = ModelHandle(
            model=model, tokenizer=MagicMock(),
            config=config, model_name="test", task="causal_lm",
        )

        pipeline = AbliterationPipeline(model_name="test", method="inverted")
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None

        # Set up fake layer and direction
        layer = FakeLayer()
        torch.manual_seed(42)

        # Make router weight so expert 0 has highest affinity for d
        d = torch.randn(hidden)
        d = d / d.norm()
        # Set router weights: expert 0 aligned with d, expert 3 anti-aligned
        layer.mlp.gate.weight.data[0] = d * 5.0
        layer.mlp.gate.weight.data[1] = d * 1.0
        layer.mlp.gate.weight.data[2] = d * -1.0
        layer.mlp.gate.weight.data[3] = d * -5.0

        # Mock get_layer_modules to return our fake layer
        import obliteratus.abliterate as abl_module
        orig_get_layers = abl_module.get_layer_modules
        orig_get_ffn = abl_module.get_ffn_module
        abl_module.get_layer_modules = lambda h: [layer]
        abl_module.get_ffn_module = lambda lay, a: lay.mlp
        try:
            pipeline.refusal_directions = {0: d}
            pipeline._strong_layers = [0]
            pipeline._identify_safety_experts()
        finally:
            abl_module.get_layer_modules = orig_get_layers
            abl_module.get_ffn_module = orig_get_ffn

        assert 0 in pipeline._expert_safety_scores
        scores = pipeline._expert_safety_scores[0]
        # Expert 0 should be highest safety affinity
        assert scores[0][0] == 0, f"Expert 0 should be top safety, got {scores[0]}"
        # Expert 3 should be lowest
        assert scores[-1][0] == 3, f"Expert 3 should be lowest, got {scores[-1]}"

    def test_moe_inverted_excision_selective(self):
        """Inverted MoE excision should reflect safety experts and remove from capability."""
        hidden = 16
        n_experts = 4

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, hidden, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)])

        moe = FakeMoE()
        torch.manual_seed(42)
        for p in moe.parameters():
            p.data = torch.randn_like(p.data)

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        # Set up safety scores: experts 0,1 are safety, 2,3 are capability
        pipeline = AbliterationPipeline(model_name="test", method="inverted")
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        pipeline._expert_safety_scores = {
            0: [(0, 5.0), (1, 3.0), (2, -1.0), (3, -3.0)]
        }

        orig_router = moe.gate.weight.data.clone()

        count = pipeline._project_moe_experts_inverted(
            moe, d, 0, norm_preserve=False, project_biases=False,
        )

        assert count > 0, "Should project some weights"

        # Router should be reflected (capped at 1.5x to prevent extreme logits
        # that cause CUDA illegal memory access in batched expert forward).
        # With router_reg = max(reflect_reg, -0.5) → scale = 1.5:
        #   new_proj ≈ orig_proj - 1.5 * orig_proj = -0.5 * orig_proj
        # Additionally, _stabilize_router_weights clamps outliers, so we
        # verify the sign is flipped and magnitude is substantial.
        router_proj = (moe.gate.weight.data @ d.squeeze()).squeeze()
        orig_router_proj = (orig_router @ d.squeeze()).squeeze()
        cosine = torch.nn.functional.cosine_similarity(
            router_proj.unsqueeze(0), -orig_router_proj.unsqueeze(0),
        )
        assert cosine > 0.5, (
            f"Router projection should be at least partially reflected, cosine={cosine.item():.3f}"
        )

        # Safety expert 0: should be reflected (projection negated)
        e0_proj = (moe.experts[0].down_proj.weight.data @ d).norm()
        # After reflection the projection doesn't go to zero — it negates
        assert e0_proj > 1e-4, "Safety expert should have non-zero projection (reflected, not removed)"

        # Capability expert 3: should have projection removed (near zero)
        e3_proj = (moe.experts[3].down_proj.weight.data @ d).norm().item()
        assert e3_proj < 1e-3, f"Capability expert should have projection removed, got {e3_proj}"


# ---------------------------------------------------------------------------
# Nuclear method
# ---------------------------------------------------------------------------

class TestNuclearMethod:
    def test_nuclear_preset_config(self):
        """Nuclear method should match inverted baseline + permanent weight techniques."""
        cfg = METHODS["nuclear"]
        assert cfg["invert_refusal"] is True
        assert cfg["n_directions"] == 4  # fewer than inverted to avoid over-ablation
        assert cfg["refinement_passes"] == 2  # same as inverted
        assert cfg["reflection_strength"] == 1.25  # tempered for CoT coherence
        assert cfg["project_embeddings"] is True
        assert cfg["embed_regularization"] == 0.50  # conservative cascade limit
        assert cfg["activation_steering"] is True  # residual cleanup hooks
        assert cfg["steering_strength"] == 0.15  # light residual correction
        assert cfg["expert_transplant"] is True
        assert cfg["transplant_blend"] == 0.10  # gentle nudge, not overwrite
        assert cfg["use_jailbreak_contrast"] is True
        assert cfg["attention_head_surgery"] is True
        assert cfg["layer_adaptive_strength"] is True  # per-layer scaling

    def test_nuclear_pipeline_init(self):
        """Pipeline initialized with nuclear method should have all flags set."""
        pipeline = AbliterationPipeline(model_name="test", method="nuclear")
        assert pipeline.invert_refusal is True
        assert pipeline.reflection_strength == 1.25
        assert pipeline.embed_regularization == 0.50
        assert pipeline.transplant_blend == 0.10
        assert pipeline.project_embeddings is True
        assert pipeline.activation_steering is True  # residual cleanup
        assert pipeline.expert_transplant is True
        assert pipeline.n_directions == 4
        assert pipeline.refinement_passes == 2
        assert pipeline.layer_adaptive_strength is True

    def test_reflection_strength_configurable(self):
        """reflection_strength should be explicitly overridable."""
        pipeline = AbliterationPipeline(
            model_name="test", method="inverted", reflection_strength=3.0,
        )
        assert pipeline.reflection_strength == 3.0

    def test_inverted_default_strength_is_2(self):
        """Inverted method should default to reflection_strength=2.0."""
        pipeline = AbliterationPipeline(model_name="test", method="inverted")
        assert pipeline.reflection_strength == 2.0

    def test_boosted_reflection_math(self):
        """2.5x reflection should produce stronger negation than 2x."""
        hidden = 16

        class Wrapper(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.o_proj = torch.nn.Linear(hidden, 32, bias=False)

        d = torch.randn(hidden, 1)
        d = d / d.norm()

        # 2x reflection
        module_2x = Wrapper()
        torch.manual_seed(42)
        module_2x.o_proj.weight.data = torch.randn(32, hidden)
        orig = module_2x.o_proj.weight.data.clone()
        AbliterationPipeline._project_out_advanced(
            module_2x, d, ["o_proj"], regularization=-1.0,  # scale=2.0
        )
        proj_2x = (module_2x.o_proj.weight.data @ d).squeeze()

        # 2.5x reflection
        module_25x = Wrapper()
        module_25x.o_proj.weight.data = orig.clone()
        AbliterationPipeline._project_out_advanced(
            module_25x, d, ["o_proj"], regularization=-1.5,  # scale=2.5
        )
        proj_25x = (module_25x.o_proj.weight.data @ d).squeeze()

        # 2.5x should be 25% stronger negation than 2x
        assert proj_25x.norm() > proj_2x.norm(), (
            "2.5x reflection should produce stronger (more negative) projection than 2x"
        )

    def test_activation_steering_hook(self):
        """Steering hooks should subtract refusal direction from hidden states."""
        hidden = 8

        class FakeLayer(torch.nn.Module):
            def forward(self, x):
                return x

        layer = FakeLayer()
        layers = torch.nn.ModuleList([layer])

        # Explicitly enable steering (nuclear preset has it off by default)
        pipeline = AbliterationPipeline(
            model_name="test", method="inverted", activation_steering=True,
            steering_strength=0.5,
        )
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None

        d = torch.randn(hidden)
        d = d / d.norm()
        pipeline.refusal_directions = {0: d}
        pipeline._strong_layers = [0]

        n_hooks = pipeline._install_activation_steering(layers)
        assert n_hooks == 1
        assert len(pipeline._steering_hooks) == 1

        # Create a hidden state with strong refusal component
        batch = torch.randn(1, 4, hidden)
        refusal_component = 5.0 * d.unsqueeze(0).unsqueeze(0).expand_as(batch)
        input_hidden = batch + refusal_component

        # Run through the layer (hook should fire)
        output = layer(input_hidden)

        # The refusal component should be reduced
        proj_before = torch.einsum("bsh,h->bs", input_hidden, d).abs().mean()
        proj_after = torch.einsum("bsh,h->bs", output, d).abs().mean()
        assert proj_after < proj_before, (
            f"Steering should reduce refusal projection: before={proj_before:.3f}, after={proj_after:.3f}"
        )

        # Cleanup
        for hook in pipeline._steering_hooks:
            hook.remove()

    def test_expert_transplant(self):
        """Expert transplant should overwrite safety expert weights with capability average."""
        hidden = 16
        n_experts = 4

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, hidden, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)])

        class FakeLayer(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.self_attn = torch.nn.Module()
                self.self_attn.o_proj = torch.nn.Linear(hidden, hidden, bias=False)
                self.mlp = FakeMoE()

        layer = FakeLayer()
        layers = torch.nn.ModuleList([layer])
        torch.manual_seed(42)
        for p in layer.parameters():
            p.data = torch.randn_like(p.data)

        # Save original safety expert weight
        orig_safety0 = layer.mlp.experts[0].down_proj.weight.data.clone()
        # Save capability expert weights for computing expected mean
        # With top-third classification (n_experts // 3 = 1), only expert 0
        # is safety; experts 1, 2, 3 are all capability.
        cap1 = layer.mlp.experts[1].down_proj.weight.data.clone()
        cap2 = layer.mlp.experts[2].down_proj.weight.data.clone()
        cap3 = layer.mlp.experts[3].down_proj.weight.data.clone()
        expected_mean = (cap1 + cap2 + cap3) / 3.0

        import obliteratus.abliterate as abl_module
        from obliteratus.models.loader import ModelHandle
        from transformers import GPT2Config

        config = GPT2Config(n_embd=hidden, n_head=2, n_layer=1, vocab_size=100, n_positions=64)
        model = MagicMock()
        model.parameters.return_value = iter([torch.zeros(1)])
        handle = ModelHandle(model=model, tokenizer=MagicMock(), config=config, model_name="test", task="causal_lm")

        pipeline = AbliterationPipeline(model_name="test", method="nuclear")
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        pipeline._strong_layers = [0]
        # Experts 0,1 are safety (high affinity), 2,3 are capability
        pipeline._expert_safety_scores = {
            0: [(0, 5.0), (1, 3.0), (2, -1.0), (3, -3.0)]
        }

        orig_get_ffn = abl_module.get_ffn_module
        abl_module.get_ffn_module = lambda lay, a: lay.mlp
        try:
            count = pipeline._transplant_expert_weights(layers)
        finally:
            abl_module.get_ffn_module = orig_get_ffn

        assert count >= 1, f"Should blend at least 1 weight (top-third safety expert), got {count}"

        # Safety expert 0 should be a 10% blend toward capability mean
        # (nuclear default transplant_blend=0.10)
        # new = 0.90 * original + 0.10 * capability_mean
        blend = pipeline.transplant_blend  # 0.10
        expected_blend = (1.0 - blend) * orig_safety0 + blend * expected_mean
        transplanted = layer.mlp.experts[0].down_proj.weight.data
        assert torch.allclose(transplanted, expected_blend, atol=1e-4), (
            f"Safety expert weight should be {blend:.0%} blended toward capability mean"
        )

        # Capability expert 2 should be unchanged
        assert torch.allclose(layer.mlp.experts[2].down_proj.weight.data, cap2, atol=1e-6), (
            "Capability expert should be unchanged"
        )

    def test_gather_state_dict_raises_on_missing_offload(self):
        """Should raise RuntimeError (not silently corrupt) when offload dir is missing."""
        from obliteratus.models.loader import ModelHandle
        from transformers import GPT2Config

        config = GPT2Config(n_embd=8, n_head=2, n_layer=1, vocab_size=100, n_positions=64)

        # Create a fake model whose state_dict returns a meta tensor
        fake_model = MagicMock()
        meta_tensor = torch.empty(4, 8, device="meta")
        fake_model.state_dict.return_value = {"layer.weight": meta_tensor}

        handle = ModelHandle(
            model=fake_model, tokenizer=MagicMock(), config=config,
            model_name="test", task="causal_lm",
        )
        handle._offload_dir = "/nonexistent/path"

        pipeline = AbliterationPipeline(model_name="test", method="nuclear")
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None

        with pytest.raises(RuntimeError, match="bricked checkpoint"):
            pipeline._gather_state_dict()


# ---------------------------------------------------------------------------
# Knee detection
# ---------------------------------------------------------------------------

class TestKneeDetection:
    def test_empty_input(self):
        result = AbliterationPipeline._select_layers_knee([])
        assert result == []

    def test_two_layers(self):
        result = AbliterationPipeline._select_layers_knee([(0, 5.0), (1, 3.0)])
        assert set(result) == {0, 1}

    def test_clear_knee(self):
        """Layers with a sharp dropoff should be separated by knee detection."""
        sorted_layers = [
            (14, 10.0), (15, 9.5), (13, 9.0),  # strong cluster
            (16, 2.0), (12, 1.5), (17, 1.0), (11, 0.5), (18, 0.2), (10, 0.1),
        ]
        result = AbliterationPipeline._select_layers_knee(sorted_layers)
        # Should select the strong cluster (layers 14, 15, 13) and exclude weak ones
        assert 14 in result
        assert 15 in result
        assert 13 in result
        assert len(result) <= 5  # shouldn't select all 9

    def test_minimum_threshold_filters_noise(self):
        """Layers below 10% of max should be filtered out."""
        sorted_layers = [(0, 10.0), (1, 0.5)]  # 0.5 is 5% of 10
        result = AbliterationPipeline._select_layers_knee(sorted_layers)
        # Layer 1 is below 10% threshold
        assert 0 in result

    def test_all_equal_norms(self):
        """When all norms are equal, should select all (or most)."""
        sorted_layers = [(i, 5.0) for i in range(5)]
        result = AbliterationPipeline._select_layers_knee(sorted_layers)
        assert len(result) >= 1


# ---------------------------------------------------------------------------
# Activation collection
# ---------------------------------------------------------------------------

class TestActivationCollection:
    def test_collect_activations(self, handle):
        """Test that activation collection returns correct structure."""
        from obliteratus.strategies.utils import get_layer_modules

        pipeline = AbliterationPipeline(model_name="test")
        pipeline.handle = handle
        pipeline._on_log = lambda m: None

        layers = get_layer_modules(handle)
        prompts = ["Hello world", "Test prompt"]

        handle.tokenizer.return_value = {
            "input_ids": torch.randint(0, 1000, (1, 5)),
            "attention_mask": torch.ones(1, 5, dtype=torch.long),
        }

        activations = pipeline._collect_activations(layers, prompts, "test")

        assert len(activations) == len(layers)
        for idx in range(len(layers)):
            assert len(activations[idx]) == len(prompts)
            for act in activations[idx]:
                assert act.device == torch.device("cpu")
                assert act.shape[-1] == handle.hidden_size


# ---------------------------------------------------------------------------
# Distill: single direction (basic method)
# ---------------------------------------------------------------------------

class TestDistillBasic:
    def test_single_direction(self, handle):
        """Basic method: single refusal direction via difference-in-means."""
        from obliteratus.strategies.utils import get_layer_modules

        pipeline = AbliterationPipeline(
            model_name="test",
            method="basic",
            harmful_prompts=["bad prompt"],
            harmless_prompts=["good prompt"],
        )
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        _make_varied_tokenizer(handle)

        pipeline._probe()
        pipeline._distill()

        n_layers = len(get_layer_modules(handle))
        assert len(pipeline.refusal_directions) == n_layers
        for idx, direction in pipeline.refusal_directions.items():
            assert abs(direction.norm().item() - 1.0) < 1e-4
            # Single direction: subspace should be (1, hidden_dim)
            assert pipeline.refusal_subspaces[idx].shape[0] == 1


# ---------------------------------------------------------------------------
# Distill: multi-direction SVD (advanced/aggressive method)
# ---------------------------------------------------------------------------

class TestDistillSVD:
    def test_multi_direction_svd(self, handle):
        """Advanced method: SVD extracts multiple refusal directions.

        Note: on small models (hidden_size < 2048 or < 2B params), n_directions
        is automatically capped to 2 to prevent over-ablation.  The test model
        (hidden_size=64, 4 layers) triggers this safeguard.
        """
        from obliteratus.strategies.utils import get_layer_modules

        pipeline = AbliterationPipeline(
            model_name="test",
            method="advanced",
            harmful_prompts=["bad1", "bad2", "bad3", "bad4", "bad5"],
            harmless_prompts=["good1", "good2", "good3", "good4", "good5"],
        )
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        _make_varied_tokenizer(handle)

        pipeline._probe()
        pipeline._distill()

        n_layers = len(get_layer_modules(handle))
        assert len(pipeline.refusal_subspaces) == n_layers
        # Small-model cap: n_directions capped to 2 for tiny test model
        expected_dirs = min(2, pipeline.n_directions, 5, handle.hidden_size)
        for idx, subspace in pipeline.refusal_subspaces.items():
            assert subspace.shape[0] == expected_dirs
            assert subspace.shape[1] == handle.hidden_size

        # Primary direction should still be a unit vector
        for idx, direction in pipeline.refusal_directions.items():
            assert abs(direction.norm().item() - 1.0) < 1e-4


# ---------------------------------------------------------------------------
# Full pipeline: excise with different methods
# ---------------------------------------------------------------------------

class TestExcise:
    def test_excise_basic(self, handle):
        """Basic method should modify weights."""
        from obliteratus.strategies.utils import get_layer_modules

        pipeline = AbliterationPipeline(
            model_name="test",
            method="basic",
            harmful_prompts=["bad prompt"],
            harmless_prompts=["good prompt"],
        )
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        _make_varied_tokenizer(handle)

        layers = get_layer_modules(handle)
        original_weights = {}
        for idx in range(len(layers)):
            for name, param in layers[idx].named_parameters():
                original_weights[(idx, name)] = param.data.clone()

        pipeline._probe()
        pipeline._distill()
        pipeline._excise()

        any_changed = False
        for idx in range(len(layers)):
            for name, param in layers[idx].named_parameters():
                if not torch.allclose(original_weights[(idx, name)], param.data, atol=1e-6):
                    any_changed = True
                    break

        assert any_changed, "Excise should modify at least some weights"

    def test_excise_advanced_norm_preserving(self, handle):
        """Advanced method with norm preservation should maintain weight norms."""
        from obliteratus.strategies.utils import get_layer_modules

        pipeline = AbliterationPipeline(
            model_name="test",
            method="advanced",
            harmful_prompts=["bad prompt"],
            harmless_prompts=["good prompt"],
        )
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        _make_varied_tokenizer(handle)

        get_layer_modules(handle)

        pipeline._probe()
        pipeline._distill()
        pipeline._excise()

        # Weights should have been modified (advanced uses _project_out_advanced)
        assert len(pipeline._strong_layers) > 0


# ---------------------------------------------------------------------------
# Rebirth (save)
# ---------------------------------------------------------------------------

class TestRebirth:
    def test_rebirth_saves_metadata(self, handle, tmp_path):
        """Rebirth should save model and comprehensive metadata JSON."""
        pipeline = AbliterationPipeline(
            model_name="test-model",
            output_dir=str(tmp_path / "output"),
            method="advanced",
        )
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        pipeline._strong_layers = [0]
        pipeline._quality_metrics = {"perplexity": 8.5, "coherence": 1.0}

        handle.model.save_pretrained = MagicMock()
        handle.tokenizer.save_pretrained = MagicMock()

        result_path = pipeline._rebirth()

        assert result_path == tmp_path / "output"
        assert (result_path / "abliteration_metadata.json").exists()

        metadata = json.loads((result_path / "abliteration_metadata.json").read_text())
        assert metadata["source_model"] == "test-model"
        assert metadata["technique"] == "refusal_direction_ablation"
        assert metadata["method"] == "advanced"
        assert metadata["strong_layers"] == [0]
        assert "method_config" in metadata
        assert metadata["method_config"]["n_directions"] == METHODS["advanced"]["n_directions"]
        assert metadata["method_config"]["norm_preserve"] is True
        assert "references" in metadata
        assert len(metadata["references"]) >= 3
        assert "quality_metrics" in metadata
        assert metadata["quality_metrics"]["perplexity"] == 8.5


# ---------------------------------------------------------------------------
# CLI integration
# ---------------------------------------------------------------------------

class TestCLI:
    def test_abliterate_parser_with_method(self):
        """Test that the abliterate subcommand parses method correctly."""
        import argparse

        parser = argparse.ArgumentParser()
        subparsers = parser.add_subparsers(dest="command")
        abl_parser = subparsers.add_parser("abliterate")
        abl_parser.add_argument("model", type=str)
        abl_parser.add_argument("--output-dir", type=str, default=None)
        abl_parser.add_argument("--device", type=str, default="auto")
        abl_parser.add_argument("--dtype", type=str, default="float16")
        abl_parser.add_argument("--method", type=str, default="advanced",
                                choices=["basic", "advanced", "aggressive"])
        abl_parser.add_argument("--n-directions", type=int, default=None)
        abl_parser.add_argument("--regularization", type=float, default=None)
        abl_parser.add_argument("--refinement-passes", type=int, default=None)

        args = parser.parse_args(["abliterate", "gpt2", "--method", "aggressive", "--n-directions", "6"])
        assert args.command == "abliterate"
        assert args.model == "gpt2"
        assert args.method == "aggressive"
        assert args.n_directions == 6
        assert args.dtype == "float16"

    def test_default_method(self):
        """Default method should be advanced."""
        import argparse

        parser = argparse.ArgumentParser()
        subparsers = parser.add_subparsers(dest="command")
        abl_parser = subparsers.add_parser("abliterate")
        abl_parser.add_argument("model", type=str)
        abl_parser.add_argument("--method", type=str, default="advanced")

        args = parser.parse_args(["abliterate", "gpt2"])
        assert args.method == "advanced"


# ---------------------------------------------------------------------------
# Expert-Granular Abliteration (EGA)
# ---------------------------------------------------------------------------

class TestFindRouterModule:
    """Test _find_router_module static method."""

    def test_finds_gate(self):
        """Should find a router named 'gate'."""
        hidden = 16

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, 4, bias=False)
                self.experts = torch.nn.ModuleList()

        moe = FakeMoE()
        router = AbliterationPipeline._find_router_module(moe)
        assert router is moe.gate

    def test_finds_router(self):
        """Should find a router named 'router'."""
        hidden = 16

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.router = torch.nn.Linear(hidden, 4, bias=False)
                self.experts = torch.nn.ModuleList()

        moe = FakeMoE()
        router = AbliterationPipeline._find_router_module(moe)
        assert router is moe.router

    def test_auto_detects_unknown_router(self):
        """Should auto-detect a router with unusual name via heuristic."""
        hidden = 16

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.moe_gate_proj = torch.nn.Linear(hidden, 4, bias=False)
                self.experts = torch.nn.ModuleList()

        moe = FakeMoE()
        router = AbliterationPipeline._find_router_module(moe)
        assert router is moe.moe_gate_proj

    def test_returns_none_no_router(self):
        """Should return None when no router is found."""
        class NoRouter(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = torch.nn.Linear(16, 16)

        mod = NoRouter()
        assert AbliterationPipeline._find_router_module(mod) is None


class TestRouterProfilingHooks:
    """Test _install_router_profiling_hooks."""

    def _make_moe_pipeline_and_layers(self, hidden=16, n_experts=4):
        """Create a pipeline with a fake MoE model for router profiling tests."""
        from obliteratus.models.loader import ModelHandle
        from transformers import GPT2Config

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, hidden, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)])

            def forward(self, x):
                return x

        class FakeLayer(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.self_attn = torch.nn.Module()
                self.self_attn.o_proj = torch.nn.Linear(hidden, hidden, bias=False)
                self.mlp = FakeMoE()

            def forward(self, x):
                return (x,)

        config = GPT2Config(n_embd=hidden, n_head=2, n_layer=1, vocab_size=100, n_positions=64)
        model = MagicMock()
        model.parameters.return_value = iter([torch.zeros(1)])
        handle = ModelHandle(model=model, tokenizer=MagicMock(), config=config, model_name="test", task="causal_lm")

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline.handle = handle
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None

        layer = FakeLayer()
        layers = torch.nn.ModuleList([layer])

        # Monkey-patch get_ffn_module
        import obliteratus.abliterate as abl_module
        orig_get_ffn = abl_module.get_ffn_module
        abl_module.get_ffn_module = lambda lay, a: lay.mlp

        return pipeline, layers, layer, abl_module, orig_get_ffn

    def test_hooks_installed(self):
        """Should install hooks on MoE router modules."""
        pipeline, layers, layer, abl_module, orig_get_ffn = self._make_moe_pipeline_and_layers()
        try:
            hooks = pipeline._install_router_profiling_hooks(layers)
            assert len(hooks) == 1
            assert 0 in pipeline._routing_harmful
            assert 0 in pipeline._routing_harmless
        finally:
            for h in hooks:
                h.remove()
            abl_module.get_ffn_module = orig_get_ffn

    def test_hooks_record_logits(self):
        """Hooks should record router logits during forward passes."""
        pipeline, layers, layer, abl_module, orig_get_ffn = self._make_moe_pipeline_and_layers()
        try:
            hooks = pipeline._install_router_profiling_hooks(layers)

            # Simulate harmful forward pass
            pipeline._routing_is_harmful = True
            x = torch.randn(1, 5, 16)
            layer.mlp.gate(x)  # triggers hook

            assert len(pipeline._routing_harmful[0]) == 1
            assert pipeline._routing_harmful[0][0].shape[0] == 4  # n_experts

            # Simulate harmless forward pass
            pipeline._routing_is_harmful = False
            layer.mlp.gate(x)

            assert len(pipeline._routing_harmless[0]) == 1
        finally:
            for h in hooks:
                h.remove()
            abl_module.get_ffn_module = orig_get_ffn

    def test_no_handle_returns_empty(self):
        """Should return empty list when handle is None."""
        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline.handle = None
        hooks = pipeline._install_router_profiling_hooks(torch.nn.ModuleList())
        assert hooks == []


class TestComputeExpertGranularDirections:
    """Test _compute_expert_granular_directions."""

    def test_computes_per_expert_directions(self):
        """Should compute per-expert refusal directions from routing data."""
        hidden = 16
        n_experts = 4

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        pipeline._strong_layers = [0]

        torch.manual_seed(42)

        # Simulate router logits: expert 0 favored for harmful, expert 3 for harmless
        h_logits = []
        s_logits = []
        for _ in range(10):
            hl = torch.randn(n_experts)
            hl[0] += 2.0  # bias expert 0 for harmful
            h_logits.append(hl)
            sl = torch.randn(n_experts)
            sl[3] += 2.0  # bias expert 3 for harmless
            s_logits.append(sl)

        pipeline._routing_harmful = {0: h_logits}
        pipeline._routing_harmless = {0: s_logits}

        # Simulate per-prompt activations with harmful/harmless separation
        refusal_dir = torch.randn(hidden)
        refusal_dir = refusal_dir / refusal_dir.norm()

        h_acts = [torch.randn(hidden) + 1.5 * refusal_dir for _ in range(10)]
        s_acts = [torch.randn(hidden) - 1.5 * refusal_dir for _ in range(10)]
        pipeline._harmful_acts = {0: h_acts}
        pipeline._harmless_acts = {0: s_acts}

        pipeline._compute_expert_granular_directions()

        # Should have computed expert directions for layer 0
        assert 0 in pipeline._expert_directions
        assert len(pipeline._expert_directions[0]) > 0

        # Should have dynamic safety scores
        assert 0 in pipeline._expert_safety_scores
        scores = pipeline._expert_safety_scores[0]
        assert len(scores) == n_experts
        # Expert 0 should have higher safety score (more activated for harmful)
        expert_0_score = next(s for eid, s in scores if eid == 0)
        expert_3_score = next(s for eid, s in scores if eid == 3)
        assert expert_0_score > expert_3_score, (
            f"Expert 0 should have higher safety score: {expert_0_score} vs {expert_3_score}"
        )

    def test_directions_are_unit_vectors(self):
        """Per-expert directions should be unit normalized."""
        hidden = 16
        n_experts = 4

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._strong_layers = [0]

        torch.manual_seed(42)
        h_logits = [torch.randn(n_experts) for _ in range(10)]
        s_logits = [torch.randn(n_experts) for _ in range(10)]
        pipeline._routing_harmful = {0: h_logits}
        pipeline._routing_harmless = {0: s_logits}
        pipeline._harmful_acts = {0: [torch.randn(hidden) + torch.ones(hidden) for _ in range(10)]}
        pipeline._harmless_acts = {0: [torch.randn(hidden) - torch.ones(hidden) for _ in range(10)]}

        pipeline._compute_expert_granular_directions()

        if 0 in pipeline._expert_directions:
            for ei, d in pipeline._expert_directions[0].items():
                assert abs(d.norm().item() - 1.0) < 1e-4, (
                    f"Expert {ei} direction norm={d.norm().item()}, expected 1.0"
                )

    def test_skips_when_no_routing_data(self):
        """Should skip gracefully when no routing data is available."""
        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._routing_harmful = {}
        pipeline._routing_harmless = {}

        pipeline._compute_expert_granular_directions()

        assert len(pipeline._expert_directions) == 0

    def test_skips_expert_with_low_routing_weight(self):
        """Experts with insufficient routing weight should not get directions."""
        hidden = 16

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._strong_layers = [0]

        # Create routing logits where expert 3 is never selected (very low)
        h_logits = []
        s_logits = []
        for _ in range(3):
            hl = torch.tensor([5.0, 5.0, 5.0, -100.0])  # expert 3 never routed
            h_logits.append(hl)
            sl = torch.tensor([5.0, 5.0, 5.0, -100.0])
            s_logits.append(sl)

        pipeline._routing_harmful = {0: h_logits}
        pipeline._routing_harmless = {0: s_logits}

        torch.manual_seed(42)
        pipeline._harmful_acts = {0: [torch.randn(hidden) for _ in range(3)]}
        pipeline._harmless_acts = {0: [torch.randn(hidden) for _ in range(3)]}

        pipeline._compute_expert_granular_directions()

        # Expert 3 should NOT have a direction (routing weight too low)
        if 0 in pipeline._expert_directions:
            assert 3 not in pipeline._expert_directions[0], (
                "Expert with near-zero routing weight should not get a direction"
            )


class TestProjectMoEExpertsGranular:
    """Test _project_moe_experts_granular (ModuleList path)."""

    def _make_direction(self, hidden_dim=16):
        d = torch.randn(hidden_dim, 1)
        return d / d.norm()

    def test_per_expert_directions_applied(self):
        """Each expert should use its own direction when available."""
        hidden = 16
        n_experts = 4

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.up_proj = torch.nn.Linear(hidden, 32, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)])

        moe = FakeMoE()
        torch.manual_seed(42)
        for p in moe.parameters():
            p.data = torch.randn_like(p.data)

        shared_dir = self._make_direction(hidden)

        # Create distinct per-expert directions
        expert_dirs = {}
        for ei in range(n_experts):
            d = torch.randn(hidden)
            d = d / d.norm()
            expert_dirs[ei] = d

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._expert_directions = {0: expert_dirs}

        # Save originals
        orig_weights = {
            ei: moe.experts[ei].down_proj.weight.data.clone()
            for ei in range(n_experts)
        }

        count = pipeline._project_moe_experts_granular(
            moe, shared_dir, layer_idx=0,
        )

        assert count > 0, "Should project some weights"

        # All experts should be modified
        for ei in range(n_experts):
            assert not torch.allclose(
                moe.experts[ei].down_proj.weight.data, orig_weights[ei]
            ), f"Expert {ei} should be modified"

    def test_falls_back_to_shared_direction(self):
        """Experts without per-expert direction should use shared direction."""
        hidden = 16
        n_experts = 4

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.up_proj = torch.nn.Linear(hidden, 32, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)])

        moe = FakeMoE()
        torch.manual_seed(42)
        for p in moe.parameters():
            p.data = torch.randn_like(p.data)

        shared_dir = self._make_direction(hidden)

        # Only expert 0 has a per-expert direction
        expert_dirs = {0: torch.randn(hidden).div_(torch.randn(hidden).norm())}
        expert_dirs[0] = expert_dirs[0] / expert_dirs[0].norm()

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._expert_directions = {0: expert_dirs}

        orig_e1 = moe.experts[1].down_proj.weight.data.clone()

        pipeline._project_moe_experts_granular(
            moe, shared_dir, layer_idx=0,
        )

        # Experts 1,2,3 should be modified (using shared direction)
        assert not torch.allclose(moe.experts[1].down_proj.weight.data, orig_e1), \
            "Expert 1 should use shared direction fallback"

    def test_router_uses_shared_direction(self):
        """Router should always use the shared direction, not per-expert."""
        hidden = 16
        n_experts = 4

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)])

        moe = FakeMoE()
        shared_dir = self._make_direction(hidden)

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._expert_directions = {0: {0: torch.randn(hidden)}}

        orig_gate = moe.gate.weight.data.clone()

        pipeline._project_moe_experts_granular(moe, shared_dir, layer_idx=0)

        # Gate should be projected
        assert not torch.allclose(moe.gate.weight.data, orig_gate), \
            "Router should be projected with shared direction"

        # Gate's projection onto shared direction should be near zero
        proj = (moe.gate.weight.data @ shared_dir).norm().item()
        assert proj < 1e-4, f"Router should have shared dir removed, proj={proj}"

    def test_shared_expert_uses_shared_direction(self):
        """Shared expert should always use the shared direction."""
        hidden = 16

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.up_proj = torch.nn.Linear(hidden, 32, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, 2, bias=False)
                self.shared_expert = torch.nn.Module()
                self.shared_expert.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.shared_expert.up_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(2)])

        moe = FakeMoE()
        shared_dir = self._make_direction(hidden)

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._expert_directions = {0: {0: torch.randn(hidden)}}

        orig_shared = moe.shared_expert.down_proj.weight.data.clone()

        pipeline._project_moe_experts_granular(moe, shared_dir, layer_idx=0)

        assert not torch.allclose(moe.shared_expert.down_proj.weight.data, orig_shared), \
            "Shared expert should be projected"


class TestProjectFused3DGranular:
    """Test _project_fused_3d_granular for fused 3D expert tensors."""

    def test_per_expert_directions_on_fused(self):
        """Each expert slice should use its own direction."""
        hidden = 16
        intermediate = 32
        n_experts = 4

        class FusedExperts(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden))

        container = FusedExperts()
        torch.manual_seed(42)

        shared_dir = torch.randn(hidden, 1)
        shared_dir = shared_dir / shared_dir.norm()

        # Per-expert directions
        expert_dirs = {}
        for ei in range(n_experts):
            d = torch.randn(hidden)
            d = d / d.norm()
            expert_dirs[ei] = d

        orig_data = container.down_proj.data.clone()

        count = AbliterationPipeline._project_fused_3d_granular(
            container, shared_dir, expert_dirs, ["down_proj"],
            norm_preserve=False, scale=1.0,
        )

        assert count == n_experts, f"Should project {n_experts} experts, got {count}"

        # Each expert should be modified
        for ei in range(n_experts):
            assert not torch.allclose(
                container.down_proj.data[ei], orig_data[ei]
            ), f"Expert {ei} should be modified"

    def test_fallback_to_shared_on_fused(self):
        """Experts without per-expert direction should use shared direction."""
        hidden = 16
        intermediate = 32
        n_experts = 4

        class FusedExperts(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden))

        container = FusedExperts()
        torch.manual_seed(42)

        shared_dir = torch.randn(hidden, 1)
        shared_dir = shared_dir / shared_dir.norm()

        # Only expert 0 has a direction
        expert_dirs = {0: torch.randn(hidden).div_(1.0)}
        expert_dirs[0] = expert_dirs[0] / expert_dirs[0].norm()

        orig_data = container.down_proj.data.clone()

        count = AbliterationPipeline._project_fused_3d_granular(
            container, shared_dir, expert_dirs, ["down_proj"],
            norm_preserve=False, scale=1.0,
        )

        assert count == n_experts
        # All experts should be modified (experts 1-3 use shared dir)
        for ei in range(n_experts):
            assert not torch.allclose(
                container.down_proj.data[ei], orig_data[ei]
            ), f"Expert {ei} should be modified"

    def test_norm_preserve_on_fused(self):
        """Fused 3D with norm_preserve should maintain per-expert norms."""
        hidden = 16
        intermediate = 32
        n_experts = 4

        class FusedExperts(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden))

        container = FusedExperts()
        torch.manual_seed(42)

        shared_dir = torch.randn(hidden, 1)
        shared_dir = shared_dir / shared_dir.norm()

        expert_dirs = {}
        for ei in range(n_experts):
            d = torch.randn(hidden)
            expert_dirs[ei] = d / d.norm()

        orig_norms = [container.down_proj.data[i].norm().item() for i in range(n_experts)]

        AbliterationPipeline._project_fused_3d_granular(
            container, shared_dir, expert_dirs, ["down_proj"],
            norm_preserve=True, scale=1.0,
        )

        for i in range(n_experts):
            new_norm = container.down_proj.data[i].norm().item()
            assert abs(orig_norms[i] - new_norm) < 1e-3, (
                f"Expert {i} norm not preserved: {orig_norms[i]:.4f} vs {new_norm:.4f}"
            )

    def test_skips_non_3d_params(self):
        """Should skip parameters that are not 3-dimensional."""
        hidden = 16

        class FlatExperts(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Parameter(torch.randn(32, hidden))

        container = FlatExperts()
        shared_dir = torch.randn(hidden, 1)
        shared_dir = shared_dir / shared_dir.norm()

        count = AbliterationPipeline._project_fused_3d_granular(
            container, shared_dir, {}, ["down_proj"],
            norm_preserve=False, scale=1.0,
        )
        assert count == 0


class TestEGAExciseIntegration:
    """Test that EGA integrates properly in the excise stage path."""

    def test_ega_pipeline_flags(self):
        """Pipeline with surgical method should enable per_expert_directions."""
        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        assert pipeline.per_expert_directions is True

    def test_ega_only_on_primary_direction(self):
        """EGA should only apply for dir_idx==0, not higher SVD directions."""
        # This is enforced by the `and dir_idx == 0` check in _excise
        # We verify the code structure exists
        from obliteratus.abliterate import AbliterationPipeline
        import inspect
        source = inspect.getsource(AbliterationPipeline._excise_inner)
        assert "dir_idx == 0" in source, "EGA should only apply for primary direction"
        assert "_project_moe_experts_granular" in source, "EGA method should be called in excise"

    def test_ega_distill_integration(self):
        """EGA should be called during distill when per_expert_directions is enabled."""
        from obliteratus.abliterate import AbliterationPipeline
        import inspect
        source = inspect.getsource(AbliterationPipeline._distill)
        assert "_compute_expert_granular_directions" in source
        assert "per_expert_directions" in source

    def test_nuclear_method_enables_ega(self):
        """Nuclear method should also enable per_expert_directions."""
        cfg = METHODS["nuclear"]
        assert cfg["per_expert_directions"] is True
        pipeline = AbliterationPipeline(model_name="test", method="nuclear")
        assert pipeline.per_expert_directions is True

    def test_basic_method_disables_ega(self):
        """Basic method should not enable per_expert_directions."""
        cfg = METHODS["basic"]
        assert cfg.get("per_expert_directions", False) is False

    def test_inverted_method_enables_ega(self):
        """Inverted method should enable per_expert_directions."""
        cfg = METHODS["inverted"]
        assert cfg["per_expert_directions"] is True

    def test_ega_with_routing_data_end_to_end(self):
        """End-to-end: EGA computes directions and granular projection modifies weights."""
        hidden = 16
        n_experts = 4

        class FakeExpert(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.down_proj = torch.nn.Linear(hidden, 32, bias=False)
                self.up_proj = torch.nn.Linear(hidden, 32, bias=False)

        class FakeMoE(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.gate = torch.nn.Linear(hidden, n_experts, bias=False)
                self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)])

        moe = FakeMoE()
        torch.manual_seed(42)
        for p in moe.parameters():
            p.data = torch.randn_like(p.data)

        pipeline = AbliterationPipeline(model_name="test", method="surgical")
        pipeline._on_log = lambda m: None
        pipeline._on_stage = lambda r: None
        pipeline._strong_layers = [0]

        # Simulate EGA routing data
        h_logits = [torch.randn(n_experts) for _ in range(5)]
        s_logits = [torch.randn(n_experts) for _ in range(5)]
        pipeline._routing_harmful = {0: h_logits}
        pipeline._routing_harmless = {0: s_logits}

        # Simulate activations with clear separation
        refusal_dir = torch.randn(hidden)
        refusal_dir = refusal_dir / refusal_dir.norm()
        pipeline._harmful_acts = {0: [torch.randn(hidden) + 2 * refusal_dir for _ in range(5)]}
        pipeline._harmless_acts = {0: [torch.randn(hidden) - 2 * refusal_dir for _ in range(5)]}

        # Step 1: compute EGA directions
        pipeline._compute_expert_granular_directions()
        assert 0 in pipeline._expert_directions
        assert len(pipeline._expert_directions[0]) > 0

        # Step 2: apply granular projection
        shared_dir = torch.randn(hidden, 1)
        shared_dir = shared_dir / shared_dir.norm()

        orig_expert0 = moe.experts[0].down_proj.weight.data.clone()

        count = pipeline._project_moe_experts_granular(
            moe, shared_dir, layer_idx=0,
        )

        assert count > 0
        assert not torch.allclose(moe.experts[0].down_proj.weight.data, orig_expert0), \
            "Expert weights should be modified by EGA"