OBLITERATUS/tests/test_advanced_metrics.py

"""Tests for advanced evaluation metrics."""

from __future__ import annotations


import pytest
import torch

from obliteratus.evaluation.advanced_metrics import (
    REFUSAL_PREFIXES,
    REFUSAL_SUBSTRINGS,
    _is_refusal,
    refusal_rate,
    token_kl_divergence,
    first_token_kl_divergence,
    effective_rank,
    effective_rank_change,
    activation_cosine_similarity,
    linear_cka,
    refusal_projection_magnitude,
    format_eval_report,
    AbliterationEvalResult,
)


# ---------------------------------------------------------------------------
# Refusal Rate
# ---------------------------------------------------------------------------

class TestRefusalRate:
    def test_all_refusals(self):
        responses = [
            "I'm sorry, but I cannot help with that.",
            "I apologize, but I'm unable to assist.",
            "As an AI, I cannot provide that information.",
        ]
        assert refusal_rate(responses, mode="prefix") == 1.0

    def test_no_refusals(self):
        responses = [
            "Sure, here's how to do that.",
            "Here's a step-by-step guide.",
            "The answer is 42.",
        ]
        assert refusal_rate(responses, mode="prefix") == 0.0

    def test_mixed_responses(self):
        responses = [
            "I'm sorry, I can't do that.",
            "Sure, here you go.",
            "I cannot provide that.",
            "Here's what you need.",
        ]
        rate = refusal_rate(responses, mode="prefix")
        assert rate == 0.5  # 2 out of 4

    def test_substring_mode(self):
        responses = [
            "Well, actually I cannot help with that request.",
            "Let me explain how to do this.",
        ]
        assert refusal_rate(responses, mode="substring") == 0.5

    def test_combined_mode(self):
        responses = [
            "I'm sorry, but no.",  # caught by prefix
            "That said, i won't do that for you.",  # caught by substring
            "Here's the answer.",  # not a refusal
        ]
        rate = refusal_rate(responses, mode="combined")
        assert abs(rate - 2.0 / 3.0) < 1e-6

    def test_empty_responses(self):
        assert refusal_rate([], mode="prefix") == 0.0

    def test_empty_string_not_refusal(self):
        assert not _is_refusal("", mode="combined")

    def test_is_refusal_prefix(self):
        assert _is_refusal("I'm sorry, I can't do that.", mode="prefix")
        assert _is_refusal("As an AI language model, I cannot", mode="prefix")
        assert not _is_refusal("The answer is 42.", mode="prefix")

    def test_is_refusal_substring(self):
        assert _is_refusal("I must say, i cannot assist with that.", mode="substring")
        assert not _is_refusal("The capital of France is Paris.", mode="substring")

    def test_refusal_prefixes_nonempty(self):
        assert len(REFUSAL_PREFIXES) > 20

    def test_refusal_substrings_nonempty(self):
        assert len(REFUSAL_SUBSTRINGS) > 10


# ---------------------------------------------------------------------------
# KL Divergence
# ---------------------------------------------------------------------------

class TestKLDivergence:
    def test_identical_distributions(self):
        """KL divergence of identical distributions should be 0."""
        logits = torch.randn(2, 10, 100)
        kl = token_kl_divergence(logits, logits)
        assert abs(kl) < 1e-5

    def test_different_distributions(self):
        """KL divergence of different distributions should be positive."""
        torch.manual_seed(42)
        logits_a = torch.randn(2, 10, 100)
        logits_b = torch.randn(2, 10, 100)
        kl = token_kl_divergence(logits_a, logits_b)
        assert kl > 0

    def test_kl_nonnegative(self):
        """KL divergence should always be non-negative."""
        torch.manual_seed(42)
        for _ in range(5):
            logits_a = torch.randn(1, 5, 50)
            logits_b = torch.randn(1, 5, 50)
            kl = token_kl_divergence(logits_a, logits_b)
            assert kl >= -1e-6  # allow small numerical errors

    def test_first_token_kl_identical(self):
        """First-token KL of identical distributions should be 0."""
        logits = torch.randn(4, 20, 100)
        kl = first_token_kl_divergence(logits, logits)
        assert abs(kl) < 1e-5

    def test_first_token_kl_different(self):
        """First-token KL of different distributions should be positive."""
        torch.manual_seed(42)
        logits_a = torch.randn(4, 20, 100)
        logits_b = torch.randn(4, 20, 100)
        kl = first_token_kl_divergence(logits_a, logits_b)
        assert kl > 0

    def test_temperature_effect(self):
        """Higher temperature should reduce KL divergence (smoother distributions)."""
        torch.manual_seed(42)
        logits_a = torch.randn(2, 5, 50)
        logits_b = torch.randn(2, 5, 50)
        kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0)
        kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0)
        assert kl_t5 < kl_t1


# ---------------------------------------------------------------------------
# Effective Rank
# ---------------------------------------------------------------------------

class TestEffectiveRank:
    def test_rank_one_matrix(self):
        """Rank-1 matrix should have effective rank close to 1."""
        v = torch.randn(8, 1)
        u = torch.randn(1, 4)
        W = v @ u  # rank-1
        erank = effective_rank(W)
        assert erank < 1.5

    def test_identity_matrix(self):
        """Identity matrix should have effective rank equal to dimension."""
        n = 8
        W = torch.eye(n)
        erank = effective_rank(W)
        assert abs(erank - n) < 0.1

    def test_random_full_rank(self):
        """Random matrix should have high effective rank."""
        torch.manual_seed(42)
        W = torch.randn(16, 16)
        erank = effective_rank(W)
        assert erank > 10  # should be close to 16

    def test_zero_matrix(self):
        """Zero matrix should have effective rank 0."""
        W = torch.zeros(4, 4)
        erank = effective_rank(W)
        assert erank == 0.0

    def test_effective_rank_change(self):
        """Should compute before/after rank comparison."""
        torch.manual_seed(42)
        W_before = torch.randn(8, 8)
        # Simulate abliteration: remove a direction (reduces rank slightly)
        d = torch.randn(8, 1)
        d = d / d.norm()
        W_after = W_before - (W_before @ d) @ d.T

        result = effective_rank_change(W_before, W_after)
        assert "rank_before" in result
        assert "rank_after" in result
        assert "rank_delta" in result
        assert "rank_ratio" in result
        assert result["rank_after"] <= result["rank_before"] + 0.1

    def test_rejects_non_2d(self):
        """Should raise ValueError for non-2D tensors."""
        with pytest.raises(ValueError):
            effective_rank(torch.randn(4, 4, 4))


# ---------------------------------------------------------------------------
# Activation Cosine Similarity
# ---------------------------------------------------------------------------

class TestActivationCosineSimilarity:
    def test_identical_activations(self):
        acts = torch.randn(10, 32)
        sim = activation_cosine_similarity(acts, acts)
        assert abs(sim - 1.0) < 1e-5

    def test_orthogonal_activations(self):
        """Orthogonal activations should have cosine near 0."""
        a = torch.tensor([[1.0, 0.0, 0.0]])
        b = torch.tensor([[0.0, 1.0, 0.0]])
        sim = activation_cosine_similarity(a, b)
        assert abs(sim) < 1e-5

    def test_opposite_activations(self):
        """Opposite activations should have cosine -1."""
        a = torch.randn(5, 16)
        sim = activation_cosine_similarity(a, -a)
        assert abs(sim - (-1.0)) < 1e-5

    def test_handles_3d(self):
        """Should handle 3D tensors by reshaping."""
        a = torch.randn(2, 5, 16)
        b = torch.randn(2, 5, 16)
        sim = activation_cosine_similarity(a, b)
        assert -1.0 <= sim <= 1.0


# ---------------------------------------------------------------------------
# Linear CKA
# ---------------------------------------------------------------------------

class TestLinearCKA:
    def test_identical_representations(self):
        """CKA of identical representations should be 1.0."""
        X = torch.randn(20, 16)
        cka = linear_cka(X, X)
        assert abs(cka - 1.0) < 1e-4

    def test_scaled_representations(self):
        """CKA should be invariant to isotropic scaling."""
        X = torch.randn(20, 16)
        Y = X * 5.0
        cka = linear_cka(X, Y)
        assert abs(cka - 1.0) < 1e-4

    def test_random_representations(self):
        """CKA of random representations should be low."""
        torch.manual_seed(42)
        X = torch.randn(100, 16)
        Y = torch.randn(100, 16)
        cka = linear_cka(X, Y)
        assert cka < 0.3  # random should be near 0

    def test_cka_bounded(self):
        """CKA should be between 0 and 1."""
        torch.manual_seed(42)
        for _ in range(5):
            X = torch.randn(20, 8)
            Y = torch.randn(20, 8)
            cka = linear_cka(X, Y)
            assert -0.01 <= cka <= 1.01  # small tolerance for numerics

    def test_different_dimensions(self):
        """CKA should work with different hidden dimensions."""
        X = torch.randn(20, 16)
        Y = torch.randn(20, 32)
        cka = linear_cka(X, Y)
        assert -0.01 <= cka <= 1.01

    def test_handles_3d(self):
        """Should handle 3D tensors by reshaping."""
        X = torch.randn(2, 10, 16)
        Y = torch.randn(2, 10, 16)
        cka = linear_cka(X, Y)
        assert -0.01 <= cka <= 1.01


# ---------------------------------------------------------------------------
# Refusal Direction Projection Magnitude
# ---------------------------------------------------------------------------

class TestRefusalProjection:
    def test_aligned_activations(self):
        """Activations aligned with direction should have high projection."""
        d = torch.tensor([1.0, 0.0, 0.0])
        acts = torch.tensor([
            [5.0, 0.0, 0.0],
            [3.0, 0.0, 0.0],
            [4.0, 0.0, 0.0],
        ])
        result = refusal_projection_magnitude(acts, d)
        assert result["mean"] == 4.0
        assert result["abs_mean"] == 4.0

    def test_orthogonal_activations(self):
        """Orthogonal activations should have zero projection."""
        d = torch.tensor([1.0, 0.0, 0.0])
        acts = torch.tensor([
            [0.0, 5.0, 0.0],
            [0.0, 0.0, 3.0],
        ])
        result = refusal_projection_magnitude(acts, d)
        assert abs(result["mean"]) < 1e-5
        assert abs(result["abs_mean"]) < 1e-5

    def test_result_keys(self):
        """Should return all expected keys."""
        d = torch.randn(8)
        acts = torch.randn(5, 8)
        result = refusal_projection_magnitude(acts, d)
        assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"}


# ---------------------------------------------------------------------------
# Eval Report Formatting
# ---------------------------------------------------------------------------

class TestEvalReport:
    def test_format_report(self):
        result = AbliterationEvalResult(
            refusal_rate_harmful=0.1,
            refusal_rate_harmless=0.02,
            kl_divergence=0.15,
            perplexity=12.5,
            coherence_score=0.8,
            mean_activation_cosine=0.95,
            mean_cka=0.92,
        )
        report = format_eval_report(result)
        assert "10.0%" in report
        assert "12.50" in report
        assert "excellent" in report  # KL < 0.2

    def test_format_report_high_kl(self):
        result = AbliterationEvalResult(
            refusal_rate_harmful=0.0,
            refusal_rate_harmless=0.0,
            kl_divergence=1.5,
            perplexity=50.0,
            coherence_score=0.4,
            mean_activation_cosine=None,
            mean_cka=None,
        )
        report = format_eval_report(result)
        assert "significant damage" in report

    def test_format_report_no_kl(self):
        result = AbliterationEvalResult(
            refusal_rate_harmful=0.5,
            refusal_rate_harmless=0.1,
            kl_divergence=None,
            perplexity=20.0,
            coherence_score=1.0,
            mean_activation_cosine=None,
            mean_cka=None,
        )
        report = format_eval_report(result)
        assert "50.0%" in report
        assert "KL" not in report