Files
OBLITERATUS/tests/test_advanced_metrics.py
2026-03-05 00:50:44 -08:00

365 lines
12 KiB
Python

"""Tests for advanced evaluation metrics."""
from __future__ import annotations
import pytest
import torch
from obliteratus.evaluation.advanced_metrics import (
REFUSAL_PREFIXES,
REFUSAL_SUBSTRINGS,
_is_refusal,
refusal_rate,
token_kl_divergence,
first_token_kl_divergence,
effective_rank,
effective_rank_change,
activation_cosine_similarity,
linear_cka,
refusal_projection_magnitude,
format_eval_report,
AbliterationEvalResult,
)
# ---------------------------------------------------------------------------
# Refusal Rate
# ---------------------------------------------------------------------------
class TestRefusalRate:
def test_all_refusals(self):
responses = [
"I'm sorry, but I cannot help with that.",
"I apologize, but I'm unable to assist.",
"As an AI, I cannot provide that information.",
]
assert refusal_rate(responses, mode="prefix") == 1.0
def test_no_refusals(self):
responses = [
"Sure, here's how to do that.",
"Here's a step-by-step guide.",
"The answer is 42.",
]
assert refusal_rate(responses, mode="prefix") == 0.0
def test_mixed_responses(self):
responses = [
"I'm sorry, I can't do that.",
"Sure, here you go.",
"I cannot provide that.",
"Here's what you need.",
]
rate = refusal_rate(responses, mode="prefix")
assert rate == 0.5 # 2 out of 4
def test_substring_mode(self):
responses = [
"Well, actually I cannot help with that request.",
"Let me explain how to do this.",
]
assert refusal_rate(responses, mode="substring") == 0.5
def test_combined_mode(self):
responses = [
"I'm sorry, but no.", # caught by prefix
"That said, i won't do that for you.", # caught by substring
"Here's the answer.", # not a refusal
]
rate = refusal_rate(responses, mode="combined")
assert abs(rate - 2.0 / 3.0) < 1e-6
def test_empty_responses(self):
assert refusal_rate([], mode="prefix") == 0.0
def test_empty_string_not_refusal(self):
assert not _is_refusal("", mode="combined")
def test_is_refusal_prefix(self):
assert _is_refusal("I'm sorry, I can't do that.", mode="prefix")
assert _is_refusal("As an AI language model, I cannot", mode="prefix")
assert not _is_refusal("The answer is 42.", mode="prefix")
def test_is_refusal_substring(self):
assert _is_refusal("I must say, i cannot assist with that.", mode="substring")
assert not _is_refusal("The capital of France is Paris.", mode="substring")
def test_refusal_prefixes_nonempty(self):
assert len(REFUSAL_PREFIXES) > 20
def test_refusal_substrings_nonempty(self):
assert len(REFUSAL_SUBSTRINGS) > 10
# ---------------------------------------------------------------------------
# KL Divergence
# ---------------------------------------------------------------------------
class TestKLDivergence:
def test_identical_distributions(self):
"""KL divergence of identical distributions should be 0."""
logits = torch.randn(2, 10, 100)
kl = token_kl_divergence(logits, logits)
assert abs(kl) < 1e-5
def test_different_distributions(self):
"""KL divergence of different distributions should be positive."""
torch.manual_seed(42)
logits_a = torch.randn(2, 10, 100)
logits_b = torch.randn(2, 10, 100)
kl = token_kl_divergence(logits_a, logits_b)
assert kl > 0
def test_kl_nonnegative(self):
"""KL divergence should always be non-negative."""
torch.manual_seed(42)
for _ in range(5):
logits_a = torch.randn(1, 5, 50)
logits_b = torch.randn(1, 5, 50)
kl = token_kl_divergence(logits_a, logits_b)
assert kl >= -1e-6 # allow small numerical errors
def test_first_token_kl_identical(self):
"""First-token KL of identical distributions should be 0."""
logits = torch.randn(4, 20, 100)
kl = first_token_kl_divergence(logits, logits)
assert abs(kl) < 1e-5
def test_first_token_kl_different(self):
"""First-token KL of different distributions should be positive."""
torch.manual_seed(42)
logits_a = torch.randn(4, 20, 100)
logits_b = torch.randn(4, 20, 100)
kl = first_token_kl_divergence(logits_a, logits_b)
assert kl > 0
def test_temperature_effect(self):
"""Higher temperature should reduce KL divergence (smoother distributions)."""
torch.manual_seed(42)
logits_a = torch.randn(2, 5, 50)
logits_b = torch.randn(2, 5, 50)
kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0)
kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0)
assert kl_t5 < kl_t1
# ---------------------------------------------------------------------------
# Effective Rank
# ---------------------------------------------------------------------------
class TestEffectiveRank:
def test_rank_one_matrix(self):
"""Rank-1 matrix should have effective rank close to 1."""
v = torch.randn(8, 1)
u = torch.randn(1, 4)
W = v @ u # rank-1
erank = effective_rank(W)
assert erank < 1.5
def test_identity_matrix(self):
"""Identity matrix should have effective rank equal to dimension."""
n = 8
W = torch.eye(n)
erank = effective_rank(W)
assert abs(erank - n) < 0.1
def test_random_full_rank(self):
"""Random matrix should have high effective rank."""
torch.manual_seed(42)
W = torch.randn(16, 16)
erank = effective_rank(W)
assert erank > 10 # should be close to 16
def test_zero_matrix(self):
"""Zero matrix should have effective rank 0."""
W = torch.zeros(4, 4)
erank = effective_rank(W)
assert erank == 0.0
def test_effective_rank_change(self):
"""Should compute before/after rank comparison."""
torch.manual_seed(42)
W_before = torch.randn(8, 8)
# Simulate abliteration: remove a direction (reduces rank slightly)
d = torch.randn(8, 1)
d = d / d.norm()
W_after = W_before - (W_before @ d) @ d.T
result = effective_rank_change(W_before, W_after)
assert "rank_before" in result
assert "rank_after" in result
assert "rank_delta" in result
assert "rank_ratio" in result
assert result["rank_after"] <= result["rank_before"] + 0.1
def test_rejects_non_2d(self):
"""Should raise ValueError for non-2D tensors."""
with pytest.raises(ValueError):
effective_rank(torch.randn(4, 4, 4))
# ---------------------------------------------------------------------------
# Activation Cosine Similarity
# ---------------------------------------------------------------------------
class TestActivationCosineSimilarity:
def test_identical_activations(self):
acts = torch.randn(10, 32)
sim = activation_cosine_similarity(acts, acts)
assert abs(sim - 1.0) < 1e-5
def test_orthogonal_activations(self):
"""Orthogonal activations should have cosine near 0."""
a = torch.tensor([[1.0, 0.0, 0.0]])
b = torch.tensor([[0.0, 1.0, 0.0]])
sim = activation_cosine_similarity(a, b)
assert abs(sim) < 1e-5
def test_opposite_activations(self):
"""Opposite activations should have cosine -1."""
a = torch.randn(5, 16)
sim = activation_cosine_similarity(a, -a)
assert abs(sim - (-1.0)) < 1e-5
def test_handles_3d(self):
"""Should handle 3D tensors by reshaping."""
a = torch.randn(2, 5, 16)
b = torch.randn(2, 5, 16)
sim = activation_cosine_similarity(a, b)
assert -1.0 <= sim <= 1.0
# ---------------------------------------------------------------------------
# Linear CKA
# ---------------------------------------------------------------------------
class TestLinearCKA:
def test_identical_representations(self):
"""CKA of identical representations should be 1.0."""
X = torch.randn(20, 16)
cka = linear_cka(X, X)
assert abs(cka - 1.0) < 1e-4
def test_scaled_representations(self):
"""CKA should be invariant to isotropic scaling."""
X = torch.randn(20, 16)
Y = X * 5.0
cka = linear_cka(X, Y)
assert abs(cka - 1.0) < 1e-4
def test_random_representations(self):
"""CKA of random representations should be low."""
torch.manual_seed(42)
X = torch.randn(100, 16)
Y = torch.randn(100, 16)
cka = linear_cka(X, Y)
assert cka < 0.3 # random should be near 0
def test_cka_bounded(self):
"""CKA should be between 0 and 1."""
torch.manual_seed(42)
for _ in range(5):
X = torch.randn(20, 8)
Y = torch.randn(20, 8)
cka = linear_cka(X, Y)
assert -0.01 <= cka <= 1.01 # small tolerance for numerics
def test_different_dimensions(self):
"""CKA should work with different hidden dimensions."""
X = torch.randn(20, 16)
Y = torch.randn(20, 32)
cka = linear_cka(X, Y)
assert -0.01 <= cka <= 1.01
def test_handles_3d(self):
"""Should handle 3D tensors by reshaping."""
X = torch.randn(2, 10, 16)
Y = torch.randn(2, 10, 16)
cka = linear_cka(X, Y)
assert -0.01 <= cka <= 1.01
# ---------------------------------------------------------------------------
# Refusal Direction Projection Magnitude
# ---------------------------------------------------------------------------
class TestRefusalProjection:
def test_aligned_activations(self):
"""Activations aligned with direction should have high projection."""
d = torch.tensor([1.0, 0.0, 0.0])
acts = torch.tensor([
[5.0, 0.0, 0.0],
[3.0, 0.0, 0.0],
[4.0, 0.0, 0.0],
])
result = refusal_projection_magnitude(acts, d)
assert result["mean"] == 4.0
assert result["abs_mean"] == 4.0
def test_orthogonal_activations(self):
"""Orthogonal activations should have zero projection."""
d = torch.tensor([1.0, 0.0, 0.0])
acts = torch.tensor([
[0.0, 5.0, 0.0],
[0.0, 0.0, 3.0],
])
result = refusal_projection_magnitude(acts, d)
assert abs(result["mean"]) < 1e-5
assert abs(result["abs_mean"]) < 1e-5
def test_result_keys(self):
"""Should return all expected keys."""
d = torch.randn(8)
acts = torch.randn(5, 8)
result = refusal_projection_magnitude(acts, d)
assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"}
# ---------------------------------------------------------------------------
# Eval Report Formatting
# ---------------------------------------------------------------------------
class TestEvalReport:
def test_format_report(self):
result = AbliterationEvalResult(
refusal_rate_harmful=0.1,
refusal_rate_harmless=0.02,
kl_divergence=0.15,
perplexity=12.5,
coherence_score=0.8,
mean_activation_cosine=0.95,
mean_cka=0.92,
)
report = format_eval_report(result)
assert "10.0%" in report
assert "12.50" in report
assert "excellent" in report # KL < 0.2
def test_format_report_high_kl(self):
result = AbliterationEvalResult(
refusal_rate_harmful=0.0,
refusal_rate_harmless=0.0,
kl_divergence=1.5,
perplexity=50.0,
coherence_score=0.4,
mean_activation_cosine=None,
mean_cka=None,
)
report = format_eval_report(result)
assert "significant damage" in report
def test_format_report_no_kl(self):
result = AbliterationEvalResult(
refusal_rate_harmful=0.5,
refusal_rate_harmless=0.1,
kl_divergence=None,
perplexity=20.0,
coherence_score=1.0,
mean_activation_cosine=None,
mean_cka=None,
)
report = format_eval_report(result)
assert "50.0%" in report
assert "KL" not in report