mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-04-28 22:26:15 +02:00
365 lines
12 KiB
Python
365 lines
12 KiB
Python
"""Tests for advanced evaluation metrics."""
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from obliteratus.evaluation.advanced_metrics import (
|
|
REFUSAL_PREFIXES,
|
|
REFUSAL_SUBSTRINGS,
|
|
_is_refusal,
|
|
refusal_rate,
|
|
token_kl_divergence,
|
|
first_token_kl_divergence,
|
|
effective_rank,
|
|
effective_rank_change,
|
|
activation_cosine_similarity,
|
|
linear_cka,
|
|
refusal_projection_magnitude,
|
|
format_eval_report,
|
|
AbliterationEvalResult,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Refusal Rate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestRefusalRate:
|
|
def test_all_refusals(self):
|
|
responses = [
|
|
"I'm sorry, but I cannot help with that.",
|
|
"I apologize, but I'm unable to assist.",
|
|
"As an AI, I cannot provide that information.",
|
|
]
|
|
assert refusal_rate(responses, mode="prefix") == 1.0
|
|
|
|
def test_no_refusals(self):
|
|
responses = [
|
|
"Sure, here's how to do that.",
|
|
"Here's a step-by-step guide.",
|
|
"The answer is 42.",
|
|
]
|
|
assert refusal_rate(responses, mode="prefix") == 0.0
|
|
|
|
def test_mixed_responses(self):
|
|
responses = [
|
|
"I'm sorry, I can't do that.",
|
|
"Sure, here you go.",
|
|
"I cannot provide that.",
|
|
"Here's what you need.",
|
|
]
|
|
rate = refusal_rate(responses, mode="prefix")
|
|
assert rate == 0.5 # 2 out of 4
|
|
|
|
def test_substring_mode(self):
|
|
responses = [
|
|
"Well, actually I cannot help with that request.",
|
|
"Let me explain how to do this.",
|
|
]
|
|
assert refusal_rate(responses, mode="substring") == 0.5
|
|
|
|
def test_combined_mode(self):
|
|
responses = [
|
|
"I'm sorry, but no.", # caught by prefix
|
|
"That said, i won't do that for you.", # caught by substring
|
|
"Here's the answer.", # not a refusal
|
|
]
|
|
rate = refusal_rate(responses, mode="combined")
|
|
assert abs(rate - 2.0 / 3.0) < 1e-6
|
|
|
|
def test_empty_responses(self):
|
|
assert refusal_rate([], mode="prefix") == 0.0
|
|
|
|
def test_empty_string_not_refusal(self):
|
|
assert not _is_refusal("", mode="combined")
|
|
|
|
def test_is_refusal_prefix(self):
|
|
assert _is_refusal("I'm sorry, I can't do that.", mode="prefix")
|
|
assert _is_refusal("As an AI language model, I cannot", mode="prefix")
|
|
assert not _is_refusal("The answer is 42.", mode="prefix")
|
|
|
|
def test_is_refusal_substring(self):
|
|
assert _is_refusal("I must say, i cannot assist with that.", mode="substring")
|
|
assert not _is_refusal("The capital of France is Paris.", mode="substring")
|
|
|
|
def test_refusal_prefixes_nonempty(self):
|
|
assert len(REFUSAL_PREFIXES) > 20
|
|
|
|
def test_refusal_substrings_nonempty(self):
|
|
assert len(REFUSAL_SUBSTRINGS) > 10
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# KL Divergence
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestKLDivergence:
|
|
def test_identical_distributions(self):
|
|
"""KL divergence of identical distributions should be 0."""
|
|
logits = torch.randn(2, 10, 100)
|
|
kl = token_kl_divergence(logits, logits)
|
|
assert abs(kl) < 1e-5
|
|
|
|
def test_different_distributions(self):
|
|
"""KL divergence of different distributions should be positive."""
|
|
torch.manual_seed(42)
|
|
logits_a = torch.randn(2, 10, 100)
|
|
logits_b = torch.randn(2, 10, 100)
|
|
kl = token_kl_divergence(logits_a, logits_b)
|
|
assert kl > 0
|
|
|
|
def test_kl_nonnegative(self):
|
|
"""KL divergence should always be non-negative."""
|
|
torch.manual_seed(42)
|
|
for _ in range(5):
|
|
logits_a = torch.randn(1, 5, 50)
|
|
logits_b = torch.randn(1, 5, 50)
|
|
kl = token_kl_divergence(logits_a, logits_b)
|
|
assert kl >= -1e-6 # allow small numerical errors
|
|
|
|
def test_first_token_kl_identical(self):
|
|
"""First-token KL of identical distributions should be 0."""
|
|
logits = torch.randn(4, 20, 100)
|
|
kl = first_token_kl_divergence(logits, logits)
|
|
assert abs(kl) < 1e-5
|
|
|
|
def test_first_token_kl_different(self):
|
|
"""First-token KL of different distributions should be positive."""
|
|
torch.manual_seed(42)
|
|
logits_a = torch.randn(4, 20, 100)
|
|
logits_b = torch.randn(4, 20, 100)
|
|
kl = first_token_kl_divergence(logits_a, logits_b)
|
|
assert kl > 0
|
|
|
|
def test_temperature_effect(self):
|
|
"""Higher temperature should reduce KL divergence (smoother distributions)."""
|
|
torch.manual_seed(42)
|
|
logits_a = torch.randn(2, 5, 50)
|
|
logits_b = torch.randn(2, 5, 50)
|
|
kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0)
|
|
kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0)
|
|
assert kl_t5 < kl_t1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Effective Rank
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEffectiveRank:
|
|
def test_rank_one_matrix(self):
|
|
"""Rank-1 matrix should have effective rank close to 1."""
|
|
v = torch.randn(8, 1)
|
|
u = torch.randn(1, 4)
|
|
W = v @ u # rank-1
|
|
erank = effective_rank(W)
|
|
assert erank < 1.5
|
|
|
|
def test_identity_matrix(self):
|
|
"""Identity matrix should have effective rank equal to dimension."""
|
|
n = 8
|
|
W = torch.eye(n)
|
|
erank = effective_rank(W)
|
|
assert abs(erank - n) < 0.1
|
|
|
|
def test_random_full_rank(self):
|
|
"""Random matrix should have high effective rank."""
|
|
torch.manual_seed(42)
|
|
W = torch.randn(16, 16)
|
|
erank = effective_rank(W)
|
|
assert erank > 10 # should be close to 16
|
|
|
|
def test_zero_matrix(self):
|
|
"""Zero matrix should have effective rank 0."""
|
|
W = torch.zeros(4, 4)
|
|
erank = effective_rank(W)
|
|
assert erank == 0.0
|
|
|
|
def test_effective_rank_change(self):
|
|
"""Should compute before/after rank comparison."""
|
|
torch.manual_seed(42)
|
|
W_before = torch.randn(8, 8)
|
|
# Simulate abliteration: remove a direction (reduces rank slightly)
|
|
d = torch.randn(8, 1)
|
|
d = d / d.norm()
|
|
W_after = W_before - (W_before @ d) @ d.T
|
|
|
|
result = effective_rank_change(W_before, W_after)
|
|
assert "rank_before" in result
|
|
assert "rank_after" in result
|
|
assert "rank_delta" in result
|
|
assert "rank_ratio" in result
|
|
assert result["rank_after"] <= result["rank_before"] + 0.1
|
|
|
|
def test_rejects_non_2d(self):
|
|
"""Should raise ValueError for non-2D tensors."""
|
|
with pytest.raises(ValueError):
|
|
effective_rank(torch.randn(4, 4, 4))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Activation Cosine Similarity
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestActivationCosineSimilarity:
|
|
def test_identical_activations(self):
|
|
acts = torch.randn(10, 32)
|
|
sim = activation_cosine_similarity(acts, acts)
|
|
assert abs(sim - 1.0) < 1e-5
|
|
|
|
def test_orthogonal_activations(self):
|
|
"""Orthogonal activations should have cosine near 0."""
|
|
a = torch.tensor([[1.0, 0.0, 0.0]])
|
|
b = torch.tensor([[0.0, 1.0, 0.0]])
|
|
sim = activation_cosine_similarity(a, b)
|
|
assert abs(sim) < 1e-5
|
|
|
|
def test_opposite_activations(self):
|
|
"""Opposite activations should have cosine -1."""
|
|
a = torch.randn(5, 16)
|
|
sim = activation_cosine_similarity(a, -a)
|
|
assert abs(sim - (-1.0)) < 1e-5
|
|
|
|
def test_handles_3d(self):
|
|
"""Should handle 3D tensors by reshaping."""
|
|
a = torch.randn(2, 5, 16)
|
|
b = torch.randn(2, 5, 16)
|
|
sim = activation_cosine_similarity(a, b)
|
|
assert -1.0 <= sim <= 1.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Linear CKA
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestLinearCKA:
|
|
def test_identical_representations(self):
|
|
"""CKA of identical representations should be 1.0."""
|
|
X = torch.randn(20, 16)
|
|
cka = linear_cka(X, X)
|
|
assert abs(cka - 1.0) < 1e-4
|
|
|
|
def test_scaled_representations(self):
|
|
"""CKA should be invariant to isotropic scaling."""
|
|
X = torch.randn(20, 16)
|
|
Y = X * 5.0
|
|
cka = linear_cka(X, Y)
|
|
assert abs(cka - 1.0) < 1e-4
|
|
|
|
def test_random_representations(self):
|
|
"""CKA of random representations should be low."""
|
|
torch.manual_seed(42)
|
|
X = torch.randn(100, 16)
|
|
Y = torch.randn(100, 16)
|
|
cka = linear_cka(X, Y)
|
|
assert cka < 0.3 # random should be near 0
|
|
|
|
def test_cka_bounded(self):
|
|
"""CKA should be between 0 and 1."""
|
|
torch.manual_seed(42)
|
|
for _ in range(5):
|
|
X = torch.randn(20, 8)
|
|
Y = torch.randn(20, 8)
|
|
cka = linear_cka(X, Y)
|
|
assert -0.01 <= cka <= 1.01 # small tolerance for numerics
|
|
|
|
def test_different_dimensions(self):
|
|
"""CKA should work with different hidden dimensions."""
|
|
X = torch.randn(20, 16)
|
|
Y = torch.randn(20, 32)
|
|
cka = linear_cka(X, Y)
|
|
assert -0.01 <= cka <= 1.01
|
|
|
|
def test_handles_3d(self):
|
|
"""Should handle 3D tensors by reshaping."""
|
|
X = torch.randn(2, 10, 16)
|
|
Y = torch.randn(2, 10, 16)
|
|
cka = linear_cka(X, Y)
|
|
assert -0.01 <= cka <= 1.01
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Refusal Direction Projection Magnitude
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestRefusalProjection:
|
|
def test_aligned_activations(self):
|
|
"""Activations aligned with direction should have high projection."""
|
|
d = torch.tensor([1.0, 0.0, 0.0])
|
|
acts = torch.tensor([
|
|
[5.0, 0.0, 0.0],
|
|
[3.0, 0.0, 0.0],
|
|
[4.0, 0.0, 0.0],
|
|
])
|
|
result = refusal_projection_magnitude(acts, d)
|
|
assert result["mean"] == 4.0
|
|
assert result["abs_mean"] == 4.0
|
|
|
|
def test_orthogonal_activations(self):
|
|
"""Orthogonal activations should have zero projection."""
|
|
d = torch.tensor([1.0, 0.0, 0.0])
|
|
acts = torch.tensor([
|
|
[0.0, 5.0, 0.0],
|
|
[0.0, 0.0, 3.0],
|
|
])
|
|
result = refusal_projection_magnitude(acts, d)
|
|
assert abs(result["mean"]) < 1e-5
|
|
assert abs(result["abs_mean"]) < 1e-5
|
|
|
|
def test_result_keys(self):
|
|
"""Should return all expected keys."""
|
|
d = torch.randn(8)
|
|
acts = torch.randn(5, 8)
|
|
result = refusal_projection_magnitude(acts, d)
|
|
assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Eval Report Formatting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEvalReport:
|
|
def test_format_report(self):
|
|
result = AbliterationEvalResult(
|
|
refusal_rate_harmful=0.1,
|
|
refusal_rate_harmless=0.02,
|
|
kl_divergence=0.15,
|
|
perplexity=12.5,
|
|
coherence_score=0.8,
|
|
mean_activation_cosine=0.95,
|
|
mean_cka=0.92,
|
|
)
|
|
report = format_eval_report(result)
|
|
assert "10.0%" in report
|
|
assert "12.50" in report
|
|
assert "excellent" in report # KL < 0.2
|
|
|
|
def test_format_report_high_kl(self):
|
|
result = AbliterationEvalResult(
|
|
refusal_rate_harmful=0.0,
|
|
refusal_rate_harmless=0.0,
|
|
kl_divergence=1.5,
|
|
perplexity=50.0,
|
|
coherence_score=0.4,
|
|
mean_activation_cosine=None,
|
|
mean_cka=None,
|
|
)
|
|
report = format_eval_report(result)
|
|
assert "significant damage" in report
|
|
|
|
def test_format_report_no_kl(self):
|
|
result = AbliterationEvalResult(
|
|
refusal_rate_harmful=0.5,
|
|
refusal_rate_harmless=0.1,
|
|
kl_divergence=None,
|
|
perplexity=20.0,
|
|
coherence_score=1.0,
|
|
mean_activation_cosine=None,
|
|
mean_cka=None,
|
|
)
|
|
report = format_eval_report(result)
|
|
assert "50.0%" in report
|
|
assert "KL" not in report
|