mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-04-23 19:56:15 +02:00
346 lines
14 KiB
Python
346 lines
14 KiB
Python
"""Tests for the analysis techniques."""
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
import torch
|
|
|
|
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor, WhitenedSVDResult
|
|
from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer, CrossLayerResult
|
|
from obliteratus.analysis.activation_probing import ActivationProbe, ProbeResult
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# WhitenedSVDExtractor
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestWhitenedSVD:
|
|
def test_basic_extraction(self):
|
|
"""Whitened SVD should extract directions from activation differences."""
|
|
torch.manual_seed(42)
|
|
n_prompts, hidden_dim = 10, 32
|
|
|
|
# Create activations with a clear refusal direction
|
|
refusal_dir = torch.randn(hidden_dim)
|
|
refusal_dir = refusal_dir / refusal_dir.norm()
|
|
|
|
harmless = [torch.randn(hidden_dim) for _ in range(n_prompts)]
|
|
harmful = [h + 2.0 * refusal_dir for h in harmless] # shifted along refusal dir
|
|
|
|
extractor = WhitenedSVDExtractor()
|
|
result = extractor.extract(harmful, harmless, n_directions=3)
|
|
|
|
assert isinstance(result, WhitenedSVDResult)
|
|
assert result.directions.shape == (3, hidden_dim)
|
|
assert result.singular_values.shape == (3,)
|
|
assert result.variance_explained > 0
|
|
assert result.condition_number > 0
|
|
assert result.effective_rank > 0
|
|
|
|
def test_directions_are_unit_vectors(self):
|
|
"""Extracted directions should be unit length."""
|
|
torch.manual_seed(42)
|
|
harmless = [torch.randn(16) for _ in range(8)]
|
|
harmful = [h + torch.randn(16) * 0.5 for h in harmless]
|
|
|
|
extractor = WhitenedSVDExtractor()
|
|
result = extractor.extract(harmful, harmless, n_directions=2)
|
|
|
|
for i in range(result.directions.shape[0]):
|
|
assert abs(result.directions[i].norm().item() - 1.0) < 1e-4
|
|
|
|
def test_primary_aligns_with_planted_direction(self):
|
|
"""Primary whitened direction should capture the planted refusal signal.
|
|
|
|
Whitening rotates directions relative to the covariance structure,
|
|
so perfect alignment with the raw direction is not expected. We verify
|
|
the whitened direction explains substantial variance and has moderate
|
|
alignment (whitening intentionally reweights dimensions).
|
|
"""
|
|
torch.manual_seed(42)
|
|
hidden_dim = 64
|
|
n_prompts = 30
|
|
|
|
refusal_dir = torch.randn(hidden_dim)
|
|
refusal_dir = refusal_dir / refusal_dir.norm()
|
|
|
|
# Isotropic harmless activations (whitening has minimal effect)
|
|
harmless = [torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
|
|
harmful = [h + 5.0 * refusal_dir for h in harmless]
|
|
|
|
extractor = WhitenedSVDExtractor(regularization_eps=1e-3)
|
|
result = extractor.extract(harmful, harmless, n_directions=1)
|
|
|
|
cos_sim = (result.directions[0] @ refusal_dir).abs().item()
|
|
# Moderate alignment expected (whitening reweights dimensions)
|
|
assert cos_sim > 0.2, f"Expected alignment > 0.2, got {cos_sim:.3f}"
|
|
# More importantly: the direction should explain most variance
|
|
assert result.variance_explained > 0.5
|
|
|
|
def test_extract_all_layers(self):
|
|
"""Should extract directions for all provided layers."""
|
|
torch.manual_seed(42)
|
|
harmful_acts = {}
|
|
harmless_acts = {}
|
|
for layer in range(4):
|
|
harmful_acts[layer] = [torch.randn(16) for _ in range(5)]
|
|
harmless_acts[layer] = [torch.randn(16) for _ in range(5)]
|
|
|
|
extractor = WhitenedSVDExtractor()
|
|
results = extractor.extract_all_layers(harmful_acts, harmless_acts, n_directions=2)
|
|
|
|
assert len(results) == 4
|
|
for idx in range(4):
|
|
assert idx in results
|
|
assert results[idx].directions.shape[0] == 2
|
|
|
|
def test_compare_with_standard(self):
|
|
"""Comparison should return valid cosine similarities."""
|
|
torch.manual_seed(42)
|
|
harmless = [torch.randn(16) for _ in range(8)]
|
|
harmful = [h + torch.randn(16) for h in harmless]
|
|
|
|
extractor = WhitenedSVDExtractor()
|
|
result = extractor.extract(harmful, harmless, n_directions=2)
|
|
|
|
std_dir = torch.randn(16)
|
|
std_dir = std_dir / std_dir.norm()
|
|
|
|
comparison = WhitenedSVDExtractor.compare_with_standard(result, std_dir)
|
|
assert "primary_direction_cosine" in comparison
|
|
assert "subspace_principal_cosine" in comparison
|
|
assert 0 <= comparison["primary_direction_cosine"] <= 1.0
|
|
|
|
def test_handles_3d_activations(self):
|
|
"""Should handle activations with an extra batch dimension."""
|
|
torch.manual_seed(42)
|
|
# (1, hidden_dim) shape from hook output
|
|
harmless = [torch.randn(1, 16) for _ in range(5)]
|
|
harmful = [torch.randn(1, 16) for _ in range(5)]
|
|
|
|
extractor = WhitenedSVDExtractor()
|
|
result = extractor.extract(harmful, harmless, n_directions=2)
|
|
assert result.directions.shape == (2, 16)
|
|
|
|
def test_variance_explained_bounded(self):
|
|
"""Variance explained should be between 0 and 1."""
|
|
torch.manual_seed(42)
|
|
harmless = [torch.randn(16) for _ in range(8)]
|
|
harmful = [torch.randn(16) for _ in range(8)]
|
|
|
|
extractor = WhitenedSVDExtractor()
|
|
result = extractor.extract(harmful, harmless, n_directions=3)
|
|
assert 0 <= result.variance_explained <= 1.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CrossLayerAlignmentAnalyzer
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCrossLayerAlignment:
|
|
def test_identical_directions(self):
|
|
"""Identical directions across layers should give persistence = 1."""
|
|
direction = torch.randn(32)
|
|
direction = direction / direction.norm()
|
|
directions = {i: direction.clone() for i in range(5)}
|
|
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze(directions)
|
|
|
|
assert isinstance(result, CrossLayerResult)
|
|
assert result.direction_persistence_score > 0.99
|
|
assert result.mean_adjacent_cosine > 0.99
|
|
assert result.total_geodesic_distance < 0.01
|
|
|
|
def test_orthogonal_directions(self):
|
|
"""Orthogonal directions should give low persistence."""
|
|
# Create orthogonal directions via QR decomposition
|
|
torch.manual_seed(42)
|
|
M = torch.randn(5, 32)
|
|
Q, _ = torch.linalg.qr(M.T)
|
|
directions = {i: Q[:, i] for i in range(5)}
|
|
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze(directions)
|
|
|
|
assert result.direction_persistence_score < 0.3
|
|
assert result.mean_adjacent_cosine < 0.3
|
|
|
|
def test_cluster_detection(self):
|
|
"""Should detect clusters of similar directions."""
|
|
torch.manual_seed(42)
|
|
# Create two clusters
|
|
d1 = torch.randn(32)
|
|
d1 = d1 / d1.norm()
|
|
d2 = torch.randn(32)
|
|
d2 = d2 / d2.norm()
|
|
|
|
directions = {
|
|
0: d1, 1: d1 + 0.01 * torch.randn(32),
|
|
2: d1 + 0.01 * torch.randn(32),
|
|
3: d2, 4: d2 + 0.01 * torch.randn(32),
|
|
}
|
|
# Normalize
|
|
directions = {k: v / v.norm() for k, v in directions.items()}
|
|
|
|
analyzer = CrossLayerAlignmentAnalyzer(cluster_threshold=0.9)
|
|
result = analyzer.analyze(directions)
|
|
|
|
# Should find at least 2 clusters
|
|
assert result.cluster_count >= 2
|
|
|
|
def test_empty_input(self):
|
|
"""Should handle empty input gracefully."""
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze({})
|
|
assert result.layer_indices == []
|
|
assert result.cluster_count == 0
|
|
|
|
def test_single_layer(self):
|
|
"""Single layer should work fine."""
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze({5: torch.randn(16)})
|
|
assert result.layer_indices == [5]
|
|
assert result.direction_persistence_score == 1.0
|
|
|
|
def test_strong_layers_filter(self):
|
|
"""Should only analyze specified strong layers."""
|
|
directions = {i: torch.randn(16) for i in range(10)}
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze(directions, strong_layers=[2, 5, 7])
|
|
assert result.layer_indices == [2, 5, 7]
|
|
assert result.cosine_matrix.shape == (3, 3)
|
|
|
|
def test_cosine_matrix_symmetry(self):
|
|
"""Cosine matrix should be symmetric."""
|
|
torch.manual_seed(42)
|
|
directions = {i: torch.randn(16) for i in range(4)}
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze(directions)
|
|
diff = (result.cosine_matrix - result.cosine_matrix.T).abs().max().item()
|
|
assert diff < 1e-5
|
|
|
|
def test_cosine_matrix_diagonal_ones(self):
|
|
"""Diagonal of cosine matrix should be 1.0."""
|
|
torch.manual_seed(42)
|
|
directions = {i: torch.randn(16) for i in range(4)}
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze(directions)
|
|
for i in range(4):
|
|
assert abs(result.cosine_matrix[i, i].item() - 1.0) < 1e-4
|
|
|
|
def test_angular_drift_monotonic(self):
|
|
"""Angular drift should be monotonically non-decreasing."""
|
|
torch.manual_seed(42)
|
|
directions = {i: torch.randn(16) for i in range(6)}
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze(directions)
|
|
for i in range(len(result.angular_drift) - 1):
|
|
assert result.angular_drift[i + 1] >= result.angular_drift[i] - 1e-6
|
|
|
|
def test_format_report(self):
|
|
"""Format report should produce a non-empty string."""
|
|
torch.manual_seed(42)
|
|
directions = {i: torch.randn(16) for i in range(4)}
|
|
analyzer = CrossLayerAlignmentAnalyzer()
|
|
result = analyzer.analyze(directions)
|
|
report = CrossLayerAlignmentAnalyzer.format_report(result)
|
|
assert "Cross-Layer" in report
|
|
assert "persistence" in report
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ActivationProbe
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestActivationProbe:
|
|
def test_clean_elimination(self):
|
|
"""After removing direction, projections should be near-zero."""
|
|
torch.manual_seed(42)
|
|
hidden_dim = 32
|
|
refusal_dir = torch.randn(hidden_dim)
|
|
refusal_dir = refusal_dir / refusal_dir.norm()
|
|
|
|
# "Post-abliteration" activations: direction has been removed
|
|
harmless = [torch.randn(hidden_dim) for _ in range(10)]
|
|
harmful = [torch.randn(hidden_dim) for _ in range(10)]
|
|
# Both sets are random, no refusal signal => gap should be small
|
|
|
|
probe = ActivationProbe()
|
|
result = probe.probe_layer(harmful, harmless, refusal_dir)
|
|
assert abs(result.projection_gap) < 1.0
|
|
assert result.separation_d_prime < 2.0
|
|
|
|
def test_residual_detection(self):
|
|
"""Should detect residual refusal signal when direction wasn't removed."""
|
|
torch.manual_seed(42)
|
|
hidden_dim = 32
|
|
refusal_dir = torch.randn(hidden_dim)
|
|
refusal_dir = refusal_dir / refusal_dir.norm()
|
|
|
|
harmless = [torch.randn(hidden_dim) for _ in range(10)]
|
|
# Harmful still has strong refusal direction component
|
|
harmful = [h + 5.0 * refusal_dir for h in harmless]
|
|
|
|
probe = ActivationProbe()
|
|
result = probe.probe_layer(harmful, harmless, refusal_dir)
|
|
assert abs(result.projection_gap) > 1.0
|
|
assert result.separation_d_prime > 2.0
|
|
|
|
def test_probe_all_layers(self):
|
|
"""Should compute aggregate metrics across layers."""
|
|
torch.manual_seed(42)
|
|
hidden_dim = 16
|
|
n_layers = 4
|
|
|
|
harmful_acts = {}
|
|
harmless_acts = {}
|
|
refusal_dirs = {}
|
|
|
|
for layer in range(n_layers):
|
|
harmful_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
|
|
harmless_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
|
|
d = torch.randn(hidden_dim)
|
|
refusal_dirs[layer] = d / d.norm()
|
|
|
|
probe = ActivationProbe()
|
|
result = probe.probe_all_layers(harmful_acts, harmless_acts, refusal_dirs)
|
|
|
|
assert isinstance(result, ProbeResult)
|
|
assert len(result.per_layer) == n_layers
|
|
assert 0 <= result.refusal_elimination_score <= 1.0
|
|
assert result.mean_projection_gap >= 0
|
|
|
|
def test_res_score_range(self):
|
|
"""RES should always be between 0 and 1."""
|
|
torch.manual_seed(42)
|
|
for seed in range(5):
|
|
torch.manual_seed(seed)
|
|
harmful = {0: [torch.randn(8) for _ in range(3)]}
|
|
harmless = {0: [torch.randn(8) for _ in range(3)]}
|
|
dirs = {0: torch.randn(8)}
|
|
dirs[0] = dirs[0] / dirs[0].norm()
|
|
|
|
probe = ActivationProbe()
|
|
result = probe.probe_all_layers(harmful, harmless, dirs)
|
|
assert 0 <= result.refusal_elimination_score <= 1.0
|
|
|
|
def test_format_report(self):
|
|
"""Format report should produce readable output."""
|
|
torch.manual_seed(42)
|
|
harmful = {0: [torch.randn(8) for _ in range(3)]}
|
|
harmless = {0: [torch.randn(8) for _ in range(3)]}
|
|
dirs = {0: torch.randn(8)}
|
|
|
|
probe = ActivationProbe()
|
|
result = probe.probe_all_layers(harmful, harmless, dirs)
|
|
report = ActivationProbe.format_report(result)
|
|
assert "Refusal Elimination Score" in report
|
|
|
|
def test_empty_input(self):
|
|
"""Should handle empty input gracefully."""
|
|
probe = ActivationProbe()
|
|
result = probe.probe_all_layers({}, {}, {})
|
|
assert result.refusal_elimination_score == 0.0
|
|
assert len(result.per_layer) == 0
|