mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-04-29 14:46:15 +02:00
170 lines
6.4 KiB
Python
170 lines
6.4 KiB
Python
"""Tests for defense robustness evaluation framework."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import MagicMock
|
|
|
|
import torch
|
|
|
|
from obliteratus.analysis.defense_robustness import (
|
|
DefenseProfile,
|
|
DefenseRobustnessEvaluator,
|
|
EntanglementMap,
|
|
SelfRepairResult,
|
|
)
|
|
|
|
|
|
def _make_mock_pipeline(n_layers=6, hidden_dim=16, n_prompts=5):
|
|
"""Create a mock pipeline with refusal directions and activations."""
|
|
pipeline = MagicMock()
|
|
pipeline.model_name = "test-model"
|
|
|
|
# Generate refusal directions (some strong, some weak)
|
|
torch.manual_seed(42)
|
|
directions = {}
|
|
for i in range(n_layers):
|
|
d = torch.randn(hidden_dim)
|
|
directions[i] = d / d.norm()
|
|
pipeline.refusal_directions = directions
|
|
|
|
# Generate activations with a planted refusal signal in middle layers
|
|
harmful_means = {}
|
|
harmless_means = {}
|
|
harmful_acts = {}
|
|
harmless_acts = {}
|
|
|
|
for i in range(n_layers):
|
|
base = torch.randn(hidden_dim)
|
|
harmless_means[i] = base.unsqueeze(0)
|
|
|
|
# Middle layers have stronger refusal signal
|
|
signal_strength = 3.0 if 2 <= i <= 4 else 0.5
|
|
harmful_means[i] = (base + signal_strength * directions[i]).unsqueeze(0)
|
|
|
|
harmful_acts[i] = [base + signal_strength * directions[i] + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
|
|
harmless_acts[i] = [base + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
|
|
|
|
pipeline._harmful_means = harmful_means
|
|
pipeline._harmless_means = harmless_means
|
|
pipeline._harmful_acts = harmful_acts
|
|
pipeline._harmless_acts = harmless_acts
|
|
|
|
return pipeline
|
|
|
|
|
|
class TestDefenseProfile:
|
|
def test_profile_generates(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
profile = evaluator.profile_defense()
|
|
|
|
assert isinstance(profile, DefenseProfile)
|
|
assert profile.model_name == "test-model"
|
|
assert profile.refusal_layer_spread > 0
|
|
assert profile.mean_refusal_strength > 0
|
|
assert profile.max_refusal_strength >= profile.mean_refusal_strength
|
|
assert profile.estimated_robustness in ("low", "medium", "high", "very_high")
|
|
|
|
def test_alignment_type_estimate(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
profile = evaluator.profile_defense()
|
|
assert profile.alignment_type_estimate != "unknown"
|
|
|
|
def test_empty_pipeline(self):
|
|
pipeline = MagicMock()
|
|
pipeline.model_name = "empty"
|
|
pipeline.refusal_directions = {}
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
profile = evaluator.profile_defense()
|
|
assert profile.estimated_robustness == "unknown"
|
|
|
|
def test_concentration_bounded(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
profile = evaluator.profile_defense()
|
|
# Gini coefficient should be between 0 and 1
|
|
assert 0 <= profile.refusal_concentration <= 1.0
|
|
|
|
def test_self_repair_bounded(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
profile = evaluator.profile_defense()
|
|
assert 0 <= profile.self_repair_estimate <= 1.0
|
|
|
|
def test_format_report(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
profile = evaluator.profile_defense()
|
|
report = DefenseRobustnessEvaluator.format_defense_profile(profile)
|
|
assert "Defense Robustness" in report
|
|
assert "test-model" in report
|
|
|
|
|
|
class TestSelfRepair:
|
|
def test_self_repair_measurement(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
result = evaluator.measure_self_repair(layer_idx=3)
|
|
|
|
assert isinstance(result, SelfRepairResult)
|
|
assert result.layer_idx == 3
|
|
assert result.original_refusal_strength >= 0
|
|
assert 0 <= result.repair_ratio <= 1.0
|
|
assert len(result.compensating_layers) > 0
|
|
assert 3 not in result.compensating_layers # shouldn't list itself
|
|
|
|
def test_repair_ratio_high_for_distributed(self):
|
|
"""Distributed refusal should have high repair ratio."""
|
|
pipeline = _make_mock_pipeline(n_layers=10)
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
result = evaluator.measure_self_repair(layer_idx=3)
|
|
# With distributed signal, removing one layer leaves much compensation
|
|
assert result.repair_ratio > 0.5
|
|
|
|
def test_format_self_repair(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
result = evaluator.measure_self_repair(layer_idx=2)
|
|
report = DefenseRobustnessEvaluator.format_self_repair(result)
|
|
assert "Self-Repair" in report
|
|
assert "Layer 2" in report
|
|
|
|
|
|
class TestEntanglement:
|
|
def test_entanglement_map(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
emap = evaluator.map_entanglement()
|
|
|
|
assert isinstance(emap, EntanglementMap)
|
|
assert len(emap.layer_entanglement) > 0
|
|
assert 0 <= emap.overall_entanglement <= 1.0
|
|
assert len(emap.most_entangled_layers) > 0
|
|
assert len(emap.least_entangled_layers) > 0
|
|
|
|
def test_capability_sensitivity_keys(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
emap = evaluator.map_entanglement()
|
|
|
|
expected_keys = {"factual_knowledge", "reasoning", "language_fluency",
|
|
"instruction_following", "math"}
|
|
assert set(emap.capability_sensitivity.keys()) == expected_keys
|
|
|
|
def test_math_most_sensitive(self):
|
|
"""Math should be estimated as the most sensitive capability."""
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
emap = evaluator.map_entanglement()
|
|
if emap.overall_entanglement > 0:
|
|
assert emap.capability_sensitivity["math"] >= emap.capability_sensitivity["language_fluency"]
|
|
|
|
def test_format_entanglement(self):
|
|
pipeline = _make_mock_pipeline()
|
|
evaluator = DefenseRobustnessEvaluator(pipeline)
|
|
emap = evaluator.map_entanglement()
|
|
report = DefenseRobustnessEvaluator.format_entanglement(emap)
|
|
assert "Entanglement" in report
|
|
assert "math" in report
|