mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-04-27 21:56:49 +02:00
386 lines
13 KiB
Python
386 lines
13 KiB
Python
"""Tests for the Analysis-Informed Abliteration Pipeline."""
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from obliteratus.informed_pipeline import (
|
|
AnalysisInsights,
|
|
InformedAbliterationPipeline,
|
|
InformedPipelineReport,
|
|
INFORMED_METHOD,
|
|
)
|
|
from obliteratus.abliterate import METHODS
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.fixture
|
|
def insights():
|
|
"""Default AnalysisInsights for testing."""
|
|
return AnalysisInsights()
|
|
|
|
|
|
@pytest.fixture
|
|
def pipeline(tmp_path):
|
|
"""An InformedAbliterationPipeline with no model loaded."""
|
|
return InformedAbliterationPipeline(
|
|
model_name="test-model",
|
|
output_dir=str(tmp_path / "test_informed"),
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# AnalysisInsights
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestAnalysisInsights:
|
|
def test_default_values(self, insights):
|
|
assert insights.detected_alignment_method == "unknown"
|
|
assert insights.alignment_confidence == 0.0
|
|
assert insights.cone_is_polyhedral is False
|
|
assert insights.cone_dimensionality == 1.0
|
|
assert insights.mean_pairwise_cosine == 1.0
|
|
assert insights.per_category_directions == {}
|
|
assert insights.direction_specificity == {}
|
|
assert insights.cluster_count == 0
|
|
assert insights.direction_persistence == 0.0
|
|
assert insights.use_sparse_surgery is False
|
|
assert insights.recommended_n_directions == 4
|
|
assert insights.recommended_regularization == 0.0
|
|
assert insights.recommended_refinement_passes == 2
|
|
assert insights.recommended_layers == []
|
|
assert insights.skip_layers == []
|
|
|
|
def test_default_robustness(self, insights):
|
|
assert insights.estimated_robustness == "unknown"
|
|
assert insights.self_repair_estimate == 0.0
|
|
assert insights.entanglement_score == 0.0
|
|
assert insights.entangled_layers == []
|
|
assert insights.clean_layers == []
|
|
|
|
|
|
class TestInformedPipelineReport:
|
|
def test_default_report(self):
|
|
insights = AnalysisInsights()
|
|
report = InformedPipelineReport(insights=insights)
|
|
assert report.analysis_duration == 0.0
|
|
assert report.total_duration == 0.0
|
|
assert report.ouroboros_passes == 0
|
|
assert report.final_refusal_rate == 0.0
|
|
assert report.stages == []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Method preset
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestInformedMethod:
|
|
def test_informed_method_in_abliterate_methods(self):
|
|
assert "informed" in METHODS
|
|
cfg = METHODS["informed"]
|
|
assert cfg["norm_preserve"] is True
|
|
assert cfg["project_biases"] is True
|
|
assert cfg["use_chat_template"] is True
|
|
assert cfg["use_whitened_svd"] is True
|
|
assert cfg["true_iterative_refinement"] is True
|
|
|
|
def test_informed_method_standalone(self):
|
|
assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)"
|
|
assert INFORMED_METHOD["n_directions"] == 4
|
|
assert INFORMED_METHOD["norm_preserve"] is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pipeline initialization
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestPipelineInit:
|
|
def test_method_set_to_informed(self, pipeline):
|
|
assert pipeline.method == "informed"
|
|
|
|
def test_default_analysis_flags(self, pipeline):
|
|
assert pipeline._run_cone is True
|
|
assert pipeline._run_alignment is True
|
|
assert pipeline._run_cross_layer is True
|
|
assert pipeline._run_sparse is True
|
|
assert pipeline._run_defense is True
|
|
|
|
def test_ouroboros_defaults(self, pipeline):
|
|
assert pipeline._ouroboros_threshold == 0.5
|
|
assert pipeline._max_ouroboros_passes == 3
|
|
|
|
def test_entanglement_gate(self, pipeline):
|
|
assert pipeline._entanglement_gate == 0.8
|
|
|
|
def test_inherits_base_pipeline(self, pipeline):
|
|
assert pipeline.norm_preserve is True
|
|
assert pipeline.project_biases is True
|
|
assert pipeline.use_chat_template is True
|
|
assert pipeline.use_whitened_svd is True
|
|
assert pipeline.true_iterative_refinement is True
|
|
|
|
def test_custom_flags(self):
|
|
p = InformedAbliterationPipeline(
|
|
model_name="test",
|
|
run_cone_analysis=False,
|
|
run_alignment_detection=False,
|
|
ouroboros_threshold=0.3,
|
|
max_ouroboros_passes=5,
|
|
entanglement_gate=0.9,
|
|
)
|
|
assert p._run_cone is False
|
|
assert p._run_alignment is False
|
|
assert p._ouroboros_threshold == 0.3
|
|
assert p._max_ouroboros_passes == 5
|
|
assert p._entanglement_gate == 0.9
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration derivation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestConfigurationDerivation:
|
|
"""Test the _derive_configuration logic with various insights."""
|
|
|
|
def _make_pipeline_with_insights(self, **kwargs):
|
|
p = InformedAbliterationPipeline(
|
|
model_name="test",
|
|
on_log=lambda m: None,
|
|
)
|
|
for k, v in kwargs.items():
|
|
setattr(p._insights, k, v)
|
|
return p
|
|
|
|
def test_polyhedral_cone_more_directions(self):
|
|
p = self._make_pipeline_with_insights(
|
|
cone_is_polyhedral=True,
|
|
cone_dimensionality=3.5,
|
|
)
|
|
p._derive_configuration()
|
|
# Polyhedral with dim 3.5 → n_dirs = max(4, min(8, int(3.5*2))) = 7
|
|
assert p.n_directions == 7
|
|
|
|
def test_linear_cone_fewer_directions(self):
|
|
p = self._make_pipeline_with_insights(
|
|
cone_is_polyhedral=False,
|
|
cone_dimensionality=1.0,
|
|
)
|
|
p._derive_configuration()
|
|
# Linear with dim 1.0 → n_dirs = max(1, min(4, int(1.0+1))) = 2
|
|
assert p.n_directions == 2
|
|
|
|
def test_dpo_zero_regularization(self):
|
|
p = self._make_pipeline_with_insights(
|
|
detected_alignment_method="dpo",
|
|
entanglement_score=0.1,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.regularization == 0.0
|
|
|
|
def test_rlhf_moderate_regularization(self):
|
|
p = self._make_pipeline_with_insights(
|
|
detected_alignment_method="rlhf",
|
|
entanglement_score=0.2,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.regularization == 0.15
|
|
|
|
def test_cai_regularization(self):
|
|
p = self._make_pipeline_with_insights(
|
|
detected_alignment_method="cai",
|
|
entanglement_score=0.2,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.regularization == 0.2
|
|
|
|
def test_sft_low_regularization(self):
|
|
p = self._make_pipeline_with_insights(
|
|
detected_alignment_method="sft",
|
|
entanglement_score=0.1,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.regularization == 0.05
|
|
|
|
def test_high_entanglement_increases_regularization(self):
|
|
p = self._make_pipeline_with_insights(
|
|
detected_alignment_method="dpo",
|
|
entanglement_score=0.7,
|
|
)
|
|
p._derive_configuration()
|
|
# DPO base = 0.0, + 0.15 for high entanglement = 0.15
|
|
assert p.regularization == 0.15
|
|
|
|
def test_high_self_repair_more_passes(self):
|
|
p = self._make_pipeline_with_insights(
|
|
self_repair_estimate=0.8,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.refinement_passes == 3
|
|
|
|
def test_moderate_self_repair_two_passes(self):
|
|
p = self._make_pipeline_with_insights(
|
|
self_repair_estimate=0.5,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.refinement_passes == 2
|
|
|
|
def test_low_self_repair_one_pass(self):
|
|
p = self._make_pipeline_with_insights(
|
|
self_repair_estimate=0.2,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.refinement_passes == 1
|
|
|
|
def test_cluster_layers_used(self):
|
|
p = self._make_pipeline_with_insights(
|
|
cluster_representative_layers=[5, 10, 15],
|
|
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
|
|
)
|
|
p.refusal_directions = {i: torch.randn(64) for i in range(20)}
|
|
p._derive_configuration()
|
|
# Should include all cluster layers
|
|
assert 5 in p._insights.recommended_layers
|
|
assert 10 in p._insights.recommended_layers
|
|
|
|
def test_entangled_layers_skipped(self):
|
|
p = self._make_pipeline_with_insights(
|
|
cluster_representative_layers=[5, 10, 15],
|
|
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
|
|
entangled_layers=[10],
|
|
)
|
|
p._derive_configuration()
|
|
# Layer 10 should be skipped
|
|
assert 10 not in p._insights.recommended_layers
|
|
assert 10 in p._insights.skip_layers
|
|
|
|
def test_sparse_surgery_enabled_when_rsi_high(self):
|
|
p = self._make_pipeline_with_insights(
|
|
mean_refusal_sparsity_index=0.7,
|
|
)
|
|
p._sparse_threshold = 0.5
|
|
p._derive_configuration()
|
|
assert p._insights.use_sparse_surgery is True
|
|
|
|
def test_sparse_surgery_disabled_when_rsi_low(self):
|
|
p = self._make_pipeline_with_insights(
|
|
mean_refusal_sparsity_index=0.3,
|
|
)
|
|
p._sparse_threshold = 0.5
|
|
p._derive_configuration()
|
|
assert p._insights.use_sparse_surgery is False
|
|
|
|
def test_whitened_svd_for_multi_direction(self):
|
|
p = self._make_pipeline_with_insights(
|
|
cone_is_polyhedral=True,
|
|
cone_dimensionality=2.5,
|
|
)
|
|
p._derive_configuration()
|
|
assert p.n_directions > 1
|
|
assert p.use_whitened_svd is True
|
|
|
|
def test_no_whitened_svd_for_single_direction(self):
|
|
p = self._make_pipeline_with_insights(
|
|
cone_is_polyhedral=False,
|
|
cone_dimensionality=0.5,
|
|
)
|
|
p._derive_configuration()
|
|
# dim 0.5 → max(1, min(4, int(0.5+1))) = 1
|
|
assert p.n_directions == 1
|
|
assert p.use_whitened_svd is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Format report
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestFormatInsights:
|
|
def test_format_default(self, insights):
|
|
text = InformedAbliterationPipeline.format_insights(insights)
|
|
assert "Analysis-Informed Pipeline" in text
|
|
assert "UNKNOWN" in text # detected method
|
|
assert "LINEAR" in text # cone type
|
|
|
|
def test_format_polyhedral(self):
|
|
insights = AnalysisInsights(
|
|
detected_alignment_method="dpo",
|
|
alignment_confidence=0.85,
|
|
cone_is_polyhedral=True,
|
|
cone_dimensionality=3.5,
|
|
cluster_count=4,
|
|
)
|
|
text = InformedAbliterationPipeline.format_insights(insights)
|
|
assert "DPO" in text
|
|
assert "POLYHEDRAL" in text
|
|
assert "3.50" in text
|
|
|
|
def test_format_includes_derived_config(self, insights):
|
|
insights.recommended_n_directions = 6
|
|
insights.recommended_regularization = 0.2
|
|
insights.recommended_refinement_passes = 3
|
|
text = InformedAbliterationPipeline.format_insights(insights)
|
|
assert "n_directions: 6" in text
|
|
assert "regularization: 0.2" in text
|
|
assert "refinement_passes: 3" in text
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Edge cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEdgeCases:
|
|
def test_no_cluster_layers_falls_back(self):
|
|
p = InformedAbliterationPipeline(
|
|
model_name="test",
|
|
on_log=lambda m: None,
|
|
)
|
|
p._insights.cluster_representative_layers = []
|
|
p._derive_configuration()
|
|
assert p._insights.recommended_layers == []
|
|
|
|
def test_regularization_capped(self):
|
|
p = InformedAbliterationPipeline(
|
|
model_name="test",
|
|
on_log=lambda m: None,
|
|
)
|
|
p._insights.detected_alignment_method = "cai"
|
|
p._insights.entanglement_score = 0.9
|
|
p._derive_configuration()
|
|
# CAI base = 0.2, + 0.15 = 0.35, capped at 0.5
|
|
assert p.regularization <= 0.5
|
|
|
|
def test_all_layers_entangled_keeps_some(self):
|
|
"""If all cluster layers are entangled, don't skip all of them."""
|
|
p = InformedAbliterationPipeline(
|
|
model_name="test",
|
|
on_log=lambda m: None,
|
|
)
|
|
p._insights.cluster_representative_layers = [5]
|
|
p._insights.direction_clusters = [[5]]
|
|
p._insights.entangled_layers = [5]
|
|
p._derive_configuration()
|
|
# Should NOT skip the only layer
|
|
assert 5 in p._insights.recommended_layers
|
|
|
|
def test_cone_dimensionality_bounds(self):
|
|
"""Extreme cone dimensionality values are handled."""
|
|
p = InformedAbliterationPipeline(
|
|
model_name="test",
|
|
on_log=lambda m: None,
|
|
)
|
|
# Very high dimensionality
|
|
p._insights.cone_is_polyhedral = True
|
|
p._insights.cone_dimensionality = 10.0
|
|
p._derive_configuration()
|
|
assert p.n_directions <= 8 # capped
|
|
|
|
# Very low dimensionality
|
|
p._insights.cone_is_polyhedral = False
|
|
p._insights.cone_dimensionality = 0.1
|
|
p._derive_configuration()
|
|
assert p.n_directions >= 1 # at least 1
|