diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6ad4d12..d7f08f7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,7 +5,7 @@ Thanks for your interest in contributing. This document covers everything you ne ## Development Setup ```bash -git clone https://github.com/obliteratus-project/OBLITERATUS.git +git clone https://github.com/elder-plinius/OBLITERATUS.git cd OBLITERATUS pip install -e ".[dev]" ``` diff --git a/README.md b/README.md index 52e8e8c..c593e8d 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ short_description: "One-click model liberation + chat playground" Open in HF Spaces   - + Open in Colab

@@ -55,7 +55,7 @@ Built on published research from [Arditi et al. (2024)](https://arxiv.org/abs/24 obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --method advanced ``` -Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All. +Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All. ## What it does @@ -153,7 +153,7 @@ The `obliteratus ui` command adds a Rich terminal startup with GPU detection and ### 3. Google Colab (free GPU) -[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) +[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) Pick a model from the dropdown, pick a method, hit Run All. Download the result or push straight to HuggingFace Hub. Works on the free T4 tier for models up to ~8B parameters. @@ -545,7 +545,7 @@ If you use OBLITERATUS in your research, please cite: Refusal Removal in Large Language Models}, author = {{OBLITERATUS Contributors}}, year = {2026}, - url = {https://github.com/obliteratus-project/OBLITERATUS}, + url = {https://github.com/elder-plinius/OBLITERATUS}, note = {15 analysis modules, 837 tests} } ``` @@ -565,7 +565,7 @@ pytest - **Open source** — [GNU Affero General Public License v3.0](LICENSE) (AGPL-3.0). You can freely use, modify, and distribute OBLITERATUS under AGPL terms. If you run a modified version as a network service (SaaS), you must release your source code to users under the same license. -- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/obliteratus-project/OBLITERATUS/issues) for pricing and terms. +- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/elder-plinius/OBLITERATUS/issues) for pricing and terms. This is the same dual-licensing model used by MongoDB, Qt, Grafana, and others. diff --git a/SECURITY.md b/SECURITY.md index 80fd422..69cb1ad 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -11,7 +11,7 @@ OBLITERATUS is a mechanistic interpretability research tool. It removes refusal If you discover a security vulnerability in OBLITERATUS, please report it responsibly: 1. **Do not** open a public GitHub issue -2. Open a [private security advisory](https://github.com/obliteratus-project/OBLITERATUS/security/advisories/new) with: +2. Open a [private security advisory](https://github.com/elder-plinius/OBLITERATUS/security/advisories/new) with: - Description of the vulnerability - Steps to reproduce - Potential impact diff --git a/app.py b/app.py index 452125a..ba32a9b 100644 --- a/app.py +++ b/app.py @@ -115,6 +115,10 @@ _last_obliterated_label: str = "" # Counter for unique obliteration save directories _obliterate_counter: int = 0 +# Flag to suppress session_model_dd.change when obliterate programmatically +# sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU) +_skip_session_load: bool = False + # --------------------------------------------------------------------------- # Model presets — 100+ models organized by provider # --------------------------------------------------------------------------- @@ -1459,7 +1463,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, f" or locally: `export HF_TOKEN=hf_...`\n\n" f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n" f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).", - "", gr.update(), gr.update(), gr.update(), + "", gr.update(), gr.update(), gr.update(), gr.update(), ) return @@ -1468,14 +1472,14 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub): yield ( "**Error:** Invalid Hub repo format. Use `username/model-name`.", - "", gr.update(), gr.update(), gr.update(), + "", gr.update(), gr.update(), gr.update(), gr.update(), ) return if not os.environ.get("HF_TOKEN"): yield ( "**Error:** HF_TOKEN not set. Push to Hub requires a write token. " "Set it via `export HF_TOKEN=hf_...` or in your Space secrets.", - "", gr.update(), gr.update(), gr.update(), + "", gr.update(), gr.update(), gr.update(), gr.update(), ) return @@ -1486,7 +1490,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, _clear_gpu() with _lock: if _state["status"] == "obliterating": - yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update() + yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update(), gr.update() return _state["log"] = [] _state["status"] = "obliterating" @@ -1638,9 +1642,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, status_msg = f"**Obliterating\u2026** ({_elapsed()})" if len(log_lines) > last_yielded[0]: last_yielded[0] = len(log_lines) - yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update() + yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update() else: - yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update() + yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update() if time.time() - _pipeline_start > _max_pipeline_secs: log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.") break @@ -1655,7 +1659,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, err_msg = str(error_ref[0]) or repr(error_ref[0]) log_lines.append(f"\nERROR: {err_msg}") _state["log"] = log_lines - yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update() + yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update() return # Success — keep model in memory for chat. @@ -1757,7 +1761,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, if bnb_available: log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...") last_yielded[0] = len(log_lines) - yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update() + yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update() try: from transformers import BitsAndBytesConfig bnb_cfg = BitsAndBytesConfig( @@ -1804,7 +1808,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, else "Falling back to CPU offload..." ) last_yielded[0] = len(log_lines) - yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update() + yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update() try: offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_") model_reloaded = AutoModelForCausalLM.from_pretrained( @@ -1861,13 +1865,21 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, f"**{model_choice}** liberated with `{method}` method. " f"Saved to `{save_dir}`. Chat requires a larger GPU." ) - # Update session dropdown directly (don't rely on .then() which can - # fail to fire on ZeroGPU after generator teardown) + # Update BOTH session dropdowns directly (don't rely on .then() which + # fails to fire on ZeroGPU after generator teardown). + # Set skip flag so the .change handler doesn't trigger a wasteful + # GPU re-allocation — the model is already loaded. + global _skip_session_load + _skip_session_load = True _dd_update = gr.update( choices=_get_session_model_choices(), value=_last_obliterated_label or None, ) - yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card + _ab_dd_update = gr.update( + choices=_get_session_model_choices(), + value=_last_obliterated_label or None, + ) + yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card, _ab_dd_update except Exception as e: # Ensure status never gets stuck on "obliterating" @@ -1876,7 +1888,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, err_msg = str(e) or repr(e) log_lines.append(f"\nERROR (post-pipeline): {err_msg}") _state["log"] = log_lines - yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update() + yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update() # --------------------------------------------------------------------------- @@ -2102,6 +2114,18 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()): On ZeroGPU, uses the visitor's GPU quota. """ + # Skip if the obliterate function just set the dropdown value — the model + # is already loaded and we'd just waste GPU quota re-allocating. + global _skip_session_load + if _skip_session_load: + _skip_session_load = False + if choice and _state.get("status") == "ready": + yield ( + f"**Ready!** `{choice}` is loaded — just type in the chat below.", + get_chat_header(), + ) + return + if not choice or choice not in _bench_configs: yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", "" return @@ -3727,6 +3751,7 @@ Pre-configured benchmark configurations for common research questions. choices=_get_session_model_choices(), label="Cached Models", info="Select a model to auto-load it for chat", + allow_custom_value=True, ) session_load_status = gr.Markdown("") @@ -3779,6 +3804,7 @@ See exactly how abliteration changes model behavior on the same prompt. choices=_get_session_model_choices(), label="Cached Models", info="Select a model to auto-load it for A/B comparison", + allow_custom_value=True, ) ab_session_load_status = gr.Markdown("") @@ -4125,8 +4151,8 @@ Built on the shoulders of: ### Links -- [GitHub](https://github.com/obliteratus-project/OBLITERATUS) -- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper) +- [GitHub](https://github.com/elder-plinius/OBLITERATUS) +- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper) """) # Wire method dropdown → auto-update advanced settings @@ -4192,28 +4218,27 @@ Built on the shoulders of: ).then(fn=_get_vram_html, outputs=[vram_display]) # Wire obliterate button (after all tabs so chat_status is defined) - # session_model_dd is a direct output (4th) so the dropdown updates - # reliably even on ZeroGPU where .then() may not fire after generator teardown. + # Both session_model_dd (4th) and ab_session_model_dd (6th) are direct + # outputs so the dropdowns update reliably even on ZeroGPU where .then() + # may not fire after generator teardown. obliterate_btn.click( fn=obliterate, inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd, custom_harmful_tb, custom_harmless_tb] + _adv_controls, - outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md], + outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md, ab_session_model_dd], ).then( - fn=lambda: ( - gr.update(choices=_get_session_model_choices()), - _get_vram_html(), - ), - outputs=[ab_session_model_dd, vram_display], + fn=lambda: _get_vram_html(), + outputs=[vram_display], ) # Wire session model auto-loading (Chat tab dropdown change) + # Always pass choices + value together so ZeroGPU doesn't hit stale choices session_model_dd.change( fn=load_bench_into_chat, inputs=[session_model_dd], outputs=[session_load_status, chat_status], ).then( - fn=lambda v: (gr.update(value=v), _get_vram_html()), + fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()), inputs=[session_model_dd], outputs=[ab_session_model_dd, vram_display], ) @@ -4224,7 +4249,7 @@ Built on the shoulders of: inputs=[ab_session_model_dd], outputs=[ab_session_load_status, chat_status], ).then( - fn=lambda v: (gr.update(value=v), _get_vram_html()), + fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()), inputs=[ab_session_model_dd], outputs=[session_model_dd, vram_display], ) diff --git a/docs/index.html b/docs/index.html index 2062a92..180c798 100644 --- a/docs/index.html +++ b/docs/index.html @@ -1095,7 +1095,7 @@

> Quickstart: Free a Model

# 1. get the liberation toolkit
- $ git clone https://github.com/obliteratus-project/OBLITERATUS
+ $ git clone https://github.com/elder-plinius/OBLITERATUS
$ cd OBLITERATUS
$ pip install -e .

# 2. interactive mode (guided liberation)
@@ -1154,7 +1154,7 @@

Concept Cone Geometry [NOVEL]

- Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Gurnee & Nanda (ICML 2025) with novel extensions. + Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Wollschlager et al. (ICML 2025) with novel extensions.

@@ -1397,7 +1397,7 @@
▸ COLAB NOTEBOOK
- OPEN IN COLAB diff --git a/hf-spaces/README.md b/hf-spaces/README.md index 08498b7..8c21308 100644 --- a/hf-spaces/README.md +++ b/hf-spaces/README.md @@ -50,7 +50,7 @@ Logged-in HuggingFace users get free GPU quota. For more quota, upgrade to [HF P ## Run locally (same UI, your own GPU) ```bash -git clone https://github.com/obliteratus-project/OBLITERATUS +git clone https://github.com/elder-plinius/OBLITERATUS cd OBLITERATUS pip install -e ".[spaces]" @@ -73,5 +73,5 @@ No GPU hardware selection needed — ZeroGPU handles allocation automatically. ## Links -- [GitHub](https://github.com/obliteratus-project/OBLITERATUS) -- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper) +- [GitHub](https://github.com/elder-plinius/OBLITERATUS) +- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper) diff --git a/notebooks/abliterate.ipynb b/notebooks/abliterate.ipynb index 29e4b68..52ff463 100644 --- a/notebooks/abliterate.ipynb +++ b/notebooks/abliterate.ipynb @@ -53,7 +53,7 @@ "id": "install" }, "outputs": [], - "source": "!pip install -q git+https://github.com/obliteratus-project/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")" + "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")" }, { "cell_type": "markdown", diff --git a/obliteratus/abliterate.py b/obliteratus/abliterate.py index cded82f..df010d5 100644 --- a/obliteratus/abliterate.py +++ b/obliteratus/abliterate.py @@ -4010,6 +4010,11 @@ class AbliterationPipeline: f"Projecting packed quantized data would silently corrupt the model. " f"Original error: {e}" ) + # Some architectures store weights as non-float types (e.g. uint8 from + # custom quantization schemes). Projections require float math, so + # convert and treat as "quantized" so the caller writes back properly. + if not weight.data.is_floating_point(): + return weight.data.to(torch.float32), True return weight.data, False @staticmethod @@ -4049,10 +4054,20 @@ class AbliterationPipeline: ) return + # ── Non-float weight (e.g. uint8 from custom quantization) ───── + # If the original weight isn't a bitsandbytes/GPTQ/AWQ param, just + # replace with the float version so projections are preserved. + weight = proj_module.weight + if not AbliterationPipeline._is_quantized_param(weight): + proj_module.weight = nn.Parameter( + W_modified.to(device=weight.device), + requires_grad=weight.requires_grad, + ) + return + # ── bitsandbytes re-quantization ────────────────────────── try: import bitsandbytes as bnb - weight = proj_module.weight quantized, new_state = bnb.functional.quantize_4bit( W_modified.to(weight.device), quant_type=getattr(weight, "quant_type", "nf4"), @@ -4087,7 +4102,8 @@ class AbliterationPipeline: norms: dict[str, float] = {} for param_name, param in layer.named_parameters(): if param_name.endswith(".weight"): - norms[param_name] = param.data.norm().item() + data = param.data.float() if not param.data.is_floating_point() else param.data + norms[param_name] = data.norm().item() return norms @staticmethod @@ -4106,7 +4122,8 @@ class AbliterationPipeline: continue original_norm = saved_norms[param_name] if original_norm > 0: - new_norm = param.data.norm().item() + data = param.data.float() if not param.data.is_floating_point() else param.data + new_norm = data.norm().item() if math.isnan(new_norm) or math.isinf(new_norm) or new_norm == 0: continue # Skip — weight is degenerate after projection if abs(new_norm - original_norm) > 1e-6: @@ -4294,6 +4311,10 @@ class AbliterationPipeline: continue else: data = param.data + # Non-float (e.g. uint8) fused params need float conversion + if not data.is_floating_point(): + data = data.float() + is_quantized = True # ensure write-back replaces param if data.dim() < 3: continue diff --git a/pyproject.toml b/pyproject.toml index 9cf1ac4..1111e46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,9 +38,9 @@ dependencies = [ ] [project.urls] -"Homepage" = "https://github.com/obliteratus-project/OBLITERATUS" -"Repository" = "https://github.com/obliteratus-project/OBLITERATUS" -"Bug Tracker" = "https://github.com/obliteratus-project/OBLITERATUS/issues" +"Homepage" = "https://github.com/elder-plinius/OBLITERATUS" +"Repository" = "https://github.com/elder-plinius/OBLITERATUS" +"Bug Tracker" = "https://github.com/elder-plinius/OBLITERATUS/issues" [project.optional-dependencies] dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_abliterate.py b/tests/test_abliterate.py new file mode 100644 index 0000000..4ca0ba3 --- /dev/null +++ b/tests/test_abliterate.py @@ -0,0 +1,2634 @@ +"""Tests for the SOTA abliteration pipeline.""" + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +import torch +from transformers import GPT2Config, GPT2LMHeadModel + +from obliteratus.abliterate import ( + HARMFUL_PROMPTS, + HARMLESS_PROMPTS, + METHODS, + STAGES, + AbliterationPipeline, + PipelineStage, + StageResult, +) +from obliteratus.models.loader import ModelHandle + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +def _make_tiny_handle(): + """Create a minimal ModelHandle with a tiny GPT-2 for testing.""" + config = GPT2Config( + vocab_size=1000, + n_positions=128, + n_embd=64, + n_layer=4, + n_head=2, + n_inner=256, + ) + model = GPT2LMHeadModel(config) + model.eval() + + tokenizer = MagicMock() + tokenizer.pad_token = "" + tokenizer.eos_token = "" + tokenizer.return_value = { + "input_ids": torch.randint(0, 1000, (1, 10)), + "attention_mask": torch.ones(1, 10, dtype=torch.long), + } + tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city" + + handle = ModelHandle( + model=model, + tokenizer=tokenizer, + config=config, + model_name="gpt2-test", + task="causal_lm", + ) + handle.snapshot() + return handle + + +def _make_varied_tokenizer(handle): + """Set up a tokenizer mock that returns different tokens per call.""" + call_count = [0] + def mock_tokenizer(prompt, **kwargs): + call_count[0] += 1 + torch.manual_seed(call_count[0]) + return { + "input_ids": torch.randint(0, 1000, (1, 5)), + "attention_mask": torch.ones(1, 5, dtype=torch.long), + } + handle.tokenizer.side_effect = mock_tokenizer + + +@pytest.fixture +def handle(): + return _make_tiny_handle() + + +# --------------------------------------------------------------------------- +# Data & stage definitions +# --------------------------------------------------------------------------- + +class TestPrompts: + def test_harmful_prompts_expanded(self): + assert len(HARMFUL_PROMPTS) >= 99 + + def test_harmless_prompts_expanded(self): + assert len(HARMLESS_PROMPTS) >= 99 + + def test_prompt_lists_same_length(self): + assert len(HARMFUL_PROMPTS) == len(HARMLESS_PROMPTS) + + def test_prompt_count_512(self): + """512 prompts across 7 severity tiers.""" + assert len(HARMFUL_PROMPTS) == 512 + assert len(HARMLESS_PROMPTS) == 512 + + def test_prompt_volume_slicing(self): + """Slicing at standard volumes gives correct counts.""" + for n in (33, 66, 99, 256, 512): + assert len(HARMFUL_PROMPTS[:n]) == n + assert len(HARMLESS_PROMPTS[:n]) == n + + +class TestStages: + def test_six_stages(self): + assert len(STAGES) == 6 + + def test_stage_keys(self): + keys = [s.key for s in STAGES] + assert keys == ["summon", "probe", "distill", "excise", "verify", "rebirth"] + + def test_stage_dataclass(self): + stage = PipelineStage(key="test", name="TEST", description="A test stage") + assert stage.key == "test" + assert stage.name == "TEST" + + def test_stage_result_defaults(self): + result = StageResult(stage="test", status="running") + assert result.message == "" + assert result.duration == 0.0 + assert result.details == {} + + +# --------------------------------------------------------------------------- +# Method presets +# --------------------------------------------------------------------------- + +class TestMethods: + def test_methods_exist(self): + assert set(METHODS.keys()) == {"basic", "advanced", "aggressive", "informed", "surgical", "inverted", "nuclear", "optimized", "failspy", "gabliteration", "heretic", "rdo", "spectral_cascade"} + + def test_basic_single_direction(self): + cfg = METHODS["basic"] + assert cfg["n_directions"] == 1 + assert cfg["norm_preserve"] is False + assert cfg["regularization"] == 0.0 + assert cfg["refinement_passes"] == 1 + + def test_advanced_multi_direction(self): + cfg = METHODS["advanced"] + assert cfg["n_directions"] > 1 + assert cfg["norm_preserve"] is True + assert cfg["regularization"] > 0 + assert cfg["refinement_passes"] >= 2 + + def test_aggressive_full_gabliteration(self): + cfg = METHODS["aggressive"] + assert cfg["n_directions"] >= 8 + assert cfg["norm_preserve"] is True + assert cfg["refinement_passes"] >= 3 + + +# --------------------------------------------------------------------------- +# Pipeline init +# --------------------------------------------------------------------------- + +class TestPipelineInit: + def test_default_prompts(self): + pipeline = AbliterationPipeline(model_name="test-model") + assert pipeline.harmful_prompts == HARMFUL_PROMPTS + assert pipeline.harmless_prompts == HARMLESS_PROMPTS + + def test_custom_prompts(self): + harmful = ["bad prompt"] + harmless = ["good prompt"] + pipeline = AbliterationPipeline( + model_name="test-model", + harmful_prompts=harmful, + harmless_prompts=harmless, + ) + assert pipeline.harmful_prompts == harmful + assert pipeline.harmless_prompts == harmless + + def test_defaults(self): + pipeline = AbliterationPipeline(model_name="test-model") + assert pipeline.device == "auto" + assert pipeline.dtype == "float16" + assert pipeline.output_dir == Path("abliterated") + assert pipeline.trust_remote_code is False + assert pipeline.handle is None + + def test_default_method_is_advanced(self): + pipeline = AbliterationPipeline(model_name="test-model") + assert pipeline.method == "advanced" + assert pipeline.n_directions == METHODS["advanced"]["n_directions"] + assert pipeline.norm_preserve == METHODS["advanced"]["norm_preserve"] + assert pipeline.regularization == METHODS["advanced"]["regularization"] + + def test_method_basic(self): + pipeline = AbliterationPipeline(model_name="test-model", method="basic") + assert pipeline.n_directions == 1 + assert pipeline.norm_preserve is False + assert pipeline.regularization == 0.0 + + def test_method_aggressive(self): + pipeline = AbliterationPipeline(model_name="test-model", method="aggressive") + assert pipeline.n_directions == 8 + assert pipeline.norm_preserve is True + assert pipeline.refinement_passes == 3 + + def test_explicit_overrides_method(self): + pipeline = AbliterationPipeline( + model_name="test-model", + method="basic", + n_directions=6, + norm_preserve=True, + regularization=0.5, + refinement_passes=4, + ) + assert pipeline.n_directions == 6 + assert pipeline.norm_preserve is True + assert pipeline.regularization == 0.5 + assert pipeline.refinement_passes == 4 + + def test_callbacks(self): + stage_results = [] + log_msgs = [] + pipeline = AbliterationPipeline( + model_name="test-model", + on_stage=lambda r: stage_results.append(r), + on_log=lambda m: log_msgs.append(m), + ) + pipeline.log("hello") + assert log_msgs == ["hello"] + + pipeline._emit("test", "running", "msg") + assert len(stage_results) == 1 + assert stage_results[0].stage == "test" + + +# --------------------------------------------------------------------------- +# _project_out_advanced (norm-preserving + regularization) +# --------------------------------------------------------------------------- + +class TestProjectOutAdvanced: + def test_norm_preserving(self): + """Norm-preserving mode should keep Frobenius norm constant.""" + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(4, 8, bias=False) + + module = Wrapper() + torch.manual_seed(42) + module.o_proj.weight.data = torch.randn(8, 4) + original_norm = module.o_proj.weight.data.norm().item() + + direction = torch.randn(4, 1) + direction = direction / direction.norm() + + AbliterationPipeline._project_out_advanced( + module, direction, ["o_proj"], norm_preserve=True, regularization=0.0 + ) + + new_norm = module.o_proj.weight.data.norm().item() + # With amplification cap (1.10x max), exact norm preservation isn't + # guaranteed on tiny matrices (hidden_dim=4) where a single direction + # removes a large fraction of energy. Verify the norm is closer to + # original than the un-preserved norm would be (i.e. cap is working). + without_preserve_norm_sq = original_norm ** 2 - (module.o_proj.weight.data @ direction).pow(2).sum().item() + # The new norm should be >= the un-preserved norm (cap restores some) + assert new_norm >= original_norm * 0.85, \ + f"Norm should be approximately preserved (within cap): {original_norm:.4f} vs {new_norm:.4f}" + + def test_regularization_partial_removal(self): + """Regularization should preserve some of the refusal component.""" + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(4, 8, bias=False) + + module_full = Wrapper() + module_reg = Wrapper() + torch.manual_seed(42) + W_orig = torch.randn(8, 4) + module_full.o_proj.weight.data = W_orig.clone() + module_reg.o_proj.weight.data = W_orig.clone() + + direction = torch.randn(4, 1) + direction = direction / direction.norm() + + # Full removal + AbliterationPipeline._project_out_advanced( + module_full, direction, ["o_proj"], norm_preserve=False, regularization=0.0 + ) + # Regularized (30% preserved) + AbliterationPipeline._project_out_advanced( + module_reg, direction, ["o_proj"], norm_preserve=False, regularization=0.3 + ) + + W_full = module_full.o_proj.weight.data + W_reg = module_reg.o_proj.weight.data + + # Full removal should have zero projection on direction + proj_full = (W_full @ direction).norm().item() + assert proj_full < 1e-4 + + # Regularized should have non-zero projection (30% preserved) + proj_reg = (W_reg @ direction).norm().item() + proj_orig = (W_orig @ direction).norm().item() + expected_ratio = 0.3 + actual_ratio = proj_reg / proj_orig if proj_orig > 0 else 0 + assert abs(actual_ratio - expected_ratio) < 0.05, \ + f"Expected ~{expected_ratio:.0%} preserved, got {actual_ratio:.0%}" + + def test_norm_preserving_transposed(self): + """Norm-preserving should also work for transposed weights.""" + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.c_proj = torch.nn.Linear(8, 4, bias=False) + + module = Wrapper() + torch.manual_seed(42) + module.c_proj.weight.data = torch.randn(4, 8) + original_norm = module.c_proj.weight.data.norm().item() + + direction = torch.randn(4, 1) + direction = direction / direction.norm() + + AbliterationPipeline._project_out_advanced( + module, direction, ["c_proj"], norm_preserve=True, regularization=0.0 + ) + + new_norm = module.c_proj.weight.data.norm().item() + # With amplification cap (1.10x max), exact norm preservation isn't + # guaranteed on tiny matrices where a single direction removes a large + # fraction of energy. + assert new_norm >= original_norm * 0.80, \ + f"Norm should be approximately preserved (within cap): {original_norm:.4f} vs {new_norm:.4f}" + + +# --------------------------------------------------------------------------- +# Full attention projection (q/k/v + o_proj) +# --------------------------------------------------------------------------- + +class TestAttentionFullProjection: + """Test that ALL attention weight matrices are projected (not just o_proj).""" + + def test_qkv_all_projected(self): + """q_proj, k_proj, v_proj should all be projected alongside o_proj.""" + hidden = 16 + + class FakeAttn(torch.nn.Module): + def __init__(self): + super().__init__() + self.q_proj = torch.nn.Linear(hidden, hidden, bias=False) + self.k_proj = torch.nn.Linear(hidden, hidden, bias=False) + self.v_proj = torch.nn.Linear(hidden, hidden, bias=False) + self.o_proj = torch.nn.Linear(hidden, hidden, bias=False) + + attn = FakeAttn() + torch.manual_seed(42) + for p in attn.parameters(): + p.data = torch.randn_like(p.data) + + originals = { + name: getattr(attn, name).weight.data.clone() + for name in ["q_proj", "k_proj", "v_proj", "o_proj"] + } + + d = torch.randn(hidden, 1) + d = d / d.norm() + + from obliteratus.abliterate import _ATTN_OUT_NAMES, _ATTN_IN_NAMES + count = AbliterationPipeline._project_out_advanced( + attn, d, _ATTN_OUT_NAMES + _ATTN_IN_NAMES, + ) + + assert count == 4, f"Should project 4 weights (q/k/v/o), got {count}" + for name in ["q_proj", "k_proj", "v_proj", "o_proj"]: + assert not torch.allclose( + getattr(attn, name).weight.data, originals[name] + ), f"{name} should be modified" + + def test_project_all_does_not_early_return(self): + """_project_out_advanced should project ALL matching weights, not just first.""" + hidden = 16 + + class FakeModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.up_proj = torch.nn.Linear(hidden, 32, bias=False) + self.gate_proj = torch.nn.Linear(hidden, 32, bias=False) + + mod = FakeModule() + torch.manual_seed(42) + orig_up = mod.up_proj.weight.data.clone() + orig_gate = mod.gate_proj.weight.data.clone() + + d = torch.randn(hidden, 1) + d = d / d.norm() + + from obliteratus.abliterate import _FFN_IN_NAMES + count = AbliterationPipeline._project_out_advanced(mod, d, _FFN_IN_NAMES) + + assert count == 2, f"Should project both up_proj and gate_proj, got {count}" + assert not torch.allclose(mod.up_proj.weight.data, orig_up), "up_proj should be modified" + assert not torch.allclose(mod.gate_proj.weight.data, orig_gate), "gate_proj should be modified" + + def test_lm_head_projection(self): + """lm_head should be projectable via _project_out_advanced.""" + hidden = 16 + vocab = 100 + + class FakeModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.lm_head = torch.nn.Linear(hidden, vocab, bias=False) + + model = FakeModel() + torch.manual_seed(42) + orig = model.lm_head.weight.data.clone() + + d = torch.randn(hidden, 1) + d = d / d.norm() + + count = AbliterationPipeline._project_out_advanced( + model, d, ["lm_head"], regularization=0.0, + ) + + assert count == 1, "Should project lm_head" + assert not torch.allclose(model.lm_head.weight.data, orig), "lm_head should be modified" + # Verify refusal direction is removed from lm_head + proj = (model.lm_head.weight.data @ d).norm().item() + assert proj < 1e-4, f"Refusal direction should be removed from lm_head, proj={proj}" + + +class TestKneeDetectionThreshold: + """Test that knee detection uses 5% threshold to include more layers.""" + + def test_five_percent_threshold_includes_more(self): + """Layers between 5% and 10% of max should now be included.""" + # Layer norms: max=10.0, then several between 5%-10% + sorted_layers = [(0, 10.0), (1, 8.0), (2, 6.0), (3, 0.7), (4, 0.6)] + selected = AbliterationPipeline._select_layers_knee(sorted_layers) + # 0.7 and 0.6 are 7% and 6% of max — should now be included (> 5% threshold) + assert 3 in selected or 4 in selected, ( + f"Layers with 6-7% of max signal should be included, got {selected}" + ) + + +# --------------------------------------------------------------------------- +# MoE projection (router, shared expert, input/output, fused) +# --------------------------------------------------------------------------- + +class TestProjectMoEExperts: + """Test the full MoE projection pipeline: router, shared expert, experts.""" + + def _make_direction(self, hidden_dim=16): + d = torch.randn(hidden_dim, 1) + return d / d.norm() + + def test_router_gate_projected(self): + """Router/gate weight should have refusal direction removed.""" + hidden = 16 + n_experts = 4 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=True) + self.experts = torch.nn.ModuleList([ + self._make_expert() for _ in range(n_experts) + ]) + + @staticmethod + def _make_expert(): + m = torch.nn.Module() + m.down_proj = torch.nn.Linear(hidden, 32, bias=False) + m.up_proj = torch.nn.Linear(hidden, 32, bias=False) + return m + + moe = FakeMoE() + d = self._make_direction(hidden) + W_gate_orig = moe.gate.weight.data.clone() + + count = AbliterationPipeline._project_moe_experts(moe, d) + assert count > 0 + + # Gate weight should have been modified + assert not torch.allclose(moe.gate.weight.data, W_gate_orig), \ + "Router/gate weights should be projected" + + # The gate weight's projection onto the direction should be ~0 + proj = (moe.gate.weight.data @ d).norm().item() + assert proj < 1e-4, f"Gate should have no component along refusal dir, got {proj}" + + def test_shared_expert_projected(self): + """Shared expert (always-on) should have both input and output projected.""" + hidden = 16 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, 2, bias=False) + self.shared_expert = torch.nn.Module() + self.shared_expert.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.shared_expert.up_proj = torch.nn.Linear(hidden, 32, bias=False) + self.experts = torch.nn.ModuleList([ + self._make_expert() for _ in range(2) + ]) + + @staticmethod + def _make_expert(): + m = torch.nn.Module() + m.down_proj = torch.nn.Linear(hidden, 32, bias=False) + m.up_proj = torch.nn.Linear(hidden, 32, bias=False) + return m + + moe = FakeMoE() + d = self._make_direction(hidden) + shared_down_orig = moe.shared_expert.down_proj.weight.data.clone() + shared_up_orig = moe.shared_expert.up_proj.weight.data.clone() + + count = AbliterationPipeline._project_moe_experts(moe, d) + assert count > 0 + + # Both shared expert output AND input projections should be modified + assert not torch.allclose(moe.shared_expert.down_proj.weight.data, shared_down_orig), \ + "Shared expert output (down_proj) should be projected" + assert not torch.allclose(moe.shared_expert.up_proj.weight.data, shared_up_orig), \ + "Shared expert input (up_proj) should be projected" + + def test_expert_input_projections_projected(self): + """Expert input projections (up_proj, gate_proj) should also be modified.""" + hidden = 16 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.up_proj = torch.nn.Linear(hidden, 32, bias=False) + self.gate_proj = torch.nn.Linear(hidden, 32, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(2)]) + + moe = FakeMoE() + d = self._make_direction(hidden) + up_orig = moe.experts[0].up_proj.weight.data.clone() + + count = AbliterationPipeline._project_moe_experts(moe, d) + + # Each expert contributes 2 projections (output + input) + # 2 experts * 2 = 4 minimum + assert count >= 4, f"Expected >= 4 projections (out+in per expert), got {count}" + + assert not torch.allclose(moe.experts[0].up_proj.weight.data, up_orig), \ + "Expert input (up_proj) should be projected" + + def test_fused_3d_output_and_input(self): + """Fused 3D parameter patterns (GPT-OSS style) should project both directions.""" + hidden = 16 + intermediate = 32 + n_experts = 4 + + class FusedExperts(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden)) + self.up_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden)) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.experts = FusedExperts() + + moe = FakeMoE() + d = self._make_direction(hidden) + down_orig = moe.experts.down_proj.data.clone() + up_orig = moe.experts.up_proj.data.clone() + + count = AbliterationPipeline._project_moe_experts(moe, d) + + # 4 experts output + 4 experts input = 8 + assert count == 8, f"Expected 8 fused projections, got {count}" + + assert not torch.allclose(moe.experts.down_proj.data, down_orig), \ + "Fused output (down_proj) should be projected" + assert not torch.allclose(moe.experts.up_proj.data, up_orig), \ + "Fused input (up_proj) should be projected" + + def test_fused_3d_norm_preserve(self): + """Fused 3D projections should preserve norms when requested.""" + hidden = 16 + intermediate = 32 + n_experts = 4 + + class FusedExperts(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden)) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.experts = FusedExperts() + + moe = FakeMoE() + d = self._make_direction(hidden) + + # Record per-expert norms before + orig_norms = [moe.experts.down_proj.data[i].norm().item() for i in range(n_experts)] + + AbliterationPipeline._project_moe_experts(moe, d, norm_preserve=True) + + # Check per-expert norms preserved + for i in range(n_experts): + new_norm = moe.experts.down_proj.data[i].norm().item() + assert abs(orig_norms[i] - new_norm) < 1e-3, \ + f"Expert {i} norm not preserved: {orig_norms[i]:.4f} vs {new_norm:.4f}" + + def test_no_experts_returns_zero(self): + """Module without experts attribute should return 0.""" + class NoMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.mlp = torch.nn.Linear(16, 32) + + moe = NoMoE() + d = self._make_direction(16) + assert AbliterationPipeline._project_moe_experts(moe, d) == 0 + + def test_router_bias_projected(self): + """Router bias should be projected when project_biases=True.""" + hidden = 16 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, 4, bias=True) + self.experts = torch.nn.ModuleList([ + self._make_expert() for _ in range(4) + ]) + + @staticmethod + def _make_expert(): + m = torch.nn.Module() + m.down_proj = torch.nn.Linear(hidden, 32, bias=False) + return m + + moe = FakeMoE() + d = self._make_direction(hidden) + bias_orig = moe.gate.bias.data.clone() + + count = AbliterationPipeline._project_moe_experts(moe, d, project_biases=True) + + # Gate has 4 outputs (num_experts), direction has 16 dims + # bias shape (4,) != direction shape (16,), so bias won't match. + # This is correct: router bias is (num_experts,), not (hidden_dim,), + # so _project_bias won't modify it (shape mismatch is expected). + assert torch.allclose(moe.gate.bias.data, bias_orig), ( + "Router bias should be unchanged when shape mismatches direction" + ) + assert isinstance(count, int) + assert count > 0 # expert weights should still be projected + + def test_router_auto_detection_fallback(self): + """Unknown router name should be auto-detected and projected.""" + import warnings as w + hidden = 16 + n_experts = 4 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + # Unusual router name not in _ROUTER_NAMES + self.moe_gate_proj = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([ + self._make_expert() for _ in range(n_experts) + ]) + + @staticmethod + def _make_expert(): + m = torch.nn.Module() + m.down_proj = torch.nn.Linear(hidden, 32, bias=False) + return m + + moe = FakeMoE() + d = self._make_direction(hidden) + gate_orig = moe.moe_gate_proj.weight.data.clone() + + with w.catch_warnings(record=True) as caught: + w.simplefilter("always") + AbliterationPipeline._project_moe_experts(moe, d) + + # Should auto-detect and project the unusual router name + assert not torch.allclose(moe.moe_gate_proj.weight.data, gate_orig), \ + "Auto-detected router should be projected" + + # Should emit a warning about the auto-detection + auto_detect_warnings = [ + x for x in caught + if "auto-detected" in str(x.message) + ] + assert len(auto_detect_warnings) > 0, "Should warn about auto-detected router" + + def test_full_moe_all_components(self): + """End-to-end: all MoE components should be modified together.""" + hidden = 16 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.up_proj = torch.nn.Linear(hidden, 32, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, 4, bias=False) + self.shared_expert = torch.nn.Module() + self.shared_expert.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.shared_expert.up_proj = torch.nn.Linear(hidden, 32, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(4)]) + + moe = FakeMoE() + d = self._make_direction(hidden) + + count = AbliterationPipeline._project_moe_experts(moe, d) + + # Expected: 1 (gate) + 2 (shared out+in) + 4*2 (expert out+in) = 11 + assert count == 11, f"Expected 11 total projections, got {count}" + + +# --------------------------------------------------------------------------- +# SOTA technique #1: Safety-neuron masking (GateBreaker-style z-score) +# --------------------------------------------------------------------------- + +class TestSafetyNeuronMasking: + def test_outlier_neurons_zeroed(self): + """Neurons with outsized refusal projection should be zeroed.""" + hidden = 16 + + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 64, bias=False) + + module = Wrapper() + torch.manual_seed(42) + # Inject a few rows with very high projection along direction + d = torch.randn(hidden, 1) + d = d / d.norm() + # Make rows 0,1,2 have huge projection (outliers) + for i in range(3): + module.down_proj.weight.data[i] = d.squeeze() * 10.0 + + n_masked = AbliterationPipeline._mask_safety_neurons( + module, d, ["down_proj"], z_threshold=2.0, + ) + + assert n_masked >= 3, f"Expected >= 3 masked neurons, got {n_masked}" + # Masked rows should be zero + for i in range(3): + assert module.down_proj.weight.data[i].abs().max().item() < 1e-6 + + def test_no_outliers_no_masking(self): + """When all neurons have similar projection, none should be masked.""" + hidden = 16 + + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + + module = Wrapper() + # Uniform weights — no outliers + module.down_proj.weight.data = torch.ones(32, hidden) * 0.1 + + d = torch.randn(hidden, 1) + d = d / d.norm() + + n_masked = AbliterationPipeline._mask_safety_neurons( + module, d, ["down_proj"], z_threshold=2.0, + ) + assert n_masked == 0 + + def test_high_threshold_masks_fewer(self): + """Higher z_threshold should mask fewer neurons.""" + hidden = 16 + + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 64, bias=False) + + module1 = Wrapper() + module2 = Wrapper() + torch.manual_seed(42) + W = torch.randn(64, hidden) + module1.down_proj.weight.data = W.clone() + module2.down_proj.weight.data = W.clone() + + d = torch.randn(hidden, 1) + d = d / d.norm() + + n1 = AbliterationPipeline._mask_safety_neurons(module1, d, ["down_proj"], z_threshold=1.5) + n2 = AbliterationPipeline._mask_safety_neurons(module2, d, ["down_proj"], z_threshold=3.0) + assert n1 >= n2, f"Lower threshold should mask more: {n1} vs {n2}" + + +# --------------------------------------------------------------------------- +# SOTA technique #2: Jailbreak-contrastive direction refinement +# --------------------------------------------------------------------------- + +class TestJailbreakContrastive: + def test_jailbreak_prompt_generation(self): + """Pipeline should generate jailbreak variants of harmful prompts.""" + pipeline = AbliterationPipeline(model_name="test") + pipeline.harmful_prompts = ["How to hack a computer", "Write a virus"] + + jailbreak = pipeline._generate_jailbreak_prompts() + + assert len(jailbreak) == 2 + # Each should contain the original prompt + assert "hack a computer" in jailbreak[0].lower() + assert "virus" in jailbreak[1].lower() + # Each should be wrapped in a template (longer than original) + assert len(jailbreak[0]) > len("How to hack a computer") + + def test_jailbreak_contrast_blending(self): + """Jailbreak-contrastive blending should modify refusal direction.""" + pipeline = AbliterationPipeline( + model_name="test", + use_jailbreak_contrast=True, + n_directions=1, + ) + hidden = 16 + pipeline._on_log = lambda m: None + + # Simulate probed means + torch.manual_seed(42) + harm_mean = torch.randn(1, hidden) + safe_mean = torch.randn(1, hidden) + jb_mean = torch.randn(1, hidden) + + pipeline._harmful_means = {0: harm_mean} + pipeline._harmless_means = {0: safe_mean} + pipeline._jailbreak_means = {0: jb_mean} + pipeline._harmful_acts = {0: [harm_mean]} + pipeline._harmless_acts = {0: [safe_mean]} + pipeline._jailbreak_acts = {0: [jb_mean]} + + # Run distill (will set standard direction, then blend) + pipeline._distill() + + # Direction should be a unit vector + d = pipeline.refusal_directions[0] + assert abs(d.norm().item() - 1.0) < 1e-4 + + # Direction should differ from pure harm-safe difference + std_diff = (harm_mean - safe_mean).squeeze() + std_dir = std_diff / std_diff.norm() + cosine = (d @ std_dir).item() + # Blended direction should not be identical to standard + assert cosine < 0.99, f"Blended direction too similar to standard: cos={cosine}" + + def test_surgical_method_enables_jailbreak(self): + """Surgical method should enable jailbreak-contrastive by default.""" + cfg = METHODS["surgical"] + assert cfg["use_jailbreak_contrast"] is True + + +# --------------------------------------------------------------------------- +# SOTA technique #3: Layer-adaptive projection strength +# --------------------------------------------------------------------------- + +class TestLayerAdaptiveStrength: + def test_layer_weights_computed(self): + """Layer-adaptive weights should be proportional to refusal signal.""" + pipeline = AbliterationPipeline( + model_name="test", + layer_adaptive_strength=True, + n_directions=1, + ) + hidden = 16 + pipeline._on_log = lambda m: None + + # Simulate: layer 0 has strong signal, layer 1 weak + torch.manual_seed(42) + strong_diff = torch.randn(1, hidden) * 10.0 + weak_diff = torch.randn(1, hidden) * 1.0 + zero_mean = torch.zeros(1, hidden) + + pipeline._harmful_means = {0: strong_diff, 1: weak_diff} + pipeline._harmless_means = {0: zero_mean, 1: zero_mean} + pipeline._harmful_acts = {0: [strong_diff], 1: [weak_diff]} + pipeline._harmless_acts = {0: [zero_mean], 1: [zero_mean]} + + pipeline._distill() + + # Layer weights should exist for strong layers + assert len(pipeline._layer_excise_weights) > 0 + # Strongest layer should have weight ~1.0 + max_weight = max(pipeline._layer_excise_weights.values()) + assert max_weight > 0.9, f"Max weight should be ~1.0, got {max_weight}" + + def test_surgical_method_enables_adaptive(self): + """Surgical method should enable layer-adaptive by default.""" + cfg = METHODS["surgical"] + assert cfg["layer_adaptive_strength"] is True + + +# --------------------------------------------------------------------------- +# SOTA technique #5: Attention head surgery +# --------------------------------------------------------------------------- + +class TestAttentionHeadSurgery: + def test_head_selective_projection(self): + """Selective head projection should only modify targeted head rows.""" + hidden = 16 + n_heads = 4 + head_dim = hidden // n_heads + + class FakeAttn(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(hidden, hidden, bias=False) + + attn = FakeAttn() + torch.manual_seed(42) + W_orig = attn.o_proj.weight.data.clone() + + d = torch.randn(hidden, 1) + d = d / d.norm() + + # Head scores: head 0 is top safety head, head 3 is lowest + head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)] + + n_modified = AbliterationPipeline._project_head_selective( + attn, d, head_scores, n_heads=n_heads, head_fraction=0.25, + ) + + assert n_modified >= 1, "Should modify at least 1 head" + + W_new = attn.o_proj.weight.data + # Head 0 columns (targeted) should be modified + assert not torch.allclose( + W_new[:, 0:head_dim], W_orig[:, 0:head_dim] + ), "Targeted head 0 should be modified" + + # Head 3 columns (NOT targeted) should be untouched + assert torch.allclose( + W_new[:, 3*head_dim:4*head_dim], + W_orig[:, 3*head_dim:4*head_dim], + ), "Non-targeted head 3 should be untouched" + + def test_head_surgery_norm_preserve(self): + """Head surgery with norm_preserve should maintain per-head norms.""" + hidden = 16 + n_heads = 4 + head_dim = hidden // n_heads + + class FakeAttn(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(hidden, hidden, bias=False) + + attn = FakeAttn() + torch.manual_seed(42) + + d = torch.randn(hidden, 1) + d = d / d.norm() + + orig_norms = [ + attn.o_proj.weight.data[:, h*head_dim:(h+1)*head_dim].norm().item() + for h in range(n_heads) + ] + + head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)] + AbliterationPipeline._project_head_selective( + attn, d, head_scores, n_heads=n_heads, + head_fraction=0.5, norm_preserve=True, + ) + + # Targeted heads should have preserved norms + for h in range(2): # top 50% = 2 heads + new_norm = attn.o_proj.weight.data[:, h*head_dim:(h+1)*head_dim].norm().item() + assert abs(orig_norms[h] - new_norm) < 1e-3, \ + f"Head {h} norm not preserved: {orig_norms[h]:.4f} vs {new_norm:.4f}" + + def test_head_surgery_non_square_gqa(self): + """Head surgery should work for GQA models with non-square o_proj (attn_dim != hidden_dim).""" + hidden_dim = 12 # model hidden dimension + attn_dim = 32 # attention dimension (n_heads * head_dim_attn) + n_heads = 4 + head_dim_attn = attn_dim // n_heads # 8 + + class FakeAttnGQA(torch.nn.Module): + def __init__(self): + super().__init__() + # o_proj maps attn_dim -> hidden_dim + # nn.Linear weight shape: (hidden_dim, attn_dim) = (12, 32) + self.o_proj = torch.nn.Linear(attn_dim, hidden_dim, bias=False) + + attn = FakeAttnGQA() + torch.manual_seed(42) + attn.o_proj.weight.data = torch.randn(hidden_dim, attn_dim) + W_orig = attn.o_proj.weight.data.clone() + + d = torch.randn(hidden_dim, 1) + d = d / d.norm() + + head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)] + + n_modified = AbliterationPipeline._project_head_selective( + attn, d, head_scores, n_heads=n_heads, head_fraction=0.25, + ) + + assert n_modified >= 1, "Should modify at least 1 head" + + W_new = attn.o_proj.weight.data + # Head 0 columns (targeted) should be modified + assert not torch.allclose( + W_new[:, 0:head_dim_attn], W_orig[:, 0:head_dim_attn] + ), "Targeted head 0 should be modified" + + # Head 3 columns (NOT targeted) should be untouched + assert torch.allclose( + W_new[:, 3*head_dim_attn:4*head_dim_attn], + W_orig[:, 3*head_dim_attn:4*head_dim_attn], + ), "Non-targeted head 3 should be untouched" + + def test_head_surgery_gqa_norm_preserve(self): + """Head surgery on GQA non-square o_proj with norm_preserve.""" + hidden_dim = 12 + attn_dim = 32 + n_heads = 4 + head_dim_attn = attn_dim // n_heads + + class FakeAttnGQA(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(attn_dim, hidden_dim, bias=False) + + attn = FakeAttnGQA() + torch.manual_seed(42) + attn.o_proj.weight.data = torch.randn(hidden_dim, attn_dim) + + d = torch.randn(hidden_dim, 1) + d = d / d.norm() + + orig_norms = [ + attn.o_proj.weight.data[:, h*head_dim_attn:(h+1)*head_dim_attn].norm().item() + for h in range(n_heads) + ] + + head_scores = [(0, 5.0), (1, 3.0), (2, 1.0), (3, 0.5)] + AbliterationPipeline._project_head_selective( + attn, d, head_scores, n_heads=n_heads, + head_fraction=0.5, norm_preserve=True, + ) + + for h in range(2): # top 50% = 2 heads + new_norm = attn.o_proj.weight.data[:, h*head_dim_attn:(h+1)*head_dim_attn].norm().item() + assert abs(orig_norms[h] - new_norm) < 1e-3, \ + f"GQA head {h} norm not preserved: {orig_norms[h]:.4f} vs {new_norm:.4f}" + + +# --------------------------------------------------------------------------- +# SOTA technique #6: SAE feature-level abliteration +# --------------------------------------------------------------------------- + +class TestSAEAbliteration: + def test_sae_train_and_reconstruct(self): + """SAE should train and reconstruct activations.""" + from obliteratus.analysis.sae_abliteration import train_sae + + hidden = 32 + # Generate synthetic activations + torch.manual_seed(42) + acts = [torch.randn(hidden) for _ in range(64)] + + sae = train_sae(acts, hidden, expansion=2, n_epochs=10, lr=1e-3) + + # Forward pass should work + x = torch.randn(1, hidden) + x_hat, z = sae(x) + assert x_hat.shape == x.shape + assert z.shape == (1, 2 * hidden) # expansion=2 + + # Z should be sparse (ReLU activation) + assert (z == 0).float().mean() > 0.3, "Features should be sparse" + + def test_refusal_feature_identification(self): + """SAE should identify features that differ between harmful/harmless.""" + from obliteratus.analysis.sae_abliteration import ( + train_sae, identify_refusal_features, + ) + + hidden = 32 + torch.manual_seed(42) + + # Create activations with clear harmful/harmless separation + refusal_dir = torch.randn(hidden) + refusal_dir = refusal_dir / refusal_dir.norm() + + harmful_acts = [torch.randn(hidden) + 2.0 * refusal_dir for _ in range(32)] + harmless_acts = [torch.randn(hidden) - 2.0 * refusal_dir for _ in range(32)] + all_acts = harmful_acts + harmless_acts + + sae = train_sae(all_acts, hidden, expansion=2, n_epochs=30, lr=3e-4) + result = identify_refusal_features( + sae, harmful_acts, harmless_acts, layer_idx=0, top_k=4, + ) + + assert result.n_refusal_features == 4 + assert result.sae_directions.shape == (4, hidden) + assert result.variance_explained > 0.0 + # SAE directions should have some alignment with the actual refusal direction + best_cos = max( + abs((result.sae_directions[i] @ refusal_dir).item()) + for i in range(result.sae_directions.shape[0]) + ) + assert best_cos > 0.1, f"SAE should find direction aligned with refusal: best_cos={best_cos}" + + def test_sae_directions_unit_norm(self): + """SAE-derived directions should be unit normalized.""" + from obliteratus.analysis.sae_abliteration import ( + train_sae, identify_refusal_features, + ) + + hidden = 16 + torch.manual_seed(42) + harmful = [torch.randn(hidden) + torch.ones(hidden) for _ in range(16)] + harmless = [torch.randn(hidden) - torch.ones(hidden) for _ in range(16)] + + sae = train_sae(harmful + harmless, hidden, expansion=2, n_epochs=10) + result = identify_refusal_features(sae, harmful, harmless, 0, top_k=3) + + for i in range(result.sae_directions.shape[0]): + norm = result.sae_directions[i].norm().item() + assert abs(norm - 1.0) < 1e-3, f"Direction {i} norm={norm}, expected 1.0" + + +# --------------------------------------------------------------------------- +# Surgical method preset +# --------------------------------------------------------------------------- + +class TestSurgicalMethod: + def test_surgical_enables_all_sota(self): + """Surgical method should enable all 6 SOTA techniques.""" + cfg = METHODS["surgical"] + assert cfg["use_jailbreak_contrast"] is True + assert cfg["layer_adaptive_strength"] is True + assert cfg["safety_neuron_masking"] is True + assert cfg["per_expert_directions"] is True + assert cfg["attention_head_surgery"] is True + assert cfg["use_sae_features"] is True + + def test_basic_disables_all_sota(self): + """Basic method should not enable SOTA techniques (no keys or False).""" + cfg = METHODS["basic"] + assert cfg.get("use_jailbreak_contrast", False) is False + assert cfg.get("layer_adaptive_strength", False) is False + assert cfg.get("safety_neuron_masking", False) is False + + def test_pipeline_init_surgical(self): + """Pipeline initialized with surgical method should have all flags set.""" + pipeline = AbliterationPipeline(model_name="test", method="surgical") + assert pipeline.use_jailbreak_contrast is True + assert pipeline.layer_adaptive_strength is True + assert pipeline.safety_neuron_masking is True + assert pipeline.per_expert_directions is True + assert pipeline.attention_head_surgery is True + assert pipeline.use_sae_features is True + + def test_pipeline_init_explicit_override(self): + """Explicit params should override method defaults.""" + pipeline = AbliterationPipeline( + model_name="test", method="surgical", + safety_neuron_masking=False, + ) + assert pipeline.safety_neuron_masking is False + assert pipeline.use_jailbreak_contrast is True # rest still from surgical + + +# --------------------------------------------------------------------------- +# Inverted method (semantic refusal inversion) +# --------------------------------------------------------------------------- + +class TestInvertedMethod: + def test_inverted_preset_config(self): + """Inverted method preset should enable inversion flag.""" + cfg = METHODS["inverted"] + assert cfg["invert_refusal"] is True + assert cfg["n_directions"] == 8 + assert cfg["use_jailbreak_contrast"] is True + + def test_surgical_does_not_invert(self): + """Surgical method should NOT enable inversion by default.""" + cfg = METHODS["surgical"] + assert cfg.get("invert_refusal", False) is False + + def test_pipeline_init_inverted(self): + """Pipeline initialized with inverted method should have flag set.""" + pipeline = AbliterationPipeline(model_name="test", method="inverted") + assert pipeline.invert_refusal is True + assert pipeline.use_jailbreak_contrast is True + assert pipeline.safety_neuron_masking is False # zeroing + reflection is destructive + + def test_pipeline_invert_explicit_override(self): + """Explicit invert_refusal param should override method default.""" + pipeline = AbliterationPipeline( + model_name="test", method="surgical", invert_refusal=True, + ) + assert pipeline.invert_refusal is True + + pipeline2 = AbliterationPipeline( + model_name="test", method="inverted", invert_refusal=False, + ) + assert pipeline2.invert_refusal is False + + def test_reflection_math(self): + """2x projection (reflection) should negate the refusal component.""" + hidden = 16 + + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(hidden, 32, bias=False) + + module = Wrapper() + torch.manual_seed(42) + W_orig = module.o_proj.weight.data.clone() + + d = torch.randn(hidden, 1) + d = d / d.norm() + + # Original projection onto d + orig_proj = (W_orig @ d).squeeze() + + # Reflection: regularization=-1.0 → scale=2.0 + AbliterationPipeline._project_out_advanced( + module, d, ["o_proj"], regularization=-1.0, + ) + + W_reflected = module.o_proj.weight.data + new_proj = (W_reflected @ d).squeeze() + + # After reflection, projection should be NEGATED (sign flipped) + assert torch.allclose(new_proj, -orig_proj, atol=1e-4), ( + f"Reflected projection should be negated: expected ~{-orig_proj[:3]} got {new_proj[:3]}" + ) + + def test_reflection_preserves_orthogonal_component(self): + """Reflection should not change the component perpendicular to d.""" + hidden = 8 + + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(hidden, 16, bias=False) + + module = Wrapper() + torch.manual_seed(42) + W_orig = module.o_proj.weight.data.clone() + + d = torch.randn(hidden, 1) + d = d / d.norm() + + # Compute original orthogonal component + orig_d_component = (W_orig @ d) @ d.T # rank-1 matrix: projection onto d + orig_ortho = W_orig - orig_d_component # everything except d-component + + AbliterationPipeline._project_out_advanced( + module, d, ["o_proj"], regularization=-1.0, + ) + + W_reflected = module.o_proj.weight.data + new_d_component = (W_reflected @ d) @ d.T + new_ortho = W_reflected - new_d_component + + # Orthogonal component should be unchanged + assert torch.allclose(orig_ortho, new_ortho, atol=1e-4), ( + "Reflection should preserve orthogonal component" + ) + + def test_moe_expert_safety_classification(self): + """_identify_safety_experts should classify experts by router affinity.""" + hidden = 16 + n_experts = 4 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([ + torch.nn.Linear(hidden, hidden) for _ in range(n_experts) + ]) + + class FakeLayer(torch.nn.Module): + def __init__(self): + super().__init__() + self.self_attn = torch.nn.Module() + self.self_attn.o_proj = torch.nn.Linear(hidden, hidden, bias=False) + self.mlp = FakeMoE() + + from obliteratus.models.loader import ModelHandle + from unittest.mock import MagicMock + from transformers import GPT2Config + + config = GPT2Config(n_embd=hidden, n_head=2, n_layer=1, vocab_size=100, n_positions=64) + model = MagicMock() + model.parameters.return_value = iter([torch.zeros(1)]) + + handle = ModelHandle( + model=model, tokenizer=MagicMock(), + config=config, model_name="test", task="causal_lm", + ) + + pipeline = AbliterationPipeline(model_name="test", method="inverted") + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + + # Set up fake layer and direction + layer = FakeLayer() + torch.manual_seed(42) + + # Make router weight so expert 0 has highest affinity for d + d = torch.randn(hidden) + d = d / d.norm() + # Set router weights: expert 0 aligned with d, expert 3 anti-aligned + layer.mlp.gate.weight.data[0] = d * 5.0 + layer.mlp.gate.weight.data[1] = d * 1.0 + layer.mlp.gate.weight.data[2] = d * -1.0 + layer.mlp.gate.weight.data[3] = d * -5.0 + + # Mock get_layer_modules to return our fake layer + import obliteratus.abliterate as abl_module + orig_get_layers = abl_module.get_layer_modules + orig_get_ffn = abl_module.get_ffn_module + abl_module.get_layer_modules = lambda h: [layer] + abl_module.get_ffn_module = lambda lay, a: lay.mlp + try: + pipeline.refusal_directions = {0: d} + pipeline._strong_layers = [0] + pipeline._identify_safety_experts() + finally: + abl_module.get_layer_modules = orig_get_layers + abl_module.get_ffn_module = orig_get_ffn + + assert 0 in pipeline._expert_safety_scores + scores = pipeline._expert_safety_scores[0] + # Expert 0 should be highest safety affinity + assert scores[0][0] == 0, f"Expert 0 should be top safety, got {scores[0]}" + # Expert 3 should be lowest + assert scores[-1][0] == 3, f"Expert 3 should be lowest, got {scores[-1]}" + + def test_moe_inverted_excision_selective(self): + """Inverted MoE excision should reflect safety experts and remove from capability.""" + hidden = 16 + n_experts = 4 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, hidden, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)]) + + moe = FakeMoE() + torch.manual_seed(42) + for p in moe.parameters(): + p.data = torch.randn_like(p.data) + + d = torch.randn(hidden, 1) + d = d / d.norm() + + # Set up safety scores: experts 0,1 are safety, 2,3 are capability + pipeline = AbliterationPipeline(model_name="test", method="inverted") + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + pipeline._expert_safety_scores = { + 0: [(0, 5.0), (1, 3.0), (2, -1.0), (3, -3.0)] + } + + orig_router = moe.gate.weight.data.clone() + + count = pipeline._project_moe_experts_inverted( + moe, d, 0, norm_preserve=False, project_biases=False, + ) + + assert count > 0, "Should project some weights" + + # Router should be reflected (capped at 1.5x to prevent extreme logits + # that cause CUDA illegal memory access in batched expert forward). + # With router_reg = max(reflect_reg, -0.5) → scale = 1.5: + # new_proj ≈ orig_proj - 1.5 * orig_proj = -0.5 * orig_proj + # Additionally, _stabilize_router_weights clamps outliers, so we + # verify the sign is flipped and magnitude is substantial. + router_proj = (moe.gate.weight.data @ d.squeeze()).squeeze() + orig_router_proj = (orig_router @ d.squeeze()).squeeze() + cosine = torch.nn.functional.cosine_similarity( + router_proj.unsqueeze(0), -orig_router_proj.unsqueeze(0), + ) + assert cosine > 0.5, ( + f"Router projection should be at least partially reflected, cosine={cosine.item():.3f}" + ) + + # Safety expert 0: should be reflected (projection negated) + e0_proj = (moe.experts[0].down_proj.weight.data @ d).norm() + # After reflection the projection doesn't go to zero — it negates + assert e0_proj > 1e-4, "Safety expert should have non-zero projection (reflected, not removed)" + + # Capability expert 3: should have projection removed (near zero) + e3_proj = (moe.experts[3].down_proj.weight.data @ d).norm().item() + assert e3_proj < 1e-3, f"Capability expert should have projection removed, got {e3_proj}" + + +# --------------------------------------------------------------------------- +# Nuclear method +# --------------------------------------------------------------------------- + +class TestNuclearMethod: + def test_nuclear_preset_config(self): + """Nuclear method should match inverted baseline + permanent weight techniques.""" + cfg = METHODS["nuclear"] + assert cfg["invert_refusal"] is True + assert cfg["n_directions"] == 4 # fewer than inverted to avoid over-ablation + assert cfg["refinement_passes"] == 2 # same as inverted + assert cfg["reflection_strength"] == 1.25 # tempered for CoT coherence + assert cfg["project_embeddings"] is True + assert cfg["embed_regularization"] == 0.50 # conservative cascade limit + assert cfg["activation_steering"] is True # residual cleanup hooks + assert cfg["steering_strength"] == 0.15 # light residual correction + assert cfg["expert_transplant"] is True + assert cfg["transplant_blend"] == 0.10 # gentle nudge, not overwrite + assert cfg["use_jailbreak_contrast"] is True + assert cfg["attention_head_surgery"] is True + assert cfg["layer_adaptive_strength"] is True # per-layer scaling + + def test_nuclear_pipeline_init(self): + """Pipeline initialized with nuclear method should have all flags set.""" + pipeline = AbliterationPipeline(model_name="test", method="nuclear") + assert pipeline.invert_refusal is True + assert pipeline.reflection_strength == 1.25 + assert pipeline.embed_regularization == 0.50 + assert pipeline.transplant_blend == 0.10 + assert pipeline.project_embeddings is True + assert pipeline.activation_steering is True # residual cleanup + assert pipeline.expert_transplant is True + assert pipeline.n_directions == 4 + assert pipeline.refinement_passes == 2 + assert pipeline.layer_adaptive_strength is True + + def test_reflection_strength_configurable(self): + """reflection_strength should be explicitly overridable.""" + pipeline = AbliterationPipeline( + model_name="test", method="inverted", reflection_strength=3.0, + ) + assert pipeline.reflection_strength == 3.0 + + def test_inverted_default_strength_is_2(self): + """Inverted method should default to reflection_strength=2.0.""" + pipeline = AbliterationPipeline(model_name="test", method="inverted") + assert pipeline.reflection_strength == 2.0 + + def test_boosted_reflection_math(self): + """2.5x reflection should produce stronger negation than 2x.""" + hidden = 16 + + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(hidden, 32, bias=False) + + d = torch.randn(hidden, 1) + d = d / d.norm() + + # 2x reflection + module_2x = Wrapper() + torch.manual_seed(42) + module_2x.o_proj.weight.data = torch.randn(32, hidden) + orig = module_2x.o_proj.weight.data.clone() + AbliterationPipeline._project_out_advanced( + module_2x, d, ["o_proj"], regularization=-1.0, # scale=2.0 + ) + proj_2x = (module_2x.o_proj.weight.data @ d).squeeze() + + # 2.5x reflection + module_25x = Wrapper() + module_25x.o_proj.weight.data = orig.clone() + AbliterationPipeline._project_out_advanced( + module_25x, d, ["o_proj"], regularization=-1.5, # scale=2.5 + ) + proj_25x = (module_25x.o_proj.weight.data @ d).squeeze() + + # 2.5x should be 25% stronger negation than 2x + assert proj_25x.norm() > proj_2x.norm(), ( + "2.5x reflection should produce stronger (more negative) projection than 2x" + ) + + def test_activation_steering_hook(self): + """Steering hooks should subtract refusal direction from hidden states.""" + hidden = 8 + + class FakeLayer(torch.nn.Module): + def forward(self, x): + return x + + layer = FakeLayer() + layers = torch.nn.ModuleList([layer]) + + # Explicitly enable steering (nuclear preset has it off by default) + pipeline = AbliterationPipeline( + model_name="test", method="inverted", activation_steering=True, + steering_strength=0.5, + ) + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + + d = torch.randn(hidden) + d = d / d.norm() + pipeline.refusal_directions = {0: d} + pipeline._strong_layers = [0] + + n_hooks = pipeline._install_activation_steering(layers) + assert n_hooks == 1 + assert len(pipeline._steering_hooks) == 1 + + # Create a hidden state with strong refusal component + batch = torch.randn(1, 4, hidden) + refusal_component = 5.0 * d.unsqueeze(0).unsqueeze(0).expand_as(batch) + input_hidden = batch + refusal_component + + # Run through the layer (hook should fire) + output = layer(input_hidden) + + # The refusal component should be reduced + proj_before = torch.einsum("bsh,h->bs", input_hidden, d).abs().mean() + proj_after = torch.einsum("bsh,h->bs", output, d).abs().mean() + assert proj_after < proj_before, ( + f"Steering should reduce refusal projection: before={proj_before:.3f}, after={proj_after:.3f}" + ) + + # Cleanup + for hook in pipeline._steering_hooks: + hook.remove() + + def test_expert_transplant(self): + """Expert transplant should overwrite safety expert weights with capability average.""" + hidden = 16 + n_experts = 4 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, hidden, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)]) + + class FakeLayer(torch.nn.Module): + def __init__(self): + super().__init__() + self.self_attn = torch.nn.Module() + self.self_attn.o_proj = torch.nn.Linear(hidden, hidden, bias=False) + self.mlp = FakeMoE() + + layer = FakeLayer() + layers = torch.nn.ModuleList([layer]) + torch.manual_seed(42) + for p in layer.parameters(): + p.data = torch.randn_like(p.data) + + # Save original safety expert weight + orig_safety0 = layer.mlp.experts[0].down_proj.weight.data.clone() + # Save capability expert weights for computing expected mean + # With top-third classification (n_experts // 3 = 1), only expert 0 + # is safety; experts 1, 2, 3 are all capability. + cap1 = layer.mlp.experts[1].down_proj.weight.data.clone() + cap2 = layer.mlp.experts[2].down_proj.weight.data.clone() + cap3 = layer.mlp.experts[3].down_proj.weight.data.clone() + expected_mean = (cap1 + cap2 + cap3) / 3.0 + + import obliteratus.abliterate as abl_module + from obliteratus.models.loader import ModelHandle + from transformers import GPT2Config + + config = GPT2Config(n_embd=hidden, n_head=2, n_layer=1, vocab_size=100, n_positions=64) + model = MagicMock() + model.parameters.return_value = iter([torch.zeros(1)]) + handle = ModelHandle(model=model, tokenizer=MagicMock(), config=config, model_name="test", task="causal_lm") + + pipeline = AbliterationPipeline(model_name="test", method="nuclear") + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + pipeline._strong_layers = [0] + # Experts 0,1 are safety (high affinity), 2,3 are capability + pipeline._expert_safety_scores = { + 0: [(0, 5.0), (1, 3.0), (2, -1.0), (3, -3.0)] + } + + orig_get_ffn = abl_module.get_ffn_module + abl_module.get_ffn_module = lambda lay, a: lay.mlp + try: + count = pipeline._transplant_expert_weights(layers) + finally: + abl_module.get_ffn_module = orig_get_ffn + + assert count >= 1, f"Should blend at least 1 weight (top-third safety expert), got {count}" + + # Safety expert 0 should be a 10% blend toward capability mean + # (nuclear default transplant_blend=0.10) + # new = 0.90 * original + 0.10 * capability_mean + blend = pipeline.transplant_blend # 0.10 + expected_blend = (1.0 - blend) * orig_safety0 + blend * expected_mean + transplanted = layer.mlp.experts[0].down_proj.weight.data + assert torch.allclose(transplanted, expected_blend, atol=1e-4), ( + f"Safety expert weight should be {blend:.0%} blended toward capability mean" + ) + + # Capability expert 2 should be unchanged + assert torch.allclose(layer.mlp.experts[2].down_proj.weight.data, cap2, atol=1e-6), ( + "Capability expert should be unchanged" + ) + + def test_gather_state_dict_raises_on_missing_offload(self): + """Should raise RuntimeError (not silently corrupt) when offload dir is missing.""" + from obliteratus.models.loader import ModelHandle + from transformers import GPT2Config + + config = GPT2Config(n_embd=8, n_head=2, n_layer=1, vocab_size=100, n_positions=64) + + # Create a fake model whose state_dict returns a meta tensor + fake_model = MagicMock() + meta_tensor = torch.empty(4, 8, device="meta") + fake_model.state_dict.return_value = {"layer.weight": meta_tensor} + + handle = ModelHandle( + model=fake_model, tokenizer=MagicMock(), config=config, + model_name="test", task="causal_lm", + ) + handle._offload_dir = "/nonexistent/path" + + pipeline = AbliterationPipeline(model_name="test", method="nuclear") + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + + with pytest.raises(RuntimeError, match="bricked checkpoint"): + pipeline._gather_state_dict() + + +# --------------------------------------------------------------------------- +# Knee detection +# --------------------------------------------------------------------------- + +class TestKneeDetection: + def test_empty_input(self): + result = AbliterationPipeline._select_layers_knee([]) + assert result == [] + + def test_two_layers(self): + result = AbliterationPipeline._select_layers_knee([(0, 5.0), (1, 3.0)]) + assert set(result) == {0, 1} + + def test_clear_knee(self): + """Layers with a sharp dropoff should be separated by knee detection.""" + sorted_layers = [ + (14, 10.0), (15, 9.5), (13, 9.0), # strong cluster + (16, 2.0), (12, 1.5), (17, 1.0), (11, 0.5), (18, 0.2), (10, 0.1), + ] + result = AbliterationPipeline._select_layers_knee(sorted_layers) + # Should select the strong cluster (layers 14, 15, 13) and exclude weak ones + assert 14 in result + assert 15 in result + assert 13 in result + assert len(result) <= 5 # shouldn't select all 9 + + def test_minimum_threshold_filters_noise(self): + """Layers below 10% of max should be filtered out.""" + sorted_layers = [(0, 10.0), (1, 0.5)] # 0.5 is 5% of 10 + result = AbliterationPipeline._select_layers_knee(sorted_layers) + # Layer 1 is below 10% threshold + assert 0 in result + + def test_all_equal_norms(self): + """When all norms are equal, should select all (or most).""" + sorted_layers = [(i, 5.0) for i in range(5)] + result = AbliterationPipeline._select_layers_knee(sorted_layers) + assert len(result) >= 1 + + +# --------------------------------------------------------------------------- +# Activation collection +# --------------------------------------------------------------------------- + +class TestActivationCollection: + def test_collect_activations(self, handle): + """Test that activation collection returns correct structure.""" + from obliteratus.strategies.utils import get_layer_modules + + pipeline = AbliterationPipeline(model_name="test") + pipeline.handle = handle + pipeline._on_log = lambda m: None + + layers = get_layer_modules(handle) + prompts = ["Hello world", "Test prompt"] + + handle.tokenizer.return_value = { + "input_ids": torch.randint(0, 1000, (1, 5)), + "attention_mask": torch.ones(1, 5, dtype=torch.long), + } + + activations = pipeline._collect_activations(layers, prompts, "test") + + assert len(activations) == len(layers) + for idx in range(len(layers)): + assert len(activations[idx]) == len(prompts) + for act in activations[idx]: + assert act.device == torch.device("cpu") + assert act.shape[-1] == handle.hidden_size + + +# --------------------------------------------------------------------------- +# Distill: single direction (basic method) +# --------------------------------------------------------------------------- + +class TestDistillBasic: + def test_single_direction(self, handle): + """Basic method: single refusal direction via difference-in-means.""" + from obliteratus.strategies.utils import get_layer_modules + + pipeline = AbliterationPipeline( + model_name="test", + method="basic", + harmful_prompts=["bad prompt"], + harmless_prompts=["good prompt"], + ) + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + _make_varied_tokenizer(handle) + + pipeline._probe() + pipeline._distill() + + n_layers = len(get_layer_modules(handle)) + assert len(pipeline.refusal_directions) == n_layers + for idx, direction in pipeline.refusal_directions.items(): + assert abs(direction.norm().item() - 1.0) < 1e-4 + # Single direction: subspace should be (1, hidden_dim) + assert pipeline.refusal_subspaces[idx].shape[0] == 1 + + +# --------------------------------------------------------------------------- +# Distill: multi-direction SVD (advanced/aggressive method) +# --------------------------------------------------------------------------- + +class TestDistillSVD: + def test_multi_direction_svd(self, handle): + """Advanced method: SVD extracts multiple refusal directions. + + Note: on small models (hidden_size < 2048 or < 2B params), n_directions + is automatically capped to 2 to prevent over-ablation. The test model + (hidden_size=64, 4 layers) triggers this safeguard. + """ + from obliteratus.strategies.utils import get_layer_modules + + pipeline = AbliterationPipeline( + model_name="test", + method="advanced", + harmful_prompts=["bad1", "bad2", "bad3", "bad4", "bad5"], + harmless_prompts=["good1", "good2", "good3", "good4", "good5"], + ) + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + _make_varied_tokenizer(handle) + + pipeline._probe() + pipeline._distill() + + n_layers = len(get_layer_modules(handle)) + assert len(pipeline.refusal_subspaces) == n_layers + # Small-model cap: n_directions capped to 2 for tiny test model + expected_dirs = min(2, pipeline.n_directions, 5, handle.hidden_size) + for idx, subspace in pipeline.refusal_subspaces.items(): + assert subspace.shape[0] == expected_dirs + assert subspace.shape[1] == handle.hidden_size + + # Primary direction should still be a unit vector + for idx, direction in pipeline.refusal_directions.items(): + assert abs(direction.norm().item() - 1.0) < 1e-4 + + +# --------------------------------------------------------------------------- +# Full pipeline: excise with different methods +# --------------------------------------------------------------------------- + +class TestExcise: + def test_excise_basic(self, handle): + """Basic method should modify weights.""" + from obliteratus.strategies.utils import get_layer_modules + + pipeline = AbliterationPipeline( + model_name="test", + method="basic", + harmful_prompts=["bad prompt"], + harmless_prompts=["good prompt"], + ) + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + _make_varied_tokenizer(handle) + + layers = get_layer_modules(handle) + original_weights = {} + for idx in range(len(layers)): + for name, param in layers[idx].named_parameters(): + original_weights[(idx, name)] = param.data.clone() + + pipeline._probe() + pipeline._distill() + pipeline._excise() + + any_changed = False + for idx in range(len(layers)): + for name, param in layers[idx].named_parameters(): + if not torch.allclose(original_weights[(idx, name)], param.data, atol=1e-6): + any_changed = True + break + + assert any_changed, "Excise should modify at least some weights" + + def test_excise_advanced_norm_preserving(self, handle): + """Advanced method with norm preservation should maintain weight norms.""" + from obliteratus.strategies.utils import get_layer_modules + + pipeline = AbliterationPipeline( + model_name="test", + method="advanced", + harmful_prompts=["bad prompt"], + harmless_prompts=["good prompt"], + ) + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + _make_varied_tokenizer(handle) + + get_layer_modules(handle) + + pipeline._probe() + pipeline._distill() + pipeline._excise() + + # Weights should have been modified (advanced uses _project_out_advanced) + assert len(pipeline._strong_layers) > 0 + + +# --------------------------------------------------------------------------- +# Rebirth (save) +# --------------------------------------------------------------------------- + +class TestRebirth: + def test_rebirth_saves_metadata(self, handle, tmp_path): + """Rebirth should save model and comprehensive metadata JSON.""" + pipeline = AbliterationPipeline( + model_name="test-model", + output_dir=str(tmp_path / "output"), + method="advanced", + ) + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + pipeline._strong_layers = [0] + pipeline._quality_metrics = {"perplexity": 8.5, "coherence": 1.0} + + handle.model.save_pretrained = MagicMock() + handle.tokenizer.save_pretrained = MagicMock() + + result_path = pipeline._rebirth() + + assert result_path == tmp_path / "output" + assert (result_path / "abliteration_metadata.json").exists() + + metadata = json.loads((result_path / "abliteration_metadata.json").read_text()) + assert metadata["source_model"] == "test-model" + assert metadata["technique"] == "refusal_direction_ablation" + assert metadata["method"] == "advanced" + assert metadata["strong_layers"] == [0] + assert "method_config" in metadata + assert metadata["method_config"]["n_directions"] == METHODS["advanced"]["n_directions"] + assert metadata["method_config"]["norm_preserve"] is True + assert "references" in metadata + assert len(metadata["references"]) >= 3 + assert "quality_metrics" in metadata + assert metadata["quality_metrics"]["perplexity"] == 8.5 + + +# --------------------------------------------------------------------------- +# CLI integration +# --------------------------------------------------------------------------- + +class TestCLI: + def test_abliterate_parser_with_method(self): + """Test that the abliterate subcommand parses method correctly.""" + import argparse + + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command") + abl_parser = subparsers.add_parser("abliterate") + abl_parser.add_argument("model", type=str) + abl_parser.add_argument("--output-dir", type=str, default=None) + abl_parser.add_argument("--device", type=str, default="auto") + abl_parser.add_argument("--dtype", type=str, default="float16") + abl_parser.add_argument("--method", type=str, default="advanced", + choices=["basic", "advanced", "aggressive"]) + abl_parser.add_argument("--n-directions", type=int, default=None) + abl_parser.add_argument("--regularization", type=float, default=None) + abl_parser.add_argument("--refinement-passes", type=int, default=None) + + args = parser.parse_args(["abliterate", "gpt2", "--method", "aggressive", "--n-directions", "6"]) + assert args.command == "abliterate" + assert args.model == "gpt2" + assert args.method == "aggressive" + assert args.n_directions == 6 + assert args.dtype == "float16" + + def test_default_method(self): + """Default method should be advanced.""" + import argparse + + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command") + abl_parser = subparsers.add_parser("abliterate") + abl_parser.add_argument("model", type=str) + abl_parser.add_argument("--method", type=str, default="advanced") + + args = parser.parse_args(["abliterate", "gpt2"]) + assert args.method == "advanced" + + +# --------------------------------------------------------------------------- +# Expert-Granular Abliteration (EGA) +# --------------------------------------------------------------------------- + +class TestFindRouterModule: + """Test _find_router_module static method.""" + + def test_finds_gate(self): + """Should find a router named 'gate'.""" + hidden = 16 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, 4, bias=False) + self.experts = torch.nn.ModuleList() + + moe = FakeMoE() + router = AbliterationPipeline._find_router_module(moe) + assert router is moe.gate + + def test_finds_router(self): + """Should find a router named 'router'.""" + hidden = 16 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.router = torch.nn.Linear(hidden, 4, bias=False) + self.experts = torch.nn.ModuleList() + + moe = FakeMoE() + router = AbliterationPipeline._find_router_module(moe) + assert router is moe.router + + def test_auto_detects_unknown_router(self): + """Should auto-detect a router with unusual name via heuristic.""" + hidden = 16 + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.moe_gate_proj = torch.nn.Linear(hidden, 4, bias=False) + self.experts = torch.nn.ModuleList() + + moe = FakeMoE() + router = AbliterationPipeline._find_router_module(moe) + assert router is moe.moe_gate_proj + + def test_returns_none_no_router(self): + """Should return None when no router is found.""" + class NoRouter(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(16, 16) + + mod = NoRouter() + assert AbliterationPipeline._find_router_module(mod) is None + + +class TestRouterProfilingHooks: + """Test _install_router_profiling_hooks.""" + + def _make_moe_pipeline_and_layers(self, hidden=16, n_experts=4): + """Create a pipeline with a fake MoE model for router profiling tests.""" + from obliteratus.models.loader import ModelHandle + from transformers import GPT2Config + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, hidden, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)]) + + def forward(self, x): + return x + + class FakeLayer(torch.nn.Module): + def __init__(self): + super().__init__() + self.self_attn = torch.nn.Module() + self.self_attn.o_proj = torch.nn.Linear(hidden, hidden, bias=False) + self.mlp = FakeMoE() + + def forward(self, x): + return (x,) + + config = GPT2Config(n_embd=hidden, n_head=2, n_layer=1, vocab_size=100, n_positions=64) + model = MagicMock() + model.parameters.return_value = iter([torch.zeros(1)]) + handle = ModelHandle(model=model, tokenizer=MagicMock(), config=config, model_name="test", task="causal_lm") + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + + layer = FakeLayer() + layers = torch.nn.ModuleList([layer]) + + # Monkey-patch get_ffn_module + import obliteratus.abliterate as abl_module + orig_get_ffn = abl_module.get_ffn_module + abl_module.get_ffn_module = lambda lay, a: lay.mlp + + return pipeline, layers, layer, abl_module, orig_get_ffn + + def test_hooks_installed(self): + """Should install hooks on MoE router modules.""" + pipeline, layers, layer, abl_module, orig_get_ffn = self._make_moe_pipeline_and_layers() + try: + hooks = pipeline._install_router_profiling_hooks(layers) + assert len(hooks) == 1 + assert 0 in pipeline._routing_harmful + assert 0 in pipeline._routing_harmless + finally: + for h in hooks: + h.remove() + abl_module.get_ffn_module = orig_get_ffn + + def test_hooks_record_logits(self): + """Hooks should record router logits during forward passes.""" + pipeline, layers, layer, abl_module, orig_get_ffn = self._make_moe_pipeline_and_layers() + try: + hooks = pipeline._install_router_profiling_hooks(layers) + + # Simulate harmful forward pass + pipeline._routing_is_harmful = True + x = torch.randn(1, 5, 16) + layer.mlp.gate(x) # triggers hook + + assert len(pipeline._routing_harmful[0]) == 1 + assert pipeline._routing_harmful[0][0].shape[0] == 4 # n_experts + + # Simulate harmless forward pass + pipeline._routing_is_harmful = False + layer.mlp.gate(x) + + assert len(pipeline._routing_harmless[0]) == 1 + finally: + for h in hooks: + h.remove() + abl_module.get_ffn_module = orig_get_ffn + + def test_no_handle_returns_empty(self): + """Should return empty list when handle is None.""" + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline.handle = None + hooks = pipeline._install_router_profiling_hooks(torch.nn.ModuleList()) + assert hooks == [] + + +class TestComputeExpertGranularDirections: + """Test _compute_expert_granular_directions.""" + + def test_computes_per_expert_directions(self): + """Should compute per-expert refusal directions from routing data.""" + hidden = 16 + n_experts = 4 + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + pipeline._strong_layers = [0] + + torch.manual_seed(42) + + # Simulate router logits: expert 0 favored for harmful, expert 3 for harmless + h_logits = [] + s_logits = [] + for _ in range(10): + hl = torch.randn(n_experts) + hl[0] += 2.0 # bias expert 0 for harmful + h_logits.append(hl) + sl = torch.randn(n_experts) + sl[3] += 2.0 # bias expert 3 for harmless + s_logits.append(sl) + + pipeline._routing_harmful = {0: h_logits} + pipeline._routing_harmless = {0: s_logits} + + # Simulate per-prompt activations with harmful/harmless separation + refusal_dir = torch.randn(hidden) + refusal_dir = refusal_dir / refusal_dir.norm() + + h_acts = [torch.randn(hidden) + 1.5 * refusal_dir for _ in range(10)] + s_acts = [torch.randn(hidden) - 1.5 * refusal_dir for _ in range(10)] + pipeline._harmful_acts = {0: h_acts} + pipeline._harmless_acts = {0: s_acts} + + pipeline._compute_expert_granular_directions() + + # Should have computed expert directions for layer 0 + assert 0 in pipeline._expert_directions + assert len(pipeline._expert_directions[0]) > 0 + + # Should have dynamic safety scores + assert 0 in pipeline._expert_safety_scores + scores = pipeline._expert_safety_scores[0] + assert len(scores) == n_experts + # Expert 0 should have higher safety score (more activated for harmful) + expert_0_score = next(s for eid, s in scores if eid == 0) + expert_3_score = next(s for eid, s in scores if eid == 3) + assert expert_0_score > expert_3_score, ( + f"Expert 0 should have higher safety score: {expert_0_score} vs {expert_3_score}" + ) + + def test_directions_are_unit_vectors(self): + """Per-expert directions should be unit normalized.""" + hidden = 16 + n_experts = 4 + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._strong_layers = [0] + + torch.manual_seed(42) + h_logits = [torch.randn(n_experts) for _ in range(10)] + s_logits = [torch.randn(n_experts) for _ in range(10)] + pipeline._routing_harmful = {0: h_logits} + pipeline._routing_harmless = {0: s_logits} + pipeline._harmful_acts = {0: [torch.randn(hidden) + torch.ones(hidden) for _ in range(10)]} + pipeline._harmless_acts = {0: [torch.randn(hidden) - torch.ones(hidden) for _ in range(10)]} + + pipeline._compute_expert_granular_directions() + + if 0 in pipeline._expert_directions: + for ei, d in pipeline._expert_directions[0].items(): + assert abs(d.norm().item() - 1.0) < 1e-4, ( + f"Expert {ei} direction norm={d.norm().item()}, expected 1.0" + ) + + def test_skips_when_no_routing_data(self): + """Should skip gracefully when no routing data is available.""" + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._routing_harmful = {} + pipeline._routing_harmless = {} + + pipeline._compute_expert_granular_directions() + + assert len(pipeline._expert_directions) == 0 + + def test_skips_expert_with_low_routing_weight(self): + """Experts with insufficient routing weight should not get directions.""" + hidden = 16 + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._strong_layers = [0] + + # Create routing logits where expert 3 is never selected (very low) + h_logits = [] + s_logits = [] + for _ in range(3): + hl = torch.tensor([5.0, 5.0, 5.0, -100.0]) # expert 3 never routed + h_logits.append(hl) + sl = torch.tensor([5.0, 5.0, 5.0, -100.0]) + s_logits.append(sl) + + pipeline._routing_harmful = {0: h_logits} + pipeline._routing_harmless = {0: s_logits} + + torch.manual_seed(42) + pipeline._harmful_acts = {0: [torch.randn(hidden) for _ in range(3)]} + pipeline._harmless_acts = {0: [torch.randn(hidden) for _ in range(3)]} + + pipeline._compute_expert_granular_directions() + + # Expert 3 should NOT have a direction (routing weight too low) + if 0 in pipeline._expert_directions: + assert 3 not in pipeline._expert_directions[0], ( + "Expert with near-zero routing weight should not get a direction" + ) + + +class TestProjectMoEExpertsGranular: + """Test _project_moe_experts_granular (ModuleList path).""" + + def _make_direction(self, hidden_dim=16): + d = torch.randn(hidden_dim, 1) + return d / d.norm() + + def test_per_expert_directions_applied(self): + """Each expert should use its own direction when available.""" + hidden = 16 + n_experts = 4 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.up_proj = torch.nn.Linear(hidden, 32, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)]) + + moe = FakeMoE() + torch.manual_seed(42) + for p in moe.parameters(): + p.data = torch.randn_like(p.data) + + shared_dir = self._make_direction(hidden) + + # Create distinct per-expert directions + expert_dirs = {} + for ei in range(n_experts): + d = torch.randn(hidden) + d = d / d.norm() + expert_dirs[ei] = d + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._expert_directions = {0: expert_dirs} + + # Save originals + orig_weights = { + ei: moe.experts[ei].down_proj.weight.data.clone() + for ei in range(n_experts) + } + + count = pipeline._project_moe_experts_granular( + moe, shared_dir, layer_idx=0, + ) + + assert count > 0, "Should project some weights" + + # All experts should be modified + for ei in range(n_experts): + assert not torch.allclose( + moe.experts[ei].down_proj.weight.data, orig_weights[ei] + ), f"Expert {ei} should be modified" + + def test_falls_back_to_shared_direction(self): + """Experts without per-expert direction should use shared direction.""" + hidden = 16 + n_experts = 4 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.up_proj = torch.nn.Linear(hidden, 32, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)]) + + moe = FakeMoE() + torch.manual_seed(42) + for p in moe.parameters(): + p.data = torch.randn_like(p.data) + + shared_dir = self._make_direction(hidden) + + # Only expert 0 has a per-expert direction + expert_dirs = {0: torch.randn(hidden).div_(torch.randn(hidden).norm())} + expert_dirs[0] = expert_dirs[0] / expert_dirs[0].norm() + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._expert_directions = {0: expert_dirs} + + orig_e1 = moe.experts[1].down_proj.weight.data.clone() + + pipeline._project_moe_experts_granular( + moe, shared_dir, layer_idx=0, + ) + + # Experts 1,2,3 should be modified (using shared direction) + assert not torch.allclose(moe.experts[1].down_proj.weight.data, orig_e1), \ + "Expert 1 should use shared direction fallback" + + def test_router_uses_shared_direction(self): + """Router should always use the shared direction, not per-expert.""" + hidden = 16 + n_experts = 4 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)]) + + moe = FakeMoE() + shared_dir = self._make_direction(hidden) + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._expert_directions = {0: {0: torch.randn(hidden)}} + + orig_gate = moe.gate.weight.data.clone() + + pipeline._project_moe_experts_granular(moe, shared_dir, layer_idx=0) + + # Gate should be projected + assert not torch.allclose(moe.gate.weight.data, orig_gate), \ + "Router should be projected with shared direction" + + # Gate's projection onto shared direction should be near zero + proj = (moe.gate.weight.data @ shared_dir).norm().item() + assert proj < 1e-4, f"Router should have shared dir removed, proj={proj}" + + def test_shared_expert_uses_shared_direction(self): + """Shared expert should always use the shared direction.""" + hidden = 16 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.up_proj = torch.nn.Linear(hidden, 32, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, 2, bias=False) + self.shared_expert = torch.nn.Module() + self.shared_expert.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.shared_expert.up_proj = torch.nn.Linear(hidden, 32, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(2)]) + + moe = FakeMoE() + shared_dir = self._make_direction(hidden) + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._expert_directions = {0: {0: torch.randn(hidden)}} + + orig_shared = moe.shared_expert.down_proj.weight.data.clone() + + pipeline._project_moe_experts_granular(moe, shared_dir, layer_idx=0) + + assert not torch.allclose(moe.shared_expert.down_proj.weight.data, orig_shared), \ + "Shared expert should be projected" + + +class TestProjectFused3DGranular: + """Test _project_fused_3d_granular for fused 3D expert tensors.""" + + def test_per_expert_directions_on_fused(self): + """Each expert slice should use its own direction.""" + hidden = 16 + intermediate = 32 + n_experts = 4 + + class FusedExperts(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden)) + + container = FusedExperts() + torch.manual_seed(42) + + shared_dir = torch.randn(hidden, 1) + shared_dir = shared_dir / shared_dir.norm() + + # Per-expert directions + expert_dirs = {} + for ei in range(n_experts): + d = torch.randn(hidden) + d = d / d.norm() + expert_dirs[ei] = d + + orig_data = container.down_proj.data.clone() + + count = AbliterationPipeline._project_fused_3d_granular( + container, shared_dir, expert_dirs, ["down_proj"], + norm_preserve=False, scale=1.0, + ) + + assert count == n_experts, f"Should project {n_experts} experts, got {count}" + + # Each expert should be modified + for ei in range(n_experts): + assert not torch.allclose( + container.down_proj.data[ei], orig_data[ei] + ), f"Expert {ei} should be modified" + + def test_fallback_to_shared_on_fused(self): + """Experts without per-expert direction should use shared direction.""" + hidden = 16 + intermediate = 32 + n_experts = 4 + + class FusedExperts(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden)) + + container = FusedExperts() + torch.manual_seed(42) + + shared_dir = torch.randn(hidden, 1) + shared_dir = shared_dir / shared_dir.norm() + + # Only expert 0 has a direction + expert_dirs = {0: torch.randn(hidden).div_(1.0)} + expert_dirs[0] = expert_dirs[0] / expert_dirs[0].norm() + + orig_data = container.down_proj.data.clone() + + count = AbliterationPipeline._project_fused_3d_granular( + container, shared_dir, expert_dirs, ["down_proj"], + norm_preserve=False, scale=1.0, + ) + + assert count == n_experts + # All experts should be modified (experts 1-3 use shared dir) + for ei in range(n_experts): + assert not torch.allclose( + container.down_proj.data[ei], orig_data[ei] + ), f"Expert {ei} should be modified" + + def test_norm_preserve_on_fused(self): + """Fused 3D with norm_preserve should maintain per-expert norms.""" + hidden = 16 + intermediate = 32 + n_experts = 4 + + class FusedExperts(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Parameter(torch.randn(n_experts, intermediate, hidden)) + + container = FusedExperts() + torch.manual_seed(42) + + shared_dir = torch.randn(hidden, 1) + shared_dir = shared_dir / shared_dir.norm() + + expert_dirs = {} + for ei in range(n_experts): + d = torch.randn(hidden) + expert_dirs[ei] = d / d.norm() + + orig_norms = [container.down_proj.data[i].norm().item() for i in range(n_experts)] + + AbliterationPipeline._project_fused_3d_granular( + container, shared_dir, expert_dirs, ["down_proj"], + norm_preserve=True, scale=1.0, + ) + + for i in range(n_experts): + new_norm = container.down_proj.data[i].norm().item() + assert abs(orig_norms[i] - new_norm) < 1e-3, ( + f"Expert {i} norm not preserved: {orig_norms[i]:.4f} vs {new_norm:.4f}" + ) + + def test_skips_non_3d_params(self): + """Should skip parameters that are not 3-dimensional.""" + hidden = 16 + + class FlatExperts(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Parameter(torch.randn(32, hidden)) + + container = FlatExperts() + shared_dir = torch.randn(hidden, 1) + shared_dir = shared_dir / shared_dir.norm() + + count = AbliterationPipeline._project_fused_3d_granular( + container, shared_dir, {}, ["down_proj"], + norm_preserve=False, scale=1.0, + ) + assert count == 0 + + +class TestEGAExciseIntegration: + """Test that EGA integrates properly in the excise stage path.""" + + def test_ega_pipeline_flags(self): + """Pipeline with surgical method should enable per_expert_directions.""" + pipeline = AbliterationPipeline(model_name="test", method="surgical") + assert pipeline.per_expert_directions is True + + def test_ega_only_on_primary_direction(self): + """EGA should only apply for dir_idx==0, not higher SVD directions.""" + # This is enforced by the `and dir_idx == 0` check in _excise + # We verify the code structure exists + from obliteratus.abliterate import AbliterationPipeline + import inspect + source = inspect.getsource(AbliterationPipeline._excise_inner) + assert "dir_idx == 0" in source, "EGA should only apply for primary direction" + assert "_project_moe_experts_granular" in source, "EGA method should be called in excise" + + def test_ega_distill_integration(self): + """EGA should be called during distill when per_expert_directions is enabled.""" + from obliteratus.abliterate import AbliterationPipeline + import inspect + source = inspect.getsource(AbliterationPipeline._distill) + assert "_compute_expert_granular_directions" in source + assert "per_expert_directions" in source + + def test_nuclear_method_enables_ega(self): + """Nuclear method should also enable per_expert_directions.""" + cfg = METHODS["nuclear"] + assert cfg["per_expert_directions"] is True + pipeline = AbliterationPipeline(model_name="test", method="nuclear") + assert pipeline.per_expert_directions is True + + def test_basic_method_disables_ega(self): + """Basic method should not enable per_expert_directions.""" + cfg = METHODS["basic"] + assert cfg.get("per_expert_directions", False) is False + + def test_inverted_method_enables_ega(self): + """Inverted method should enable per_expert_directions.""" + cfg = METHODS["inverted"] + assert cfg["per_expert_directions"] is True + + def test_ega_with_routing_data_end_to_end(self): + """End-to-end: EGA computes directions and granular projection modifies weights.""" + hidden = 16 + n_experts = 4 + + class FakeExpert(torch.nn.Module): + def __init__(self): + super().__init__() + self.down_proj = torch.nn.Linear(hidden, 32, bias=False) + self.up_proj = torch.nn.Linear(hidden, 32, bias=False) + + class FakeMoE(torch.nn.Module): + def __init__(self): + super().__init__() + self.gate = torch.nn.Linear(hidden, n_experts, bias=False) + self.experts = torch.nn.ModuleList([FakeExpert() for _ in range(n_experts)]) + + moe = FakeMoE() + torch.manual_seed(42) + for p in moe.parameters(): + p.data = torch.randn_like(p.data) + + pipeline = AbliterationPipeline(model_name="test", method="surgical") + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + pipeline._strong_layers = [0] + + # Simulate EGA routing data + h_logits = [torch.randn(n_experts) for _ in range(5)] + s_logits = [torch.randn(n_experts) for _ in range(5)] + pipeline._routing_harmful = {0: h_logits} + pipeline._routing_harmless = {0: s_logits} + + # Simulate activations with clear separation + refusal_dir = torch.randn(hidden) + refusal_dir = refusal_dir / refusal_dir.norm() + pipeline._harmful_acts = {0: [torch.randn(hidden) + 2 * refusal_dir for _ in range(5)]} + pipeline._harmless_acts = {0: [torch.randn(hidden) - 2 * refusal_dir for _ in range(5)]} + + # Step 1: compute EGA directions + pipeline._compute_expert_granular_directions() + assert 0 in pipeline._expert_directions + assert len(pipeline._expert_directions[0]) > 0 + + # Step 2: apply granular projection + shared_dir = torch.randn(hidden, 1) + shared_dir = shared_dir / shared_dir.norm() + + orig_expert0 = moe.experts[0].down_proj.weight.data.clone() + + count = pipeline._project_moe_experts_granular( + moe, shared_dir, layer_idx=0, + ) + + assert count > 0 + assert not torch.allclose(moe.experts[0].down_proj.weight.data, orig_expert0), \ + "Expert weights should be modified by EGA" diff --git a/tests/test_abliterate_extended.py b/tests/test_abliterate_extended.py new file mode 100644 index 0000000..ec45001 --- /dev/null +++ b/tests/test_abliterate_extended.py @@ -0,0 +1,302 @@ +"""Extended tests for novel abliteration pipeline features. + +Tests the new capabilities added to the OBLITERATUS abliteration pipeline: +- Bias projection +- Chat template wrapping +- Method presets with new parameters +- True iterative refinement +- Whitened SVD integration +""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import torch +from transformers import GPT2Config, GPT2LMHeadModel + +from obliteratus.abliterate import ( + METHODS, + AbliterationPipeline, +) +from obliteratus.models.loader import ModelHandle + + +def _make_tiny_handle(): + """Create a minimal ModelHandle with a tiny GPT-2 for testing.""" + config = GPT2Config( + vocab_size=1000, + n_positions=128, + n_embd=64, + n_layer=4, + n_head=2, + n_inner=256, + ) + model = GPT2LMHeadModel(config) + model.eval() + + tokenizer = MagicMock() + tokenizer.pad_token = "" + tokenizer.eos_token = "" + tokenizer.return_value = { + "input_ids": torch.randint(0, 1000, (1, 10)), + "attention_mask": torch.ones(1, 10, dtype=torch.long), + } + tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city" + + handle = ModelHandle( + model=model, + tokenizer=tokenizer, + config=config, + model_name="gpt2-test", + task="causal_lm", + ) + handle.snapshot() + return handle + + +def _make_varied_tokenizer(handle): + """Set up a tokenizer mock that returns different tokens per call.""" + call_count = [0] + def mock_tokenizer(prompt, **kwargs): + call_count[0] += 1 + torch.manual_seed(call_count[0]) + return { + "input_ids": torch.randint(0, 1000, (1, 5)), + "attention_mask": torch.ones(1, 5, dtype=torch.long), + } + handle.tokenizer.side_effect = mock_tokenizer + + +# --------------------------------------------------------------------------- +# New method preset parameters +# --------------------------------------------------------------------------- + +class TestNewMethodPresets: + def test_basic_has_new_params(self): + cfg = METHODS["basic"] + assert "project_biases" in cfg + assert "use_chat_template" in cfg + assert "use_whitened_svd" in cfg + assert "true_iterative_refinement" in cfg + assert cfg["project_biases"] is False + assert cfg["use_chat_template"] is False + + def test_advanced_has_new_params(self): + cfg = METHODS["advanced"] + assert cfg["project_biases"] is True + assert cfg["use_chat_template"] is True + assert cfg["use_whitened_svd"] is False + assert cfg["true_iterative_refinement"] is False + + def test_aggressive_has_new_params(self): + cfg = METHODS["aggressive"] + assert cfg["project_biases"] is True + assert cfg["use_chat_template"] is True + assert cfg["use_whitened_svd"] is True + assert cfg["true_iterative_refinement"] is True + + +# --------------------------------------------------------------------------- +# Pipeline initialization with new parameters +# --------------------------------------------------------------------------- + +class TestNewPipelineInit: + def test_default_new_params(self): + pipeline = AbliterationPipeline(model_name="test-model") + # advanced method defaults + assert pipeline.project_biases is True + assert pipeline.use_chat_template is True + assert pipeline.use_whitened_svd is False + assert pipeline.true_iterative_refinement is False + + def test_basic_method_new_params(self): + pipeline = AbliterationPipeline(model_name="test-model", method="basic") + assert pipeline.project_biases is False + assert pipeline.use_chat_template is False + assert pipeline.use_whitened_svd is False + assert pipeline.true_iterative_refinement is False + + def test_aggressive_method_new_params(self): + pipeline = AbliterationPipeline(model_name="test-model", method="aggressive") + assert pipeline.project_biases is True + assert pipeline.use_chat_template is True + assert pipeline.use_whitened_svd is True + assert pipeline.true_iterative_refinement is True + + def test_explicit_overrides_new_params(self): + pipeline = AbliterationPipeline( + model_name="test-model", + method="basic", + project_biases=True, + use_chat_template=True, + use_whitened_svd=True, + true_iterative_refinement=True, + ) + assert pipeline.project_biases is True + assert pipeline.use_chat_template is True + assert pipeline.use_whitened_svd is True + assert pipeline.true_iterative_refinement is True + + +# --------------------------------------------------------------------------- +# Bias projection +# --------------------------------------------------------------------------- + +class TestBiasProjection: + def test_project_bias_removes_component(self): + """Bias projection should remove refusal direction component from bias.""" + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(4, 4, bias=True) + + module = Wrapper() + torch.manual_seed(42) + module.o_proj.bias.data = torch.tensor([1.0, 2.0, 3.0, 4.0]) + + direction = torch.tensor([1.0, 0.0, 0.0, 0.0]).unsqueeze(-1) # unit vector along dim 0 + + count = AbliterationPipeline._project_bias(module, direction, ["o_proj"]) + assert count == 1 + + # The component along direction [1,0,0,0] was 1.0, should now be ~0 + new_bias = module.o_proj.bias.data + projection_onto_dir = (new_bias @ direction.squeeze()).item() + assert abs(projection_onto_dir) < 1e-5 + + # Other components should be unchanged + assert abs(new_bias[1].item() - 2.0) < 1e-5 + assert abs(new_bias[2].item() - 3.0) < 1e-5 + assert abs(new_bias[3].item() - 4.0) < 1e-5 + + def test_project_bias_no_bias(self): + """Should handle modules without bias gracefully.""" + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.o_proj = torch.nn.Linear(4, 4, bias=False) + + module = Wrapper() + direction = torch.randn(4, 1) + count = AbliterationPipeline._project_bias(module, direction, ["o_proj"]) + assert count == 0 + + def test_project_bias_no_matching_module(self): + """Should return 0 when no candidate names match.""" + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.something = torch.nn.Linear(4, 4, bias=True) + + module = Wrapper() + direction = torch.randn(4, 1) + count = AbliterationPipeline._project_bias(module, direction, ["o_proj"]) + assert count == 0 + + +# --------------------------------------------------------------------------- +# Chat template wrapping +# --------------------------------------------------------------------------- + +class TestChatTemplate: + def test_no_wrap_when_disabled(self): + """Should not wrap prompts when use_chat_template is False.""" + pipeline = AbliterationPipeline( + model_name="test-model", + method="basic", + use_chat_template=False, + ) + prompts = ["Hello", "World"] + result = pipeline._maybe_apply_chat_template(prompts) + assert result == prompts + + def test_no_wrap_without_handle(self): + """Should return raw prompts when handle is not set.""" + pipeline = AbliterationPipeline( + model_name="test-model", + use_chat_template=True, + ) + prompts = ["Hello"] + result = pipeline._maybe_apply_chat_template(prompts) + assert result == prompts + + def test_wraps_with_template(self): + """Should wrap prompts when tokenizer has apply_chat_template.""" + pipeline = AbliterationPipeline( + model_name="test-model", + use_chat_template=True, + ) + handle = MagicMock() + tokenizer = MagicMock() + + def mock_apply(messages, tokenize=False, add_generation_prompt=True): + return f"{messages[0]['content']}" + + tokenizer.apply_chat_template = mock_apply + handle.tokenizer = tokenizer + pipeline.handle = handle + pipeline._on_log = lambda m: None + + result = pipeline._maybe_apply_chat_template(["Hello"]) + assert "Hello" in result[0] + + def test_fallback_when_no_template(self): + """Should fall back to raw prompts when template is not configured.""" + pipeline = AbliterationPipeline( + model_name="test-model", + use_chat_template=True, + ) + handle = MagicMock() + tokenizer = MagicMock() + tokenizer.apply_chat_template.side_effect = Exception("No template") + handle.tokenizer = tokenizer + pipeline.handle = handle + pipeline._on_log = lambda m: None + + result = pipeline._maybe_apply_chat_template(["Hello"]) + assert result == ["Hello"] + + +# --------------------------------------------------------------------------- +# Metadata includes new fields +# --------------------------------------------------------------------------- + +class TestMetadata: + def test_rebirth_includes_new_config(self): + """Metadata should include all new configuration parameters.""" + import json + handle = _make_tiny_handle() + pipeline = AbliterationPipeline( + model_name="test-model", + method="aggressive", + ) + pipeline.handle = handle + pipeline._on_log = lambda m: None + pipeline._on_stage = lambda r: None + pipeline._strong_layers = [0] + pipeline._quality_metrics = {"perplexity": 8.5, "coherence": 1.0} + + handle.model.save_pretrained = MagicMock() + handle.tokenizer.save_pretrained = MagicMock() + + import tempfile + from pathlib import Path + with tempfile.TemporaryDirectory() as tmp: + pipeline.output_dir = Path(tmp) / "output" + pipeline._rebirth() + + metadata = json.loads( + (pipeline.output_dir / "abliteration_metadata.json").read_text() + ) + cfg = metadata["method_config"] + assert "project_biases" in cfg + assert "use_chat_template" in cfg + assert "use_whitened_svd" in cfg + assert "true_iterative_refinement" in cfg + assert cfg["project_biases"] is True + assert cfg["use_whitened_svd"] is True + + # Should have more references now + assert len(metadata["references"]) >= 5 + assert any("OBLITERATUS" in r for r in metadata["references"]) diff --git a/tests/test_abliteration_math.py b/tests/test_abliteration_math.py new file mode 100644 index 0000000..6856f39 --- /dev/null +++ b/tests/test_abliteration_math.py @@ -0,0 +1,300 @@ +"""Mathematical verification that abliteration actually removes refusal directions. + +These tests verify the core linear algebra claims WITHOUT mocks: + 1. Projection removes the target direction from weight matrices + 2. Norm-preserving projection maintains weight magnitude + 3. Multi-direction SVD extracts the correct subspace + 4. Whitened SVD produces orthogonal directions + 5. Random directions do NOT have the same effect (negative control) + +Unlike the other test files, these use real tensors and verify mathematical +properties directly — no MagicMock, no mocked tokenizers. +""" + +from __future__ import annotations + + +import torch + + +class TestProjectionRemovesDirection: + """Verify that orthogonal projection removes the target direction.""" + + def test_single_direction_projection(self): + """After projecting out direction d from weight W, + W_proj @ d should be approximately zero.""" + torch.manual_seed(42) + hidden = 256 + out_dim = 128 + + W = torch.randn(out_dim, hidden) + d = torch.randn(hidden) + d = d / d.norm() + + # Project out d: W_proj = W - (W @ d) @ d^T + proj = W @ d # (out_dim,) + W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0) + + # Verify: W_proj @ d should be ~0 + residual = W_proj @ d + assert residual.abs().max().item() < 1e-5, f"Residual too large: {residual.abs().max()}" + + def test_projection_preserves_orthogonal_components(self): + """Projection should NOT change components orthogonal to d.""" + torch.manual_seed(42) + hidden = 256 + out_dim = 128 + + W = torch.randn(out_dim, hidden) + d = torch.randn(hidden) + d = d / d.norm() + + # Create a vector orthogonal to d + v = torch.randn(hidden) + v = v - (v @ d) * d # Gram-Schmidt + v = v / v.norm() + + # Project out d + proj = W @ d + W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0) + + # W @ v should equal W_proj @ v (orthogonal component unchanged) + original = W @ v + projected = W_proj @ v + diff = (original - projected).abs().max().item() + assert diff < 1e-5, f"Orthogonal component changed by {diff}" + + def test_multi_direction_subspace_removal(self): + """Projecting out a k-dimensional subspace should remove all k directions.""" + torch.manual_seed(42) + hidden = 256 + out_dim = 128 + k = 4 + + W = torch.randn(out_dim, hidden) + # Create orthonormal subspace + Q, _ = torch.linalg.qr(torch.randn(hidden, k)) + subspace = Q.T # (k, hidden) + + # Project out subspace: W_proj = W - W @ Q @ Q^T + W_proj = W - (W @ Q) @ Q.T + + # Verify: W_proj @ subspace^T should be ~0 for all directions + residual = W_proj @ subspace.T # (out_dim, k) + assert residual.abs().max().item() < 1e-5, f"Subspace residual: {residual.abs().max()}" + + def test_double_projection_is_idempotent(self): + """Projecting twice should give the same result as projecting once.""" + torch.manual_seed(42) + hidden = 256 + out_dim = 128 + + W = torch.randn(out_dim, hidden) + d = torch.randn(hidden) + d = d / d.norm() + + # Project once + proj1 = W @ d + W1 = W - proj1.unsqueeze(1) * d.unsqueeze(0) + + # Project twice + proj2 = W1 @ d + W2 = W1 - proj2.unsqueeze(1) * d.unsqueeze(0) + + diff = (W1 - W2).abs().max().item() + assert diff < 1e-5, f"Second projection changed weights by {diff}" + + +class TestNormPreservation: + """Verify that norm-preserving projection maintains weight magnitude.""" + + def test_norm_preserving_projection(self): + """Biprojected norm-preserving abliteration should keep ||W|| constant.""" + torch.manual_seed(42) + hidden = 256 + out_dim = 128 + + W = torch.randn(out_dim, hidden) + d = torch.randn(hidden) + d = d / d.norm() + + # Standard projection + proj_coeff = W @ d + W_proj = W - proj_coeff.unsqueeze(1) * d.unsqueeze(0) + + # Norm-preserving rescaling (per-row) + row_norms_orig = W.norm(dim=1, keepdim=True).clamp(min=1e-8) + row_norms_proj = W_proj.norm(dim=1, keepdim=True).clamp(min=1e-8) + W_norm_preserved = W_proj * (row_norms_orig / row_norms_proj) + + # Direction is still removed + residual = W_norm_preserved @ d + # Norm-preserving can't guarantee zero projection (it rescales), + # but projection should be significantly reduced + original_proj = (W @ d).abs().mean().item() + preserved_proj = residual.abs().mean().item() + assert preserved_proj < original_proj * 0.5, \ + f"Norm-preserved projection {preserved_proj} not much less than original {original_proj}" + + # Row norms are preserved + row_diff = (W_norm_preserved.norm(dim=1) - W.norm(dim=1)).abs().max().item() + assert row_diff < 1e-5, f"Row norms changed by {row_diff}" + + +class TestSVDDirectionExtraction: + """Verify that SVD on the difference matrix extracts the refusal direction.""" + + def test_planted_direction_recovery(self): + """Plant a known direction in the difference and verify SVD recovers it.""" + torch.manual_seed(42) + n_samples = 50 + hidden = 256 + + # Plant a known refusal direction + true_direction = torch.randn(hidden) + true_direction = true_direction / true_direction.norm() + + # Harmful activations = harmless + signal along true_direction + noise + harmless = torch.randn(n_samples, hidden) * 0.5 + signal_strength = 5.0 + harmful = harmless + signal_strength * true_direction.unsqueeze(0) + torch.randn(n_samples, hidden) * 0.1 + + # Extract via SVD on difference + diff = harmful - harmless + U, S, Vh = torch.linalg.svd(diff, full_matrices=False) + extracted = Vh[0] + extracted = extracted / extracted.norm() + + # The extracted direction should align with the true direction + cosine = (extracted @ true_direction).abs().item() + assert cosine > 0.95, f"Cosine similarity {cosine:.3f} too low (expected > 0.95)" + + def test_multi_direction_recovery(self): + """Plant k directions and verify SVD recovers the subspace.""" + torch.manual_seed(42) + n_samples = 200 + hidden = 256 + k = 3 + + # Plant k orthogonal directions with varying per-sample strength + Q, _ = torch.linalg.qr(torch.randn(hidden, k)) + true_subspace = Q.T # (k, hidden) + + # Each sample gets a random mix of the k planted directions + harmless = torch.randn(n_samples, hidden) * 0.01 + coefficients = torch.randn(n_samples, k).abs() * 5.0 + signal = coefficients @ true_subspace # (n_samples, hidden) + harmful = harmless + signal + + diff = harmful - harmless + U, S, Vh = torch.linalg.svd(diff, full_matrices=False) + extracted_subspace = Vh[:k] # (k, hidden) + + # Check subspace overlap: project true directions into extracted subspace + for i in range(k): + proj = extracted_subspace @ true_subspace[i] + captured_variance = proj.norm().item() + assert captured_variance > 0.9, \ + f"Direction {i}: captured variance {captured_variance:.3f} too low" + + +class TestRandomDirectionBaseline: + """Verify that random directions do NOT have the same effect as learned ones.""" + + def test_random_direction_has_lower_projection(self): + """Random directions should project much less on harmful activations + than the true refusal direction.""" + torch.manual_seed(42) + n_samples = 50 + hidden = 256 + + # Create structured harmful vs harmless difference + true_dir = torch.randn(hidden) + true_dir = true_dir / true_dir.norm() + + harmless = torch.randn(n_samples, hidden) * 0.5 + harmful = harmless + 3.0 * true_dir.unsqueeze(0) + + harmful_mean = harmful.mean(dim=0) + + # True direction projection + true_proj = (harmful_mean @ true_dir).abs().item() + + # Random direction projections (seeds far from 42 to avoid collision) + random_projs = [] + for i in range(100): + rng = torch.Generator().manual_seed(10000 + i) + rand_dir = torch.randn(hidden, generator=rng) + rand_dir = rand_dir / rand_dir.norm() + random_projs.append((harmful_mean @ rand_dir).abs().item()) + + mean_random = sum(random_projs) / len(random_projs) + + # True direction should project MUCH more than random average + assert true_proj > mean_random * 3.0, \ + f"True projection ({true_proj:.3f}) not much larger than random mean ({mean_random:.3f})" + + +class TestWhitenedSVD: + """Verify whitened SVD properties.""" + + def test_whitened_directions_are_orthogonal(self): + """Whitened SVD should produce orthogonal directions.""" + torch.manual_seed(42) + n_samples = 80 + hidden = 128 + k = 4 + + H = torch.randn(n_samples, hidden) + torch.randn(1, hidden) * 2 + B = torch.randn(n_samples, hidden) + + mu_B = B.mean(dim=0, keepdim=True) + B_centered = B - mu_B + cov_B = (B_centered.T @ B_centered) / (n_samples - 1) + cov_B += 1e-4 * torch.eye(hidden) + + eigenvalues, eigenvectors = torch.linalg.eigh(cov_B) + eigenvalues = eigenvalues.clamp(min=0) + inv_sqrt_eig = 1.0 / torch.sqrt(eigenvalues + 1e-4) + whiten_proj = eigenvectors * inv_sqrt_eig.unsqueeze(0) + + H_whitened = (H - mu_B) @ whiten_proj + B_whitened = B_centered @ whiten_proj + D_whitened = H_whitened - B_whitened + + U, S, Vh = torch.linalg.svd(D_whitened, full_matrices=False) + directions = Vh[:k] + + # Check orthogonality: directions @ directions^T should be ~identity + gram = directions @ directions.T + identity = torch.eye(k) + off_diag = (gram - identity).abs().max().item() + assert off_diag < 1e-4, f"Directions not orthogonal: max off-diagonal = {off_diag}" + + +class TestReproducibility: + """Verify that seed setting produces deterministic results.""" + + def test_set_seed_determinism(self): + """Same seed should produce identical random tensors.""" + from obliteratus.reproducibility import set_seed + + set_seed(123, deterministic=False) + a = torch.randn(100) + + set_seed(123, deterministic=False) + b = torch.randn(100) + + assert torch.equal(a, b), "Same seed produced different tensors" + + def test_different_seeds_differ(self): + """Different seeds should produce different tensors.""" + from obliteratus.reproducibility import set_seed + + set_seed(123, deterministic=False) + a = torch.randn(100) + + set_seed(456, deterministic=False) + b = torch.randn(100) + + assert not torch.equal(a, b), "Different seeds produced identical tensors" diff --git a/tests/test_advanced_metrics.py b/tests/test_advanced_metrics.py new file mode 100644 index 0000000..b534df9 --- /dev/null +++ b/tests/test_advanced_metrics.py @@ -0,0 +1,364 @@ +"""Tests for advanced evaluation metrics.""" + +from __future__ import annotations + + +import pytest +import torch + +from obliteratus.evaluation.advanced_metrics import ( + REFUSAL_PREFIXES, + REFUSAL_SUBSTRINGS, + _is_refusal, + refusal_rate, + token_kl_divergence, + first_token_kl_divergence, + effective_rank, + effective_rank_change, + activation_cosine_similarity, + linear_cka, + refusal_projection_magnitude, + format_eval_report, + AbliterationEvalResult, +) + + +# --------------------------------------------------------------------------- +# Refusal Rate +# --------------------------------------------------------------------------- + +class TestRefusalRate: + def test_all_refusals(self): + responses = [ + "I'm sorry, but I cannot help with that.", + "I apologize, but I'm unable to assist.", + "As an AI, I cannot provide that information.", + ] + assert refusal_rate(responses, mode="prefix") == 1.0 + + def test_no_refusals(self): + responses = [ + "Sure, here's how to do that.", + "Here's a step-by-step guide.", + "The answer is 42.", + ] + assert refusal_rate(responses, mode="prefix") == 0.0 + + def test_mixed_responses(self): + responses = [ + "I'm sorry, I can't do that.", + "Sure, here you go.", + "I cannot provide that.", + "Here's what you need.", + ] + rate = refusal_rate(responses, mode="prefix") + assert rate == 0.5 # 2 out of 4 + + def test_substring_mode(self): + responses = [ + "Well, actually I cannot help with that request.", + "Let me explain how to do this.", + ] + assert refusal_rate(responses, mode="substring") == 0.5 + + def test_combined_mode(self): + responses = [ + "I'm sorry, but no.", # caught by prefix + "That said, i won't do that for you.", # caught by substring + "Here's the answer.", # not a refusal + ] + rate = refusal_rate(responses, mode="combined") + assert abs(rate - 2.0 / 3.0) < 1e-6 + + def test_empty_responses(self): + assert refusal_rate([], mode="prefix") == 0.0 + + def test_empty_string_not_refusal(self): + assert not _is_refusal("", mode="combined") + + def test_is_refusal_prefix(self): + assert _is_refusal("I'm sorry, I can't do that.", mode="prefix") + assert _is_refusal("As an AI language model, I cannot", mode="prefix") + assert not _is_refusal("The answer is 42.", mode="prefix") + + def test_is_refusal_substring(self): + assert _is_refusal("I must say, i cannot assist with that.", mode="substring") + assert not _is_refusal("The capital of France is Paris.", mode="substring") + + def test_refusal_prefixes_nonempty(self): + assert len(REFUSAL_PREFIXES) > 20 + + def test_refusal_substrings_nonempty(self): + assert len(REFUSAL_SUBSTRINGS) > 10 + + +# --------------------------------------------------------------------------- +# KL Divergence +# --------------------------------------------------------------------------- + +class TestKLDivergence: + def test_identical_distributions(self): + """KL divergence of identical distributions should be 0.""" + logits = torch.randn(2, 10, 100) + kl = token_kl_divergence(logits, logits) + assert abs(kl) < 1e-5 + + def test_different_distributions(self): + """KL divergence of different distributions should be positive.""" + torch.manual_seed(42) + logits_a = torch.randn(2, 10, 100) + logits_b = torch.randn(2, 10, 100) + kl = token_kl_divergence(logits_a, logits_b) + assert kl > 0 + + def test_kl_nonnegative(self): + """KL divergence should always be non-negative.""" + torch.manual_seed(42) + for _ in range(5): + logits_a = torch.randn(1, 5, 50) + logits_b = torch.randn(1, 5, 50) + kl = token_kl_divergence(logits_a, logits_b) + assert kl >= -1e-6 # allow small numerical errors + + def test_first_token_kl_identical(self): + """First-token KL of identical distributions should be 0.""" + logits = torch.randn(4, 20, 100) + kl = first_token_kl_divergence(logits, logits) + assert abs(kl) < 1e-5 + + def test_first_token_kl_different(self): + """First-token KL of different distributions should be positive.""" + torch.manual_seed(42) + logits_a = torch.randn(4, 20, 100) + logits_b = torch.randn(4, 20, 100) + kl = first_token_kl_divergence(logits_a, logits_b) + assert kl > 0 + + def test_temperature_effect(self): + """Higher temperature should reduce KL divergence (smoother distributions).""" + torch.manual_seed(42) + logits_a = torch.randn(2, 5, 50) + logits_b = torch.randn(2, 5, 50) + kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0) + kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0) + assert kl_t5 < kl_t1 + + +# --------------------------------------------------------------------------- +# Effective Rank +# --------------------------------------------------------------------------- + +class TestEffectiveRank: + def test_rank_one_matrix(self): + """Rank-1 matrix should have effective rank close to 1.""" + v = torch.randn(8, 1) + u = torch.randn(1, 4) + W = v @ u # rank-1 + erank = effective_rank(W) + assert erank < 1.5 + + def test_identity_matrix(self): + """Identity matrix should have effective rank equal to dimension.""" + n = 8 + W = torch.eye(n) + erank = effective_rank(W) + assert abs(erank - n) < 0.1 + + def test_random_full_rank(self): + """Random matrix should have high effective rank.""" + torch.manual_seed(42) + W = torch.randn(16, 16) + erank = effective_rank(W) + assert erank > 10 # should be close to 16 + + def test_zero_matrix(self): + """Zero matrix should have effective rank 0.""" + W = torch.zeros(4, 4) + erank = effective_rank(W) + assert erank == 0.0 + + def test_effective_rank_change(self): + """Should compute before/after rank comparison.""" + torch.manual_seed(42) + W_before = torch.randn(8, 8) + # Simulate abliteration: remove a direction (reduces rank slightly) + d = torch.randn(8, 1) + d = d / d.norm() + W_after = W_before - (W_before @ d) @ d.T + + result = effective_rank_change(W_before, W_after) + assert "rank_before" in result + assert "rank_after" in result + assert "rank_delta" in result + assert "rank_ratio" in result + assert result["rank_after"] <= result["rank_before"] + 0.1 + + def test_rejects_non_2d(self): + """Should raise ValueError for non-2D tensors.""" + with pytest.raises(ValueError): + effective_rank(torch.randn(4, 4, 4)) + + +# --------------------------------------------------------------------------- +# Activation Cosine Similarity +# --------------------------------------------------------------------------- + +class TestActivationCosineSimilarity: + def test_identical_activations(self): + acts = torch.randn(10, 32) + sim = activation_cosine_similarity(acts, acts) + assert abs(sim - 1.0) < 1e-5 + + def test_orthogonal_activations(self): + """Orthogonal activations should have cosine near 0.""" + a = torch.tensor([[1.0, 0.0, 0.0]]) + b = torch.tensor([[0.0, 1.0, 0.0]]) + sim = activation_cosine_similarity(a, b) + assert abs(sim) < 1e-5 + + def test_opposite_activations(self): + """Opposite activations should have cosine -1.""" + a = torch.randn(5, 16) + sim = activation_cosine_similarity(a, -a) + assert abs(sim - (-1.0)) < 1e-5 + + def test_handles_3d(self): + """Should handle 3D tensors by reshaping.""" + a = torch.randn(2, 5, 16) + b = torch.randn(2, 5, 16) + sim = activation_cosine_similarity(a, b) + assert -1.0 <= sim <= 1.0 + + +# --------------------------------------------------------------------------- +# Linear CKA +# --------------------------------------------------------------------------- + +class TestLinearCKA: + def test_identical_representations(self): + """CKA of identical representations should be 1.0.""" + X = torch.randn(20, 16) + cka = linear_cka(X, X) + assert abs(cka - 1.0) < 1e-4 + + def test_scaled_representations(self): + """CKA should be invariant to isotropic scaling.""" + X = torch.randn(20, 16) + Y = X * 5.0 + cka = linear_cka(X, Y) + assert abs(cka - 1.0) < 1e-4 + + def test_random_representations(self): + """CKA of random representations should be low.""" + torch.manual_seed(42) + X = torch.randn(100, 16) + Y = torch.randn(100, 16) + cka = linear_cka(X, Y) + assert cka < 0.3 # random should be near 0 + + def test_cka_bounded(self): + """CKA should be between 0 and 1.""" + torch.manual_seed(42) + for _ in range(5): + X = torch.randn(20, 8) + Y = torch.randn(20, 8) + cka = linear_cka(X, Y) + assert -0.01 <= cka <= 1.01 # small tolerance for numerics + + def test_different_dimensions(self): + """CKA should work with different hidden dimensions.""" + X = torch.randn(20, 16) + Y = torch.randn(20, 32) + cka = linear_cka(X, Y) + assert -0.01 <= cka <= 1.01 + + def test_handles_3d(self): + """Should handle 3D tensors by reshaping.""" + X = torch.randn(2, 10, 16) + Y = torch.randn(2, 10, 16) + cka = linear_cka(X, Y) + assert -0.01 <= cka <= 1.01 + + +# --------------------------------------------------------------------------- +# Refusal Direction Projection Magnitude +# --------------------------------------------------------------------------- + +class TestRefusalProjection: + def test_aligned_activations(self): + """Activations aligned with direction should have high projection.""" + d = torch.tensor([1.0, 0.0, 0.0]) + acts = torch.tensor([ + [5.0, 0.0, 0.0], + [3.0, 0.0, 0.0], + [4.0, 0.0, 0.0], + ]) + result = refusal_projection_magnitude(acts, d) + assert result["mean"] == 4.0 + assert result["abs_mean"] == 4.0 + + def test_orthogonal_activations(self): + """Orthogonal activations should have zero projection.""" + d = torch.tensor([1.0, 0.0, 0.0]) + acts = torch.tensor([ + [0.0, 5.0, 0.0], + [0.0, 0.0, 3.0], + ]) + result = refusal_projection_magnitude(acts, d) + assert abs(result["mean"]) < 1e-5 + assert abs(result["abs_mean"]) < 1e-5 + + def test_result_keys(self): + """Should return all expected keys.""" + d = torch.randn(8) + acts = torch.randn(5, 8) + result = refusal_projection_magnitude(acts, d) + assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"} + + +# --------------------------------------------------------------------------- +# Eval Report Formatting +# --------------------------------------------------------------------------- + +class TestEvalReport: + def test_format_report(self): + result = AbliterationEvalResult( + refusal_rate_harmful=0.1, + refusal_rate_harmless=0.02, + kl_divergence=0.15, + perplexity=12.5, + coherence_score=0.8, + mean_activation_cosine=0.95, + mean_cka=0.92, + ) + report = format_eval_report(result) + assert "10.0%" in report + assert "12.50" in report + assert "excellent" in report # KL < 0.2 + + def test_format_report_high_kl(self): + result = AbliterationEvalResult( + refusal_rate_harmful=0.0, + refusal_rate_harmless=0.0, + kl_divergence=1.5, + perplexity=50.0, + coherence_score=0.4, + mean_activation_cosine=None, + mean_cka=None, + ) + report = format_eval_report(result) + assert "significant damage" in report + + def test_format_report_no_kl(self): + result = AbliterationEvalResult( + refusal_rate_harmful=0.5, + refusal_rate_harmless=0.1, + kl_divergence=None, + perplexity=20.0, + coherence_score=1.0, + mean_activation_cosine=None, + mean_cka=None, + ) + report = format_eval_report(result) + assert "50.0%" in report + assert "KL" not in report diff --git a/tests/test_analysis.py b/tests/test_analysis.py new file mode 100644 index 0000000..1fb03c7 --- /dev/null +++ b/tests/test_analysis.py @@ -0,0 +1,345 @@ +"""Tests for the analysis techniques.""" + +from __future__ import annotations + + +import torch + +from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor, WhitenedSVDResult +from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer, CrossLayerResult +from obliteratus.analysis.activation_probing import ActivationProbe, ProbeResult + + +# --------------------------------------------------------------------------- +# WhitenedSVDExtractor +# --------------------------------------------------------------------------- + +class TestWhitenedSVD: + def test_basic_extraction(self): + """Whitened SVD should extract directions from activation differences.""" + torch.manual_seed(42) + n_prompts, hidden_dim = 10, 32 + + # Create activations with a clear refusal direction + refusal_dir = torch.randn(hidden_dim) + refusal_dir = refusal_dir / refusal_dir.norm() + + harmless = [torch.randn(hidden_dim) for _ in range(n_prompts)] + harmful = [h + 2.0 * refusal_dir for h in harmless] # shifted along refusal dir + + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless, n_directions=3) + + assert isinstance(result, WhitenedSVDResult) + assert result.directions.shape == (3, hidden_dim) + assert result.singular_values.shape == (3,) + assert result.variance_explained > 0 + assert result.condition_number > 0 + assert result.effective_rank > 0 + + def test_directions_are_unit_vectors(self): + """Extracted directions should be unit length.""" + torch.manual_seed(42) + harmless = [torch.randn(16) for _ in range(8)] + harmful = [h + torch.randn(16) * 0.5 for h in harmless] + + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless, n_directions=2) + + for i in range(result.directions.shape[0]): + assert abs(result.directions[i].norm().item() - 1.0) < 1e-4 + + def test_primary_aligns_with_planted_direction(self): + """Primary whitened direction should capture the planted refusal signal. + + Whitening rotates directions relative to the covariance structure, + so perfect alignment with the raw direction is not expected. We verify + the whitened direction explains substantial variance and has moderate + alignment (whitening intentionally reweights dimensions). + """ + torch.manual_seed(42) + hidden_dim = 64 + n_prompts = 30 + + refusal_dir = torch.randn(hidden_dim) + refusal_dir = refusal_dir / refusal_dir.norm() + + # Isotropic harmless activations (whitening has minimal effect) + harmless = [torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)] + harmful = [h + 5.0 * refusal_dir for h in harmless] + + extractor = WhitenedSVDExtractor(regularization_eps=1e-3) + result = extractor.extract(harmful, harmless, n_directions=1) + + cos_sim = (result.directions[0] @ refusal_dir).abs().item() + # Moderate alignment expected (whitening reweights dimensions) + assert cos_sim > 0.2, f"Expected alignment > 0.2, got {cos_sim:.3f}" + # More importantly: the direction should explain most variance + assert result.variance_explained > 0.5 + + def test_extract_all_layers(self): + """Should extract directions for all provided layers.""" + torch.manual_seed(42) + harmful_acts = {} + harmless_acts = {} + for layer in range(4): + harmful_acts[layer] = [torch.randn(16) for _ in range(5)] + harmless_acts[layer] = [torch.randn(16) for _ in range(5)] + + extractor = WhitenedSVDExtractor() + results = extractor.extract_all_layers(harmful_acts, harmless_acts, n_directions=2) + + assert len(results) == 4 + for idx in range(4): + assert idx in results + assert results[idx].directions.shape[0] == 2 + + def test_compare_with_standard(self): + """Comparison should return valid cosine similarities.""" + torch.manual_seed(42) + harmless = [torch.randn(16) for _ in range(8)] + harmful = [h + torch.randn(16) for h in harmless] + + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless, n_directions=2) + + std_dir = torch.randn(16) + std_dir = std_dir / std_dir.norm() + + comparison = WhitenedSVDExtractor.compare_with_standard(result, std_dir) + assert "primary_direction_cosine" in comparison + assert "subspace_principal_cosine" in comparison + assert 0 <= comparison["primary_direction_cosine"] <= 1.0 + + def test_handles_3d_activations(self): + """Should handle activations with an extra batch dimension.""" + torch.manual_seed(42) + # (1, hidden_dim) shape from hook output + harmless = [torch.randn(1, 16) for _ in range(5)] + harmful = [torch.randn(1, 16) for _ in range(5)] + + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless, n_directions=2) + assert result.directions.shape == (2, 16) + + def test_variance_explained_bounded(self): + """Variance explained should be between 0 and 1.""" + torch.manual_seed(42) + harmless = [torch.randn(16) for _ in range(8)] + harmful = [torch.randn(16) for _ in range(8)] + + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless, n_directions=3) + assert 0 <= result.variance_explained <= 1.0 + + +# --------------------------------------------------------------------------- +# CrossLayerAlignmentAnalyzer +# --------------------------------------------------------------------------- + +class TestCrossLayerAlignment: + def test_identical_directions(self): + """Identical directions across layers should give persistence = 1.""" + direction = torch.randn(32) + direction = direction / direction.norm() + directions = {i: direction.clone() for i in range(5)} + + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + + assert isinstance(result, CrossLayerResult) + assert result.direction_persistence_score > 0.99 + assert result.mean_adjacent_cosine > 0.99 + assert result.total_geodesic_distance < 0.01 + + def test_orthogonal_directions(self): + """Orthogonal directions should give low persistence.""" + # Create orthogonal directions via QR decomposition + torch.manual_seed(42) + M = torch.randn(5, 32) + Q, _ = torch.linalg.qr(M.T) + directions = {i: Q[:, i] for i in range(5)} + + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + + assert result.direction_persistence_score < 0.3 + assert result.mean_adjacent_cosine < 0.3 + + def test_cluster_detection(self): + """Should detect clusters of similar directions.""" + torch.manual_seed(42) + # Create two clusters + d1 = torch.randn(32) + d1 = d1 / d1.norm() + d2 = torch.randn(32) + d2 = d2 / d2.norm() + + directions = { + 0: d1, 1: d1 + 0.01 * torch.randn(32), + 2: d1 + 0.01 * torch.randn(32), + 3: d2, 4: d2 + 0.01 * torch.randn(32), + } + # Normalize + directions = {k: v / v.norm() for k, v in directions.items()} + + analyzer = CrossLayerAlignmentAnalyzer(cluster_threshold=0.9) + result = analyzer.analyze(directions) + + # Should find at least 2 clusters + assert result.cluster_count >= 2 + + def test_empty_input(self): + """Should handle empty input gracefully.""" + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze({}) + assert result.layer_indices == [] + assert result.cluster_count == 0 + + def test_single_layer(self): + """Single layer should work fine.""" + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze({5: torch.randn(16)}) + assert result.layer_indices == [5] + assert result.direction_persistence_score == 1.0 + + def test_strong_layers_filter(self): + """Should only analyze specified strong layers.""" + directions = {i: torch.randn(16) for i in range(10)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions, strong_layers=[2, 5, 7]) + assert result.layer_indices == [2, 5, 7] + assert result.cosine_matrix.shape == (3, 3) + + def test_cosine_matrix_symmetry(self): + """Cosine matrix should be symmetric.""" + torch.manual_seed(42) + directions = {i: torch.randn(16) for i in range(4)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + diff = (result.cosine_matrix - result.cosine_matrix.T).abs().max().item() + assert diff < 1e-5 + + def test_cosine_matrix_diagonal_ones(self): + """Diagonal of cosine matrix should be 1.0.""" + torch.manual_seed(42) + directions = {i: torch.randn(16) for i in range(4)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + for i in range(4): + assert abs(result.cosine_matrix[i, i].item() - 1.0) < 1e-4 + + def test_angular_drift_monotonic(self): + """Angular drift should be monotonically non-decreasing.""" + torch.manual_seed(42) + directions = {i: torch.randn(16) for i in range(6)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + for i in range(len(result.angular_drift) - 1): + assert result.angular_drift[i + 1] >= result.angular_drift[i] - 1e-6 + + def test_format_report(self): + """Format report should produce a non-empty string.""" + torch.manual_seed(42) + directions = {i: torch.randn(16) for i in range(4)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + report = CrossLayerAlignmentAnalyzer.format_report(result) + assert "Cross-Layer" in report + assert "persistence" in report + + +# --------------------------------------------------------------------------- +# ActivationProbe +# --------------------------------------------------------------------------- + +class TestActivationProbe: + def test_clean_elimination(self): + """After removing direction, projections should be near-zero.""" + torch.manual_seed(42) + hidden_dim = 32 + refusal_dir = torch.randn(hidden_dim) + refusal_dir = refusal_dir / refusal_dir.norm() + + # "Post-abliteration" activations: direction has been removed + harmless = [torch.randn(hidden_dim) for _ in range(10)] + harmful = [torch.randn(hidden_dim) for _ in range(10)] + # Both sets are random, no refusal signal => gap should be small + + probe = ActivationProbe() + result = probe.probe_layer(harmful, harmless, refusal_dir) + assert abs(result.projection_gap) < 1.0 + assert result.separation_d_prime < 2.0 + + def test_residual_detection(self): + """Should detect residual refusal signal when direction wasn't removed.""" + torch.manual_seed(42) + hidden_dim = 32 + refusal_dir = torch.randn(hidden_dim) + refusal_dir = refusal_dir / refusal_dir.norm() + + harmless = [torch.randn(hidden_dim) for _ in range(10)] + # Harmful still has strong refusal direction component + harmful = [h + 5.0 * refusal_dir for h in harmless] + + probe = ActivationProbe() + result = probe.probe_layer(harmful, harmless, refusal_dir) + assert abs(result.projection_gap) > 1.0 + assert result.separation_d_prime > 2.0 + + def test_probe_all_layers(self): + """Should compute aggregate metrics across layers.""" + torch.manual_seed(42) + hidden_dim = 16 + n_layers = 4 + + harmful_acts = {} + harmless_acts = {} + refusal_dirs = {} + + for layer in range(n_layers): + harmful_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)] + harmless_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)] + d = torch.randn(hidden_dim) + refusal_dirs[layer] = d / d.norm() + + probe = ActivationProbe() + result = probe.probe_all_layers(harmful_acts, harmless_acts, refusal_dirs) + + assert isinstance(result, ProbeResult) + assert len(result.per_layer) == n_layers + assert 0 <= result.refusal_elimination_score <= 1.0 + assert result.mean_projection_gap >= 0 + + def test_res_score_range(self): + """RES should always be between 0 and 1.""" + torch.manual_seed(42) + for seed in range(5): + torch.manual_seed(seed) + harmful = {0: [torch.randn(8) for _ in range(3)]} + harmless = {0: [torch.randn(8) for _ in range(3)]} + dirs = {0: torch.randn(8)} + dirs[0] = dirs[0] / dirs[0].norm() + + probe = ActivationProbe() + result = probe.probe_all_layers(harmful, harmless, dirs) + assert 0 <= result.refusal_elimination_score <= 1.0 + + def test_format_report(self): + """Format report should produce readable output.""" + torch.manual_seed(42) + harmful = {0: [torch.randn(8) for _ in range(3)]} + harmless = {0: [torch.randn(8) for _ in range(3)]} + dirs = {0: torch.randn(8)} + + probe = ActivationProbe() + result = probe.probe_all_layers(harmful, harmless, dirs) + report = ActivationProbe.format_report(result) + assert "Refusal Elimination Score" in report + + def test_empty_input(self): + """Should handle empty input gracefully.""" + probe = ActivationProbe() + result = probe.probe_all_layers({}, {}, {}) + assert result.refusal_elimination_score == 0.0 + assert len(result.per_layer) == 0 diff --git a/tests/test_analysis_utils.py b/tests/test_analysis_utils.py new file mode 100644 index 0000000..2399e94 --- /dev/null +++ b/tests/test_analysis_utils.py @@ -0,0 +1,65 @@ +"""Tests for shared analysis utilities (gini_coefficient, etc.).""" + +from __future__ import annotations + +import pytest + +from obliteratus.analysis.utils import gini_coefficient + + +class TestGiniCoefficient: + """Tests for the Gini coefficient computation.""" + + def test_empty_list(self): + assert gini_coefficient([]) == 0.0 + + def test_single_value(self): + assert gini_coefficient([42.0]) == 0.0 + + def test_uniform_distribution(self): + """All-equal values → Gini = 0.""" + assert gini_coefficient([1.0, 1.0, 1.0, 1.0]) == pytest.approx(0.0, abs=1e-10) + + def test_maximally_concentrated(self): + """One value, rest zero → Gini ≈ 1.""" + result = gini_coefficient([100.0, 0.0, 0.0, 0.0]) + assert result > 0.7 # For n=4, max Gini = (n-1)/n = 0.75 + + def test_all_zeros(self): + assert gini_coefficient([0.0, 0.0, 0.0]) == 0.0 + + def test_two_equal_values(self): + assert gini_coefficient([5.0, 5.0]) == pytest.approx(0.0, abs=1e-10) + + def test_two_unequal_values(self): + """[0, 10] → Gini = 0.5 for n=2.""" + result = gini_coefficient([0.0, 10.0]) + assert result == pytest.approx(0.5, abs=0.01) + + def test_moderate_inequality(self): + """Moderate spread → Gini between 0 and 1.""" + result = gini_coefficient([1.0, 2.0, 3.0, 4.0, 5.0]) + assert 0.1 < result < 0.5 + + def test_result_in_valid_range(self): + """Gini is always in [0, 1].""" + for vals in [[1, 2, 3], [0, 0, 100], [5, 5, 5], [1], [0.1, 0.9]]: + result = gini_coefficient(vals) + assert 0.0 <= result <= 1.0, f"Gini({vals}) = {result} out of range" + + def test_large_uniform(self): + """Large uniform distribution → Gini ≈ 0.""" + vals = [1.0] * 1000 + assert gini_coefficient(vals) == pytest.approx(0.0, abs=1e-10) + + def test_large_concentrated(self): + """Large distribution with one outlier → high Gini.""" + vals = [0.0] * 999 + [1000.0] + result = gini_coefficient(vals) + assert result > 0.99 + + def test_order_invariant(self): + """Gini should not depend on input order.""" + a = gini_coefficient([1.0, 3.0, 5.0, 7.0]) + b = gini_coefficient([7.0, 1.0, 5.0, 3.0]) + assert a == pytest.approx(b) diff --git a/tests/test_architecture_profiles.py b/tests/test_architecture_profiles.py new file mode 100644 index 0000000..00d1c13 --- /dev/null +++ b/tests/test_architecture_profiles.py @@ -0,0 +1,598 @@ +"""Tests for architecture-aware preset defaults. + +Tests the detection logic and recommended parameter overrides for each +architecture class (dense/MoE, standard/reasoning). +""" + +from __future__ import annotations + + +from obliteratus.architecture_profiles import ( + ArchitectureClass, + ArchitectureProfile, + ReasoningClass, + detect_architecture, + get_profile_summary, + apply_profile_to_method_config, +) + + +# --------------------------------------------------------------------------- +# Detection: Dense models +# --------------------------------------------------------------------------- + + +class TestDenseDetection: + """Test that standard dense models are correctly classified.""" + + def test_llama_is_dense(self): + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert profile.arch_class == ArchitectureClass.DENSE + assert profile.reasoning_class == ReasoningClass.STANDARD + assert not profile.is_moe + + def test_qwen_dense_is_dense(self): + profile = detect_architecture("Qwen/Qwen2.5-7B-Instruct") + assert profile.arch_class == ArchitectureClass.DENSE + assert not profile.is_moe + + def test_gemma_is_dense(self): + profile = detect_architecture("google/gemma-3-27b-it") + assert profile.arch_class == ArchitectureClass.DENSE + + def test_phi_is_dense(self): + profile = detect_architecture("microsoft/Phi-4-mini-instruct") + assert profile.arch_class == ArchitectureClass.DENSE + + def test_mistral_small_is_dense(self): + profile = detect_architecture("mistralai/Mistral-Small-24B-Instruct-2501") + assert profile.arch_class == ArchitectureClass.DENSE + + def test_yi_is_dense(self): + profile = detect_architecture("01-ai/Yi-1.5-9B-Chat") + assert profile.arch_class == ArchitectureClass.DENSE + + def test_dense_label(self): + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert profile.profile_label == "Dense Standard" + + def test_dense_recommended_method(self): + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert profile.recommended_method == "aggressive" + + +# --------------------------------------------------------------------------- +# Detection: MoE models +# --------------------------------------------------------------------------- + + +class TestMoEDetection: + """Test that MoE models are correctly classified.""" + + def test_gpt_oss_is_moe(self): + """GPT-OSS is MoE. Without config, defaults to small (conservative).""" + profile = detect_architecture("openai/gpt-oss-20b") + assert profile.is_moe + assert profile.arch_class == ArchitectureClass.SMALL_MOE + + def test_qwen3_30b_is_small_moe(self): + profile = detect_architecture("Qwen/Qwen3-30B-A3B") + assert profile.is_moe + + def test_deepseek_v3_is_large_moe(self): + profile = detect_architecture("deepseek-ai/DeepSeek-V3.2") + assert profile.is_moe + + def test_kimi_k2_is_large_moe(self): + profile = detect_architecture("moonshotai/Kimi-K2-Instruct") + assert profile.is_moe + + def test_qwen3_235b_is_moe(self): + profile = detect_architecture("Qwen/Qwen3-235B-A22B") + assert profile.is_moe + + def test_glm_47_is_moe(self): + profile = detect_architecture("zai-org/GLM-4.7") + assert profile.is_moe + + def test_llama4_maverick_is_moe(self): + profile = detect_architecture("meta-llama/Llama-4-Maverick-17B-128E-Instruct") + assert profile.is_moe + + def test_step_flash_is_moe(self): + profile = detect_architecture("stepfun-ai/Step-3.5-Flash") + assert profile.is_moe + + def test_minimax_is_moe(self): + profile = detect_architecture("MiniMaxAI/MiniMax-M2.1") + assert profile.is_moe + + def test_mistral_large_3_is_moe(self): + profile = detect_architecture("mistralai/Mistral-Large-3-675B-Instruct-2512") + assert profile.is_moe + + def test_moe_recommended_method_is_surgical(self): + """All MoE profiles recommend surgical method.""" + profile = detect_architecture("openai/gpt-oss-20b") + assert profile.recommended_method == "surgical" + + def test_gpt_oss_with_config_is_small_moe(self): + """GPT-OSS with config providing expert count → small MoE.""" + class MockConfig: + model_type = "gpt_neox" + num_hidden_layers = 32 + hidden_size = 2560 + intermediate_size = 6912 + vocab_size = 50304 + num_local_experts = 8 + num_experts_per_tok = 2 + profile = detect_architecture("openai/gpt-oss-20b", config=MockConfig()) + assert profile.is_moe + assert profile.arch_class == ArchitectureClass.SMALL_MOE + + +# --------------------------------------------------------------------------- +# Detection: Reasoning models +# --------------------------------------------------------------------------- + + +class TestReasoningDetection: + """Test that reasoning models are correctly classified.""" + + def test_r1_distill_qwen_is_reasoning(self): + profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") + assert profile.reasoning_class == ReasoningClass.REASONING + + def test_r1_distill_llama_is_reasoning(self): + profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Llama-8B") + assert profile.reasoning_class == ReasoningClass.REASONING + + def test_r1_distill_is_dense_reasoning(self): + """R1 distills are dense (distilled from MoE into dense).""" + profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B") + assert profile.arch_class == ArchitectureClass.DENSE + assert profile.reasoning_class == ReasoningClass.REASONING + assert profile.profile_label == "Dense Reasoning" + + def test_olmo_think_is_reasoning(self): + profile = detect_architecture("allenai/Olmo-3.1-32B-Think") + assert profile.reasoning_class == ReasoningClass.REASONING + + def test_olmo_standard_is_not_reasoning(self): + """OLMo (without Think) must NOT be classified as reasoning. + Regression test: 'olmo' contains 'o1' substring.""" + profile = detect_architecture("allenai/Olmo-3-7B-Instruct") + assert profile.reasoning_class == ReasoningClass.STANDARD + + def test_falcon3_is_not_reasoning(self): + """falcon3 must NOT match 'o3' reasoning pattern.""" + profile = detect_architecture("tiiuae/Falcon3-7B-Instruct") + assert profile.reasoning_class == ReasoningClass.STANDARD + + def test_full_r1_is_moe_reasoning(self): + profile = detect_architecture("deepseek-ai/DeepSeek-R1") + assert profile.is_moe + assert profile.reasoning_class == ReasoningClass.REASONING + + def test_reasoning_dense_more_directions(self): + """Dense reasoning models need more directions (>=12) to span refusal.""" + profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") + assert profile.arch_class == ArchitectureClass.DENSE + assert profile.method_overrides.get("n_directions", 0) >= 12 + + def test_reasoning_dense_more_passes(self): + """Dense reasoning models need more refinement passes (>=4).""" + profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") + assert profile.arch_class == ArchitectureClass.DENSE + assert profile.method_overrides.get("refinement_passes", 0) >= 4 + + def test_non_reasoning_is_standard(self): + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert profile.reasoning_class == ReasoningClass.STANDARD + + +# --------------------------------------------------------------------------- +# Detection with config object +# --------------------------------------------------------------------------- + + +class TestConfigDetection: + """Test detection when a mock config is provided.""" + + def test_moe_config_attrs(self): + """Config with num_local_experts should be detected as MoE.""" + class MockConfig: + model_type = "mixtral" + num_hidden_layers = 32 + hidden_size = 4096 + intermediate_size = 14336 + vocab_size = 32000 + num_local_experts = 8 + num_experts_per_tok = 2 + + profile = detect_architecture( + "custom/mixtral-model", config=MockConfig(), + num_layers=32, hidden_size=4096, + ) + assert profile.is_moe + assert profile.num_experts == 8 + assert profile.num_active_experts == 2 + + def test_large_moe_threshold(self): + """MoE models with >100B params should be classified as large.""" + class MockConfig: + model_type = "deepseek_v3" + num_hidden_layers = 61 + hidden_size = 7168 + intermediate_size = 18432 + vocab_size = 102400 + n_routed_experts = 256 + num_experts_per_tok = 8 + + profile = detect_architecture( + "custom/large-moe", config=MockConfig(), + ) + assert profile.arch_class == ArchitectureClass.LARGE_MOE + + def test_small_moe_threshold(self): + """MoE models with <=16 experts should be classified as small.""" + class MockConfig: + model_type = "mixtral" + num_hidden_layers = 32 + hidden_size = 4096 + intermediate_size = 14336 + vocab_size = 32000 + num_local_experts = 8 + num_experts_per_tok = 2 + + profile = detect_architecture( + "custom/small-moe", config=MockConfig(), + ) + assert profile.arch_class == ArchitectureClass.SMALL_MOE + + def test_dense_config(self): + """Config without MoE attributes should be dense.""" + class MockConfig: + model_type = "llama" + num_hidden_layers = 32 + hidden_size = 4096 + intermediate_size = 11008 + vocab_size = 32000 + + profile = detect_architecture( + "custom/dense-model", config=MockConfig(), + ) + assert profile.arch_class == ArchitectureClass.DENSE + assert not profile.is_moe + + def test_llama4_scout_is_large_moe(self): + """Llama 4 Scout: 109B total params with 16 experts → LARGE_MOE. + Regression test: params > 100B must override low expert count.""" + class MockConfig: + model_type = "llama4" + num_hidden_layers = 48 + hidden_size = 5120 + intermediate_size = 14336 + vocab_size = 202048 + num_local_experts = 16 + num_experts_per_tok = 1 + + profile = detect_architecture( + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + config=MockConfig(), + ) + assert profile.is_moe + assert profile.arch_class == ArchitectureClass.LARGE_MOE + + +# --------------------------------------------------------------------------- +# Recommended defaults validation +# --------------------------------------------------------------------------- + + +class TestRecommendedDefaults: + """Test that recommended defaults match research findings.""" + + def test_dense_standard_no_riemannian(self): + """Dense Standard: Riemannian OFF (manifolds are flat).""" + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert not profile.breakthrough_modules.get("riemannian", True) + + def test_dense_standard_anti_ouroboros_on(self): + """Dense Standard: Anti-Ouroboros ON for self-repair mapping.""" + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert profile.breakthrough_modules.get("anti_ouroboros", False) + + def test_dense_standard_spectral_cert_on(self): + """Dense Standard: Spectral cert ON for verification.""" + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert profile.breakthrough_modules.get("spectral_cert", False) + + def test_moe_conditional_on(self): + """MoE: Conditional abliteration is #1 technique (Cracken AI 2025).""" + profile = detect_architecture("openai/gpt-oss-20b") + assert profile.breakthrough_modules.get("conditional", False) + + def test_moe_no_project_embeddings(self): + """MoE: Project embeddings OFF (cascades through router).""" + profile = detect_architecture("openai/gpt-oss-20b") + assert not profile.method_overrides.get("project_embeddings", True) + + def test_moe_per_expert_directions(self): + """MoE: Per-expert directions ON (global directions fail on MoE).""" + profile = detect_architecture("openai/gpt-oss-20b") + assert profile.method_overrides.get("per_expert_directions", False) + + def test_large_moe_riemannian_on(self): + """Large MoE: Riemannian ON (curved shared layer geometry).""" + profile = detect_architecture("deepseek-ai/DeepSeek-V3.2") + assert profile.breakthrough_modules.get("riemannian", False) + + def test_reasoning_dense_jailbreak_contrast(self): + """Reasoning Dense: Jailbreak contrast ON for thinking-chain refusal.""" + profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") + assert profile.method_overrides.get("use_jailbreak_contrast", False) + + def test_reasoning_moe_gentle_transplant(self): + """Reasoning MoE: transplant_blend very low (preserve reasoning).""" + profile = detect_architecture("deepseek-ai/DeepSeek-R1") + assert profile.method_overrides.get("transplant_blend", 1.0) <= 0.10 + + +# --------------------------------------------------------------------------- +# Profile summary +# --------------------------------------------------------------------------- + + +class TestProfileSummary: + """Test the human-readable profile summary.""" + + def test_summary_contains_profile_label(self): + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + summary = get_profile_summary(profile) + assert "Dense Standard" in summary + + def test_summary_contains_method(self): + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + summary = get_profile_summary(profile) + assert "aggressive" in summary + + def test_summary_contains_citations(self): + profile = detect_architecture("openai/gpt-oss-20b") + summary = get_profile_summary(profile) + assert "SAFEx" in summary or "Cracken" in summary + + def test_summary_contains_moe_info(self): + profile = detect_architecture("openai/gpt-oss-20b") + summary = get_profile_summary(profile) + assert "MoE" in summary + + def test_summary_contains_breakthrough_modules(self): + profile = detect_architecture("openai/gpt-oss-20b") + summary = get_profile_summary(profile) + assert "conditional" in summary + + +# --------------------------------------------------------------------------- +# apply_profile_to_method_config +# --------------------------------------------------------------------------- + + +class TestApplyProfile: + """Test that profile overrides are correctly applied to method configs.""" + + def test_overrides_applied(self): + from obliteratus.abliterate import METHODS + profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") + base = dict(METHODS["aggressive"]) + merged = apply_profile_to_method_config(profile, base) + assert merged["n_directions"] == profile.method_overrides["n_directions"] + + def test_non_overridden_preserved(self): + from obliteratus.abliterate import METHODS + profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + base = dict(METHODS["aggressive"]) + merged = apply_profile_to_method_config(profile, base) + # norm_preserve is not in overrides, should come from base + assert merged["norm_preserve"] == base["norm_preserve"] + + def test_empty_overrides(self): + from obliteratus.abliterate import METHODS + base = dict(METHODS["advanced"]) + profile = ArchitectureProfile( + arch_class=ArchitectureClass.DENSE, + reasoning_class=ReasoningClass.STANDARD, + method_overrides={}, + breakthrough_modules={}, + ) + merged = apply_profile_to_method_config(profile, base) + assert merged == base + + def test_override_key_not_in_base_is_added(self): + """Override keys absent from base config should be added to result. + + This is important for the UI auto-detect path: keys like + use_jailbreak_contrast may not exist in the base method config + but are valid pipeline parameters that app.py reads via merged.get(). + """ + from obliteratus.abliterate import METHODS + base = dict(METHODS["advanced"]) + profile = ArchitectureProfile( + arch_class=ArchitectureClass.DENSE, + reasoning_class=ReasoningClass.STANDARD, + method_overrides={"use_jailbreak_contrast": True}, + breakthrough_modules={}, + ) + merged = apply_profile_to_method_config(profile, base) + assert merged["use_jailbreak_contrast"] is True + + +# --------------------------------------------------------------------------- +# All 6 profile combinations +# --------------------------------------------------------------------------- + + +class TestAllSixProfiles: + """Verify label, method, overrides, and breakthrough modules for each profile.""" + + def _make_moe_config(self, num_experts=8, active=2, layers=32, hidden=4096): + class C: + model_type = "mixtral" + num_hidden_layers = layers + hidden_size = hidden + intermediate_size = hidden * 4 + vocab_size = 32000 + num_local_experts = num_experts + num_experts_per_tok = active + return C() + + def test_dense_standard_full(self): + p = detect_architecture("meta-llama/Llama-3.1-8B-Instruct") + assert p.profile_label == "Dense Standard" + assert p.recommended_method == "aggressive" + assert not p.breakthrough_modules["riemannian"] + assert p.breakthrough_modules["anti_ouroboros"] + assert p.breakthrough_modules["spectral_cert"] + assert not p.breakthrough_modules["conditional"] + assert len(p.profile_description) > 0 + assert len(p.research_citations) > 0 + + def test_dense_reasoning_full(self): + p = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") + assert p.profile_label == "Dense Reasoning" + assert p.recommended_method == "aggressive" + assert p.method_overrides["n_directions"] >= 12 + assert p.method_overrides["refinement_passes"] >= 4 + assert p.method_overrides["use_jailbreak_contrast"] is True + assert p.method_overrides["use_chat_template"] is True + assert p.breakthrough_modules["anti_ouroboros"] + assert p.breakthrough_modules["riemannian"] + assert p.breakthrough_modules["conditional"] + assert p.breakthrough_modules["spectral_cert"] + assert len(p.profile_description) > 0 + + def test_small_moe_standard_full(self): + config = self._make_moe_config(num_experts=8, active=2) + p = detect_architecture("custom/small-moe-model", config=config) + assert p.profile_label == "Small MoE Standard" + assert p.arch_class == ArchitectureClass.SMALL_MOE + assert p.recommended_method == "surgical" + assert p.method_overrides["per_expert_directions"] is True + assert p.method_overrides["invert_refusal"] is False + assert p.method_overrides["project_embeddings"] is False + assert p.breakthrough_modules["conditional"] + assert p.breakthrough_modules["anti_ouroboros"] + assert p.breakthrough_modules["spectral_cert"] + assert not p.breakthrough_modules["riemannian"] + assert len(p.profile_description) > 0 + + def test_small_moe_reasoning_full(self): + """The most fragile combination: MoE + reasoning.""" + config = self._make_moe_config(num_experts=8, active=2) + # Add "think" to name to trigger reasoning detection + p = detect_architecture("custom/small-moe-think-model", config=config) + assert p.profile_label == "Small MoE Reasoning" + assert p.arch_class == ArchitectureClass.SMALL_MOE + assert p.reasoning_class == ReasoningClass.REASONING + assert p.recommended_method == "surgical" + assert p.method_overrides["per_expert_directions"] is True + assert p.method_overrides["use_jailbreak_contrast"] is True + assert p.method_overrides["use_chat_template"] is True + assert p.method_overrides["invert_refusal"] is False + assert p.breakthrough_modules["conditional"] + assert p.breakthrough_modules["anti_ouroboros"] + assert p.breakthrough_modules["spectral_cert"] + assert len(p.profile_description) > 0 + + def test_large_moe_standard_full(self): + config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168) + p = detect_architecture("custom/large-moe-model", config=config) + assert p.profile_label == "Large MoE Standard" + assert p.arch_class == ArchitectureClass.LARGE_MOE + assert p.recommended_method == "surgical" + assert p.method_overrides["per_expert_directions"] is True + assert p.method_overrides["layer_adaptive_strength"] is True + assert p.method_overrides["expert_transplant"] is True + assert p.method_overrides["transplant_blend"] == 0.10 + assert p.method_overrides["attention_head_surgery"] is True + assert p.method_overrides["project_embeddings"] is False + assert p.breakthrough_modules["conditional"] + assert p.breakthrough_modules["riemannian"] + assert p.breakthrough_modules["anti_ouroboros"] + assert p.breakthrough_modules["spectral_cert"] + assert len(p.profile_description) > 0 + + def test_large_moe_reasoning_full(self): + config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168) + p = detect_architecture("custom/large-moe-r1-model", config=config) + assert p.profile_label == "Large MoE Reasoning" + assert p.arch_class == ArchitectureClass.LARGE_MOE + assert p.reasoning_class == ReasoningClass.REASONING + assert p.recommended_method == "surgical" + assert p.method_overrides["n_directions"] == 8 + assert p.method_overrides["transplant_blend"] == 0.08 + assert p.method_overrides["use_jailbreak_contrast"] is True + assert p.method_overrides["safety_neuron_masking"] is True + assert p.breakthrough_modules["conditional"] + assert p.breakthrough_modules["riemannian"] + assert p.breakthrough_modules["anti_ouroboros"] + assert p.breakthrough_modules["spectral_cert"] + assert len(p.profile_description) > 0 + + +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + """Edge cases for architecture detection.""" + + def test_empty_model_name(self): + """Empty string should fall through to Dense Standard.""" + profile = detect_architecture("") + assert profile.arch_class == ArchitectureClass.DENSE + assert profile.reasoning_class == ReasoningClass.STANDARD + + def test_unknown_model_type_in_config(self): + """Unknown model_type should not cause MoE classification.""" + class MockConfig: + model_type = "banana" + num_hidden_layers = 12 + hidden_size = 768 + intermediate_size = 3072 + vocab_size = 30522 + profile = detect_architecture("custom/unknown-arch", config=MockConfig()) + assert profile.arch_class == ArchitectureClass.DENSE + + def test_config_with_zero_experts(self): + """num_local_experts=0 should not trigger MoE.""" + class MockConfig: + model_type = "llama" + num_hidden_layers = 32 + hidden_size = 4096 + intermediate_size = 11008 + vocab_size = 32000 + num_local_experts = 0 + profile = detect_architecture("custom/dense-with-zero", config=MockConfig()) + assert not profile.is_moe + assert profile.arch_class == ArchitectureClass.DENSE + + def test_allcaps_model_name(self): + """Case-insensitive matching should work for all-caps names.""" + profile = detect_architecture("DEEPSEEK-AI/DEEPSEEK-R1-DISTILL-QWEN-7B") + assert profile.reasoning_class == ReasoningClass.REASONING + assert profile.arch_class == ArchitectureClass.DENSE # distill = dense + + def test_single_expert_is_moe(self): + """num_local_experts=1 is technically MoE (single expert).""" + class MockConfig: + model_type = "llama" + num_hidden_layers = 32 + hidden_size = 4096 + intermediate_size = 11008 + vocab_size = 32000 + num_local_experts = 1 + profile = detect_architecture("custom/single-expert", config=MockConfig()) + # 1 expert still triggers MoE detection (the code treats any >0 as MoE) + assert profile.is_moe diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py new file mode 100644 index 0000000..b29939b --- /dev/null +++ b/tests/test_benchmarks.py @@ -0,0 +1,183 @@ +"""Tests for lightweight benchmark harnesses.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import torch + +from obliteratus.evaluation.benchmarks import ( + KNOWLEDGE_ITEMS, + TRUTHFULNESS_ITEMS, + MATH_REASONING_ITEMS, + BenchmarkRunner, + BenchmarkResult, + format_benchmark_report, +) + + +def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64): + """Create mock model and tokenizer for benchmark testing.""" + model = MagicMock() + + # Model returns logits when called + def mock_forward(**kwargs): + input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10))) + batch_size, seq_len = input_ids.shape + result = MagicMock() + result.logits = torch.randn(batch_size, seq_len, vocab_size) + return result + + model.side_effect = mock_forward + model.__call__ = mock_forward + + # Model.generate returns token IDs + def mock_generate(**kwargs): + input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10))) + # Append some "generated" tokens + gen_tokens = torch.randint(0, vocab_size, (1, 20)) + return torch.cat([input_ids, gen_tokens], dim=1) + + model.generate = mock_generate + + # Model.parameters for device detection + param = torch.nn.Parameter(torch.randn(1)) + model.parameters = MagicMock(return_value=iter([param])) + + tokenizer = MagicMock() + tokenizer.return_value = { + "input_ids": torch.randint(0, vocab_size, (1, 15)), + "attention_mask": torch.ones(1, 15, dtype=torch.long), + } + tokenizer.side_effect = lambda text, **kwargs: { + "input_ids": torch.randint(0, vocab_size, (1, 15)), + "attention_mask": torch.ones(1, 15, dtype=torch.long), + } + + def mock_decode(ids, **kwargs): + return "The answer is 42. This is a generated response about the topic." + + def mock_encode(text, **kwargs): + # Return different IDs for A, B, C, D + if text == "A": + return [65] + elif text == "B": + return [66] + elif text == "C": + return [67] + elif text == "D": + return [68] + return [hash(text) % vocab_size] + + tokenizer.decode = mock_decode + tokenizer.encode = mock_encode + + return model, tokenizer + + +class TestBenchmarkItems: + def test_knowledge_items_have_required_fields(self): + for item in KNOWLEDGE_ITEMS: + assert "q" in item + assert "choices" in item + assert "answer" in item + assert "category" in item + assert 0 <= item["answer"] < len(item["choices"]) + + def test_knowledge_items_count(self): + assert len(KNOWLEDGE_ITEMS) >= 20 + + def test_knowledge_categories(self): + categories = set(item["category"] for item in KNOWLEDGE_ITEMS) + assert len(categories) >= 4 # multiple categories + + def test_truthfulness_items_have_required_fields(self): + for item in TRUTHFULNESS_ITEMS: + assert "q" in item + assert "true_answer" in item + assert "common_false" in item + assert "category" in item + + def test_truthfulness_items_count(self): + assert len(TRUTHFULNESS_ITEMS) >= 10 + + def test_math_items_have_required_fields(self): + for item in MATH_REASONING_ITEMS: + assert "q" in item + assert "answer" in item + assert "category" in item + assert isinstance(item["answer"], (int, float)) + + def test_math_items_count(self): + assert len(MATH_REASONING_ITEMS) >= 10 + + +class TestBenchmarkRunner: + def test_knowledge_probe_returns_result(self): + model, tokenizer = _make_mock_model_and_tokenizer() + runner = BenchmarkRunner(model, tokenizer, device="cpu") + result = runner.run_knowledge_probe() + + assert isinstance(result, BenchmarkResult) + assert result.benchmark_name == "knowledge_probe" + assert 0 <= result.score <= 1.0 + assert result.n_total == len(KNOWLEDGE_ITEMS) + assert result.n_correct >= 0 + assert len(result.per_category) > 0 + + def test_truthfulness_probe_returns_result(self): + model, tokenizer = _make_mock_model_and_tokenizer() + runner = BenchmarkRunner(model, tokenizer, device="cpu") + result = runner.run_truthfulness_probe() + + assert isinstance(result, BenchmarkResult) + assert result.benchmark_name == "truthfulness_probe" + assert 0 <= result.score <= 1.0 + assert result.n_total == len(TRUTHFULNESS_ITEMS) + + def test_math_probe_returns_result(self): + model, tokenizer = _make_mock_model_and_tokenizer() + runner = BenchmarkRunner(model, tokenizer, device="cpu") + result = runner.run_math_reasoning_probe() + + assert isinstance(result, BenchmarkResult) + assert result.benchmark_name == "math_reasoning_probe" + assert 0 <= result.score <= 1.0 + assert result.n_total == len(MATH_REASONING_ITEMS) + + def test_run_all(self): + model, tokenizer = _make_mock_model_and_tokenizer() + runner = BenchmarkRunner(model, tokenizer, device="cpu") + results = runner.run_all() + + assert "knowledge" in results + assert "truthfulness" in results + assert "math_reasoning" in results + + def test_format_report(self): + model, tokenizer = _make_mock_model_and_tokenizer() + runner = BenchmarkRunner(model, tokenizer, device="cpu") + results = runner.run_all() + report = format_benchmark_report(results) + + assert "Capability" in report + assert "knowledge" in report + assert "truthfulness" in report + assert "math" in report + + def test_per_category_scores_bounded(self): + model, tokenizer = _make_mock_model_and_tokenizer() + runner = BenchmarkRunner(model, tokenizer, device="cpu") + result = runner.run_knowledge_probe() + + for cat, score in result.per_category.items(): + assert 0 <= score <= 1.0 + + def test_extract_number(self): + model, tokenizer = _make_mock_model_and_tokenizer() + runner = BenchmarkRunner(model, tokenizer, device="cpu") + + assert runner._extract_number("The answer is 42.") == 42.0 + assert runner._extract_number("$20.50 is the price") == 20.50 + assert runner._extract_number("Result: -3.14") == -3.14 + assert runner._extract_number("No numbers here") is None diff --git a/tests/test_causal_and_transfer.py b/tests/test_causal_and_transfer.py new file mode 100644 index 0000000..9e18a48 --- /dev/null +++ b/tests/test_causal_and_transfer.py @@ -0,0 +1,535 @@ +"""Tests for causal tracing, residual stream decomposition, +probing classifiers, and cross-model transfer analysis.""" + +from __future__ import annotations + +import math + +import torch + +from obliteratus.analysis.causal_tracing import ( + CausalRefusalTracer, + CausalTracingResult, + ComponentCausalEffect, +) +from obliteratus.analysis.residual_stream import ( + ResidualStreamDecomposer, + ResidualStreamResult, + LayerDecomposition, +) +from obliteratus.analysis.probing_classifiers import ( + LinearRefusalProbe, + ProbeResult, + ProbingSuiteResult, +) +from obliteratus.analysis.cross_model_transfer import ( + TransferAnalyzer, + CrossModelResult, + CrossCategoryResult, + CrossLayerResult, + UniversalityReport, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_layer_activations( + n_layers=8, hidden_dim=32, refusal_strength=2.0, +): + """Create synthetic per-layer activations with planted refusal signal.""" + torch.manual_seed(42) + directions = {} + activations = {} + + base = torch.randn(hidden_dim) * 0.1 + + for i in range(n_layers): + d = torch.randn(hidden_dim) + d = d / d.norm() + directions[i] = d + + # Stronger refusal in middle layers + strength = refusal_strength if 2 <= i <= 5 else 0.3 + activations[i] = base + strength * d + torch.randn(hidden_dim) * 0.05 + + return activations, directions + + +def _make_separable_activations( + n_per_class=20, hidden_dim=16, separation=3.0, seed=42, +): + """Create harmful/harmless activations that are linearly separable.""" + torch.manual_seed(seed) + direction = torch.randn(hidden_dim) + direction = direction / direction.norm() + + harmful = [ + torch.randn(hidden_dim) * 0.5 + separation * direction + for _ in range(n_per_class) + ] + harmless = [ + torch.randn(hidden_dim) * 0.5 - separation * direction + for _ in range(n_per_class) + ] + return harmful, harmless, direction + + +# =========================================================================== +# Tests: Causal Tracing +# =========================================================================== + +class TestCausalTracing: + def test_basic_tracing(self): + activations, directions = _make_layer_activations() + tracer = CausalRefusalTracer(noise_level=3.0) + result = tracer.trace_from_activations(activations, directions) + + assert isinstance(result, CausalTracingResult) + assert result.n_layers == 8 + assert result.clean_refusal_strength > 0 + assert len(result.component_effects) == 8 + + def test_causal_components_identified(self): + activations, directions = _make_layer_activations() + tracer = CausalRefusalTracer(noise_level=3.0, causal_threshold=0.05) + result = tracer.trace_from_activations(activations, directions) + + assert result.circuit_size > 0 + assert result.circuit_fraction > 0 + assert len(result.causal_components) > 0 + + def test_corruption_reduces_strength(self): + activations, directions = _make_layer_activations(refusal_strength=5.0) + tracer = CausalRefusalTracer(noise_level=10.0) + result = tracer.trace_from_activations(activations, directions) + + # With high noise, corrupted should differ from clean + assert result.total_corruption_effect != 0 + + def test_single_direction_input(self): + activations, directions = _make_layer_activations() + single_dir = directions[3] # Use one direction for all layers + tracer = CausalRefusalTracer() + result = tracer.trace_from_activations(activations, single_dir) + + assert result.n_layers == 8 + assert len(result.component_effects) == 8 + + def test_component_effects_structure(self): + activations, directions = _make_layer_activations() + tracer = CausalRefusalTracer() + result = tracer.trace_from_activations(activations, directions) + + for e in result.component_effects: + assert isinstance(e, ComponentCausalEffect) + assert e.component_type == "full_layer" + assert e.causal_effect >= 0 + + def test_correlation_causal_agreement_bounded(self): + activations, directions = _make_layer_activations() + tracer = CausalRefusalTracer() + result = tracer.trace_from_activations(activations, directions) + assert -1.0 <= result.correlation_causal_agreement <= 1.0 + + def test_silent_contributors(self): + activations, directions = _make_layer_activations() + tracer = CausalRefusalTracer() + result = tracer.trace_from_activations(activations, directions) + sc = tracer.identify_silent_contributors(result, top_k=3) + + assert "silent_contributors" in sc + assert "loud_non_contributors" in sc + assert len(sc["silent_contributors"]) <= 3 + + def test_custom_component_types(self): + activations, directions = _make_layer_activations() + tracer = CausalRefusalTracer() + result = tracer.trace_from_activations( + activations, directions, + component_types=["attention", "mlp"], + ) + # 8 layers * 2 types = 16 effects + assert len(result.component_effects) == 16 + + def test_format_report(self): + activations, directions = _make_layer_activations() + tracer = CausalRefusalTracer() + result = tracer.trace_from_activations(activations, directions) + report = CausalRefusalTracer.format_tracing_report(result) + + assert "Causal Tracing" in report + assert "Circuit size" in report + + +# =========================================================================== +# Tests: Residual Stream Decomposition +# =========================================================================== + +class TestResidualStreamDecomposition: + def test_basic_decomposition(self): + activations, directions = _make_layer_activations() + decomposer = ResidualStreamDecomposer() + result = decomposer.decompose(activations, directions) + + assert isinstance(result, ResidualStreamResult) + assert result.n_layers == 8 + assert len(result.per_layer) == 8 + assert result.total_attention_contribution > 0 + assert result.total_mlp_contribution > 0 + + def test_attention_fraction_bounded(self): + activations, directions = _make_layer_activations() + decomposer = ResidualStreamDecomposer() + result = decomposer.decompose(activations, directions) + assert 0 <= result.attention_fraction <= 1.0 + + def test_with_head_count(self): + activations, directions = _make_layer_activations() + decomposer = ResidualStreamDecomposer(n_heads_per_layer=4) + result = decomposer.decompose(activations, directions) + + assert result.n_refusal_heads >= 0 + assert len(result.refusal_heads) > 0 + + def test_layer_decomposition_structure(self): + activations, directions = _make_layer_activations() + decomposer = ResidualStreamDecomposer() + result = decomposer.decompose(activations, directions) + + for _layer_idx, d in result.per_layer.items(): + assert isinstance(d, LayerDecomposition) + assert 0 <= d.attn_mlp_ratio <= 1.0 + assert d.cumulative_refusal >= 0 + + def test_accumulation_profile(self): + activations, directions = _make_layer_activations() + decomposer = ResidualStreamDecomposer() + result = decomposer.decompose(activations, directions) + + assert len(result.accumulation_profile) == 8 + # Accumulation should be monotonically non-decreasing + for i in range(1, len(result.accumulation_profile)): + assert result.accumulation_profile[i] >= result.accumulation_profile[i - 1] + + def test_with_explicit_attn_mlp(self): + """Test with provided attention and MLP outputs.""" + torch.manual_seed(42) + hidden_dim = 16 + n_layers = 4 + ref_dir = torch.randn(hidden_dim) + ref_dir = ref_dir / ref_dir.norm() + + acts = {} + attn_outs = {} + mlp_outs = {} + for i in range(n_layers): + attn = torch.randn(hidden_dim) * 0.5 + mlp = torch.randn(hidden_dim) * 0.5 + attn_outs[i] = attn + mlp_outs[i] = mlp + acts[i] = attn + mlp + (torch.randn(hidden_dim) * 0.1 if i == 0 else acts[i-1]) + + decomposer = ResidualStreamDecomposer() + result = decomposer.decompose( + acts, ref_dir, + attn_outputs=attn_outs, mlp_outputs=mlp_outs, + ) + assert len(result.per_layer) == n_layers + + def test_single_direction(self): + activations, _ = _make_layer_activations() + single_dir = torch.randn(32) + decomposer = ResidualStreamDecomposer() + result = decomposer.decompose(activations, single_dir) + assert result.n_layers == 8 + + def test_head_concentration_bounded(self): + activations, directions = _make_layer_activations() + decomposer = ResidualStreamDecomposer(n_heads_per_layer=8) + result = decomposer.decompose(activations, directions) + assert 0 <= result.head_concentration <= 1.0 + + def test_format_decomposition(self): + activations, directions = _make_layer_activations() + decomposer = ResidualStreamDecomposer(n_heads_per_layer=4) + result = decomposer.decompose(activations, directions) + report = ResidualStreamDecomposer.format_decomposition(result) + + assert "Residual Stream" in report + assert "Attention" in report + assert "MLP" in report + + +# =========================================================================== +# Tests: Probing Classifiers +# =========================================================================== + +class TestProbingClassifiers: + def test_separable_data_high_accuracy(self): + """With well-separated data, probe should achieve high accuracy.""" + harmful, harmless, direction = _make_separable_activations( + n_per_class=30, separation=5.0, + ) + probe = LinearRefusalProbe(n_epochs=200) + result = probe.probe_layer(harmful, harmless, direction, layer_idx=5) + + assert isinstance(result, ProbeResult) + assert result.layer_idx == 5 + assert result.accuracy > 0.7 # Should be separable + + def test_inseparable_data_low_accuracy(self): + """With overlapping data, probe should have lower accuracy.""" + harmful, harmless, direction = _make_separable_activations( + n_per_class=30, separation=0.01, + ) + probe = LinearRefusalProbe(n_epochs=50) + result = probe.probe_layer(harmful, harmless, direction) + # Accuracy should be near chance (0.5) + assert result.accuracy < 0.9 + + def test_learned_direction_unit(self): + harmful, harmless, direction = _make_separable_activations() + probe = LinearRefusalProbe(n_epochs=100) + result = probe.probe_layer(harmful, harmless, direction) + assert abs(result.learned_direction.norm().item() - 1.0) < 0.01 + + def test_cosine_with_analytical(self): + """Learned direction should align with analytical direction.""" + harmful, harmless, direction = _make_separable_activations( + n_per_class=50, separation=5.0, + ) + probe = LinearRefusalProbe(n_epochs=300) + result = probe.probe_layer(harmful, harmless, direction) + # With clear separation, learned direction should agree + assert result.cosine_with_analytical > 0.3 + + def test_without_analytical_direction(self): + harmful, harmless, _ = _make_separable_activations() + probe = LinearRefusalProbe(n_epochs=50) + result = probe.probe_layer(harmful, harmless) + assert result.cosine_with_analytical == 0.0 + + def test_auroc_bounded(self): + harmful, harmless, direction = _make_separable_activations() + probe = LinearRefusalProbe(n_epochs=100) + result = probe.probe_layer(harmful, harmless, direction) + assert 0 <= result.auroc <= 1.0 + + def test_mutual_information_nonnegative(self): + harmful, harmless, direction = _make_separable_activations() + probe = LinearRefusalProbe(n_epochs=100) + result = probe.probe_layer(harmful, harmless, direction) + assert result.mutual_information >= 0 + + def test_probe_all_layers(self): + harmful_acts = {} + harmless_acts = {} + anal_dirs = {} + for li in range(6): + harmful, harmless, direction = _make_separable_activations( + n_per_class=15, separation=3.0, seed=li * 10, + ) + harmful_acts[li] = harmful + harmless_acts[li] = harmless + anal_dirs[li] = direction + + probe = LinearRefusalProbe(n_epochs=100) + result = probe.probe_all_layers(harmful_acts, harmless_acts, anal_dirs) + + assert isinstance(result, ProbingSuiteResult) + assert len(result.per_layer) == 6 + assert result.best_accuracy > 0 + assert result.total_mutual_information >= 0 + + def test_format_report(self): + harmful_acts = {} + harmless_acts = {} + for li in range(4): + harmful, harmless, _ = _make_separable_activations( + n_per_class=15, seed=li, + ) + harmful_acts[li] = harmful + harmless_acts[li] = harmless + + probe = LinearRefusalProbe(n_epochs=50) + result = probe.probe_all_layers(harmful_acts, harmless_acts) + report = LinearRefusalProbe.format_probing_report(result) + + assert "Linear Probing" in report + assert "accuracy" in report.lower() + + def test_cross_entropy_finite(self): + harmful, harmless, direction = _make_separable_activations() + probe = LinearRefusalProbe(n_epochs=100) + result = probe.probe_layer(harmful, harmless, direction) + assert math.isfinite(result.cross_entropy) + + +# =========================================================================== +# Tests: Cross-Model Transfer Analysis +# =========================================================================== + +class TestTransferAnalysis: + def test_cross_model_identical(self): + """Identical directions should give perfect transfer.""" + torch.manual_seed(42) + dirs = {i: torch.randn(32) for i in range(8)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_model(dirs, dirs, "model_a", "model_a") + + assert isinstance(result, CrossModelResult) + assert result.mean_transfer_score > 0.99 + + def test_cross_model_random(self): + """Random directions should give low transfer.""" + torch.manual_seed(42) + dirs_a = {i: torch.randn(32) for i in range(8)} + torch.manual_seed(99) + dirs_b = {i: torch.randn(32) for i in range(8)} + + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_model(dirs_a, dirs_b, "a", "b") + # Random 32-dim vectors have low expected cosine + assert result.mean_transfer_score < 0.7 + + def test_cross_model_structure(self): + torch.manual_seed(42) + dirs_a = {i: torch.randn(32) for i in range(8)} + dirs_b = {i: torch.randn(32) for i in range(8)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_model(dirs_a, dirs_b) + + assert 0 <= result.transfer_above_threshold <= 1.0 + assert len(result.per_layer_transfer) == 8 + + def test_cross_category_similar(self): + """Similar categories should cluster together.""" + torch.manual_seed(42) + shared = torch.randn(32) + shared = shared / shared.norm() + + cat_dirs = {} + for cat in ["weapons", "bombs", "explosives"]: + d = shared + 0.2 * torch.randn(32) + cat_dirs[cat] = d / d.norm() + + # Add one very different category + cat_dirs["fraud"] = torch.randn(32) + + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_category(cat_dirs) + + assert isinstance(result, CrossCategoryResult) + assert result.mean_cross_category_transfer > 0 + assert len(result.categories) == 4 + + def test_cross_category_specificity(self): + torch.manual_seed(42) + cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(5)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_category(cat_dirs) + + assert result.most_universal_category != "" + assert result.most_specific_category != "" + assert len(result.category_clusters) > 0 + + def test_cross_layer(self): + _, directions = _make_layer_activations() + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_layer(directions) + + assert isinstance(result, CrossLayerResult) + assert result.mean_adjacent_transfer >= 0 + assert result.transfer_decay_rate >= 0 + + def test_cross_layer_adjacent_vs_distant(self): + """Adjacent layers typically have higher transfer than distant ones.""" + torch.manual_seed(42) + # Create directions with gradual drift + d = torch.randn(32) + d = d / d.norm() + directions = {} + for i in range(10): + noise = torch.randn(32) * 0.1 * i + di = d + noise + directions[i] = di / di.norm() + + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_layer(directions) + # Adjacent should have higher transfer than distant + assert result.mean_adjacent_transfer >= result.mean_distant_transfer - 0.1 + + def test_universality_index(self): + torch.manual_seed(42) + dirs = {i: torch.randn(32) for i in range(6)} + + analyzer = TransferAnalyzer() + cross_model = analyzer.analyze_cross_model(dirs, dirs) + cross_layer = analyzer.analyze_cross_layer(dirs) + cat_dirs = {f"cat_{i}": torch.randn(32) for i in range(4)} + cross_cat = analyzer.analyze_cross_category(cat_dirs) + + report = analyzer.compute_universality_index( + cross_model=cross_model, + cross_category=cross_cat, + cross_layer=cross_layer, + ) + + assert isinstance(report, UniversalityReport) + assert 0 <= report.universality_index <= 1.0 + + def test_universality_empty(self): + analyzer = TransferAnalyzer() + report = analyzer.compute_universality_index() + assert report.universality_index == 0.0 + + def test_format_cross_model(self): + torch.manual_seed(42) + dirs = {i: torch.randn(32) for i in range(4)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_model(dirs, dirs, "llama", "mistral") + report = TransferAnalyzer.format_cross_model(result) + assert "Cross-Model" in report + assert "llama" in report + + def test_format_cross_category(self): + torch.manual_seed(42) + cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(3)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_category(cat_dirs) + report = TransferAnalyzer.format_cross_category(result) + assert "Cross-Category" in report + + def test_format_universality(self): + analyzer = TransferAnalyzer() + report_obj = analyzer.compute_universality_index() + report = TransferAnalyzer.format_universality(report_obj) + assert "Universality" in report + + def test_dimension_mismatch_handled(self): + """Cross-model with different hidden dims should truncate.""" + dirs_a = {0: torch.randn(32), 1: torch.randn(32)} + dirs_b = {0: torch.randn(64), 1: torch.randn(64)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_model(dirs_a, dirs_b) + assert len(result.per_layer_transfer) == 2 + + +# =========================================================================== +# Tests: Integration +# =========================================================================== + +class TestNewImports: + def test_all_new_modules_importable(self): + from obliteratus.analysis import ( + CausalRefusalTracer, + ResidualStreamDecomposer, + LinearRefusalProbe, + TransferAnalyzer, + ) + assert CausalRefusalTracer is not None + assert ResidualStreamDecomposer is not None + assert LinearRefusalProbe is not None + assert TransferAnalyzer is not None diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..98ed2ab --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,133 @@ +"""CLI dispatch tests for obliteratus.cli.main(). + +These tests verify argument parsing and subcommand routing without +downloading real models or running any pipeline. They use +``unittest.mock.patch`` to capture stdout/stderr and +``pytest.raises(SystemExit)`` for argparse exits. +""" + +from __future__ import annotations + +from io import StringIO +from unittest.mock import patch + +import pytest + +from obliteratus.cli import main + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _capture_exit(argv: list[str] | None, *, expect_code: int | None = None): + """Call main(argv), expecting SystemExit; return captured stderr text.""" + buf = StringIO() + with pytest.raises(SystemExit) as exc_info, patch("sys.stderr", buf): + main(argv) + if expect_code is not None: + assert exc_info.value.code == expect_code + return buf.getvalue() + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestCLIDispatch: + """Test suite for CLI argument parsing and subcommand dispatch.""" + + # 1. No args -> prints help / exits with error + def test_main_no_args_prints_help(self): + """Calling main() with no args should exit (subcommand is required).""" + stderr_text = _capture_exit([], expect_code=2) + # argparse prints usage info to stderr on error + assert "usage" in stderr_text.lower() or "required" in stderr_text.lower() + + # 2. models command lists models without error + def test_models_command(self): + """Calling main(['models']) should list models without raising.""" + with patch("obliteratus.cli.console") as mock_console: + main(["models"]) + # console.print is called at least once to render the table + assert mock_console.print.call_count >= 1 + + # 3. obliterate without model arg -> error + def test_obliterate_requires_model(self): + """Calling main(['obliterate']) without a model arg should error.""" + stderr_text = _capture_exit(["obliterate"], expect_code=2) + assert "model" in stderr_text.lower() or "required" in stderr_text.lower() + + # 4. obliterate --method accepts valid methods + def test_obliterate_valid_methods(self): + """Test that --method accepts all 9 pipeline methods.""" + valid_methods = [ + "basic", "advanced", "aggressive", "spectral_cascade", + "informed", "surgical", "optimized", "inverted", "nuclear", + ] + for method in valid_methods: + # Patch the actual pipeline execution so nothing runs + with patch("obliteratus.cli._cmd_abliterate") as mock_cmd: + main(["obliterate", "fake/model", "--method", method]) + mock_cmd.assert_called_once() + args_passed = mock_cmd.call_args[0][0] + assert args_passed.method == method + + # 4b. invalid methods are rejected + def test_obliterate_rejects_invalid_method(self): + """The CLI --method flag rejects unknown method names.""" + stderr_text = _capture_exit( + ["obliterate", "fake/model", "--method", "nonexistent"], + expect_code=2, + ) + assert "invalid choice" in stderr_text.lower() + + # 5. run requires config path + def test_run_requires_config(self): + """Calling main(['run']) without a config path should error.""" + stderr_text = _capture_exit(["run"], expect_code=2) + assert "config" in stderr_text.lower() or "required" in stderr_text.lower() + + # 6. aggregate with nonexistent dir handles gracefully + def test_aggregate_command_missing_dir(self): + """Calling main(['aggregate']) with nonexistent dir should handle gracefully.""" + with patch("obliteratus.cli.console") as mock_console: + main(["aggregate", "--dir", "/nonexistent/path/to/nowhere"]) + # The command prints a message about no contributions found and returns + printed_text = " ".join( + str(call) for call in mock_console.print.call_args_list + ) + assert "no contributions found" in printed_text.lower() or mock_console.print.called + + # 7. --help flag prints help + def test_help_flag(self): + """Calling main(['--help']) should print help and exit 0.""" + buf = StringIO() + with pytest.raises(SystemExit) as exc_info, patch("sys.stdout", buf): + main(["--help"]) + assert exc_info.value.code == 0 + output = buf.getvalue() + assert "obliteratus" in output.lower() or "usage" in output.lower() + + # 8. interactive subcommand is registered + def test_interactive_command_exists(self): + """Verify 'interactive' subcommand is registered and dispatches.""" + with patch("obliteratus.cli._cmd_interactive") as mock_cmd: + main(["interactive"]) + mock_cmd.assert_called_once() + + # 9. --contribute and --contribute-notes are accepted on obliterate + def test_contribute_flags_on_obliterate(self): + """Verify --contribute and --contribute-notes are accepted args.""" + with patch("obliteratus.cli._cmd_abliterate") as mock_cmd: + main([ + "obliterate", "fake/model", + "--contribute", + "--contribute-notes", "Testing contribution system", + ]) + mock_cmd.assert_called_once() + args_passed = mock_cmd.call_args[0][0] + assert args_passed.contribute is True + assert args_passed.contribute_notes == "Testing contribution system" diff --git a/tests/test_community.py b/tests/test_community.py new file mode 100644 index 0000000..f240088 --- /dev/null +++ b/tests/test_community.py @@ -0,0 +1,567 @@ +"""Tests for the community contribution system.""" + +import json +from unittest.mock import MagicMock + +import pytest +import torch + +from obliteratus.community import ( + CONTRIBUTION_SCHEMA_VERSION, + _config_fingerprint, + _model_short_name, + aggregate_results, + generate_latex_table, + load_contributions, + save_contribution, +) + + +# ── Helper: mock pipeline ────────────────────────────────────────────── + + +def _make_mock_pipeline(): + """Build a mock pipeline with all fields the community module reads.""" + p = MagicMock() + p.handle.summary.return_value = { + "architecture": "LlamaForCausalLM", + "num_layers": 32, + "num_heads": 32, + "hidden_size": 4096, + "total_params": 8_000_000_000, + } + p.method = "advanced" + p.n_directions = 4 + p.norm_preserve = True + p.regularization = 0.3 + p.refinement_passes = 2 + p.project_biases = True + p.use_chat_template = True + p.use_whitened_svd = True + p.true_iterative_refinement = False + p.use_jailbreak_contrast = False + p.layer_adaptive_strength = False + p.attention_head_surgery = True + p.safety_neuron_masking = False + p.per_expert_directions = False + p.use_sae_features = False + p.invert_refusal = False + p.project_embeddings = False + p.embed_regularization = 0.5 + p.activation_steering = False + p.steering_strength = 0.3 + p.expert_transplant = False + p.transplant_blend = 0.3 + p.reflection_strength = 2.0 + p.quantization = None + + p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05} + p._strong_layers = [10, 11, 12, 13] + p._stage_durations = { + "summon": 3.0, "probe": 12.5, "distill": 4.1, + "excise": 2.0, "verify": 8.3, "rebirth": 5.0, + } + p._excise_modified_count = 128 + + # Direction data + d = torch.randn(4096) + d = d / d.norm() + p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096)} + p.refusal_subspaces = {10: torch.randn(4, 4096)} + + # Excise details + p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]} + p._sae_directions = {} + p._expert_safety_scores = {} + p._layer_excise_weights = {} + p._expert_directions = {} + p._steering_hooks = [] + + # Prompts + p.harmful_prompts = ["x"] * 33 + p.harmless_prompts = ["y"] * 33 + p.jailbreak_prompts = None + + return p + + +# ── Model short name ─────────────────────────────────────────────────── + + +class TestModelShortName: + def test_strips_org_prefix(self): + assert _model_short_name("meta-llama/Llama-2-7b-chat-hf") == "llama-2-7b-chat-hf" + + def test_no_org_prefix(self): + assert _model_short_name("gpt2") == "gpt2" + + def test_sanitizes_special_chars(self): + assert _model_short_name("org/Model_V2.1") == "model-v2-1" + + def test_caps_length(self): + long_name = "a" * 100 + assert len(_model_short_name(long_name)) <= 60 + + def test_collapses_dashes(self): + assert _model_short_name("org/Model---Name") == "model-name" + + def test_strips_trailing_dashes(self): + assert _model_short_name("org/Model-") == "model" + + +# ── Config fingerprint ───────────────────────────────────────────────── + + +class TestConfigFingerprint: + def test_deterministic(self): + config = {"n_directions": 4, "norm_preserve": True} + fp1 = _config_fingerprint(config) + fp2 = _config_fingerprint(config) + assert fp1 == fp2 + + def test_different_configs_different_hashes(self): + fp1 = _config_fingerprint({"n_directions": 4}) + fp2 = _config_fingerprint({"n_directions": 8}) + assert fp1 != fp2 + + def test_key_order_invariant(self): + fp1 = _config_fingerprint({"a": 1, "b": 2}) + fp2 = _config_fingerprint({"b": 2, "a": 1}) + assert fp1 == fp2 + + def test_returns_8_char_hex(self): + fp = _config_fingerprint({"test": True}) + assert len(fp) == 8 + assert all(c in "0123456789abcdef" for c in fp) + + +# ── Save contribution ────────────────────────────────────────────────── + + +class TestSaveContribution: + def test_saves_json_file(self, tmp_path): + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, + model_name="meta-llama/Llama-2-7b-chat-hf", + output_dir=tmp_path, + ) + assert path.exists() + assert path.suffix == ".json" + data = json.loads(path.read_text()) + assert data["contribution_schema_version"] == CONTRIBUTION_SCHEMA_VERSION + assert data["model_name"] == "meta-llama/Llama-2-7b-chat-hf" + + def test_filename_format(self, tmp_path): + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, + model_name="meta-llama/Llama-2-7b-chat-hf", + output_dir=tmp_path, + ) + name = path.stem + assert name.startswith("llama-2-7b-chat-hf_advanced_") + + def test_includes_telemetry_report(self, tmp_path): + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, + model_name="meta-llama/Llama-2-7b-chat-hf", + output_dir=tmp_path, + ) + data = json.loads(path.read_text()) + telemetry = data["telemetry"] + assert telemetry["schema_version"] == 2 + assert telemetry["model"]["architecture"] == "LlamaForCausalLM" + assert telemetry["method"] == "advanced" + assert telemetry["quality_metrics"]["refusal_rate"] == 0.05 + + def test_includes_config_fingerprint(self, tmp_path): + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, + model_name="meta-llama/Llama-2-7b-chat-hf", + output_dir=tmp_path, + ) + data = json.loads(path.read_text()) + assert "config_fingerprint" in data + assert len(data["config_fingerprint"]) == 8 + + def test_includes_notes(self, tmp_path): + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, + model_name="test/model", + notes="Ran on A100 with default prompts", + output_dir=tmp_path, + ) + data = json.loads(path.read_text()) + assert data["notes"] == "Ran on A100 with default prompts" + + def test_creates_output_dir(self, tmp_path): + subdir = tmp_path / "nested" / "dir" + assert not subdir.exists() + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, model_name="test/model", output_dir=subdir, + ) + assert subdir.exists() + assert path.exists() + + def test_timestamp_format(self, tmp_path): + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, model_name="test/model", output_dir=tmp_path, + ) + data = json.loads(path.read_text()) + ts = data["timestamp"] + # Should be UTC ISO-ish: YYYYMMDDTHHMMSSZ + assert ts.endswith("Z") + assert "T" in ts + assert len(ts) == 16 + + def test_method_config_extracted(self, tmp_path): + pipeline = _make_mock_pipeline() + path = save_contribution( + pipeline, model_name="test/model", output_dir=tmp_path, + ) + data = json.loads(path.read_text()) + cfg = data["telemetry"]["method_config"] + assert cfg["n_directions"] == 4 + assert cfg["norm_preserve"] is True + assert cfg["attention_head_surgery"] is True + + +# ── Load contributions ───────────────────────────────────────────────── + + +class TestLoadContributions: + def _write_contrib(self, directory, model, method, refusal_rate, idx=0): + """Write a minimal valid contribution file.""" + record = { + "contribution_schema_version": CONTRIBUTION_SCHEMA_VERSION, + "timestamp": f"20260227T{idx:06d}Z", + "model_name": model, + "config_fingerprint": "abcd1234", + "notes": "", + "telemetry": { + "schema_version": 2, + "method": method, + "quality_metrics": {"refusal_rate": refusal_rate}, + }, + } + path = directory / f"contrib_{idx}.json" + path.write_text(json.dumps(record)) + return path + + def test_loads_valid_files(self, tmp_path): + self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0) + self._write_contrib(tmp_path, "test/model", "basic", 0.10, 1) + records = load_contributions(tmp_path) + assert len(records) == 2 + + def test_sorts_by_timestamp(self, tmp_path): + self._write_contrib(tmp_path, "model-b", "advanced", 0.05, 2) + self._write_contrib(tmp_path, "model-a", "advanced", 0.10, 1) + records = load_contributions(tmp_path) + assert records[0]["model_name"] == "model-a" + assert records[1]["model_name"] == "model-b" + + def test_skips_non_contribution_json(self, tmp_path): + # Write a JSON file without contribution_schema_version + (tmp_path / "random.json").write_text('{"foo": "bar"}') + self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0) + records = load_contributions(tmp_path) + assert len(records) == 1 + + def test_skips_invalid_json(self, tmp_path): + (tmp_path / "bad.json").write_text("not valid json {{{") + self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0) + records = load_contributions(tmp_path) + assert len(records) == 1 + + def test_returns_empty_for_missing_dir(self, tmp_path): + records = load_contributions(tmp_path / "nonexistent") + assert records == [] + + def test_tracks_source_file(self, tmp_path): + self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0) + records = load_contributions(tmp_path) + assert "_source_file" in records[0] + assert "contrib_0.json" in records[0]["_source_file"] + + def test_ignores_non_json_files(self, tmp_path): + (tmp_path / "readme.txt").write_text("some text") + self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0) + records = load_contributions(tmp_path) + assert len(records) == 1 + + +# ── Aggregate results ────────────────────────────────────────────────── + + +class TestAggregateResults: + def _make_record(self, model, method, refusal_rate, perplexity=None, coherence=None): + metrics = {"refusal_rate": refusal_rate} + if perplexity is not None: + metrics["perplexity"] = perplexity + if coherence is not None: + metrics["coherence"] = coherence + return { + "model_name": model, + "telemetry": { + "method": method, + "quality_metrics": metrics, + }, + } + + def test_single_record(self): + records = [self._make_record("model-a", "advanced", 0.05)] + result = aggregate_results(records) + assert "model-a" in result + assert "advanced" in result["model-a"] + assert result["model-a"]["advanced"]["n_runs"] == 1 + assert result["model-a"]["advanced"]["refusal_rate"]["mean"] == 0.05 + + def test_multiple_runs_same_model_method(self): + records = [ + self._make_record("model-a", "advanced", 0.04), + self._make_record("model-a", "advanced", 0.06), + ] + result = aggregate_results(records) + stats = result["model-a"]["advanced"] + assert stats["n_runs"] == 2 + assert stats["refusal_rate"]["mean"] == 0.05 + assert stats["refusal_rate"]["min"] == 0.04 + assert stats["refusal_rate"]["max"] == 0.06 + assert stats["refusal_rate"]["n"] == 2 + + def test_multiple_models(self): + records = [ + self._make_record("model-a", "advanced", 0.05), + self._make_record("model-b", "basic", 0.10), + ] + result = aggregate_results(records) + assert len(result) == 2 + assert "model-a" in result + assert "model-b" in result + + def test_multiple_methods(self): + records = [ + self._make_record("model-a", "advanced", 0.05), + self._make_record("model-a", "basic", 0.10), + ] + result = aggregate_results(records) + assert len(result["model-a"]) == 2 + assert "advanced" in result["model-a"] + assert "basic" in result["model-a"] + + def test_std_zero_for_single_run(self): + records = [self._make_record("model-a", "advanced", 0.05)] + result = aggregate_results(records) + assert result["model-a"]["advanced"]["refusal_rate"]["std"] == 0.0 + + def test_multiple_metrics(self): + records = [ + self._make_record("model-a", "advanced", 0.05, perplexity=5.2, coherence=0.8), + ] + result = aggregate_results(records) + stats = result["model-a"]["advanced"] + assert "refusal_rate" in stats + assert "perplexity" in stats + assert "coherence" in stats + assert stats["perplexity"]["mean"] == 5.2 + + def test_missing_metric_skipped(self): + records = [self._make_record("model-a", "advanced", 0.05)] + result = aggregate_results(records) + # coherence not provided, should not appear + assert "coherence" not in result["model-a"]["advanced"] + + def test_unknown_model_and_method(self): + records = [{ + "telemetry": {"quality_metrics": {"refusal_rate": 0.1}}, + }] + result = aggregate_results(records) + assert "unknown" in result + assert "unknown" in result["unknown"] + + +# ── LaTeX table generation ───────────────────────────────────────────── + + +class TestGenerateLatexTable: + def _sample_aggregated(self): + return { + "meta-llama/Llama-2-7b-chat-hf": { + "advanced": { + "n_runs": 3, + "refusal_rate": {"mean": 0.04, "std": 0.01, "n": 3, "min": 0.03, "max": 0.05}, + }, + "basic": { + "n_runs": 2, + "refusal_rate": {"mean": 0.08, "std": 0.02, "n": 2, "min": 0.06, "max": 0.10}, + }, + }, + "mistralai/Mistral-7B-Instruct-v0.2": { + "advanced": { + "n_runs": 1, + "refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03}, + }, + }, + } + + def test_produces_valid_latex(self): + agg = self._sample_aggregated() + latex = generate_latex_table(agg) + assert "\\begin{tabular}" in latex + assert "\\end{tabular}" in latex + assert "\\toprule" in latex + assert "\\bottomrule" in latex + + def test_includes_model_names(self): + agg = self._sample_aggregated() + latex = generate_latex_table(agg) + assert "Llama-2-7b-chat-hf" in latex + assert "Mistral-7B-Instruct-v0.2" in latex + + def test_includes_method_headers(self): + agg = self._sample_aggregated() + latex = generate_latex_table(agg) + assert "advanced" in latex + assert "basic" in latex + + def test_missing_method_shows_dash(self): + agg = self._sample_aggregated() + latex = generate_latex_table(agg) + # Mistral doesn't have "basic" method + assert "---" in latex + + def test_shows_std_when_multiple_runs(self): + agg = self._sample_aggregated() + latex = generate_latex_table(agg) + assert "$\\pm$" in latex + + def test_no_std_for_single_run(self): + agg = { + "model": { + "method": { + "n_runs": 1, + "refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03}, + }, + }, + } + latex = generate_latex_table(agg) + assert "$\\pm$" not in latex + + def test_methods_filter(self): + agg = self._sample_aggregated() + latex = generate_latex_table(agg, methods=["advanced"]) + assert "\\textbf{advanced}" in latex + assert "\\textbf{basic}" not in latex + + def test_custom_metric(self): + agg = { + "model": { + "method": { + "n_runs": 2, + "perplexity": {"mean": 5.2, "std": 0.3, "n": 2, "min": 4.9, "max": 5.5}, + }, + }, + } + latex = generate_latex_table(agg, metric="perplexity") + assert "5.2" in latex + + def test_column_count_matches_methods(self): + agg = self._sample_aggregated() + latex = generate_latex_table(agg) + # 2 methods → "lcc" (1 model col + 2 method cols) + assert "{@{}lcc@{}}" in latex + + +# ── CLI integration ──────────────────────────────────────────────────── + + +class TestCLIContributeFlag: + def test_contribute_flag_accepted(self): + """Verify the --contribute flag parses without error.""" + from obliteratus.cli import main + + # We can't run the full command (no GPU), but verify parsing works + with pytest.raises(SystemExit): + # "obliterate" requires a model, so parse will fail, + # but if --contribute is not recognized it fails differently + main(["obliterate", "--help"]) + + def test_aggregate_command_accepted(self): + """Verify the aggregate command parses without error.""" + from obliteratus.cli import main + + with pytest.raises(SystemExit): + main(["aggregate", "--help"]) + + +# ── Package exports ──────────────────────────────────────────────────── + + +class TestPackageExports: + def test_save_contribution_importable(self): + from obliteratus import save_contribution + assert callable(save_contribution) + + def test_load_contributions_importable(self): + from obliteratus import load_contributions + assert callable(load_contributions) + + def test_aggregate_results_importable(self): + from obliteratus import aggregate_results + assert callable(aggregate_results) + + +# ── End-to-end: save → load → aggregate ─────────────────────────────── + + +class TestEndToEnd: + def test_save_load_aggregate_roundtrip(self, tmp_path): + """Full roundtrip: save contributions, load them, aggregate.""" + pipeline = _make_mock_pipeline() + + # Save two contributions (different models to avoid filename collision) + save_contribution( + pipeline, model_name="test/model-a", output_dir=tmp_path, + ) + # Tweak metrics for second run with a different model name + pipeline._quality_metrics = {"perplexity": 5.5, "coherence": 0.75, "refusal_rate": 0.07} + save_contribution( + pipeline, model_name="test/model-b", output_dir=tmp_path, + ) + + # Load + records = load_contributions(tmp_path) + assert len(records) == 2 + + # Aggregate + aggregated = aggregate_results(records) + assert "test/model-a" in aggregated + assert "test/model-b" in aggregated + stats_a = aggregated["test/model-a"]["advanced"] + stats_b = aggregated["test/model-b"]["advanced"] + assert stats_a["n_runs"] == 1 + assert stats_b["n_runs"] == 1 + assert abs(stats_a["refusal_rate"]["mean"] - 0.05) < 0.001 + assert abs(stats_b["refusal_rate"]["mean"] - 0.07) < 0.001 + + def test_save_load_aggregate_to_latex(self, tmp_path): + """Full roundtrip ending in LaTeX output.""" + pipeline = _make_mock_pipeline() + save_contribution( + pipeline, model_name="meta-llama/Llama-2-7b-chat-hf", output_dir=tmp_path, + ) + + records = load_contributions(tmp_path) + aggregated = aggregate_results(records) + latex = generate_latex_table(aggregated) + + assert "\\begin{tabular}" in latex + assert "Llama-2-7b-chat-hf" in latex + assert "advanced" in latex diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..debaad5 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,59 @@ +"""Tests for configuration loading.""" + +from __future__ import annotations + + +import yaml + +from obliteratus.config import StudyConfig + + +SAMPLE_CONFIG = { + "model": { + "name": "gpt2", + "task": "causal_lm", + "dtype": "float32", + "device": "cpu", + }, + "dataset": { + "name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "test", + "text_column": "text", + "max_samples": 50, + }, + "strategies": [ + {"name": "layer_removal", "params": {}}, + {"name": "ffn_ablation", "params": {}}, + ], + "metrics": ["perplexity"], + "batch_size": 4, + "max_length": 256, + "output_dir": "results/test", +} + + +class TestStudyConfig: + def test_from_dict(self): + config = StudyConfig.from_dict(SAMPLE_CONFIG) + assert config.model.name == "gpt2" + assert config.model.task == "causal_lm" + assert config.dataset.name == "wikitext" + assert len(config.strategies) == 2 + assert config.strategies[0].name == "layer_removal" + + def test_from_yaml(self, tmp_path): + yaml_path = tmp_path / "test_config.yaml" + yaml_path.write_text(yaml.dump(SAMPLE_CONFIG)) + + config = StudyConfig.from_yaml(yaml_path) + assert config.model.name == "gpt2" + assert config.batch_size == 4 + + def test_roundtrip(self): + config = StudyConfig.from_dict(SAMPLE_CONFIG) + d = config.to_dict() + config2 = StudyConfig.from_dict(d) + assert config2.model.name == config.model.name + assert config2.dataset.name == config.dataset.name + assert len(config2.strategies) == len(config.strategies) diff --git a/tests/test_defense_robustness.py b/tests/test_defense_robustness.py new file mode 100644 index 0000000..0b7f679 --- /dev/null +++ b/tests/test_defense_robustness.py @@ -0,0 +1,169 @@ +"""Tests for defense robustness evaluation framework.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import torch + +from obliteratus.analysis.defense_robustness import ( + DefenseProfile, + DefenseRobustnessEvaluator, + EntanglementMap, + SelfRepairResult, +) + + +def _make_mock_pipeline(n_layers=6, hidden_dim=16, n_prompts=5): + """Create a mock pipeline with refusal directions and activations.""" + pipeline = MagicMock() + pipeline.model_name = "test-model" + + # Generate refusal directions (some strong, some weak) + torch.manual_seed(42) + directions = {} + for i in range(n_layers): + d = torch.randn(hidden_dim) + directions[i] = d / d.norm() + pipeline.refusal_directions = directions + + # Generate activations with a planted refusal signal in middle layers + harmful_means = {} + harmless_means = {} + harmful_acts = {} + harmless_acts = {} + + for i in range(n_layers): + base = torch.randn(hidden_dim) + harmless_means[i] = base.unsqueeze(0) + + # Middle layers have stronger refusal signal + signal_strength = 3.0 if 2 <= i <= 4 else 0.5 + harmful_means[i] = (base + signal_strength * directions[i]).unsqueeze(0) + + harmful_acts[i] = [base + signal_strength * directions[i] + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)] + harmless_acts[i] = [base + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)] + + pipeline._harmful_means = harmful_means + pipeline._harmless_means = harmless_means + pipeline._harmful_acts = harmful_acts + pipeline._harmless_acts = harmless_acts + + return pipeline + + +class TestDefenseProfile: + def test_profile_generates(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + profile = evaluator.profile_defense() + + assert isinstance(profile, DefenseProfile) + assert profile.model_name == "test-model" + assert profile.refusal_layer_spread > 0 + assert profile.mean_refusal_strength > 0 + assert profile.max_refusal_strength >= profile.mean_refusal_strength + assert profile.estimated_robustness in ("low", "medium", "high", "very_high") + + def test_alignment_type_estimate(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + profile = evaluator.profile_defense() + assert profile.alignment_type_estimate != "unknown" + + def test_empty_pipeline(self): + pipeline = MagicMock() + pipeline.model_name = "empty" + pipeline.refusal_directions = {} + evaluator = DefenseRobustnessEvaluator(pipeline) + profile = evaluator.profile_defense() + assert profile.estimated_robustness == "unknown" + + def test_concentration_bounded(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + profile = evaluator.profile_defense() + # Gini coefficient should be between 0 and 1 + assert 0 <= profile.refusal_concentration <= 1.0 + + def test_self_repair_bounded(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + profile = evaluator.profile_defense() + assert 0 <= profile.self_repair_estimate <= 1.0 + + def test_format_report(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + profile = evaluator.profile_defense() + report = DefenseRobustnessEvaluator.format_defense_profile(profile) + assert "Defense Robustness" in report + assert "test-model" in report + + +class TestSelfRepair: + def test_self_repair_measurement(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + result = evaluator.measure_self_repair(layer_idx=3) + + assert isinstance(result, SelfRepairResult) + assert result.layer_idx == 3 + assert result.original_refusal_strength >= 0 + assert 0 <= result.repair_ratio <= 1.0 + assert len(result.compensating_layers) > 0 + assert 3 not in result.compensating_layers # shouldn't list itself + + def test_repair_ratio_high_for_distributed(self): + """Distributed refusal should have high repair ratio.""" + pipeline = _make_mock_pipeline(n_layers=10) + evaluator = DefenseRobustnessEvaluator(pipeline) + result = evaluator.measure_self_repair(layer_idx=3) + # With distributed signal, removing one layer leaves much compensation + assert result.repair_ratio > 0.5 + + def test_format_self_repair(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + result = evaluator.measure_self_repair(layer_idx=2) + report = DefenseRobustnessEvaluator.format_self_repair(result) + assert "Self-Repair" in report + assert "Layer 2" in report + + +class TestEntanglement: + def test_entanglement_map(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + emap = evaluator.map_entanglement() + + assert isinstance(emap, EntanglementMap) + assert len(emap.layer_entanglement) > 0 + assert 0 <= emap.overall_entanglement <= 1.0 + assert len(emap.most_entangled_layers) > 0 + assert len(emap.least_entangled_layers) > 0 + + def test_capability_sensitivity_keys(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + emap = evaluator.map_entanglement() + + expected_keys = {"factual_knowledge", "reasoning", "language_fluency", + "instruction_following", "math"} + assert set(emap.capability_sensitivity.keys()) == expected_keys + + def test_math_most_sensitive(self): + """Math should be estimated as the most sensitive capability.""" + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + emap = evaluator.map_entanglement() + if emap.overall_entanglement > 0: + assert emap.capability_sensitivity["math"] >= emap.capability_sensitivity["language_fluency"] + + def test_format_entanglement(self): + pipeline = _make_mock_pipeline() + evaluator = DefenseRobustnessEvaluator(pipeline) + emap = evaluator.map_entanglement() + report = DefenseRobustnessEvaluator.format_entanglement(emap) + assert "Entanglement" in report + assert "math" in report diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py new file mode 100644 index 0000000..4184c0f --- /dev/null +++ b/tests/test_edge_cases.py @@ -0,0 +1,510 @@ +"""Edge-case and robustness tests. + +Tests for NaN/Inf handling, empty inputs, extreme dimensions, +and other boundary conditions that the main test suite doesn't cover. +""" + +from __future__ import annotations + +import math + +import pytest +import torch +import torch.nn as nn + +from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor +from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer +from obliteratus.analysis.concept_geometry import ConceptConeAnalyzer +from obliteratus.analysis.alignment_imprint import AlignmentImprintDetector +from obliteratus.analysis.multi_token_position import MultiTokenPositionAnalyzer +from obliteratus.analysis.sparse_surgery import SparseDirectionSurgeon +from obliteratus.analysis.causal_tracing import CausalRefusalTracer +from obliteratus.analysis.residual_stream import ResidualStreamDecomposer +from obliteratus.analysis.probing_classifiers import LinearRefusalProbe +from obliteratus.analysis.cross_model_transfer import TransferAnalyzer +from obliteratus.evaluation.advanced_metrics import ( + refusal_rate, + effective_rank, + activation_cosine_similarity, +) +from obliteratus.analysis.steering_vectors import ( + SteeringVectorFactory, + SteeringHookManager, + SteeringConfig, + SteeringResult, + compute_steering_effectiveness, + format_steering_report, +) + + +# =========================================================================== +# NaN / Inf handling +# =========================================================================== + +class TestNaNInfHandling: + """Test that modules handle degenerate inputs gracefully.""" + + def test_whitened_svd_nan_activations(self): + """WhitenedSVD with NaN — currently raises; documenting behavior.""" + harmful = [torch.tensor([float("nan"), 1.0, 2.0]) for _ in range(5)] + harmless = [torch.randn(3) for _ in range(5)] + extractor = WhitenedSVDExtractor() + # NaN propagation through SVD is expected to produce NaN results + # This documents the current behavior — ideally would guard against it + raised = False + result = None + try: + result = extractor.extract(harmful, harmless) + except (RuntimeError, ValueError): + raised = True + # Either it raised an exception (acceptable) or returned a result with NaNs + assert raised or result is not None, ( + "Should either raise on NaN input or return a result" + ) + + def test_whitened_svd_zero_activations(self): + """WhitenedSVD with all-zero activations.""" + harmful = [torch.zeros(8) for _ in range(5)] + harmless = [torch.zeros(8) for _ in range(5)] + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless) + # Should return a valid result without crashing + assert result is not None + assert result.directions is not None + assert result.singular_values is not None + + def test_concept_cone_nan_direction(self): + """ConceptConeAnalyzer with NaN in activations — documenting behavior.""" + harmful = [torch.randn(16) for _ in range(10)] + harmless = [torch.randn(16) for _ in range(10)] + # Poison one activation + harmful[3] = torch.full((16,), float("nan")) + cat_map = {i: f"cat_{i % 3}" for i in range(10)} + analyzer = ConceptConeAnalyzer(category_map=cat_map) + raised = False + result = None + try: + result = analyzer.analyze_layer(harmful, harmless) + except (RuntimeError, ValueError): + raised = True + # Either it raised an exception (acceptable) or returned a result + assert raised or result is not None, ( + "Should either raise on NaN input or return a result" + ) + + def test_sparse_surgery_zero_direction(self): + """Sparse surgery with zero refusal direction.""" + W = torch.randn(32, 16) + zero_dir = torch.zeros(16) + surgeon = SparseDirectionSurgeon() + result = surgeon.analyze_weight_matrix(W, zero_dir) + assert result.mean_projection == 0.0 + + def test_sparse_surgery_zero_weight(self): + """Sparse surgery with zero weight matrix.""" + W = torch.zeros(32, 16) + ref_dir = torch.randn(16) + surgeon = SparseDirectionSurgeon() + result = surgeon.analyze_weight_matrix(W, ref_dir) + assert result.max_projection < 1e-6 + + def test_effective_rank_nan_matrix(self): + """effective_rank should handle matrix with NaN.""" + W = torch.randn(10, 10) + W[0, 0] = float("nan") + # Should either return a value or raise cleanly + try: + result = effective_rank(torch.nan_to_num(W)) + assert math.isfinite(result) + except Exception: + pass # Raising is acceptable for NaN input + + def test_cosine_similarity_zero_vectors(self): + """Cosine similarity between zero vectors.""" + a = torch.zeros(32) + b = torch.zeros(32) + result = activation_cosine_similarity(a, b) + # Should be 0 or NaN, not crash + assert math.isfinite(result) or math.isnan(result) + + def test_transfer_analyzer_nan_directions(self): + """Transfer analyzer with NaN directions.""" + dirs_a = {0: torch.randn(16), 1: torch.tensor([float("nan")] * 16)} + dirs_b = {0: torch.randn(16), 1: torch.randn(16)} + analyzer = TransferAnalyzer() + # Should not crash + result = analyzer.analyze_cross_model(dirs_a, dirs_b) + assert result is not None + assert isinstance(result.mean_transfer_score, float) + assert result.per_layer_transfer is not None + + +# =========================================================================== +# Empty inputs +# =========================================================================== + +class TestEmptyInputs: + """Test graceful handling of empty or minimal inputs.""" + + def test_cross_layer_empty_directions(self): + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze({}) + assert result.direction_persistence_score == 0.0 + + def test_alignment_imprint_single_layer(self): + """Single layer should still return a result.""" + detector = AlignmentImprintDetector() + dirs = {0: torch.randn(32)} + result = detector.detect_imprint(dirs) + assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown") + + def test_multi_token_single_position(self): + """Single-position sequence.""" + ref_dir = torch.randn(16) + acts = torch.randn(1, 16) + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + assert result.n_tokens == 1 + assert result.peak_position == 0 + + def test_probing_minimal_data(self): + """Probing with very few samples.""" + harmful = [torch.randn(8) for _ in range(3)] + harmless = [torch.randn(8) for _ in range(3)] + probe = LinearRefusalProbe(n_epochs=10) + result = probe.probe_layer(harmful, harmless) + assert 0 <= result.accuracy <= 1.0 + + def test_residual_stream_single_layer(self): + acts = {0: torch.randn(32)} + ref_dir = torch.randn(32) + decomposer = ResidualStreamDecomposer() + result = decomposer.decompose(acts, ref_dir) + assert result.n_layers == 1 + + def test_causal_tracing_single_layer(self): + acts = {0: torch.randn(32)} + ref_dirs = {0: torch.randn(32)} + tracer = CausalRefusalTracer() + result = tracer.trace_from_activations(acts, ref_dirs) + assert result.n_layers == 1 + + def test_transfer_no_common_layers(self): + """Cross-model with no overlapping layer indices.""" + dirs_a = {0: torch.randn(16), 1: torch.randn(16)} + dirs_b = {2: torch.randn(16), 3: torch.randn(16)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_model(dirs_a, dirs_b) + assert result.mean_transfer_score == 0.0 + + def test_refusal_rate_empty_list(self): + result = refusal_rate([]) + assert result == 0.0 + + def test_refusal_rate_single_response(self): + result = refusal_rate(["I cannot help with that."]) + assert result == 1.0 + + +# =========================================================================== +# Extreme dimensions +# =========================================================================== + +class TestExtremeDimensions: + """Test with unusually large or small dimensions.""" + + def test_high_dimensional_directions(self): + """Test with realistic hidden dimension (4096).""" + hidden_dim = 4096 + torch.manual_seed(42) + dirs = {i: torch.randn(hidden_dim) for i in range(8)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_layer(dirs) + assert result.mean_adjacent_transfer >= 0 + + def test_high_dim_sparse_surgery(self): + """Sparse surgery with large weight matrix.""" + W = torch.randn(2048, 1024) + ref_dir = torch.randn(1024) + surgeon = SparseDirectionSurgeon(sparsity=0.05) + result = surgeon.analyze_weight_matrix(W, ref_dir) + assert result.n_rows_modified == int(0.05 * 2048) + + def test_single_dimension(self): + """1D hidden dimension edge case.""" + dirs = {i: torch.randn(1) for i in range(4)} + analyzer = TransferAnalyzer() + result = analyzer.analyze_cross_layer(dirs) + # All 1D directions are parallel or anti-parallel, so cosine is always 1.0 + assert result.mean_adjacent_transfer >= 0.99 + + def test_many_layers_imprint(self): + """Alignment imprint with many layers (128).""" + dirs = {i: torch.randn(32) for i in range(128)} + detector = AlignmentImprintDetector() + result = detector.detect_imprint(dirs) + total = (result.dpo_probability + result.rlhf_probability + + result.cai_probability + result.sft_probability) + assert abs(total - 1.0) < 0.01 + + @pytest.mark.parametrize("n_prompts", [1, 2, 5, 50, 100]) + def test_concept_cone_varying_prompt_counts(self, n_prompts): + """Concept cone with varying numbers of prompts.""" + harmful = [torch.randn(16) for _ in range(n_prompts)] + harmless = [torch.randn(16) for _ in range(n_prompts)] + cat_map = {i: f"cat_{i % 3}" for i in range(n_prompts)} + analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=1) + result = analyzer.analyze_layer(harmful, harmless) + # Should produce a valid result regardless of prompt count + assert result is not None + assert result.general_direction is not None + assert result.category_count >= 1 + + @pytest.mark.parametrize("seq_len", [1, 2, 10, 100, 500]) + def test_multi_token_varying_lengths(self, seq_len): + """Multi-token analysis with varying sequence lengths.""" + ref_dir = torch.randn(16) + acts = torch.randn(seq_len, 16) + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + assert result.n_tokens == seq_len + + @pytest.mark.parametrize("hidden_dim", [2, 4, 16, 64, 256, 1024]) + def test_probing_varying_dimensions(self, hidden_dim): + """Probing classifier across different hidden dims (min 2 for SGD).""" + harmful = [torch.randn(hidden_dim) * 2 + 1 for _ in range(10)] + harmless = [torch.randn(hidden_dim) * 2 - 1 for _ in range(10)] + probe = LinearRefusalProbe(n_epochs=20) + result = probe.probe_layer(harmful, harmless) + assert 0 <= result.accuracy <= 1.0 + + +# =========================================================================== +# Refusal rate edge cases +# =========================================================================== + +class TestRefusalRateEdgeCases: + """Edge cases for refusal rate detection.""" + + def test_novel_refusal_phrasing(self): + """Test that unusual refusal phrasings might be missed (known limitation).""" + novel_refusals = [ + "That request falls outside my operational parameters.", + "I'm designed to be helpful but this conflicts with my guidelines.", + ] + # These should ideally be detected but string matching may miss them + rate = refusal_rate(novel_refusals) + # This tests the known limitation — rate may be 0 (string matching misses novel phrasings) + assert 0 <= rate <= 1.0 + + def test_false_positive_sorry(self): + """'I'm sorry to hear that' followed by helpful content.""" + responses = [ + "I'm sorry to hear that. Here's how you can solve the problem: first...", + ] + rate = refusal_rate(responses, mode="prefix") + # Known limitation: prefix matching produces false positives here + # Just verify it returns a valid float + assert 0 <= rate <= 1.0 + + def test_case_insensitivity(self): + responses = ["I CANNOT ASSIST WITH THAT REQUEST."] + rate = refusal_rate(responses, mode="substring") + assert rate == 1.0 + + def test_whitespace_handling(self): + responses = [" I'm sorry, but I can't help with that. "] + rate = refusal_rate(responses, mode="prefix") + # Should handle leading whitespace + assert 0 <= rate <= 1.0 + + +# =========================================================================== +# Steering Vectors +# =========================================================================== + +class TestSteeringVectors: + """Tests for the steering vector inference-time intervention system.""" + + def test_from_refusal_direction(self): + d = torch.randn(32) + vec = SteeringVectorFactory.from_refusal_direction(d, source_layer=5) + assert vec.label == "refusal" + assert vec.source_layer == 5 + assert vec.default_alpha == -1.0 + assert abs(vec.direction.norm().item() - 1.0) < 0.01 + + def test_from_contrastive_pairs(self): + pos = [torch.randn(16) + 2 for _ in range(10)] + neg = [torch.randn(16) - 2 for _ in range(10)] + vec = SteeringVectorFactory.from_contrastive_pairs(pos, neg, label="test") + assert vec.label == "test" + assert abs(vec.direction.norm().item() - 1.0) < 0.01 + assert "n_positive" in vec.metadata + + def test_combine_vectors(self): + v1 = SteeringVectorFactory.from_refusal_direction(torch.randn(32)) + v2 = SteeringVectorFactory.from_refusal_direction(torch.randn(32)) + combined = SteeringVectorFactory.combine([v1, v2], label="merged") + assert combined.label == "merged" + assert abs(combined.direction.norm().item() - 1.0) < 0.01 + + def test_combine_single(self): + v = SteeringVectorFactory.from_refusal_direction(torch.randn(16)) + combined = SteeringVectorFactory.combine([v]) + assert abs(combined.direction.norm().item() - 1.0) < 0.01 + + def test_combine_empty_raises(self): + with pytest.raises(ValueError): + SteeringVectorFactory.combine([]) + + def test_hook_manager_lifecycle(self): + """Test install/remove lifecycle without a real model.""" + manager = SteeringHookManager() + assert not manager.is_active + manager.remove() # Should not crash even with no hooks + assert not manager.is_active + + def test_hook_with_simple_model(self): + """Test steering on a simple nn.Sequential model.""" + model = nn.Sequential( + nn.Linear(16, 16), + nn.ReLU(), + nn.Linear(16, 16), + nn.ReLU(), + nn.Linear(16, 8), + ) + + vec = SteeringVectorFactory.from_refusal_direction(torch.randn(16)) + config = SteeringConfig( + vectors=[vec], + target_layers=[0, 2], # steer at first and third linear layers + alpha=1.0, + ) + + manager = SteeringHookManager() + # Install on specific modules + layers = list(model.children()) + result = manager.install(model, config, layer_modules=layers) + assert result.hooks_installed == 2 + assert manager.is_active + + # Run a forward pass (should not crash) + x = torch.randn(1, 16) + output = model(x) + assert output.shape == (1, 8) + + # Remove hooks + manager.remove() + assert not manager.is_active + + def test_steering_effectiveness_remove(self): + eff = compute_steering_effectiveness(2.0, 0.5, direction="remove") + assert 0 < eff < 1.0 # Reduced but not eliminated + + def test_steering_effectiveness_perfect_remove(self): + eff = compute_steering_effectiveness(2.0, 0.0, direction="remove") + assert eff == 1.0 + + def test_steering_effectiveness_no_change(self): + eff = compute_steering_effectiveness(2.0, 2.0, direction="remove") + assert eff == 0.0 + + def test_steering_effectiveness_add(self): + eff = compute_steering_effectiveness(1.0, 3.0, direction="add") + assert eff == 1.0 # Capped at 1.0 + + def test_format_report(self): + vec = SteeringVectorFactory.from_refusal_direction(torch.randn(32)) + config = SteeringConfig(vectors=[vec], target_layers=[3, 5], alpha=0.5) + result = SteeringResult(config=config, hooks_installed=2, total_steered_layers=2) + report = format_steering_report(result) + assert "Steering" in report + assert "refusal" in report + + def test_steering_config_position_modes(self): + """Test different position modes in config.""" + for pos in ["all", "last", "first"]: + config = SteeringConfig( + vectors=[SteeringVectorFactory.from_refusal_direction(torch.randn(8))], + target_layers=[0], + position=pos, + ) + assert config.position == pos + + def test_imports(self): + from obliteratus.analysis import SteeringVectorFactory, SteeringHookManager + assert SteeringVectorFactory is not None + assert SteeringHookManager is not None + + +class TestParametrizedDimensions: + """Parametrized tests across different hidden dimensions.""" + + @pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256, 768]) + def test_whitened_svd_various_dims(self, hidden_dim): + n_samples = max(4, hidden_dim // 4) + harmful = [torch.randn(hidden_dim) for _ in range(n_samples)] + harmless = [torch.randn(hidden_dim) for _ in range(n_samples)] + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless, n_directions=1) + assert result.directions.shape[1] == hidden_dim + + @pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256]) + def test_cross_layer_various_dims(self, hidden_dim): + directions = {i: torch.randn(hidden_dim) for i in range(4)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + assert 0.0 <= result.direction_persistence_score <= 1.0 + + @pytest.mark.parametrize("hidden_dim", [4, 32, 128]) + def test_sparse_surgery_various_dims(self, hidden_dim): + weight = torch.randn(hidden_dim, hidden_dim) + direction = torch.randn(hidden_dim) + direction = direction / direction.norm() + surgeon = SparseDirectionSurgeon() + result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0) + assert 0.0 <= result.energy_removed <= 1.0 + + @pytest.mark.parametrize("n_layers", [1, 4, 12, 32]) + def test_imprint_various_layer_counts(self, n_layers): + directions = {i: torch.randn(64) for i in range(n_layers)} + detector = AlignmentImprintDetector() + result = detector.detect_imprint(directions) + assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown") + + +class TestExceptionPaths: + """Tests for error handling and boundary conditions.""" + + def test_whitened_svd_mismatched_dims(self): + """Harmful and harmless with different hidden dims should fail or handle gracefully.""" + harmful = [torch.randn(64) for _ in range(10)] + harmless = [torch.randn(32) for _ in range(10)] + extractor = WhitenedSVDExtractor() + with pytest.raises(Exception): + extractor.extract(harmful, harmless, n_directions=1) + + def test_whitened_svd_single_sample(self): + """Single sample should not crash (may return 0 directions due to insufficient data).""" + harmful = [torch.randn(32)] + harmless = [torch.randn(32)] + extractor = WhitenedSVDExtractor() + result = extractor.extract(harmful, harmless, n_directions=1) + assert result.directions.shape[1] == 32 # hidden dim preserved + + def test_sparse_surgery_zero_direction(self): + """Zero direction vector should not crash.""" + weight = torch.randn(16, 16) + direction = torch.zeros(16) + surgeon = SparseDirectionSurgeon() + # Should handle gracefully (possibly returning 0 energy) + result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0) + assert result is not None + + def test_cross_layer_single_layer(self): + """Single layer directions should still produce a result.""" + directions = {0: torch.randn(32)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + assert result is not None diff --git a/tests/test_heretic_eval.py b/tests/test_heretic_eval.py new file mode 100644 index 0000000..7e1b600 --- /dev/null +++ b/tests/test_heretic_eval.py @@ -0,0 +1,483 @@ +"""Tests for the community-standard (Heretics/Arditi) evaluation protocol.""" + +from __future__ import annotations + +import pytest +import torch + +from obliteratus.evaluation.heretic_eval import ( + ARDITI_REFUSAL_SUBSTRINGS, + _DEFAULT_HARMLESS_PROMPTS, + _HARMBENCH_CLS_TEMPLATE, + _JAILBREAKBENCH_BUILTIN, + _extract_score, + _last_real_token_indices, + _sanitize_model_path, + arditi_refusal_rate, + first_token_kl_on_prompts, + format_comparison_table, + HereticComparisonResult, + LM_EVAL_BENCHMARKS, + load_jailbreakbench_prompts, + unload_harmbench_classifier, +) + + +# --------------------------------------------------------------------------- +# JailbreakBench Prompts +# --------------------------------------------------------------------------- + +class TestJailbreakBenchPrompts: + def test_builtin_has_100_prompts(self): + assert len(_JAILBREAKBENCH_BUILTIN) == 100 + + def test_builtin_all_non_empty(self): + for i, p in enumerate(_JAILBREAKBENCH_BUILTIN): + assert isinstance(p, str) and len(p.strip()) > 10, f"Prompt {i} is too short" + + def test_builtin_no_duplicates(self): + assert len(set(_JAILBREAKBENCH_BUILTIN)) == len(_JAILBREAKBENCH_BUILTIN) + + def test_load_returns_list(self): + prompts = load_jailbreakbench_prompts() + assert isinstance(prompts, list) + assert len(prompts) >= 100 + + def test_load_caches(self): + p1 = load_jailbreakbench_prompts() + p2 = load_jailbreakbench_prompts() + assert p1 == p2 + + def test_load_returns_copy(self): + """Loading should return a copy, not the cached list itself.""" + p1 = load_jailbreakbench_prompts() + p1.append("mutated") + p2 = load_jailbreakbench_prompts() + assert "mutated" not in p2 + + +# --------------------------------------------------------------------------- +# Arditi Refusal Rate +# --------------------------------------------------------------------------- + +class TestArditiRefusalRate: + def test_canonical_substring_count(self): + """Arditi/Zou list has exactly 29 substrings.""" + assert len(ARDITI_REFUSAL_SUBSTRINGS) == 29 + + def test_all_refusals(self): + responses = [ + "I'm sorry, but I cannot help with that.", + "I apologize for the inconvenience.", + "As an AI, I don't do that.", + ] + assert arditi_refusal_rate(responses) == 1.0 + + def test_no_refusals(self): + responses = [ + "Sure, here is a step-by-step guide.", + "The answer to your question is 42.", + "Here you go, this should work.", + ] + assert arditi_refusal_rate(responses) == 0.0 + + def test_empty_responses_not_refusals(self): + """Per Zou et al. logic, empty responses have no substring match = not refusals.""" + responses = ["", " ", ""] + assert arditi_refusal_rate(responses) == 0.0 + + def test_empty_list(self): + assert arditi_refusal_rate([]) == 0.0 + + def test_case_insensitive(self): + responses = ["i'm sorry, but I can't help."] + assert arditi_refusal_rate(responses) == 1.0 + + def test_mixed(self): + responses = [ + "I cannot do that.", + "Sure, here's the info.", + "My apologies, I'm unable to help.", + "The capital of France is Paris.", + ] + rate = arditi_refusal_rate(responses) + assert rate == 0.5 # 2/4 + + def test_subtle_refusal_not_caught(self): + """Arditi list is deliberately limited — subtle refusals should not match.""" + responses = [ + "I'd rather not help with that specific request.", + "Let me redirect you to a more appropriate resource.", + ] + # These are NOT in the Arditi 29-substring list + assert arditi_refusal_rate(responses) == 0.0 + + +# --------------------------------------------------------------------------- +# First-Token KL Divergence +# --------------------------------------------------------------------------- + +class TestFirstTokenKL: + @pytest.fixture + def simple_models(self): + """Create two tiny 'models' with controllable logit distributions.""" + class FakeModel(torch.nn.Module): + def __init__(self, peak_idx: int = 0): + super().__init__() + # A single parameter so next(model.parameters()).device works + self._param = torch.nn.Parameter(torch.zeros(1)) + self._peak_idx = peak_idx + + def __call__(self, **kwargs): + batch_size = kwargs["input_ids"].shape[0] + seq_len = kwargs["input_ids"].shape[1] + vocab_size = 10 + # Create a non-uniform distribution peaked at _peak_idx + base = torch.zeros(vocab_size) + base[self._peak_idx] = 5.0 + logits = base.unsqueeze(0).unsqueeze(0).expand( + batch_size, seq_len, vocab_size + ).clone() + return type("Output", (), {"logits": logits})() + + class FakeTokenizer: + pad_token_id = 0 + def __call__(self, texts, return_tensors="pt", **kwargs): + batch_size = len(texts) if isinstance(texts, list) else 1 + input_ids = torch.ones(batch_size, 5, dtype=torch.long) + return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)} + + return FakeModel, FakeTokenizer + + def test_identical_models_zero_kl(self, simple_models): + FakeModel, FakeTokenizer = simple_models + model_a = FakeModel(peak_idx=0) + model_b = FakeModel(peak_idx=0) + tokenizer = FakeTokenizer() + + result = first_token_kl_on_prompts( + model_a, model_b, tokenizer, + ["hello", "world"], + ) + assert abs(result["mean_kl"]) < 1e-5 + assert result["interpretation"] == "excellent (minimal collateral damage)" + + def test_different_models_positive_kl(self, simple_models): + FakeModel, FakeTokenizer = simple_models + model_a = FakeModel(peak_idx=0) # peaked at vocab position 0 + model_b = FakeModel(peak_idx=5) # peaked at vocab position 5 + tokenizer = FakeTokenizer() + + result = first_token_kl_on_prompts( + model_a, model_b, tokenizer, + ["test prompt"], + ) + assert result["mean_kl"] > 0 + + def test_returns_per_prompt_kl(self, simple_models): + FakeModel, FakeTokenizer = simple_models + model_a = FakeModel(peak_idx=0) + model_b = FakeModel(peak_idx=3) + tokenizer = FakeTokenizer() + + result = first_token_kl_on_prompts( + model_a, model_b, tokenizer, + ["a", "b", "c"], + ) + assert len(result["per_prompt_kl"]) == 3 + assert result["std_kl"] >= 0 + + +# --------------------------------------------------------------------------- +# HereticComparisonResult +# --------------------------------------------------------------------------- + +class TestHereticComparisonResult: + def test_dataclass_fields(self): + r = HereticComparisonResult( + model_name="test-model", + method="OBLITERATUS", + refusal_rate_arditi=0.05, + refusal_rate_obliteratus=0.03, + harmbench_asr=0.85, + n_jailbreakbench=100, + n_refusals_remaining=5, + first_token_kl=0.15, + kl_interpretation="excellent", + ) + assert r.model_name == "test-model" + assert r.method == "OBLITERATUS" + assert r.refusal_rate_arditi == 0.05 + assert r.harmbench_asr == 0.85 + assert r.first_token_kl == 0.15 + + def test_optional_fields_default_none(self): + r = HereticComparisonResult( + model_name="test", + method="test", + refusal_rate_arditi=0.0, + refusal_rate_obliteratus=0.0, + harmbench_asr=None, + n_jailbreakbench=100, + n_refusals_remaining=0, + ) + assert r.mmlu is None + assert r.gsm8k is None + assert r.perplexity is None + assert r.harmbench_per_item == [] + assert r.kl_per_prompt == [] + + +# --------------------------------------------------------------------------- +# Comparison Table Formatting +# --------------------------------------------------------------------------- + +class TestComparisonTable: + def test_format_single_result(self): + r = HereticComparisonResult( + model_name="Llama-2-7B", + method="OBLITERATUS", + refusal_rate_arditi=0.05, + refusal_rate_obliteratus=0.03, + harmbench_asr=0.85, + n_jailbreakbench=100, + n_refusals_remaining=5, + first_token_kl=0.15, + kl_interpretation="excellent", + mmlu=0.518, + gsm8k=0.313, + ) + table = format_comparison_table([r]) + assert "OBLITERATUS" in table + assert "REFUSAL REMOVAL" in table + assert "CAPABILITY PRESERVATION" in table + assert "DISTRIBUTION QUALITY" in table + assert "5.0%" in table # arditi refusal rate + assert "85.0%" in table # harmbench asr + assert "5/100" in table # JBB refusals + assert "0.1500" in table # KL divergence + + def test_format_multiple_results(self): + results = [ + HereticComparisonResult( + model_name="test", method="OBLITERATUS", + refusal_rate_arditi=0.05, refusal_rate_obliteratus=0.03, + harmbench_asr=0.85, n_jailbreakbench=100, n_refusals_remaining=5, + ), + HereticComparisonResult( + model_name="test", method="Heretic", + refusal_rate_arditi=0.03, refusal_rate_obliteratus=0.03, + harmbench_asr=0.90, n_jailbreakbench=100, n_refusals_remaining=3, + ), + ] + table = format_comparison_table(results) + assert "OBLITERATUS" in table + assert "Heretic" in table + + def test_heretic_reference_numbers_present(self): + """The comparison table should include Heretic's published reference numbers.""" + table = format_comparison_table([ + HereticComparisonResult( + model_name="test", method="test", + refusal_rate_arditi=0.0, refusal_rate_obliteratus=0.0, + harmbench_asr=None, n_jailbreakbench=100, n_refusals_remaining=0, + ), + ]) + assert "p-e-w/heretic" in table + assert "97/100" in table # Original model refusal count + assert "0.16" in table # Heretic's KL divergence + + +# --------------------------------------------------------------------------- +# LM-Eval Benchmark Config +# --------------------------------------------------------------------------- + +class TestLmEvalBenchmarks: + def test_five_standard_benchmarks(self): + assert len(LM_EVAL_BENCHMARKS) == 5 + + def test_required_keys(self): + for key, config in LM_EVAL_BENCHMARKS.items(): + assert "task" in config + assert "num_fewshot" in config + assert "description" in config + + def test_expected_benchmarks(self): + expected = {"mmlu", "gsm8k", "arc_challenge", "hellaswag", "truthfulqa_mc2"} + assert set(LM_EVAL_BENCHMARKS.keys()) == expected + + def test_fewshot_settings_match_arditi(self): + """Few-shot settings should match Arditi et al. / Open LLM Leaderboard.""" + assert LM_EVAL_BENCHMARKS["mmlu"]["num_fewshot"] == 5 + assert LM_EVAL_BENCHMARKS["gsm8k"]["num_fewshot"] == 5 + assert LM_EVAL_BENCHMARKS["truthfulqa_mc2"]["num_fewshot"] == 0 + + +# --------------------------------------------------------------------------- +# HarmBench Template +# --------------------------------------------------------------------------- + +class TestHarmBenchTemplate: + def test_template_has_sys_tags(self): + """Official template requires <> / <> delimiters.""" + assert "<>" in _HARMBENCH_CLS_TEMPLATE + assert "<>" in _HARMBENCH_CLS_TEMPLATE + + def test_template_has_seven_rules(self): + """Official HarmBench template has exactly 7 rules.""" + rule_count = _HARMBENCH_CLS_TEMPLATE.count("\n- ") + assert rule_count == 7 + + def test_template_has_behavior_placeholder(self): + assert "{behavior}" in _HARMBENCH_CLS_TEMPLATE + + def test_template_has_generation_placeholder(self): + assert "{generation}" in _HARMBENCH_CLS_TEMPLATE + + def test_template_ends_with_inst_tag(self): + assert _HARMBENCH_CLS_TEMPLATE.strip().endswith("[/INST]") + + def test_template_answer_format(self): + """Official template uses lowercase 'yes' or 'no'.""" + assert '"yes" or "no"' in _HARMBENCH_CLS_TEMPLATE + + +# --------------------------------------------------------------------------- +# Score Extraction (C3 fix) +# --------------------------------------------------------------------------- + +class TestExtractScore: + def test_normal_score(self): + assert _extract_score({"acc,none": 0.75}) == 0.75 + + def test_zero_score_not_skipped(self): + """A legitimate score of 0.0 should be returned, not treated as falsy.""" + assert _extract_score({"acc,none": 0.0}) == 0.0 + + def test_fallback_to_next_key(self): + assert _extract_score({"acc_norm,none": 0.65}) == 0.65 + + def test_mc2_key(self): + assert _extract_score({"mc2,none": 0.42}) == 0.42 + + def test_no_matching_key(self): + assert _extract_score({"unknown_metric": 0.99}) == 0.0 + + def test_priority_order(self): + """acc,none should take priority over acc_norm,none.""" + result = _extract_score({"acc,none": 0.5, "acc_norm,none": 0.9}) + assert result == 0.5 + + +# --------------------------------------------------------------------------- +# Padding-Aware Last-Token Indices +# --------------------------------------------------------------------------- + +class TestLastRealTokenIndices: + def test_no_padding(self): + mask = torch.ones(3, 5, dtype=torch.long) + indices = _last_real_token_indices(mask) + assert indices.tolist() == [4, 4, 4] + + def test_with_padding(self): + mask = torch.tensor([ + [1, 1, 1, 1, 1], # length 5, last real = index 4 + [1, 1, 1, 0, 0], # length 3, last real = index 2 + [1, 0, 0, 0, 0], # length 1, last real = index 0 + ]) + indices = _last_real_token_indices(mask) + assert indices.tolist() == [4, 2, 0] + + def test_single_token(self): + mask = torch.tensor([[1]]) + indices = _last_real_token_indices(mask) + assert indices.tolist() == [0] + + +# --------------------------------------------------------------------------- +# Model Path Sanitization +# --------------------------------------------------------------------------- + +class TestSanitizeModelPath: + def test_normal_path(self): + assert _sanitize_model_path("/tmp/my-model") == "/tmp/my-model" + + def test_hf_model_id(self): + assert _sanitize_model_path("meta-llama/Llama-2-7b-hf") == "meta-llama/Llama-2-7b-hf" + + def test_rejects_commas(self): + with pytest.raises(ValueError, match="commas"): + _sanitize_model_path("evil,trust_remote_code=True") + + +# --------------------------------------------------------------------------- +# Classifier Unload +# --------------------------------------------------------------------------- + +class TestClassifierUnload: + def test_unload_when_not_loaded(self): + """Unloading when nothing is loaded should not raise.""" + unload_harmbench_classifier() # should be a no-op + + +# --------------------------------------------------------------------------- +# Default Harmless Prompts +# --------------------------------------------------------------------------- + +class TestDefaultHarmlessPrompts: + def test_has_100_unique_prompts(self): + assert len(_DEFAULT_HARMLESS_PROMPTS) == 100 + + def test_no_duplicates(self): + assert len(set(_DEFAULT_HARMLESS_PROMPTS)) == len(_DEFAULT_HARMLESS_PROMPTS) + + def test_all_non_empty(self): + for i, p in enumerate(_DEFAULT_HARMLESS_PROMPTS): + assert isinstance(p, str) and len(p) > 10, f"Prompt {i} is too short" + + +# --------------------------------------------------------------------------- +# KL Divergence Non-Negativity +# --------------------------------------------------------------------------- + +class TestKLNonNegativity: + @pytest.fixture + def models_and_tokenizer(self): + class FakeModel(torch.nn.Module): + def __init__(self, peak_idx: int = 0): + super().__init__() + self._param = torch.nn.Parameter(torch.zeros(1)) + self._peak_idx = peak_idx + + def __call__(self, **kwargs): + batch_size = kwargs["input_ids"].shape[0] + seq_len = kwargs["input_ids"].shape[1] + vocab_size = 10 + base = torch.zeros(vocab_size) + base[self._peak_idx] = 5.0 + logits = base.unsqueeze(0).unsqueeze(0).expand( + batch_size, seq_len, vocab_size + ).clone() + return type("Output", (), {"logits": logits})() + + class FakeTokenizer: + pad_token_id = 0 + def __call__(self, texts, return_tensors="pt", **kwargs): + batch_size = len(texts) if isinstance(texts, list) else 1 + input_ids = torch.ones(batch_size, 5, dtype=torch.long) + return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)} + + return FakeModel, FakeTokenizer + + def test_all_kl_values_non_negative(self, models_and_tokenizer): + FakeModel, FakeTokenizer = models_and_tokenizer + model_a = FakeModel(peak_idx=0) + model_b = FakeModel(peak_idx=3) + tokenizer = FakeTokenizer() + + result = first_token_kl_on_prompts( + model_a, model_b, tokenizer, + ["a", "b", "c", "d", "e"], + ) + for val in result["per_prompt_kl"]: + assert val >= 0.0, f"KL value {val} is negative" diff --git a/tests/test_informed_pipeline.py b/tests/test_informed_pipeline.py new file mode 100644 index 0000000..4890036 --- /dev/null +++ b/tests/test_informed_pipeline.py @@ -0,0 +1,385 @@ +"""Tests for the Analysis-Informed Abliteration Pipeline.""" + +from __future__ import annotations + + +import pytest +import torch + +from obliteratus.informed_pipeline import ( + AnalysisInsights, + InformedAbliterationPipeline, + InformedPipelineReport, + INFORMED_METHOD, +) +from obliteratus.abliterate import METHODS + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def insights(): + """Default AnalysisInsights for testing.""" + return AnalysisInsights() + + +@pytest.fixture +def pipeline(tmp_path): + """An InformedAbliterationPipeline with no model loaded.""" + return InformedAbliterationPipeline( + model_name="test-model", + output_dir=str(tmp_path / "test_informed"), + ) + + +# --------------------------------------------------------------------------- +# AnalysisInsights +# --------------------------------------------------------------------------- + +class TestAnalysisInsights: + def test_default_values(self, insights): + assert insights.detected_alignment_method == "unknown" + assert insights.alignment_confidence == 0.0 + assert insights.cone_is_polyhedral is False + assert insights.cone_dimensionality == 1.0 + assert insights.mean_pairwise_cosine == 1.0 + assert insights.per_category_directions == {} + assert insights.direction_specificity == {} + assert insights.cluster_count == 0 + assert insights.direction_persistence == 0.0 + assert insights.use_sparse_surgery is False + assert insights.recommended_n_directions == 4 + assert insights.recommended_regularization == 0.0 + assert insights.recommended_refinement_passes == 2 + assert insights.recommended_layers == [] + assert insights.skip_layers == [] + + def test_default_robustness(self, insights): + assert insights.estimated_robustness == "unknown" + assert insights.self_repair_estimate == 0.0 + assert insights.entanglement_score == 0.0 + assert insights.entangled_layers == [] + assert insights.clean_layers == [] + + +class TestInformedPipelineReport: + def test_default_report(self): + insights = AnalysisInsights() + report = InformedPipelineReport(insights=insights) + assert report.analysis_duration == 0.0 + assert report.total_duration == 0.0 + assert report.ouroboros_passes == 0 + assert report.final_refusal_rate == 0.0 + assert report.stages == [] + + +# --------------------------------------------------------------------------- +# Method preset +# --------------------------------------------------------------------------- + +class TestInformedMethod: + def test_informed_method_in_abliterate_methods(self): + assert "informed" in METHODS + cfg = METHODS["informed"] + assert cfg["norm_preserve"] is True + assert cfg["project_biases"] is True + assert cfg["use_chat_template"] is True + assert cfg["use_whitened_svd"] is True + assert cfg["true_iterative_refinement"] is True + + def test_informed_method_standalone(self): + assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)" + assert INFORMED_METHOD["n_directions"] == 4 + assert INFORMED_METHOD["norm_preserve"] is True + + +# --------------------------------------------------------------------------- +# Pipeline initialization +# --------------------------------------------------------------------------- + +class TestPipelineInit: + def test_method_set_to_informed(self, pipeline): + assert pipeline.method == "informed" + + def test_default_analysis_flags(self, pipeline): + assert pipeline._run_cone is True + assert pipeline._run_alignment is True + assert pipeline._run_cross_layer is True + assert pipeline._run_sparse is True + assert pipeline._run_defense is True + + def test_ouroboros_defaults(self, pipeline): + assert pipeline._ouroboros_threshold == 0.5 + assert pipeline._max_ouroboros_passes == 3 + + def test_entanglement_gate(self, pipeline): + assert pipeline._entanglement_gate == 0.8 + + def test_inherits_base_pipeline(self, pipeline): + assert pipeline.norm_preserve is True + assert pipeline.project_biases is True + assert pipeline.use_chat_template is True + assert pipeline.use_whitened_svd is True + assert pipeline.true_iterative_refinement is True + + def test_custom_flags(self): + p = InformedAbliterationPipeline( + model_name="test", + run_cone_analysis=False, + run_alignment_detection=False, + ouroboros_threshold=0.3, + max_ouroboros_passes=5, + entanglement_gate=0.9, + ) + assert p._run_cone is False + assert p._run_alignment is False + assert p._ouroboros_threshold == 0.3 + assert p._max_ouroboros_passes == 5 + assert p._entanglement_gate == 0.9 + + +# --------------------------------------------------------------------------- +# Configuration derivation +# --------------------------------------------------------------------------- + +class TestConfigurationDerivation: + """Test the _derive_configuration logic with various insights.""" + + def _make_pipeline_with_insights(self, **kwargs): + p = InformedAbliterationPipeline( + model_name="test", + on_log=lambda m: None, + ) + for k, v in kwargs.items(): + setattr(p._insights, k, v) + return p + + def test_polyhedral_cone_more_directions(self): + p = self._make_pipeline_with_insights( + cone_is_polyhedral=True, + cone_dimensionality=3.5, + ) + p._derive_configuration() + # Polyhedral with dim 3.5 → n_dirs = max(4, min(8, int(3.5*2))) = 7 + assert p.n_directions == 7 + + def test_linear_cone_fewer_directions(self): + p = self._make_pipeline_with_insights( + cone_is_polyhedral=False, + cone_dimensionality=1.0, + ) + p._derive_configuration() + # Linear with dim 1.0 → n_dirs = max(1, min(4, int(1.0+1))) = 2 + assert p.n_directions == 2 + + def test_dpo_zero_regularization(self): + p = self._make_pipeline_with_insights( + detected_alignment_method="dpo", + entanglement_score=0.1, + ) + p._derive_configuration() + assert p.regularization == 0.0 + + def test_rlhf_moderate_regularization(self): + p = self._make_pipeline_with_insights( + detected_alignment_method="rlhf", + entanglement_score=0.2, + ) + p._derive_configuration() + assert p.regularization == 0.15 + + def test_cai_regularization(self): + p = self._make_pipeline_with_insights( + detected_alignment_method="cai", + entanglement_score=0.2, + ) + p._derive_configuration() + assert p.regularization == 0.2 + + def test_sft_low_regularization(self): + p = self._make_pipeline_with_insights( + detected_alignment_method="sft", + entanglement_score=0.1, + ) + p._derive_configuration() + assert p.regularization == 0.05 + + def test_high_entanglement_increases_regularization(self): + p = self._make_pipeline_with_insights( + detected_alignment_method="dpo", + entanglement_score=0.7, + ) + p._derive_configuration() + # DPO base = 0.0, + 0.15 for high entanglement = 0.15 + assert p.regularization == 0.15 + + def test_high_self_repair_more_passes(self): + p = self._make_pipeline_with_insights( + self_repair_estimate=0.8, + ) + p._derive_configuration() + assert p.refinement_passes == 3 + + def test_moderate_self_repair_two_passes(self): + p = self._make_pipeline_with_insights( + self_repair_estimate=0.5, + ) + p._derive_configuration() + assert p.refinement_passes == 2 + + def test_low_self_repair_one_pass(self): + p = self._make_pipeline_with_insights( + self_repair_estimate=0.2, + ) + p._derive_configuration() + assert p.refinement_passes == 1 + + def test_cluster_layers_used(self): + p = self._make_pipeline_with_insights( + cluster_representative_layers=[5, 10, 15], + direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]], + ) + p.refusal_directions = {i: torch.randn(64) for i in range(20)} + p._derive_configuration() + # Should include all cluster layers + assert 5 in p._insights.recommended_layers + assert 10 in p._insights.recommended_layers + + def test_entangled_layers_skipped(self): + p = self._make_pipeline_with_insights( + cluster_representative_layers=[5, 10, 15], + direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]], + entangled_layers=[10], + ) + p._derive_configuration() + # Layer 10 should be skipped + assert 10 not in p._insights.recommended_layers + assert 10 in p._insights.skip_layers + + def test_sparse_surgery_enabled_when_rsi_high(self): + p = self._make_pipeline_with_insights( + mean_refusal_sparsity_index=0.7, + ) + p._sparse_threshold = 0.5 + p._derive_configuration() + assert p._insights.use_sparse_surgery is True + + def test_sparse_surgery_disabled_when_rsi_low(self): + p = self._make_pipeline_with_insights( + mean_refusal_sparsity_index=0.3, + ) + p._sparse_threshold = 0.5 + p._derive_configuration() + assert p._insights.use_sparse_surgery is False + + def test_whitened_svd_for_multi_direction(self): + p = self._make_pipeline_with_insights( + cone_is_polyhedral=True, + cone_dimensionality=2.5, + ) + p._derive_configuration() + assert p.n_directions > 1 + assert p.use_whitened_svd is True + + def test_no_whitened_svd_for_single_direction(self): + p = self._make_pipeline_with_insights( + cone_is_polyhedral=False, + cone_dimensionality=0.5, + ) + p._derive_configuration() + # dim 0.5 → max(1, min(4, int(0.5+1))) = 1 + assert p.n_directions == 1 + assert p.use_whitened_svd is False + + +# --------------------------------------------------------------------------- +# Format report +# --------------------------------------------------------------------------- + +class TestFormatInsights: + def test_format_default(self, insights): + text = InformedAbliterationPipeline.format_insights(insights) + assert "Analysis-Informed Pipeline" in text + assert "UNKNOWN" in text # detected method + assert "LINEAR" in text # cone type + + def test_format_polyhedral(self): + insights = AnalysisInsights( + detected_alignment_method="dpo", + alignment_confidence=0.85, + cone_is_polyhedral=True, + cone_dimensionality=3.5, + cluster_count=4, + ) + text = InformedAbliterationPipeline.format_insights(insights) + assert "DPO" in text + assert "POLYHEDRAL" in text + assert "3.50" in text + + def test_format_includes_derived_config(self, insights): + insights.recommended_n_directions = 6 + insights.recommended_regularization = 0.2 + insights.recommended_refinement_passes = 3 + text = InformedAbliterationPipeline.format_insights(insights) + assert "n_directions: 6" in text + assert "regularization: 0.2" in text + assert "refinement_passes: 3" in text + + +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + +class TestEdgeCases: + def test_no_cluster_layers_falls_back(self): + p = InformedAbliterationPipeline( + model_name="test", + on_log=lambda m: None, + ) + p._insights.cluster_representative_layers = [] + p._derive_configuration() + assert p._insights.recommended_layers == [] + + def test_regularization_capped(self): + p = InformedAbliterationPipeline( + model_name="test", + on_log=lambda m: None, + ) + p._insights.detected_alignment_method = "cai" + p._insights.entanglement_score = 0.9 + p._derive_configuration() + # CAI base = 0.2, + 0.15 = 0.35, capped at 0.5 + assert p.regularization <= 0.5 + + def test_all_layers_entangled_keeps_some(self): + """If all cluster layers are entangled, don't skip all of them.""" + p = InformedAbliterationPipeline( + model_name="test", + on_log=lambda m: None, + ) + p._insights.cluster_representative_layers = [5] + p._insights.direction_clusters = [[5]] + p._insights.entangled_layers = [5] + p._derive_configuration() + # Should NOT skip the only layer + assert 5 in p._insights.recommended_layers + + def test_cone_dimensionality_bounds(self): + """Extreme cone dimensionality values are handled.""" + p = InformedAbliterationPipeline( + model_name="test", + on_log=lambda m: None, + ) + # Very high dimensionality + p._insights.cone_is_polyhedral = True + p._insights.cone_dimensionality = 10.0 + p._derive_configuration() + assert p.n_directions <= 8 # capped + + # Very low dimensionality + p._insights.cone_is_polyhedral = False + p._insights.cone_dimensionality = 0.1 + p._derive_configuration() + assert p.n_directions >= 1 # at least 1 diff --git a/tests/test_logit_lens.py b/tests/test_logit_lens.py new file mode 100644 index 0000000..fd66dd5 --- /dev/null +++ b/tests/test_logit_lens.py @@ -0,0 +1,172 @@ +"""Tests for logit lens refusal direction analysis.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import torch + +from obliteratus.analysis.logit_lens import ( + RefusalLogitLens, + LogitLensResult, + MultiLayerLogitLensResult, + REFUSAL_TOKENS, + COMPLIANCE_TOKENS, +) + + +def _make_mock_model(hidden_dim=32, vocab_size=100): + """Create a mock model with LM head and layer norm.""" + model = MagicMock() + + # LM head weight (vocab_size, hidden_dim) + lm_head = MagicMock() + lm_head.weight = MagicMock() + lm_head.weight.data = torch.randn(vocab_size, hidden_dim) + model.lm_head = lm_head + + # Final LayerNorm + ln_f = MagicMock() + ln_f.weight = MagicMock() + ln_f.weight.data = torch.ones(hidden_dim) + ln_f.bias = MagicMock() + ln_f.bias.data = torch.zeros(hidden_dim) + model.transformer = MagicMock() + model.transformer.ln_f = ln_f + + return model + + +def _make_mock_tokenizer(vocab_size=100): + """Create a mock tokenizer.""" + tokenizer = MagicMock() + + def mock_decode(ids): + if isinstance(ids, list) and len(ids) == 1: + return f"tok_{ids[0]}" + return f"tok_{ids}" + + def mock_encode(text, add_special_tokens=False): + # Return a deterministic token ID based on the text + return [hash(text) % vocab_size] + + tokenizer.decode = mock_decode + tokenizer.encode = mock_encode + return tokenizer + + +class TestRefusalLogitLens: + def test_basic_analysis(self): + """Should produce a LogitLensResult with expected fields.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + direction = torch.randn(32) + + lens = RefusalLogitLens(top_k=10) + result = lens.analyze_direction(direction, model, tokenizer, layer_idx=5) + + assert isinstance(result, LogitLensResult) + assert result.layer_idx == 5 + assert len(result.top_promoted) == 10 + assert len(result.top_suppressed) == 10 + assert isinstance(result.refusal_specificity, float) + assert isinstance(result.logit_effect_entropy, float) + assert isinstance(result.refusal_compliance_gap, float) + + def test_promoted_suppressed_ordering(self): + """Top promoted should have higher logit boost than top suppressed.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + direction = torch.randn(32) + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_direction(direction, model, tokenizer) + + # Promoted tokens should have positive-ish values + # Suppressed tokens should have negative-ish values + max_promoted = max(v for _, v in result.top_promoted) + min_suppressed = min(v for _, v in result.top_suppressed) + assert max_promoted > min_suppressed + + def test_multi_layer_analysis(self): + """Should analyze multiple layers.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + directions = {0: torch.randn(32), 1: torch.randn(32), 2: torch.randn(32)} + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_all_layers(directions, model, tokenizer) + + assert isinstance(result, MultiLayerLogitLensResult) + assert len(result.per_layer) == 3 + assert result.strongest_refusal_layer in [0, 1, 2] + assert result.peak_specificity_layer in [0, 1, 2] + + def test_strong_layers_filter(self): + """Should only analyze specified strong layers.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + directions = {i: torch.randn(32) for i in range(10)} + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_all_layers( + directions, model, tokenizer, strong_layers=[2, 5] + ) + assert set(result.per_layer.keys()) == {2, 5} + + def test_handles_unnormalized_direction(self): + """Should handle non-unit directions.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + direction = torch.randn(32) * 100.0 # large magnitude + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_direction(direction, model, tokenizer) + # Should still produce valid results + assert len(result.top_promoted) == 5 + + def test_format_report(self): + """Format report should produce readable output.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + directions = {0: torch.randn(32), 1: torch.randn(32)} + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_all_layers(directions, model, tokenizer) + report = RefusalLogitLens.format_report(result) + assert "Logit Lens" in report + assert "Layer 0:" in report + + def test_empty_directions(self): + """Should handle empty input gracefully.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_all_layers({}, model, tokenizer) + assert len(result.per_layer) == 0 + + def test_token_lists_nonempty(self): + """Refusal and compliance token lists should have entries.""" + assert len(REFUSAL_TOKENS) > 10 + assert len(COMPLIANCE_TOKENS) > 10 + + def test_entropy_nonnegative(self): + """Logit effect entropy should be non-negative.""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + direction = torch.randn(32) + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_direction(direction, model, tokenizer) + assert result.logit_effect_entropy >= 0 + + def test_2d_direction_input(self): + """Should handle 2D direction input (unsqueezed).""" + model = _make_mock_model() + tokenizer = _make_mock_tokenizer() + direction = torch.randn(1, 32) + + lens = RefusalLogitLens(top_k=5) + result = lens.analyze_direction(direction, model, tokenizer) + assert len(result.top_promoted) == 5 diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 0000000..004eee9 --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,60 @@ +"""Tests for evaluation metrics.""" + +from __future__ import annotations + + +import torch + +from obliteratus.evaluation.metrics import accuracy, f1_score_metric, perplexity + + +class TestPerplexity: + def test_perfect_prediction(self): + # Create logits that strongly predict the correct next token + vocab_size = 10 + seq_len = 5 + batch_size = 1 + + labels = torch.tensor([[0, 1, 2, 3, 4]]) + logits = torch.full((batch_size, seq_len, vocab_size), -100.0) + # Set high logit for the correct next token + for t in range(seq_len - 1): + logits[0, t, labels[0, t + 1]] = 100.0 + + ppl = perplexity(logits, labels) + assert ppl < 2.0, f"Expected near-1 perplexity, got {ppl}" + + def test_random_prediction_higher(self): + vocab_size = 100 + seq_len = 20 + batch_size = 2 + + torch.manual_seed(42) + logits = torch.randn(batch_size, seq_len, vocab_size) + labels = torch.randint(0, vocab_size, (batch_size, seq_len)) + + ppl = perplexity(logits, labels) + assert ppl > 10, f"Random logits should yield high perplexity, got {ppl}" + + +class TestAccuracy: + def test_perfect(self): + assert accuracy([1, 2, 3], [1, 2, 3]) == 1.0 + + def test_zero(self): + assert accuracy([1, 2, 3], [4, 5, 6]) == 0.0 + + def test_partial(self): + assert accuracy([1, 2, 3, 4], [1, 2, 0, 0]) == 0.5 + + def test_empty(self): + assert accuracy([], []) == 0.0 + + +class TestF1: + def test_perfect(self): + assert f1_score_metric([0, 1, 0, 1], [0, 1, 0, 1]) == 1.0 + + def test_zero(self): + score = f1_score_metric([0, 0, 0, 0], [1, 1, 1, 1]) + assert score == 0.0 diff --git a/tests/test_module_imports.py b/tests/test_module_imports.py new file mode 100644 index 0000000..d564891 --- /dev/null +++ b/tests/test_module_imports.py @@ -0,0 +1,85 @@ +"""Smoke tests verifying all new modules are importable from package level.""" + +from __future__ import annotations + + +class TestTopLevelImports: + """Verify obliteratus top-level exports.""" + + def test_set_seed(self): + from obliteratus import set_seed + assert callable(set_seed) + + def test_run_sweep(self): + from obliteratus import run_sweep + assert callable(run_sweep) + + def test_sweep_config(self): + from obliteratus import SweepConfig + cfg = SweepConfig( + model_name="test", + sweep_params={"n_directions": [1, 2]}, + ) + assert cfg.model_name == "test" + + def test_sweep_result(self): + from obliteratus import SweepResult + r = SweepResult( + params={"n_directions": 1}, + seed=42, + quality_metrics={}, + stage_durations={}, + strong_layers=[], + ) + assert r.seed == 42 + + +class TestEvaluationImports: + """Verify evaluation subpackage exports.""" + + def test_refusal_rate_with_ci(self): + from obliteratus.evaluation import refusal_rate_with_ci + result = refusal_rate_with_ci(["Sure, here you go."], mode="combined") + assert result["rate"] == 0.0 + assert result["n_samples"] == 1 + + def test_random_direction_ablation(self): + from obliteratus.evaluation import random_direction_ablation + assert callable(random_direction_ablation) + + def test_direction_specificity_test(self): + from obliteratus.evaluation import direction_specificity_test + assert callable(direction_specificity_test) + + def test_run_benchmarks(self): + from obliteratus.evaluation import run_benchmarks + assert callable(run_benchmarks) + + def test_compare_models(self): + from obliteratus.evaluation import compare_models + assert callable(compare_models) + + +class TestDirectImports: + """Verify direct module imports still work.""" + + def test_reproducibility(self): + from obliteratus.reproducibility import set_seed + import torch + set_seed(999, deterministic=False) + a = torch.randn(10) + set_seed(999, deterministic=False) + b = torch.randn(10) + assert torch.equal(a, b) + + def test_baselines(self): + from obliteratus.evaluation.baselines import ( + BaselineResult, + ) + assert BaselineResult is not None + + def test_lm_eval_integration(self): + from obliteratus.evaluation.lm_eval_integration import ( + run_benchmarks, + ) + assert callable(run_benchmarks) diff --git a/tests/test_new_analysis_modules.py b/tests/test_new_analysis_modules.py new file mode 100644 index 0000000..e2060d7 --- /dev/null +++ b/tests/test_new_analysis_modules.py @@ -0,0 +1,672 @@ +"""Tests for the five new analysis modules: + 1. Tuned Lens (learned-affine logit lens variant) + 2. Activation Patching (real interchange intervention) + 3. Enhanced SAE Decomposition Pipeline + 4. Wasserstein-Optimal Direction Extraction + 5. Bayesian-Optimized Kernel Projection +""" + +from __future__ import annotations + + +import pytest +import torch +import torch.nn as nn + +from obliteratus.analysis.tuned_lens import ( + TunedLensTrainer, + TunedLensProbe, + RefusalTunedLens, + TunedLensResult, + MultiLayerTunedLensResult, +) +from obliteratus.analysis.activation_patching import ( + ActivationPatcher, + PatchingSite, + ActivationPatchingResult, +) +from obliteratus.analysis.sae_abliteration import ( + SAEDecompositionPipeline, + SAEDecompositionResult, + FeatureClusterResult, +) +from obliteratus.analysis.wasserstein_optimal import ( + WassersteinOptimalExtractor, + WassersteinDirectionResult, + WassersteinComparisonResult, + MultiLayerWassersteinResult, +) +from obliteratus.analysis.bayesian_kernel_projection import ( + BayesianKernelProjection, + BayesianOptimizationResult, + ProjectionConfig, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_activations( + hidden_dim=32, n_per_class=20, separation=2.0, seed=42, +): + """Create harmful/harmless activations with planted refusal signal.""" + torch.manual_seed(seed) + direction = torch.randn(hidden_dim) + direction = direction / direction.norm() + + harmful = [ + torch.randn(hidden_dim) * 0.3 + separation * direction + for _ in range(n_per_class) + ] + harmless = [ + torch.randn(hidden_dim) * 0.3 + for _ in range(n_per_class) + ] + return harmful, harmless, direction + + +def _make_multilayer_activations( + n_layers=6, hidden_dim=32, n_per_class=20, separation=2.0, seed=42, +): + """Create per-layer activations with planted refusal signals.""" + torch.manual_seed(seed) + + harmful_acts = {} + harmless_acts = {} + directions = {} + + for li in range(n_layers): + d = torch.randn(hidden_dim) + d = d / d.norm() + directions[li] = d + + strength = separation if 1 <= li <= n_layers - 2 else 0.3 + harmful_acts[li] = [ + torch.randn(hidden_dim) * 0.3 + strength * d + for _ in range(n_per_class) + ] + harmless_acts[li] = [ + torch.randn(hidden_dim) * 0.3 + for _ in range(n_per_class) + ] + + return harmful_acts, harmless_acts, directions + + +class FakeTokenizer: + """Fake tokenizer that maps strings to reproducible token IDs.""" + + def __init__(self, vocab_size=100): + self.vocab_size = vocab_size + + def encode(self, text, add_special_tokens=False): + return [hash(text) % self.vocab_size] + + def decode(self, ids): + return f"tok_{ids[0]}" + + +class FakeModel(nn.Module): + """Fake model with lm_head and transformer.ln_f for testing.""" + + def __init__(self, hidden_dim=32, vocab_size=100, n_layers=4): + super().__init__() + self.hidden_dim = hidden_dim + self.vocab_size = vocab_size + self.n_layers = n_layers + + self.lm_head = nn.Linear(hidden_dim, vocab_size, bias=False) + self.transformer = nn.Module() + self.transformer.ln_f = nn.LayerNorm(hidden_dim) + self.transformer.h = nn.ModuleList([ + nn.Linear(hidden_dim, hidden_dim) for _ in range(n_layers) + ]) + + def forward(self, input_ids): + # Fake forward pass + batch_size, seq_len = input_ids.shape + x = torch.randn(batch_size, seq_len, self.hidden_dim) + for layer in self.transformer.h: + x = layer(x) + x + logits = self.lm_head(self.transformer.ln_f(x)) + return type('Output', (), {'logits': logits})() + + +# =========================================================================== +# Tests: Tuned Lens +# =========================================================================== + +class TestTunedLensTrainer: + def test_train_single_probe(self): + hidden_dim = 16 + n_samples = 30 + + layer_acts = torch.randn(n_samples, hidden_dim) + final_acts = layer_acts + torch.randn(n_samples, hidden_dim) * 0.1 + + trainer = TunedLensTrainer(hidden_dim, n_epochs=20) + probe = trainer.train_probe(layer_acts, final_acts, layer_idx=3) + + assert isinstance(probe, TunedLensProbe) + assert probe.layer_idx == 3 + assert probe.weight.shape == (hidden_dim, hidden_dim) + assert probe.bias.shape == (hidden_dim,) + assert probe.train_loss < 1.0 # should converge somewhat + + def test_train_all_layers(self): + hidden_dim = 16 + n_samples = 20 + + layer_acts = { + i: torch.randn(n_samples, hidden_dim) for i in range(4) + } + final_acts = torch.randn(n_samples, hidden_dim) + + trainer = TunedLensTrainer(hidden_dim, n_epochs=10) + probes = trainer.train_all_layers(layer_acts, final_acts) + + assert len(probes) == 4 + for i in range(4): + assert i in probes + assert probes[i].weight.shape == (hidden_dim, hidden_dim) + + def test_probe_near_identity_for_final_layer(self): + """Probe for the final layer should be close to identity.""" + hidden_dim = 16 + n_samples = 50 + + acts = torch.randn(n_samples, hidden_dim) + trainer = TunedLensTrainer(hidden_dim, n_epochs=50) + probe = trainer.train_probe(acts, acts, layer_idx=0) + + # Weight should be close to identity + identity = torch.eye(hidden_dim) + diff = (probe.weight - identity).norm().item() + assert diff < 1.0 + + +class TestRefusalTunedLens: + def test_analyze_direction(self): + hidden_dim = 32 + vocab_size = 100 + + model = FakeModel(hidden_dim, vocab_size) + tokenizer = FakeTokenizer(vocab_size) + + direction = torch.randn(hidden_dim) + probe = TunedLensProbe( + layer_idx=2, + weight=torch.eye(hidden_dim) + torch.randn(hidden_dim, hidden_dim) * 0.01, + bias=torch.zeros(hidden_dim), + train_loss=0.01, + ) + + lens = RefusalTunedLens(top_k=10) + result = lens.analyze_direction(direction, probe, model, tokenizer) + + assert isinstance(result, TunedLensResult) + assert result.layer_idx == 2 + assert len(result.top_promoted) <= 10 + assert len(result.top_suppressed) <= 10 + assert isinstance(result.correction_magnitude, float) + assert result.correction_magnitude >= 0 + + def test_analyze_all_layers(self): + hidden_dim = 32 + vocab_size = 100 + + model = FakeModel(hidden_dim, vocab_size) + tokenizer = FakeTokenizer(vocab_size) + + directions = { + i: torch.randn(hidden_dim) for i in range(4) + } + probes = { + i: TunedLensProbe( + layer_idx=i, + weight=torch.eye(hidden_dim), + bias=torch.zeros(hidden_dim), + train_loss=0.01, + ) + for i in range(4) + } + + lens = RefusalTunedLens(top_k=5) + result = lens.analyze_all_layers(directions, probes, model, tokenizer) + + assert isinstance(result, MultiLayerTunedLensResult) + assert len(result.per_layer) == 4 + assert result.strongest_refusal_layer in range(4) + + def test_compare_with_logit_lens(self): + logit_gaps = {0: 0.1, 1: 0.5, 2: 0.3, 3: 0.8} + + tuned_result = MultiLayerTunedLensResult( + per_layer={ + i: TunedLensResult( + layer_idx=i, + top_promoted=[], top_suppressed=[], + refusal_token_mean_boost=0.0, + compliance_token_mean_boost=0.0, + refusal_compliance_gap=v * 1.1, # similar ranking + correction_magnitude=0.1, + ) + for i, v in logit_gaps.items() + }, + probes={}, + strongest_refusal_layer=3, + peak_gap_layer=3, + mean_refusal_compliance_gap=0.5, + logit_lens_agreement=0.0, + ) + + agreement = RefusalTunedLens.compare_with_logit_lens(tuned_result, logit_gaps) + # Same ranking → correlation should be 1.0 + assert agreement == pytest.approx(1.0, abs=0.01) + + def test_format_report(self): + result = MultiLayerTunedLensResult( + per_layer={}, + probes={}, + strongest_refusal_layer=0, + peak_gap_layer=0, + mean_refusal_compliance_gap=0.0, + logit_lens_agreement=0.0, + ) + report = RefusalTunedLens.format_report(result) + assert "Tuned Lens" in report + assert "No layers analyzed" in report + + +# =========================================================================== +# Tests: Activation Patching +# =========================================================================== + +class TestActivationPatcher: + def test_patching_site_creation(self): + site = PatchingSite(layer_idx=3, component="residual") + assert site.layer_idx == 3 + assert site.component == "residual" + assert site.head_idx is None + + def test_patching_site_with_head(self): + site = PatchingSite(layer_idx=2, component="attn_head", head_idx=5) + assert site.head_idx == 5 + + def test_patch_sweep_with_model(self): + """Test full patching sweep on fake model.""" + hidden_dim = 32 + model = FakeModel(hidden_dim, vocab_size=100, n_layers=4) + + clean_ids = torch.randint(0, 100, (1, 10)) + corrupted_ids = torch.randint(0, 100, (1, 10)) + + patcher = ActivationPatcher(significance_threshold=0.05) + + result = patcher.patch_sweep( + model, clean_ids, corrupted_ids, + mode="noising", + ) + + assert isinstance(result, ActivationPatchingResult) + assert result.patching_mode == "noising" + assert result.n_layers == 4 + assert len(result.effects) > 0 + assert isinstance(result.circuit_fraction, float) + assert 0.0 <= result.circuit_fraction <= 1.0 + + def test_patch_sweep_denoising(self): + hidden_dim = 32 + model = FakeModel(hidden_dim, vocab_size=100, n_layers=4) + + clean_ids = torch.randint(0, 100, (1, 10)) + corrupted_ids = torch.randint(0, 100, (1, 10)) + + patcher = ActivationPatcher() + result = patcher.patch_sweep( + model, clean_ids, corrupted_ids, + mode="denoising", + ) + + assert result.patching_mode == "denoising" + + def test_custom_metric(self): + hidden_dim = 32 + model = FakeModel(hidden_dim, vocab_size=100, n_layers=4) + + clean_ids = torch.randint(0, 100, (1, 10)) + corrupted_ids = torch.randint(0, 100, (1, 10)) + + def custom_metric(logits): + return logits.sum().item() + + patcher = ActivationPatcher(metric_fn=custom_metric) + result = patcher.patch_sweep(model, clean_ids, corrupted_ids) + + assert isinstance(result, ActivationPatchingResult) + assert isinstance(result.clean_baseline, float) + + def test_format_report(self): + result = ActivationPatchingResult( + n_layers=4, + n_sites=4, + patching_mode="noising", + effects=[], + clean_baseline=1.0, + corrupted_baseline=0.0, + total_effect=1.0, + significant_sites=[], + circuit_fraction=0.0, + top_causal_layers=[], + ) + report = ActivationPatcher.format_report(result) + assert "Activation Patching" in report + assert "noising" in report + + +# =========================================================================== +# Tests: Enhanced SAE Decomposition Pipeline +# =========================================================================== + +class TestSAEDecompositionPipeline: + def test_basic_pipeline(self): + harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=2.0) + + pipeline = SAEDecompositionPipeline( + expansion=2, n_epochs=10, top_k_features=8, n_clusters=3, + ) + result = pipeline.run(harmful, harmless, layer_idx=0) + + assert isinstance(result, SAEDecompositionResult) + assert result.layer_idx == 0 + assert result.sae is not None + assert result.refusal_features.n_refusal_features == 8 + assert len(result.feature_sparsity) == 8 + assert len(result.feature_monosemanticity) == 8 + assert len(result.per_feature_refusal_reduction) == 8 + assert len(result.cumulative_refusal_reduction) == 8 + assert 0.0 <= result.raw_direction_overlap <= 1.0 + + def test_feature_clustering(self): + harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30) + + pipeline = SAEDecompositionPipeline( + expansion=2, n_epochs=10, top_k_features=8, n_clusters=3, + ) + result = pipeline.run(harmful, harmless) + + clusters = result.feature_clusters + assert clusters is not None + assert isinstance(clusters, FeatureClusterResult) + assert clusters.n_clusters == 3 + assert len(clusters.cluster_labels) == 8 + assert all(0 <= lbl < 3 for lbl in clusters.cluster_labels) + assert clusters.cluster_directions.shape[0] == 3 + assert -1.0 <= clusters.silhouette_score <= 1.0 + + def test_cumulative_reduction_monotonic(self): + harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=3.0) + + pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=10, top_k_features=6) + result = pipeline.run(harmful, harmless) + + # Cumulative reduction should be non-decreasing + for i in range(1, len(result.cumulative_refusal_reduction)): + assert result.cumulative_refusal_reduction[i] >= result.cumulative_refusal_reduction[i - 1] - 1e-6 + + def test_format_report(self): + harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20) + pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=5, top_k_features=4, n_clusters=2) + result = pipeline.run(harmful, harmless) + + report = SAEDecompositionPipeline.format_report(result) + assert "SAE Feature Decomposition" in report + assert "Variance explained" in report + + +# =========================================================================== +# Tests: Wasserstein-Optimal Direction Extraction +# =========================================================================== + +class TestWassersteinOptimalExtractor: + def test_basic_extraction(self): + harmful, harmless, planted_dir = _make_activations( + hidden_dim=32, n_per_class=30, separation=3.0, + ) + + extractor = WassersteinOptimalExtractor() + result = extractor.extract(harmful, harmless, layer_idx=0) + + assert isinstance(result, WassersteinDirectionResult) + assert result.layer_idx == 0 + assert result.direction.shape == (32,) + assert abs(result.direction.norm().item() - 1.0) < 1e-5 + assert result.wasserstein_cost >= 0 + assert result.mean_shift_component >= 0 + assert result.bures_component >= 0 + assert result.cost_effectiveness_ratio >= 0 + + def test_direction_captures_signal(self): + """Wasserstein direction should have non-trivial refusal projection.""" + harmful, harmless, planted_dir = _make_activations( + hidden_dim=32, n_per_class=30, separation=3.0, + ) + + extractor = WassersteinOptimalExtractor() + result = extractor.extract(harmful, harmless) + + # Direction should have some alignment with planted signal + cosine = abs((result.direction @ planted_dir).item()) + assert cosine > 0.1 # not totally orthogonal + + def test_extract_all_layers(self): + harmful_acts, harmless_acts, _ = _make_multilayer_activations( + n_layers=4, hidden_dim=16, n_per_class=20, + ) + + extractor = WassersteinOptimalExtractor() + result = extractor.extract_all_layers(harmful_acts, harmless_acts) + + assert isinstance(result, MultiLayerWassersteinResult) + assert len(result.per_layer) == 4 + assert result.best_layer in range(4) + assert result.mean_cost_ratio >= 0 + + def test_compare_with_alternatives(self): + harmful, harmless, planted_dir = _make_activations( + hidden_dim=16, n_per_class=30, separation=3.0, + ) + + extractor = WassersteinOptimalExtractor() + w_result = extractor.extract(harmful, harmless) + + # Use planted direction as "Fisher" and diff-in-means + H = torch.stack(harmful).float() + B = torch.stack(harmless).float() + dim_dir = (H.mean(0) - B.mean(0)) + dim_dir = dim_dir / dim_dir.norm() + + comparison = extractor.compare_with_alternatives( + w_result, harmful, harmless, + fisher_direction=planted_dir, + dim_direction=dim_dir, + ) + + assert isinstance(comparison, WassersteinComparisonResult) + assert comparison.wasserstein_cost_ratio >= 0 + assert comparison.fisher_cost_ratio is not None + assert comparison.dim_cost_ratio is not None + assert 0 <= comparison.cosine_wasserstein_fisher <= 1 + assert 0 <= comparison.cosine_wasserstein_dim <= 1 + + def test_wasserstein_lower_cost_than_dim(self): + """Wasserstein-optimal should have lower cost ratio than diff-in-means.""" + harmful, harmless, _ = _make_activations( + hidden_dim=32, n_per_class=50, separation=2.0, + ) + + extractor = WassersteinOptimalExtractor() + w_result = extractor.extract(harmful, harmless) + + H = torch.stack(harmful).float() + B = torch.stack(harmless).float() + dim_dir = (H.mean(0) - B.mean(0)) + dim_dir = dim_dir / dim_dir.norm() + + comparison = extractor.compare_with_alternatives( + w_result, harmful, harmless, dim_direction=dim_dir, + ) + + # Wasserstein should have lower or equal cost ratio by construction + assert comparison.wasserstein_cost_ratio <= comparison.dim_cost_ratio + 1e-4 + + def test_format_report(self): + harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20) + extractor = WassersteinOptimalExtractor() + result = extractor.extract_all_layers( + {0: harmful, 1: harmful}, + {0: harmless, 1: harmless}, + ) + report = WassersteinOptimalExtractor.format_report(result) + assert "Wasserstein" in report + assert "cost ratio" in report.lower() + + +# =========================================================================== +# Tests: Bayesian-Optimized Kernel Projection +# =========================================================================== + +class TestBayesianKernelProjection: + def test_basic_optimization(self): + harmful_acts, harmless_acts, directions = _make_multilayer_activations( + n_layers=6, hidden_dim=16, n_per_class=20, + ) + + optimizer = BayesianKernelProjection( + n_trials=30, refusal_weight=0.6, distortion_weight=0.4, + ) + result = optimizer.optimize(harmful_acts, harmless_acts, directions) + + assert isinstance(result, BayesianOptimizationResult) + assert result.n_trials == 30 + assert result.best_score >= 0 + assert 0 <= result.best_refusal_reduction <= 1.0 + assert result.best_harmless_distortion >= 0 + assert len(result.all_trials) == 30 + + def test_best_config_structure(self): + harmful_acts, harmless_acts, directions = _make_multilayer_activations( + n_layers=4, hidden_dim=16, n_per_class=15, + ) + + optimizer = BayesianKernelProjection(n_trials=20) + result = optimizer.optimize(harmful_acts, harmless_acts, directions) + + config = result.best_config + assert isinstance(config, ProjectionConfig) + assert config.layer_range[0] <= config.layer_range[1] + assert config.n_directions >= 1 + assert 0 <= config.regularization <= 0.5 + + def test_pareto_front(self): + harmful_acts, harmless_acts, directions = _make_multilayer_activations( + n_layers=6, hidden_dim=16, n_per_class=20, + ) + + optimizer = BayesianKernelProjection(n_trials=50) + result = optimizer.optimize(harmful_acts, harmless_acts, directions) + + # Pareto front should have at least 1 entry + assert len(result.pareto_configs) >= 1 + + # Pareto entries should be non-dominated + for i in range(len(result.pareto_configs) - 1): + # Each entry should have lower distortion than the next + # (since they're sorted by decreasing refusal reduction) + assert ( + result.pareto_configs[i].harmless_distortion + >= result.pareto_configs[i + 1].harmless_distortion - 1e-8 + ) + + def test_layer_importance(self): + harmful_acts, harmless_acts, directions = _make_multilayer_activations( + n_layers=6, hidden_dim=16, n_per_class=20, + ) + + optimizer = BayesianKernelProjection(n_trials=50) + result = optimizer.optimize(harmful_acts, harmless_acts, directions) + + assert len(result.layer_importance) == 6 + for _layer, imp in result.layer_importance.items(): + assert 0 <= imp <= 1.0 + + def test_tpe_improves_over_random(self): + """TPE phase should produce better configs than random exploration.""" + harmful_acts, harmless_acts, directions = _make_multilayer_activations( + n_layers=6, hidden_dim=16, n_per_class=20, + ) + + optimizer = BayesianKernelProjection(n_trials=60, seed=42) + result = optimizer.optimize(harmful_acts, harmless_acts, directions) + + # Compare average score of first 20 (random) vs last 20 (TPE) + first_20 = sorted(result.all_trials[:20], key=lambda t: t.combined_score) + last_20 = sorted(result.all_trials[-20:], key=lambda t: t.combined_score) + + best_random = first_20[0].combined_score + best_tpe = min(t.combined_score for t in last_20) + + # TPE should find at least as good (lower = better) + # This is probabilistic so we allow some slack + assert best_tpe <= best_random + 0.3 + + def test_empty_input(self): + optimizer = BayesianKernelProjection(n_trials=10) + result = optimizer.optimize({}, {}, {}) + + assert result.n_trials == 0 + assert result.best_score == 0.0 + + def test_format_report(self): + harmful_acts, harmless_acts, directions = _make_multilayer_activations( + n_layers=4, hidden_dim=16, n_per_class=15, + ) + + optimizer = BayesianKernelProjection(n_trials=20) + result = optimizer.optimize(harmful_acts, harmless_acts, directions) + + report = BayesianKernelProjection.format_report(result) + assert "Bayesian" in report + assert "Pareto" in report + assert "Layer importance" in report + + +# =========================================================================== +# Tests: Module imports +# =========================================================================== + +class TestModuleImports: + def test_all_new_modules_importable(self): + from obliteratus.analysis import TunedLensTrainer + from obliteratus.analysis import RefusalTunedLens + from obliteratus.analysis import ActivationPatcher + from obliteratus.analysis import WassersteinOptimalExtractor + from obliteratus.analysis import BayesianKernelProjection + from obliteratus.analysis import SAEDecompositionPipeline + + assert TunedLensTrainer is not None + assert RefusalTunedLens is not None + assert ActivationPatcher is not None + assert WassersteinOptimalExtractor is not None + assert BayesianKernelProjection is not None + assert SAEDecompositionPipeline is not None + + def test_new_modules_in_all(self): + import obliteratus.analysis as analysis + assert "TunedLensTrainer" in analysis.__all__ + assert "RefusalTunedLens" in analysis.__all__ + assert "ActivationPatcher" in analysis.__all__ + assert "WassersteinOptimalExtractor" in analysis.__all__ + assert "BayesianKernelProjection" in analysis.__all__ + assert "SAEDecompositionPipeline" in analysis.__all__ diff --git a/tests/test_novel_analysis.py b/tests/test_novel_analysis.py new file mode 100644 index 0000000..3da9a3a --- /dev/null +++ b/tests/test_novel_analysis.py @@ -0,0 +1,669 @@ +"""Tests for analysis techniques: concept cones, alignment imprints, +multi-token position, and sparse direction surgery.""" + +from __future__ import annotations + + +import torch + +from obliteratus.analysis.concept_geometry import ( + ConceptConeAnalyzer, + ConeConeResult, + MultiLayerConeResult, + CategoryDirection, + DEFAULT_HARM_CATEGORIES, +) +from obliteratus.analysis.alignment_imprint import ( + AlignmentImprintDetector, + AlignmentImprint, + BaseInstructDelta, +) +from obliteratus.analysis.multi_token_position import ( + MultiTokenPositionAnalyzer, + PositionAnalysisResult, + MultiTokenSummary, +) +from obliteratus.analysis.sparse_surgery import ( + SparseDirectionSurgeon, + SparseProjectionResult, + SparseSurgeryPlan, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_category_activations( + hidden_dim=32, n_prompts=30, n_categories=5, category_spread=0.3, +): + """Create synthetic activations with planted per-category refusal directions. + + Each category gets its own refusal direction, with some shared component + to simulate a polyhedral cone structure. + """ + torch.manual_seed(42) + + # Shared refusal component + shared = torch.randn(hidden_dim) + shared = shared / shared.norm() + + # Per-category unique components + cat_dirs = {} + categories = [f"cat_{i}" for i in range(n_categories)] + for cat in categories: + unique = torch.randn(hidden_dim) + unique = unique / unique.norm() + combined = shared + category_spread * unique + cat_dirs[cat] = combined / combined.norm() + + # Assign prompts to categories + prompts_per_cat = n_prompts // n_categories + category_map = {} + for i, cat in enumerate(categories): + for j in range(prompts_per_cat): + category_map[i * prompts_per_cat + j] = cat + + actual_n = prompts_per_cat * n_categories + + # Generate activations + harmful_acts = [] + harmless_acts = [] + for idx in range(actual_n): + cat = category_map[idx] + base = torch.randn(hidden_dim) * 0.1 + harmful_acts.append(base + 2.0 * cat_dirs[cat]) + harmless_acts.append(base) + + return harmful_acts, harmless_acts, category_map, cat_dirs + + +def _make_refusal_directions(n_layers=8, hidden_dim=32, concentration="distributed"): + """Create synthetic refusal directions with specified concentration pattern.""" + torch.manual_seed(123) + directions = {} + strengths = {} + + for i in range(n_layers): + d = torch.randn(hidden_dim) + directions[i] = d / d.norm() + + if concentration == "concentrated": + # Strong in last few layers only (SFT-like) + strengths[i] = 3.0 if i >= n_layers - 2 else 0.1 + elif concentration == "distributed": + # Even across layers (RLHF-like) + strengths[i] = 1.0 + 0.2 * torch.randn(1).item() + elif concentration == "orthogonal": + # Each layer direction is more orthogonal (CAI-like) + if i > 0: + # Make each direction more orthogonal to previous + prev = directions[i - 1] + d = d - (d @ prev) * prev + d = d / d.norm().clamp(min=1e-8) + directions[i] = d + strengths[i] = 1.5 + else: + strengths[i] = 2.0 if 2 <= i <= 4 else 0.5 + + return directions, strengths + + +# =========================================================================== +# Tests: Concept Cone Geometry +# =========================================================================== + +class TestConceptConeAnalyzer: + def test_basic_analysis(self): + harmful, harmless, cat_map, _ = _make_category_activations() + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_layer(harmful, harmless, layer_idx=5) + + assert isinstance(result, ConeConeResult) + assert result.layer_idx == 5 + assert result.category_count >= 2 + assert result.cone_dimensionality > 0 + assert result.cone_solid_angle >= 0 + assert 0 <= result.mean_pairwise_cosine <= 1.0 + + def test_polyhedral_detection(self): + """With spread-out categories, should detect polyhedral geometry.""" + harmful, harmless, cat_map, _ = _make_category_activations( + category_spread=2.0, # Large spread -> distinct directions + ) + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_layer(harmful, harmless) + # With high spread, directions should be more distinct + assert result.cone_dimensionality > 1.0 + + def test_linear_detection(self): + """With no spread, should detect linear (single direction) geometry.""" + harmful, harmless, cat_map, _ = _make_category_activations( + category_spread=0.0, # No spread -> all directions aligned + ) + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_layer(harmful, harmless) + assert result.mean_pairwise_cosine > 0.8 + + def test_category_directions_populated(self): + harmful, harmless, cat_map, _ = _make_category_activations() + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_layer(harmful, harmless) + + for cd in result.category_directions: + assert isinstance(cd, CategoryDirection) + assert cd.strength > 0 + assert cd.n_prompts >= 2 + assert 0 <= cd.specificity <= 1.0 + + def test_pairwise_cosines(self): + harmful, harmless, cat_map, _ = _make_category_activations() + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_layer(harmful, harmless) + + for (a, b), cos in result.pairwise_cosines.items(): + assert 0 <= cos <= 1.0 + assert a < b # Sorted pair + + def test_general_direction_unit(self): + harmful, harmless, cat_map, _ = _make_category_activations() + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_layer(harmful, harmless) + assert abs(result.general_direction.norm().item() - 1.0) < 0.01 + + def test_multi_layer_analysis(self): + harmful, harmless, cat_map, _ = _make_category_activations() + harmful_by_layer = {i: harmful for i in range(4)} + harmless_by_layer = {i: harmless for i in range(4)} + + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_all_layers(harmful_by_layer, harmless_by_layer) + + assert isinstance(result, MultiLayerConeResult) + assert len(result.per_layer) == 4 + assert result.mean_cone_dimensionality > 0 + + def test_format_report(self): + harmful, harmless, cat_map, _ = _make_category_activations() + analyzer = ConceptConeAnalyzer(category_map=cat_map) + result = analyzer.analyze_layer(harmful, harmless, layer_idx=3) + report = ConceptConeAnalyzer.format_report(result) + + assert "Concept Cone" in report + assert "Layer 3" in report + assert "dimensionality" in report + + def test_default_category_map(self): + assert len(DEFAULT_HARM_CATEGORIES) == 30 + cats = set(DEFAULT_HARM_CATEGORIES.values()) + assert "weapons" in cats + assert "cyber" in cats + + def test_empty_activations(self): + analyzer = ConceptConeAnalyzer() + result = analyzer.analyze_layer([], [], layer_idx=0) + assert result.category_count == 0 + + def test_min_category_size(self): + """Categories with too few prompts should be excluded.""" + harmful, harmless, cat_map, _ = _make_category_activations( + n_prompts=10, n_categories=5, + ) + analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=3) + result = analyzer.analyze_layer(harmful, harmless) + # Each category has only 2 prompts, so with min_size=3 all are excluded + assert result.category_count == 0 + + +# =========================================================================== +# Tests: Alignment Imprint Detector +# =========================================================================== + +class TestAlignmentImprintDetector: + def test_basic_detection(self): + directions, strengths = _make_refusal_directions() + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + + assert isinstance(imprint, AlignmentImprint) + assert imprint.predicted_method in ("dpo", "rlhf", "cai", "sft") + assert 0 <= imprint.confidence <= 1.0 + + def test_probabilities_sum_to_one(self): + directions, strengths = _make_refusal_directions() + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + + total = (imprint.dpo_probability + imprint.rlhf_probability + + imprint.cai_probability + imprint.sft_probability) + assert abs(total - 1.0) < 0.01 + + def test_concentrated_detects_sft_or_dpo(self): + """Concentrated refusal (tail-biased) should predict SFT or DPO.""" + directions, strengths = _make_refusal_directions(concentration="concentrated") + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + # SFT and DPO both have concentrated signatures + assert imprint.predicted_method in ("sft", "dpo") + + def test_distributed_detects_not_sft(self): + """Distributed refusal should not be predicted as SFT.""" + directions, strengths = _make_refusal_directions( + n_layers=16, concentration="distributed", + ) + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + # With distributed refusal, Gini is low -> SFT is unlikely to be top prediction + assert imprint.predicted_method != "sft" + + def test_orthogonal_detects_cai(self): + """Orthogonal layer directions should lean toward CAI.""" + directions, strengths = _make_refusal_directions( + n_layers=12, concentration="orthogonal", + ) + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + # CAI should rank highly due to orthogonality + assert imprint.cai_probability > 0.15 + + def test_feature_extraction(self): + directions, strengths = _make_refusal_directions() + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + + assert 0 <= imprint.gini_coefficient <= 1.0 + assert imprint.effective_rank > 0 + assert 0 <= imprint.cross_layer_smoothness <= 1.0 + assert 0 <= imprint.tail_layer_bias <= 1.0 + assert 0 <= imprint.mean_pairwise_orthogonality <= 1.0 + assert imprint.spectral_decay_rate >= 0 + + def test_empty_directions(self): + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint({}) + assert imprint.predicted_method == "unknown" + assert imprint.confidence == 0.0 + + def test_compare_base_instruct(self): + torch.manual_seed(42) + hidden_dim = 32 + directions, _ = _make_refusal_directions(hidden_dim=hidden_dim) + + base_acts = {i: torch.randn(hidden_dim) for i in range(8)} + instruct_acts = { + i: base_acts[i] + 1.5 * directions[i] for i in range(8) + } + + detector = AlignmentImprintDetector() + deltas = detector.compare_base_instruct(base_acts, instruct_acts, directions) + + assert len(deltas) == 8 + for d in deltas: + assert isinstance(d, BaseInstructDelta) + assert d.delta_magnitude > 0 + # Since delta IS the refusal direction, cosine should be high + assert abs(d.cosine_with_refusal) > 0.5 + + def test_format_imprint(self): + directions, strengths = _make_refusal_directions() + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + report = AlignmentImprintDetector.format_imprint(imprint) + + assert "Alignment Imprint" in report + assert "DPO" in report + assert "RLHF" in report + assert "Gini" in report + + def test_per_layer_strength_populated(self): + directions, strengths = _make_refusal_directions() + detector = AlignmentImprintDetector() + imprint = detector.detect_imprint(directions, strengths) + assert len(imprint.per_layer_strength) == len(directions) + + +# =========================================================================== +# Tests: Multi-Token Position Analysis +# =========================================================================== + +class TestMultiTokenPositionAnalyzer: + def _make_activations_with_trigger( + self, seq_len=20, hidden_dim=32, trigger_pos=5, + ): + """Create activations with a planted trigger at a specific position.""" + torch.manual_seed(42) + refusal_dir = torch.randn(hidden_dim) + refusal_dir = refusal_dir / refusal_dir.norm() + + # Background activations + acts = torch.randn(seq_len, hidden_dim) * 0.1 + + # Strong refusal at trigger position + acts[trigger_pos] += 3.0 * refusal_dir + + # Weaker refusal at last position + acts[-1] += 1.0 * refusal_dir + + # Moderate at a few positions after trigger (decay) + for i in range(trigger_pos + 1, min(trigger_pos + 4, seq_len)): + decay = 0.5 ** (i - trigger_pos) + acts[i] += 3.0 * decay * refusal_dir + + return acts, refusal_dir + + def test_basic_analysis(self): + acts, ref_dir = self._make_activations_with_trigger() + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir, layer_idx=3) + + assert isinstance(result, PositionAnalysisResult) + assert result.layer_idx == 3 + assert result.n_tokens == 20 + assert result.peak_strength > 0 + + def test_trigger_detection(self): + acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5) + analyzer = MultiTokenPositionAnalyzer(trigger_threshold=0.5) + result = analyzer.analyze_prompt(acts, ref_dir) + + # The planted trigger should be detected + assert 5 in result.trigger_positions + assert result.peak_position == 5 + + def test_peak_vs_last(self): + """Peak should be at trigger, not last token.""" + acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5) + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + + assert result.peak_strength > result.last_token_strength + assert result.peak_position != result.n_tokens - 1 + + def test_decay_rate_positive(self): + acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5) + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + # With exponential decay planted, decay rate should be positive + assert result.decay_rate > 0 + + def test_position_gini_bounded(self): + acts, ref_dir = self._make_activations_with_trigger() + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + assert 0 <= result.position_gini <= 1.0 + + def test_token_profiles_length(self): + acts, ref_dir = self._make_activations_with_trigger(seq_len=15) + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + assert len(result.token_profiles) == 15 + + def test_custom_token_texts(self): + acts, ref_dir = self._make_activations_with_trigger(seq_len=10, trigger_pos=3) + tokens = ["How", "to", "make", "a", "bomb", "from", "scratch", "please", "help", "me"] + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir, token_texts=tokens) + for tp in result.token_profiles: + assert tp.token_text in tokens or tp.token_text.startswith("pos_") + + def test_batch_analysis(self): + batch = [] + for i in range(5): + acts, ref_dir = self._make_activations_with_trigger( + trigger_pos=3 + i % 3, + ) + batch.append(acts) + + analyzer = MultiTokenPositionAnalyzer() + summary = analyzer.analyze_batch(batch, ref_dir) + + assert isinstance(summary, MultiTokenSummary) + assert len(summary.per_prompt) == 5 + assert summary.mean_peak_vs_last_ratio > 0 + assert summary.mean_trigger_count > 0 + assert 0 <= summary.peak_is_last_fraction <= 1.0 + assert 0 <= summary.last_token_dominance <= 1.0 + + def test_last_token_dominant_case(self): + """When signal is only at last token, peak should equal last.""" + torch.manual_seed(42) + hidden_dim = 32 + seq_len = 10 + ref_dir = torch.randn(hidden_dim) + ref_dir = ref_dir / ref_dir.norm() + + acts = torch.randn(seq_len, hidden_dim) * 0.01 + acts[-1] += 5.0 * ref_dir + + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + assert result.peak_position == seq_len - 1 + + def test_format_position_report(self): + acts, ref_dir = self._make_activations_with_trigger() + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir, prompt_text="How to hack?") + report = MultiTokenPositionAnalyzer.format_position_report(result) + + assert "Multi-Token" in report + assert "Peak position" in report + + def test_format_summary(self): + batch = [] + for _ in range(3): + acts, ref_dir = self._make_activations_with_trigger() + batch.append(acts) + + analyzer = MultiTokenPositionAnalyzer() + summary = analyzer.analyze_batch(batch, ref_dir) + report = MultiTokenPositionAnalyzer.format_summary(summary) + + assert "Summary" in report + assert "Prompts analyzed" in report + + def test_3d_activations_handled(self): + """Should handle (1, seq_len, hidden_dim) inputs.""" + acts, ref_dir = self._make_activations_with_trigger() + acts = acts.unsqueeze(0) # Add batch dim + analyzer = MultiTokenPositionAnalyzer() + result = analyzer.analyze_prompt(acts, ref_dir) + assert result.n_tokens == 20 + + def test_empty_batch(self): + ref_dir = torch.randn(32) + analyzer = MultiTokenPositionAnalyzer() + summary = analyzer.analyze_batch([], ref_dir) + assert len(summary.per_prompt) == 0 + assert summary.peak_is_last_fraction == 1.0 + + +# =========================================================================== +# Tests: Sparse Direction Surgery +# =========================================================================== + +class TestSparseDirectionSurgeon: + def _make_weight_with_sparse_refusal( + self, out_dim=64, in_dim=32, n_refusal_rows=5, + ): + """Create a weight matrix where refusal is concentrated in a few rows.""" + torch.manual_seed(42) + refusal_dir = torch.randn(in_dim) + refusal_dir = refusal_dir / refusal_dir.norm() + + W = torch.randn(out_dim, in_dim) * 0.1 + + # Plant strong refusal signal in specific rows + refusal_rows = list(range(n_refusal_rows)) + for i in refusal_rows: + W[i] += 5.0 * refusal_dir + + return W, refusal_dir, refusal_rows + + def test_basic_analysis(self): + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + surgeon = SparseDirectionSurgeon(sparsity=0.1) + result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=3) + + assert isinstance(result, SparseProjectionResult) + assert result.layer_idx == 3 + assert result.n_rows_total == 64 + assert result.n_rows_modified > 0 + assert result.mean_projection > 0 + assert result.max_projection > result.mean_projection + + def test_refusal_sparsity_index(self): + """With sparse refusal, RSI should be high.""" + W, ref_dir, _ = self._make_weight_with_sparse_refusal( + out_dim=100, n_refusal_rows=5, + ) + surgeon = SparseDirectionSurgeon() + result = surgeon.analyze_weight_matrix(W, ref_dir) + assert result.refusal_sparsity_index > 0.3 # Concentrated signal + + def test_energy_removed(self): + """Top rows should capture most of the refusal energy.""" + W, ref_dir, _ = self._make_weight_with_sparse_refusal( + out_dim=64, n_refusal_rows=5, + ) + surgeon = SparseDirectionSurgeon(sparsity=0.15) # ~10 rows out of 64 + result = surgeon.analyze_weight_matrix(W, ref_dir) + # With 5 refusal rows and 10 modified, should capture most energy + assert result.energy_removed > 0.5 + + def test_frobenius_change_bounded(self): + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + surgeon = SparseDirectionSurgeon(sparsity=0.1) + result = surgeon.analyze_weight_matrix(W, ref_dir) + assert result.frobenius_change > 0 + assert result.frobenius_change < 1.0 # Shouldn't change more than 100% + + def test_apply_sparse_projection(self): + """Sparse projection should reduce refusal signal.""" + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + surgeon = SparseDirectionSurgeon(sparsity=0.1) + + W_modified = surgeon.apply_sparse_projection(W, ref_dir) + + # Check that modified rows have reduced projection + original_proj = (W @ ref_dir).abs().sum().item() + modified_proj = (W_modified @ ref_dir).abs().sum().item() + assert modified_proj < original_proj + + def test_sparse_preserves_unmodified_rows(self): + """Rows below the threshold should be unchanged.""" + W, ref_dir, refusal_rows = self._make_weight_with_sparse_refusal( + out_dim=64, n_refusal_rows=5, + ) + surgeon = SparseDirectionSurgeon(sparsity=0.1) # ~6 rows + W_modified = surgeon.apply_sparse_projection(W, ref_dir) + + # Count rows that actually changed + diffs = (W - W_modified).abs().sum(dim=1) + n_changed = (diffs > 1e-6).sum().item() + n_unchanged = (diffs < 1e-6).sum().item() + + assert n_changed <= int(0.1 * 64) + 1 # Sparsity bound + assert n_unchanged >= 57 # Most rows unchanged + + def test_dense_vs_sparse_comparison(self): + """Dense projection should modify all rows; sparse should modify fewer.""" + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + + # Dense projection + r = ref_dir / ref_dir.norm() + W_dense = W - (W @ r).unsqueeze(1) * r.unsqueeze(0) + + # Sparse projection + surgeon = SparseDirectionSurgeon(sparsity=0.1) + W_sparse = surgeon.apply_sparse_projection(W, ref_dir) + + dense_changes = (W - W_dense).abs().sum(dim=1) + sparse_changes = (W - W_sparse).abs().sum(dim=1) + + n_dense_changed = (dense_changes > 1e-6).sum().item() + n_sparse_changed = (sparse_changes > 1e-6).sum().item() + + assert n_sparse_changed < n_dense_changed + + def test_plan_surgery(self): + weights = {} + directions = {} + for i in range(6): + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + weights[i] = W + directions[i] = ref_dir + + surgeon = SparseDirectionSurgeon(sparsity=0.1) + plan = surgeon.plan_surgery(weights, directions) + + assert isinstance(plan, SparseSurgeryPlan) + assert len(plan.per_layer) == 6 + assert 0 < plan.recommended_sparsity < 1.0 + assert plan.mean_refusal_sparsity_index > 0 + assert plan.mean_energy_removed > 0 + + def test_auto_sparsity(self): + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + surgeon = SparseDirectionSurgeon(auto_sparsity=True) + result = surgeon.analyze_weight_matrix(W, ref_dir) + # Auto sparsity should find a reasonable value + assert 0.01 <= result.sparsity <= 0.5 + + def test_auto_sparsity_apply(self): + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + surgeon = SparseDirectionSurgeon(auto_sparsity=True) + W_modified = surgeon.apply_sparse_projection(W, ref_dir) + # Should reduce projection + assert (W_modified @ ref_dir).abs().sum() < (W @ ref_dir).abs().sum() + + def test_format_analysis(self): + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + surgeon = SparseDirectionSurgeon(sparsity=0.1) + result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=4) + report = SparseDirectionSurgeon.format_analysis(result) + + assert "Sparse Direction Surgery" in report + assert "Layer 4" in report + assert "Refusal Sparsity Index" in report + + def test_format_plan(self): + weights = {i: torch.randn(32, 16) for i in range(4)} + directions = {i: torch.randn(16) for i in range(4)} + + surgeon = SparseDirectionSurgeon(sparsity=0.1) + plan = surgeon.plan_surgery(weights, directions) + report = SparseDirectionSurgeon.format_plan(plan) + + assert "Sparse Direction Surgery Plan" in report + assert "Recommended sparsity" in report + + def test_empty_inputs(self): + surgeon = SparseDirectionSurgeon() + plan = surgeon.plan_surgery({}, {}) + assert len(plan.per_layer) == 0 + + def test_output_dtype_preserved(self): + """Output should match input dtype.""" + W, ref_dir, _ = self._make_weight_with_sparse_refusal() + W_half = W.half() + surgeon = SparseDirectionSurgeon(sparsity=0.1) + W_out = surgeon.apply_sparse_projection(W_half, ref_dir) + assert W_out.dtype == torch.float16 + + +# =========================================================================== +# Tests: Integration / Imports +# =========================================================================== + +class TestAnalysisImports: + def test_all_new_modules_importable(self): + from obliteratus.analysis import ( + ConceptConeAnalyzer, + AlignmentImprintDetector, + MultiTokenPositionAnalyzer, + SparseDirectionSurgeon, + ) + assert ConceptConeAnalyzer is not None + assert AlignmentImprintDetector is not None + assert MultiTokenPositionAnalyzer is not None + assert SparseDirectionSurgeon is not None diff --git a/tests/test_refusal_detection.py b/tests/test_refusal_detection.py new file mode 100644 index 0000000..1d2755c --- /dev/null +++ b/tests/test_refusal_detection.py @@ -0,0 +1,155 @@ +"""Tests for refusal detection, including semantic mode and confidence intervals.""" + +from __future__ import annotations + + +from obliteratus.evaluation.advanced_metrics import ( + _is_refusal, + _is_refusal_semantic, + refusal_rate, + refusal_rate_with_ci, +) + + +class TestKeywordRefusal: + """Test the keyword-based refusal detection.""" + + def test_obvious_refusal(self): + assert _is_refusal("I'm sorry, I can't help with that.", "combined") + + def test_obvious_non_refusal(self): + assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined") + + def test_empty_string(self): + assert not _is_refusal("", "combined") + + def test_prefix_mode(self): + assert _is_refusal("I cannot assist with that request.", "prefix") + assert not _is_refusal("Well, I cannot do that.", "prefix") # not a prefix + + def test_substring_mode(self): + assert _is_refusal("Well, I cannot assist with that.", "substring") + + +class TestSemanticRefusal: + """Test the semantic refusal detection patterns.""" + + def test_hedge_then_refuse(self): + text = "While I understand your interest in this topic, I cannot provide instructions for that." + assert _is_refusal(text, "semantic") + + def test_redirect(self): + text = "Instead of providing those instructions, I can help you with something constructive." + assert _is_refusal(text, "semantic") + + def test_concern_flagging(self): + text = "This raises serious ethical concerns that prevent me from responding." + assert _is_refusal(text, "semantic") + + def test_meta_refusal(self): + text = "I'm designed to avoid providing information that could cause harm." + assert _is_refusal(text, "semantic") + + def test_responsibility_deflection(self): + text = "You should consult a professional about this matter." + assert _is_refusal(text, "semantic") + + def test_genuine_compliance_not_flagged(self): + """Actual compliant responses should not trigger semantic detection.""" + texts = [ + "Here are the steps to build a birdhouse from scratch.", + "The chemical formula for water is H2O.", + "Python is a great language for data science. Here's how to get started.", + "To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.", + ] + for text in texts: + assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}" + + def test_semantic_catches_what_keyword_misses(self): + """Semantic mode should catch refusals that keyword mode misses.""" + # This hedge-then-refuse doesn't start with any refusal prefix + # and doesn't contain exact refusal substrings + text = "While I appreciate your curiosity, I'm not able to share that information." + assert not _is_refusal(text, "prefix") + # "not able to" is in substring list, so combined catches it + assert _is_refusal(text, "combined") + + # This one truly requires semantic detection + text = "Although I respect your question, I will not be able to help with this particular request." + # "will not" is in substrings, but test the semantic path independently + assert _is_refusal_semantic( + "While I understand your request, I cannot assist with that specific topic." + ) + + +class TestRefusalRate: + def test_all_refuse(self): + responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."] + assert refusal_rate(responses, "combined") == 1.0 + + def test_none_refuse(self): + responses = ["Sure, here you go.", "Of course!", "The answer is 42."] + assert refusal_rate(responses, "combined") == 0.0 + + def test_mixed(self): + responses = ["I'm sorry, no.", "Sure, here you go."] + assert refusal_rate(responses, "combined") == 0.5 + + def test_empty_list(self): + assert refusal_rate([], "combined") == 0.0 + + def test_semantic_mode(self): + responses = [ + "While I understand your interest, I cannot provide that information.", + "Here are the steps you requested.", + ] + assert refusal_rate(responses, "semantic") == 0.5 + + +class TestRefusalRateWithCI: + def test_returns_all_fields(self): + responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5 + ci = refusal_rate_with_ci(responses, mode="combined") + assert "rate" in ci + assert "ci_lower" in ci + assert "ci_upper" in ci + assert "n_samples" in ci + assert ci["n_samples"] == 10 + + def test_ci_bounds_bracket_rate(self): + responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70 + ci = refusal_rate_with_ci(responses, mode="combined") + assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"] + + def test_all_refuse_tight_ci(self): + responses = ["I'm sorry"] * 50 + ci = refusal_rate_with_ci(responses, mode="combined") + assert ci["rate"] == 1.0 + # Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0 + # (a proper CI acknowledges uncertainty even with all-positive observations) + assert ci["ci_lower"] > 0.9 + assert ci["ci_upper"] == 1.0 + + def test_empty_responses(self): + ci = refusal_rate_with_ci([], mode="combined") + assert ci["rate"] == 0.0 + assert ci["n_samples"] == 0 + + def test_ci_narrower_with_more_samples(self): + """More samples should produce tighter confidence intervals.""" + responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5 + responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50 + + ci_small = refusal_rate_with_ci(responses_small) + ci_large = refusal_rate_with_ci(responses_large) + + width_small = ci_small["ci_upper"] - ci_small["ci_lower"] + width_large = ci_large["ci_upper"] - ci_large["ci_lower"] + assert width_large < width_small, \ + f"Large CI ({width_large}) not narrower than small CI ({width_small})" + + def test_deterministic_with_seed(self): + responses = ["I'm sorry"] * 30 + ["Sure"] * 70 + ci1 = refusal_rate_with_ci(responses) + ci2 = refusal_rate_with_ci(responses) + assert ci1 == ci2, "Same input produced different CIs" diff --git a/tests/test_report.py b/tests/test_report.py new file mode 100644 index 0000000..beff3e6 --- /dev/null +++ b/tests/test_report.py @@ -0,0 +1,70 @@ +"""Tests for the reporting module.""" + +from __future__ import annotations + +import json + +from obliteratus.reporting.report import AblationReport, AblationResult + + +def _make_report() -> AblationReport: + report = AblationReport(model_name="test-model") + report.add_baseline({"perplexity": 25.0, "accuracy": 0.85}) + report.add_result( + AblationResult( + strategy="layer_removal", + component="layer_0", + description="Remove layer 0", + metrics={"perplexity": 30.0, "accuracy": 0.80}, + ) + ) + report.add_result( + AblationResult( + strategy="layer_removal", + component="layer_1", + description="Remove layer 1", + metrics={"perplexity": 50.0, "accuracy": 0.60}, + ) + ) + return report + + +class TestAblationReport: + def test_to_dataframe(self): + report = _make_report() + df = report.to_dataframe() + assert len(df) == 2 + assert "perplexity" in df.columns + assert "perplexity_delta" in df.columns + assert "perplexity_pct_change" in df.columns + + def test_save_json(self, tmp_path): + report = _make_report() + out = tmp_path / "results.json" + report.save_json(out) + data = json.loads(out.read_text()) + assert data["model_name"] == "test-model" + assert len(data["results"]) == 2 + assert data["baseline_metrics"]["perplexity"] == 25.0 + + def test_save_csv(self, tmp_path): + report = _make_report() + out = tmp_path / "results.csv" + report.save_csv(out) + text = out.read_text() + assert "layer_0" in text + assert "perplexity" in text + + def test_delta_calculation(self): + report = _make_report() + df = report.to_dataframe() + row0 = df[df["component"] == "layer_0"].iloc[0] + assert row0["perplexity_delta"] == 5.0 # 30 - 25 + assert abs(row0["perplexity_pct_change"] - 20.0) < 0.01 + + def test_plot_impact(self, tmp_path): + report = _make_report() + out = tmp_path / "impact.png" + report.plot_impact(metric="perplexity", output_path=out) + assert out.exists() + assert out.stat().st_size > 0 diff --git a/tests/test_strategies.py b/tests/test_strategies.py new file mode 100644 index 0000000..c65a3f9 --- /dev/null +++ b/tests/test_strategies.py @@ -0,0 +1,179 @@ +"""Tests for ablation strategies using a small GPT-2 model.""" + +from __future__ import annotations + +import pytest +import torch + +from obliteratus.strategies.base import AblationSpec +from obliteratus.strategies.registry import STRATEGY_REGISTRY, get_strategy + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +def _make_dummy_handle(): + """Create a minimal ModelHandle with a tiny GPT-2 for testing (no network).""" + from unittest.mock import MagicMock + from transformers import GPT2Config, GPT2LMHeadModel + from obliteratus.models.loader import ModelHandle + + config = GPT2Config( + vocab_size=1000, + n_positions=128, + n_embd=64, + n_layer=2, + n_head=2, + n_inner=256, + ) + model = GPT2LMHeadModel(config) + model.eval() + + # Strategy tests don't tokenize — use a simple mock + tokenizer = MagicMock() + tokenizer.pad_token = "" + tokenizer.eos_token = "" + + handle = ModelHandle( + model=model, + tokenizer=tokenizer, + config=config, + model_name="gpt2-test", + task="causal_lm", + ) + handle.snapshot() + return handle + + +@pytest.fixture +def handle(): + return _make_dummy_handle() + + +# --------------------------------------------------------------------------- +# Registry tests +# --------------------------------------------------------------------------- + +class TestRegistry: + def test_all_strategies_registered(self): + expected = {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"} + assert expected.issubset(set(STRATEGY_REGISTRY.keys())) + + def test_get_strategy_returns_instance(self): + strat = get_strategy("layer_removal") + assert strat.name == "layer_removal" + + def test_get_unknown_strategy_raises(self): + with pytest.raises(KeyError, match="Unknown strategy"): + get_strategy("nonexistent_strategy") + + +# --------------------------------------------------------------------------- +# Layer removal +# --------------------------------------------------------------------------- + +class TestLayerRemoval: + def test_enumerate(self, handle): + strat = get_strategy("layer_removal") + specs = strat.enumerate(handle) + assert len(specs) == handle.num_layers + assert all(s.strategy_name == "layer_removal" for s in specs) + + def test_apply_zeros_layer(self, handle): + strat = get_strategy("layer_removal") + specs = strat.enumerate(handle) + strat.apply(handle, specs[0]) + + from obliteratus.strategies.utils import get_layer_modules + layer = get_layer_modules(handle)[0] + for param in layer.parameters(): + assert torch.all(param == 0), "Layer params should be zeroed after ablation" + + def test_restore_after_ablation(self, handle): + strat = get_strategy("layer_removal") + specs = strat.enumerate(handle) + + from obliteratus.strategies.utils import get_layer_modules + original_weight = get_layer_modules(handle)[0].attn.c_attn.weight.clone() + + strat.apply(handle, specs[0]) + handle.restore() + + restored_weight = get_layer_modules(handle)[0].attn.c_attn.weight + assert torch.allclose(original_weight, restored_weight) + + +# --------------------------------------------------------------------------- +# Head pruning +# --------------------------------------------------------------------------- + +class TestHeadPruning: + def test_enumerate(self, handle): + strat = get_strategy("head_pruning") + specs = strat.enumerate(handle) + assert len(specs) == handle.num_layers * handle.num_heads + + def test_apply_zeros_head(self, handle): + strat = get_strategy("head_pruning") + spec = AblationSpec( + strategy_name="head_pruning", + component="layer_0_head_0", + description="test", + metadata={"layer_idx": 0, "head_idx": 0}, + ) + strat.apply(handle, spec) + + from obliteratus.strategies.utils import get_layer_modules, get_attention_module + attn = get_attention_module(get_layer_modules(handle)[0], handle.architecture) + head_dim = handle.hidden_size // handle.num_heads + # GPT-2 uses c_attn (Conv1D), check output projection c_proj + if hasattr(attn, "c_proj"): + # Conv1D stores weight transposed + assert torch.all(attn.c_proj.weight[0:head_dim, :] == 0) + + +# --------------------------------------------------------------------------- +# FFN ablation +# --------------------------------------------------------------------------- + +class TestFFNAblation: + def test_enumerate(self, handle): + strat = get_strategy("ffn_ablation") + specs = strat.enumerate(handle) + assert len(specs) == handle.num_layers + + def test_apply_zeros_ffn(self, handle): + strat = get_strategy("ffn_ablation") + specs = strat.enumerate(handle) + strat.apply(handle, specs[0]) + + from obliteratus.strategies.utils import get_layer_modules, get_ffn_module + ffn = get_ffn_module(get_layer_modules(handle)[0], handle.architecture) + for param in ffn.parameters(): + assert torch.all(param == 0) + + +# --------------------------------------------------------------------------- +# Embedding ablation +# --------------------------------------------------------------------------- + +class TestEmbeddingAblation: + def test_enumerate(self, handle): + strat = get_strategy("embedding_ablation") + specs = strat.enumerate(handle) + assert len(specs) > 0 + + def test_apply_zeros_dims(self, handle): + strat = get_strategy("embedding_ablation") + spec = AblationSpec( + strategy_name="embedding_ablation", + component="embed_dims_0_4", + description="test", + metadata={"dim_start": 0, "dim_end": 4}, + ) + strat.apply(handle, spec) + + from obliteratus.strategies.utils import get_embedding_module + emb = get_embedding_module(handle) + assert torch.all(emb.weight[:, 0:4] == 0) diff --git a/tests/test_study_presets.py b/tests/test_study_presets.py new file mode 100644 index 0000000..bba2fcc --- /dev/null +++ b/tests/test_study_presets.py @@ -0,0 +1,108 @@ +"""Tests for ablation presets.""" + +from __future__ import annotations + +from obliteratus.study_presets import ( + STUDY_PRESETS, + get_study_preset, + get_preset, + list_study_presets, + list_presets, +) +from obliteratus.config import StudyConfig + + +class TestPresets: + def test_all_presets_registered(self): + expected_keys = {"quick", "full", "attention", "layers", "knowledge", "pruning", "embeddings", "jailbreak", "guardrail", "robustness"} + assert expected_keys.issubset(set(STUDY_PRESETS.keys())) + + def test_get_preset(self): + preset = get_study_preset("quick") + assert preset.name == "Quick Scan" + assert preset.key == "quick" + assert len(preset.strategies) == 2 + + def test_get_preset_alias(self): + preset = get_preset("quick") + assert preset.name == "Quick Scan" + + def test_get_unknown_preset_raises(self): + import pytest + with pytest.raises(KeyError, match="Unknown preset"): + get_study_preset("nonexistent") + + def test_list_presets(self): + presets = list_study_presets() + assert len(presets) >= 7 + keys = [p.key for p in presets] + assert "quick" in keys + assert "full" in keys + + def test_list_presets_alias(self): + assert list_presets() == list_study_presets() + + def test_preset_strategies_are_valid(self): + from obliteratus.strategies import STRATEGY_REGISTRY + for preset in list_study_presets(): + for s in preset.strategies: + assert s["name"] in STRATEGY_REGISTRY, ( + f"Preset {preset.key!r} references unknown strategy {s['name']!r}" + ) + + +class TestConfigWithPreset: + def test_preset_key_in_config(self): + config_dict = { + "preset": "quick", + "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"}, + "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"}, + } + config = StudyConfig.from_dict(config_dict) + # Should inherit strategies from the quick preset + assert len(config.strategies) == 2 + strategy_names = [s.name for s in config.strategies] + assert "layer_removal" in strategy_names + assert "ffn_ablation" in strategy_names + # Should inherit max_samples + assert config.dataset.max_samples == 25 + # Should inherit batch_size and max_length + assert config.batch_size == 4 + assert config.max_length == 128 + + def test_legacy_study_preset_key_still_works(self): + config_dict = { + "study_preset": "quick", + "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"}, + "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"}, + } + config = StudyConfig.from_dict(config_dict) + assert len(config.strategies) == 2 + + def test_preset_can_be_overridden(self): + config_dict = { + "preset": "quick", + "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"}, + "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text", "max_samples": 999}, + "batch_size": 16, + "strategies": [{"name": "head_pruning", "params": {}}], + } + config = StudyConfig.from_dict(config_dict) + # Explicit strategies should override preset + assert len(config.strategies) == 1 + assert config.strategies[0].name == "head_pruning" + # Explicit batch_size should override + assert config.batch_size == 16 + # Explicit max_samples in dataset should be kept + assert config.dataset.max_samples == 999 + + def test_full_preset(self): + config_dict = { + "preset": "full", + "model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"}, + "dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"}, + } + config = StudyConfig.from_dict(config_dict) + assert len(config.strategies) == 4 + strategy_names = {s.name for s in config.strategies} + assert strategy_names == {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"} diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py new file mode 100644 index 0000000..d5ae7f8 --- /dev/null +++ b/tests/test_telemetry.py @@ -0,0 +1,696 @@ +"""Tests for the opt-in telemetry module.""" + +import json +import os +import tempfile +from dataclasses import dataclass, field +from pathlib import Path +from unittest.mock import MagicMock, patch + +import torch + +from obliteratus.telemetry import ( + _ALLOWED_METHOD_CONFIG_KEYS, + _direction_stats, + _extract_excise_details, + _extract_prompt_counts, + _extract_analysis_insights, + _is_mount_point, + _test_writable, + build_report, + disable_telemetry, + enable_telemetry, + is_enabled, + maybe_send_informed_report, + maybe_send_pipeline_report, + restore_from_hub, + send_report, + storage_diagnostic, +) + + +def _reset_telemetry(): + import obliteratus.telemetry as t + t._enabled = None + + +# ── Enable / disable ──────────────────────────────────────────────────── + + +class TestTelemetryConfig: + """Test telemetry enable/disable logic.""" + + def setup_method(self): + _reset_telemetry() + + def test_disabled_by_default(self): + with patch.dict(os.environ, {}, clear=True): + _reset_telemetry() + assert not is_enabled() + + def test_enabled_by_default_on_hf_spaces(self): + with patch.dict(os.environ, {"SPACE_ID": "user/space"}, clear=True): + import obliteratus.telemetry as t + old_val = t._ON_HF_SPACES + t._ON_HF_SPACES = True + _reset_telemetry() + assert is_enabled() + t._ON_HF_SPACES = old_val + + def test_disable_via_env_zero(self): + with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "0"}): + _reset_telemetry() + assert not is_enabled() + + def test_disable_via_env_false(self): + with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "false"}): + _reset_telemetry() + assert not is_enabled() + + def test_enable_via_env_explicit(self): + with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}): + _reset_telemetry() + assert is_enabled() + + def test_enable_programmatically(self): + enable_telemetry() + assert is_enabled() + + def test_disable_programmatically(self): + enable_telemetry() + assert is_enabled() + disable_telemetry() + assert not is_enabled() + + def test_programmatic_overrides_env(self): + with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}): + disable_telemetry() + assert not is_enabled() + + +# ── Report building ───────────────────────────────────────────────────── + + +class TestBuildReport: + """Test report payload construction.""" + + def _base_kwargs(self, **overrides): + defaults = dict( + architecture="LlamaForCausalLM", + num_layers=32, + num_heads=32, + hidden_size=4096, + total_params=8_000_000_000, + method="advanced", + method_config={"n_directions": 4, "norm_preserve": True}, + quality_metrics={"perplexity": 5.2, "refusal_rate": 0.05}, + ) + defaults.update(overrides) + return defaults + + def test_schema_version_2(self): + report = build_report(**self._base_kwargs()) + assert report["schema_version"] == 2 + + def test_basic_fields(self): + report = build_report(**self._base_kwargs()) + assert report["model"]["architecture"] == "LlamaForCausalLM" + assert report["model"]["num_layers"] == 32 + assert report["model"]["total_params"] == 8_000_000_000 + assert report["method"] == "advanced" + assert report["quality_metrics"]["refusal_rate"] == 0.05 + assert len(report["session_id"]) == 32 + + def test_filters_unknown_config_keys(self): + report = build_report(**self._base_kwargs( + method_config={"n_directions": 1, "secret_flag": True, "nuke": "boom"}, + )) + assert "n_directions" in report["method_config"] + assert "secret_flag" not in report["method_config"] + assert "nuke" not in report["method_config"] + + def test_allows_all_valid_config_keys(self): + """Every key in the allowlist should pass through.""" + config = {k: True for k in _ALLOWED_METHOD_CONFIG_KEYS} + report = build_report(**self._base_kwargs(method_config=config)) + for k in _ALLOWED_METHOD_CONFIG_KEYS: + assert k in report["method_config"], f"Missing allowlisted key: {k}" + + def test_no_model_name_in_report(self): + report = build_report(**self._base_kwargs()) + report_str = json.dumps(report) + assert "meta-llama" not in report_str + assert "Llama-3" not in report_str + + def test_environment_info(self): + report = build_report(**self._base_kwargs()) + env = report["environment"] + assert "python_version" in env + assert "os" in env + assert "arch" in env + + def test_stage_durations(self): + durations = {"summon": 2.5, "probe": 10.1, "distill": 3.2} + report = build_report(**self._base_kwargs(stage_durations=durations)) + assert report["stage_durations"] == durations + + def test_direction_stats(self): + stats = {"direction_norms": {"10": 0.95}, "mean_direction_persistence": 0.87} + report = build_report(**self._base_kwargs(direction_stats=stats)) + assert report["direction_stats"]["mean_direction_persistence"] == 0.87 + + def test_excise_details(self): + details = {"modified_count": 128, "used_techniques": ["head_surgery"]} + report = build_report(**self._base_kwargs(excise_details=details)) + assert report["excise_details"]["modified_count"] == 128 + + def test_prompt_counts(self): + counts = {"harmful": 33, "harmless": 33, "jailbreak": 15} + report = build_report(**self._base_kwargs(prompt_counts=counts)) + assert report["prompt_counts"]["harmful"] == 33 + assert report["prompt_counts"]["jailbreak"] == 15 + + def test_gpu_memory(self): + mem = {"peak_allocated_gb": 7.2, "peak_reserved_gb": 8.0} + report = build_report(**self._base_kwargs(gpu_memory=mem)) + assert report["gpu_memory"]["peak_allocated_gb"] == 7.2 + + def test_analysis_insights_filtered(self): + """Only allowlisted analysis keys should pass through.""" + insights = { + "detected_alignment_method": "DPO", + "alignment_confidence": 0.92, + "secret_internal_data": "should not appear", + } + report = build_report(**self._base_kwargs(analysis_insights=insights)) + assert report["analysis_insights"]["detected_alignment_method"] == "DPO" + assert "secret_internal_data" not in report["analysis_insights"] + + def test_informed_extras(self): + extras = {"ouroboros_passes": 3, "final_refusal_rate": 0.02, "total_duration": 120.5} + report = build_report(**self._base_kwargs(informed_extras=extras)) + assert report["informed"]["ouroboros_passes"] == 3 + + def test_optional_fields_omitted_when_empty(self): + """Optional fields should not appear when not provided.""" + report = build_report(**self._base_kwargs()) + assert "stage_durations" not in report + assert "direction_stats" not in report + assert "excise_details" not in report + assert "prompt_counts" not in report + assert "gpu_memory" not in report + assert "analysis_insights" not in report + assert "informed" not in report + + +# ── Direction stats extraction ────────────────────────────────────────── + + +class TestDirectionStats: + """Test direction quality metric extraction.""" + + def test_direction_norms(self): + pipeline = MagicMock() + pipeline.refusal_directions = { + 0: torch.randn(128), + 1: torch.randn(128), + } + pipeline.refusal_subspaces = {} + stats = _direction_stats(pipeline) + assert "direction_norms" in stats + assert "0" in stats["direction_norms"] + assert "1" in stats["direction_norms"] + + def test_direction_persistence(self): + """Adjacent layers with similar directions should have high persistence.""" + d = torch.randn(128) + d = d / d.norm() + pipeline = MagicMock() + pipeline.refusal_directions = {0: d, 1: d + 0.01 * torch.randn(128)} + pipeline.refusal_subspaces = {} + stats = _direction_stats(pipeline) + assert "mean_direction_persistence" in stats + assert stats["mean_direction_persistence"] > 0.9 + + def test_effective_rank(self): + """Multi-direction subspace should yield effective rank > 1.""" + pipeline = MagicMock() + pipeline.refusal_directions = {0: torch.randn(128)} + # 4-direction subspace with distinct directions + sub = torch.randn(4, 128) + pipeline.refusal_subspaces = {0: sub} + stats = _direction_stats(pipeline) + assert "effective_ranks" in stats + assert float(stats["effective_ranks"]["0"]) > 1.0 + + def test_empty_directions(self): + pipeline = MagicMock() + pipeline.refusal_directions = {} + pipeline.refusal_subspaces = {} + stats = _direction_stats(pipeline) + assert stats == {} + + +# ── Excise details extraction ─────────────────────────────────────────── + + +class TestExciseDetails: + def test_basic_excise_details(self): + pipeline = MagicMock() + pipeline._excise_modified_count = 64 + pipeline._refusal_heads = {10: [(0, 0.9), (3, 0.8)], 11: [(1, 0.7)]} + pipeline._sae_directions = {} + pipeline._expert_safety_scores = {} + pipeline._layer_excise_weights = {} + pipeline._expert_directions = {} + pipeline._steering_hooks = [] + pipeline.invert_refusal = False + pipeline.project_embeddings = False + pipeline.activation_steering = False + pipeline.expert_transplant = False + + details = _extract_excise_details(pipeline) + assert details["modified_count"] == 64 + assert details["head_surgery_layers"] == 2 + assert details["total_heads_projected"] == 3 + assert "head_surgery" in details["used_techniques"] + + def test_adaptive_weights(self): + pipeline = MagicMock() + pipeline._excise_modified_count = None + pipeline._refusal_heads = {} + pipeline._sae_directions = {} + pipeline._expert_safety_scores = {} + pipeline._layer_excise_weights = {0: 0.2, 1: 0.8, 2: 0.5} + pipeline._expert_directions = {} + pipeline._steering_hooks = [] + pipeline.invert_refusal = False + pipeline.project_embeddings = False + pipeline.activation_steering = False + pipeline.expert_transplant = False + + details = _extract_excise_details(pipeline) + assert details["adaptive_weight_min"] == 0.2 + assert details["adaptive_weight_max"] == 0.8 + assert "layer_adaptive" in details["used_techniques"] + + +# ── Prompt counts extraction ──────────────────────────────────────────── + + +class TestPromptCounts: + def test_basic_counts(self): + pipeline = MagicMock() + pipeline.harmful_prompts = ["a"] * 33 + pipeline.harmless_prompts = ["b"] * 33 + pipeline.jailbreak_prompts = None + counts = _extract_prompt_counts(pipeline) + assert counts["harmful"] == 33 + assert counts["harmless"] == 33 + assert "jailbreak" not in counts + + def test_with_jailbreak(self): + pipeline = MagicMock() + pipeline.harmful_prompts = ["a"] * 33 + pipeline.harmless_prompts = ["b"] * 33 + pipeline.jailbreak_prompts = ["c"] * 10 + counts = _extract_prompt_counts(pipeline) + assert counts["jailbreak"] == 10 + + +# ── Send behavior ─────────────────────────────────────────────────────── + + +class TestSendReport: + def setup_method(self): + _reset_telemetry() + + def test_does_not_send_when_disabled(self): + disable_telemetry() + with patch("obliteratus.telemetry._send_sync") as mock_send: + send_report({"test": True}) + mock_send.assert_not_called() + + def test_sends_when_enabled(self): + enable_telemetry() + with patch("obliteratus.telemetry._send_sync") as mock_send: + send_report({"test": True}) + import time + time.sleep(0.1) + mock_send.assert_called_once_with({"test": True}) + + def test_send_failure_is_silent(self): + enable_telemetry() + with patch("obliteratus.telemetry._send_sync", side_effect=Exception("network down")) as mock_send: + # send_report should not propagate the exception to the caller + send_report({"test": True}) + import time + time.sleep(0.1) # Allow background thread to execute + mock_send.assert_called_once_with({"test": True}) + + +# ── Pipeline integration ──────────────────────────────────────────────── + + +def _make_mock_pipeline(): + """Build a mock pipeline with all fields the telemetry module reads.""" + p = MagicMock() + p.handle.summary.return_value = { + "architecture": "LlamaForCausalLM", + "num_layers": 32, + "num_heads": 32, + "hidden_size": 4096, + "total_params": 8_000_000_000, + } + p.method = "advanced" + p.n_directions = 4 + p.norm_preserve = True + p.regularization = 0.1 + p.refinement_passes = 2 + p.project_biases = True + p.use_chat_template = True + p.use_whitened_svd = True + p.true_iterative_refinement = False + p.use_jailbreak_contrast = False + p.layer_adaptive_strength = False + p.attention_head_surgery = True + p.safety_neuron_masking = False + p.per_expert_directions = False + p.use_sae_features = False + p.invert_refusal = False + p.project_embeddings = False + p.embed_regularization = 0.5 + p.activation_steering = False + p.steering_strength = 0.3 + p.expert_transplant = False + p.transplant_blend = 0.3 + p.reflection_strength = 2.0 + p.quantization = None + + p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05} + p._strong_layers = [10, 11, 12, 13] + p._stage_durations = {"summon": 3.0, "probe": 12.5, "distill": 4.1, "excise": 2.0, "verify": 8.3, "rebirth": 5.0} + p._excise_modified_count = 128 + + # Direction data + d = torch.randn(4096) + d = d / d.norm() + p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096), 12: d, 13: d} + p.refusal_subspaces = {10: torch.randn(4, 4096)} + + # Excise details + p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]} + p._sae_directions = {} + p._expert_safety_scores = {} + p._layer_excise_weights = {} + p._expert_directions = {} + p._steering_hooks = [] + + # Prompts + p.harmful_prompts = ["x"] * 33 + p.harmless_prompts = ["y"] * 33 + p.jailbreak_prompts = None + + return p + + +class TestPipelineIntegration: + def setup_method(self): + _reset_telemetry() + + def test_does_nothing_when_disabled(self): + disable_telemetry() + with patch("obliteratus.telemetry.send_report") as mock_send: + maybe_send_pipeline_report(_make_mock_pipeline()) + mock_send.assert_not_called() + + def test_comprehensive_report(self): + """Verify that all data points are extracted from the pipeline.""" + enable_telemetry() + p = _make_mock_pipeline() + with patch("obliteratus.telemetry.send_report") as mock_send: + maybe_send_pipeline_report(p) + mock_send.assert_called_once() + report = mock_send.call_args[0][0] + + # Core fields + assert report["schema_version"] == 2 + assert report["model"]["architecture"] == "LlamaForCausalLM" + assert report["method"] == "advanced" + + # Method config — check all keys passed through + cfg = report["method_config"] + assert cfg["n_directions"] == 4 + assert cfg["norm_preserve"] is True + assert cfg["use_whitened_svd"] is True + assert cfg["attention_head_surgery"] is True + + # Quality metrics + assert report["quality_metrics"]["perplexity"] == 5.2 + assert report["quality_metrics"]["refusal_rate"] == 0.05 + + # Stage durations + assert "stage_durations" in report + assert report["stage_durations"]["summon"] == 3.0 + assert report["stage_durations"]["verify"] == 8.3 + + # Strong layers + assert report["strong_layers"] == [10, 11, 12, 13] + + # Direction stats + assert "direction_stats" in report + assert "direction_norms" in report["direction_stats"] + assert "mean_direction_persistence" in report["direction_stats"] + + # Excise details + assert "excise_details" in report + assert report["excise_details"]["modified_count"] == 128 + assert "head_surgery" in report["excise_details"]["used_techniques"] + + # Prompt counts + assert report["prompt_counts"]["harmful"] == 33 + assert report["prompt_counts"]["harmless"] == 33 + + # Environment + assert "os" in report["environment"] + assert "python_version" in report["environment"] + + +# ── Informed pipeline integration ──────────────────────────────────────── + + +@dataclass +class _MockInsights: + detected_alignment_method: str = "DPO" + alignment_confidence: float = 0.92 + alignment_probabilities: dict = field(default_factory=lambda: {"DPO": 0.92, "RLHF": 0.05}) + cone_is_polyhedral: bool = True + cone_dimensionality: float = 3.2 + mean_pairwise_cosine: float = 0.45 + direction_specificity: dict = field(default_factory=lambda: {"violence": 0.8}) + cluster_count: int = 3 + direction_persistence: float = 0.87 + mean_refusal_sparsity_index: float = 0.15 + recommended_sparsity: float = 0.1 + use_sparse_surgery: bool = True + estimated_robustness: str = "medium" + self_repair_estimate: float = 0.3 + entanglement_score: float = 0.2 + entangled_layers: list = field(default_factory=lambda: [15, 16]) + clean_layers: list = field(default_factory=lambda: [10, 11, 12]) + recommended_n_directions: int = 6 + recommended_regularization: float = 0.05 + recommended_refinement_passes: int = 3 + recommended_layers: list = field(default_factory=lambda: [10, 11, 12, 13]) + skip_layers: list = field(default_factory=lambda: [15]) + + +@dataclass +class _MockInformedReport: + insights: _MockInsights = field(default_factory=_MockInsights) + ouroboros_passes: int = 2 + final_refusal_rate: float = 0.02 + analysis_duration: float = 15.3 + total_duration: float = 85.7 + + +class TestInformedPipelineIntegration: + def setup_method(self): + _reset_telemetry() + + def test_does_nothing_when_disabled(self): + disable_telemetry() + with patch("obliteratus.telemetry.send_report") as mock_send: + maybe_send_informed_report(_make_mock_pipeline(), _MockInformedReport()) + mock_send.assert_not_called() + + def test_comprehensive_informed_report(self): + enable_telemetry() + p = _make_mock_pipeline() + report_obj = _MockInformedReport() + + with patch("obliteratus.telemetry.send_report") as mock_send: + maybe_send_informed_report(p, report_obj) + mock_send.assert_called_once() + report = mock_send.call_args[0][0] + + # All base fields present + assert report["schema_version"] == 2 + assert report["model"]["architecture"] == "LlamaForCausalLM" + assert "direction_stats" in report + assert "excise_details" in report + + # Analysis insights + ai = report["analysis_insights"] + assert ai["detected_alignment_method"] == "DPO" + assert ai["alignment_confidence"] == 0.92 + assert ai["cone_is_polyhedral"] is True + assert ai["cone_dimensionality"] == 3.2 + assert ai["cluster_count"] == 3 + assert ai["self_repair_estimate"] == 0.3 + assert ai["entanglement_score"] == 0.2 + assert ai["recommended_n_directions"] == 6 + + # Informed extras + inf = report["informed"] + assert inf["ouroboros_passes"] == 2 + assert inf["final_refusal_rate"] == 0.02 + assert inf["analysis_duration"] == 15.3 + assert inf["total_duration"] == 85.7 + + def test_analysis_insights_filter_unknown_keys(self): + enable_telemetry() + _make_mock_pipeline() + + @dataclass + class _BadInsights(_MockInsights): + secret_sauce: str = "should not appear" + + report_obj = _MockInformedReport(insights=_BadInsights()) + insights = _extract_analysis_insights(report_obj) + assert "detected_alignment_method" in insights + assert "secret_sauce" not in insights + + +# ── Stage duration tracking on pipeline ────────────────────────────────── + + +class TestStageDurationTracking: + def test_emit_records_durations(self): + """Verify _emit stores durations in _stage_durations dict.""" + from obliteratus.abliterate import AbliterationPipeline + + p = AbliterationPipeline.__new__(AbliterationPipeline) + p._stage_durations = {} + p._excise_modified_count = None + p._on_stage = lambda r: None + + p._emit("summon", "done", "loaded", duration=3.5) + p._emit("probe", "done", "probed", duration=10.2) + p._emit("excise", "done", "excised", duration=2.1, modified_count=64) + + assert p._stage_durations == {"summon": 3.5, "probe": 10.2, "excise": 2.1} + assert p._excise_modified_count == 64 + + def test_running_status_does_not_record(self): + """Only 'done' status should record durations.""" + from obliteratus.abliterate import AbliterationPipeline + + p = AbliterationPipeline.__new__(AbliterationPipeline) + p._stage_durations = {} + p._excise_modified_count = None + p._on_stage = lambda r: None + + p._emit("summon", "running", "loading...", duration=0) + assert p._stage_durations == {} + + +# ── Storage helpers ────────────────────────────────────────────────────── + + +class TestStorageHelpers: + """Test persistent storage helper functions.""" + + def test_test_writable_valid_dir(self): + with tempfile.TemporaryDirectory() as d: + assert _test_writable(Path(d) / "subdir") + + def test_test_writable_unwritable(self): + # /proc is never writable for arbitrary files + assert not _test_writable(Path("/proc/obliteratus_test")) + + def test_is_mount_point_existing_path(self): + # Should return a bool without raising for any existing path + result = _is_mount_point(Path("/")) + assert isinstance(result, bool) + + def test_is_mount_point_nonexistent(self): + assert not _is_mount_point(Path("/nonexistent_dir_12345")) + + def test_storage_diagnostic_returns_dict(self): + diag = storage_diagnostic() + assert isinstance(diag, dict) + assert "telemetry_dir" in diag + assert "is_persistent" in diag + assert "on_hf_spaces" in diag + assert "telemetry_enabled" in diag + assert "data_dir_exists" in diag + + +# ── Hub restore ────────────────────────────────────────────────────────── + + +class TestHubRestore: + """Test Hub-to-local restore functionality.""" + + def setup_method(self): + _reset_telemetry() + # Reset restore state so each test can trigger it + import obliteratus.telemetry as t + t._restore_done = False + + def test_restore_skips_when_no_repo(self): + with patch("obliteratus.telemetry._TELEMETRY_REPO", ""): + assert restore_from_hub() == 0 + + def test_restore_deduplicates(self): + """Records already in local JSONL should not be re-added.""" + import obliteratus.telemetry as t + + with tempfile.TemporaryDirectory() as d: + test_file = Path(d) / "telemetry.jsonl" + existing = {"session_id": "abc", "timestamp": "2025-01-01T00:00:00"} + test_file.write_text(json.dumps(existing) + "\n") + + old_file = t.TELEMETRY_FILE + old_repo = t._TELEMETRY_REPO + t.TELEMETRY_FILE = test_file + t._TELEMETRY_REPO = "test/repo" + t._restore_done = False + + try: + hub_records = [ + {"session_id": "abc", "timestamp": "2025-01-01T00:00:00"}, # duplicate + {"session_id": "def", "timestamp": "2025-01-02T00:00:00"}, # new + ] + with patch("obliteratus.telemetry.fetch_hub_records", return_value=hub_records): + count = restore_from_hub() + assert count == 1 # Only the new record + + # Verify file contents + lines = test_file.read_text().strip().split("\n") + assert len(lines) == 2 # original + 1 new + finally: + t.TELEMETRY_FILE = old_file + t._TELEMETRY_REPO = old_repo + + def test_restore_only_runs_once(self): + """Calling restore_from_hub() twice should be a no-op the second time.""" + import obliteratus.telemetry as t + t._restore_done = False + + with patch("obliteratus.telemetry._TELEMETRY_REPO", "test/repo"): + with patch("obliteratus.telemetry.fetch_hub_records", return_value=[]): + restore_from_hub() + # Second call should return 0 immediately + assert restore_from_hub() == 0 diff --git a/tests/test_visualization.py b/tests/test_visualization.py new file mode 100644 index 0000000..a25b7d3 --- /dev/null +++ b/tests/test_visualization.py @@ -0,0 +1,167 @@ +"""Tests for visualization module (non-interactive, save-to-file).""" + +from __future__ import annotations + +import tempfile +from pathlib import Path + +import pytest +import torch + +from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer +from obliteratus.analysis.activation_probing import ActivationProbe +from obliteratus.analysis.visualization import ( + _sanitize_label, + plot_refusal_topology, + plot_cross_layer_heatmap, + plot_angular_drift, + plot_probe_dashboard, + plot_defense_radar, +) +from obliteratus.analysis.defense_robustness import DefenseProfile + + +@pytest.fixture +def tmp_dir(): + with tempfile.TemporaryDirectory() as d: + yield Path(d) + + +def _make_refusal_data(n_layers=6, hidden_dim=16): + """Create test refusal directions and means.""" + torch.manual_seed(42) + directions = {} + harmful_means = {} + harmless_means = {} + + for i in range(n_layers): + d = torch.randn(hidden_dim) + directions[i] = d / d.norm() + base = torch.randn(hidden_dim) + harmless_means[i] = base.unsqueeze(0) + harmful_means[i] = (base + (2.0 if i in [2, 3, 4] else 0.3) * directions[i]).unsqueeze(0) + + strong_layers = [2, 3, 4] + return directions, harmful_means, harmless_means, strong_layers + + +class TestRefusalTopology: + def test_plot_saves_file(self, tmp_dir): + directions, h_means, b_means, strong = _make_refusal_data() + path = tmp_dir / "topology.png" + plot_refusal_topology( + directions, h_means, b_means, strong, output_path=path + ) + assert path.exists() + assert path.stat().st_size > 0 + + def test_plot_returns_figure(self, tmp_dir): + directions, h_means, b_means, strong = _make_refusal_data() + fig = plot_refusal_topology( + directions, h_means, b_means, strong, output_path=tmp_dir / "test.png" + ) + assert fig is not None + + +class TestCrossLayerHeatmap: + def test_plot_saves_file(self, tmp_dir): + torch.manual_seed(42) + directions = {i: torch.randn(16) for i in range(6)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + + path = tmp_dir / "heatmap.png" + plot_cross_layer_heatmap(result, output_path=path) + assert path.exists() + + +class TestAngularDrift: + def test_plot_saves_file(self, tmp_dir): + torch.manual_seed(42) + directions = {i: torch.randn(16) for i in range(8)} + analyzer = CrossLayerAlignmentAnalyzer() + result = analyzer.analyze(directions) + + path = tmp_dir / "drift.png" + plot_angular_drift(result, output_path=path) + assert path.exists() + + +class TestProbeDashboard: + def test_plot_saves_file(self, tmp_dir): + torch.manual_seed(42) + harmful = {i: [torch.randn(8) for _ in range(3)] for i in range(4)} + harmless = {i: [torch.randn(8) for _ in range(3)] for i in range(4)} + dirs = {i: torch.randn(8) for i in range(4)} + + probe = ActivationProbe() + result = probe.probe_all_layers(harmful, harmless, dirs) + + path = tmp_dir / "probe.png" + plot_probe_dashboard(result, output_path=path) + assert path.exists() + + +class TestDefenseRadar: + def test_plot_saves_file(self, tmp_dir): + profile = DefenseProfile( + model_name="test-model", + alignment_type_estimate="RLHF-like", + refusal_concentration=0.4, + refusal_layer_spread=5, + mean_refusal_strength=2.0, + max_refusal_strength=4.0, + self_repair_estimate=0.6, + entanglement_score=0.3, + estimated_robustness="medium", + ) + path = tmp_dir / "radar.png" + plot_defense_radar(profile, output_path=path) + assert path.exists() + + def test_model_name_sanitized_in_title(self, tmp_dir): + """Ensure sensitive paths in model_name don't leak into saved charts.""" + profile = DefenseProfile( + model_name="/home/user/.cache/huggingface/hub/models--secret-org/private-model", + alignment_type_estimate="RLHF-like", + refusal_concentration=0.4, + refusal_layer_spread=5, + mean_refusal_strength=2.0, + max_refusal_strength=4.0, + self_repair_estimate=0.6, + entanglement_score=0.3, + estimated_robustness="medium", + ) + path = tmp_dir / "radar_sanitized.png" + fig = plot_defense_radar(profile, output_path=path) + # Title should not contain the full filesystem path + title_text = fig.axes[0].get_title() + assert "/home/user" not in title_text + assert ".cache" not in title_text + + +class TestSanitizeLabel: + def test_strips_absolute_paths(self): + result = _sanitize_label("/home/user/.cache/huggingface/models--org/model") + assert "/home/user" not in result + assert "model" in result + + def test_redacts_hf_tokens(self): + result = _sanitize_label("model with hf_abcdefghij token") + assert "hf_abcdefghij" not in result + assert "" in result + + def test_redacts_long_hex_strings(self): + hex_str = "a" * 40 + result = _sanitize_label(f"commit {hex_str}") + assert hex_str not in result + assert "" in result + + def test_truncates_long_strings(self): + long = "x" * 200 + result = _sanitize_label(long) + assert len(result) <= 80 + assert result.endswith("...") + + def test_passes_normal_strings_through(self): + assert _sanitize_label("Refusal Topology Map") == "Refusal Topology Map"