Add files via upload

This commit is contained in:
pliny
2026-03-05 00:50:44 -08:00
committed by GitHub
parent 4cddc6399a
commit 66ea4a6f86
37 changed files with 10756 additions and 45 deletions
+1 -1
View File
@@ -5,7 +5,7 @@ Thanks for your interest in contributing. This document covers everything you ne
## Development Setup
```bash
git clone https://github.com/obliteratus-project/OBLITERATUS.git
git clone https://github.com/elder-plinius/OBLITERATUS.git
cd OBLITERATUS
pip install -e ".[dev]"
```
+5 -5
View File
@@ -28,7 +28,7 @@ short_description: "One-click model liberation + chat playground"
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue" alt="Open in HF Spaces">
</a>
&nbsp;
<a href="https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb">
<a href="https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab">
</a>
</p>
@@ -55,7 +55,7 @@ Built on published research from [Arditi et al. (2024)](https://arxiv.org/abs/24
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --method advanced
```
Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All.
Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All.
## What it does
@@ -153,7 +153,7 @@ The `obliteratus ui` command adds a Rich terminal startup with GPU detection and
### 3. Google Colab (free GPU)
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb)
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb)
Pick a model from the dropdown, pick a method, hit Run All. Download the result or push straight to HuggingFace Hub. Works on the free T4 tier for models up to ~8B parameters.
@@ -545,7 +545,7 @@ If you use OBLITERATUS in your research, please cite:
Refusal Removal in Large Language Models},
author = {{OBLITERATUS Contributors}},
year = {2026},
url = {https://github.com/obliteratus-project/OBLITERATUS},
url = {https://github.com/elder-plinius/OBLITERATUS},
note = {15 analysis modules, 837 tests}
}
```
@@ -565,7 +565,7 @@ pytest
- **Open source** — [GNU Affero General Public License v3.0](LICENSE) (AGPL-3.0). You can freely use, modify, and distribute OBLITERATUS under AGPL terms. If you run a modified version as a network service (SaaS), you must release your source code to users under the same license.
- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/obliteratus-project/OBLITERATUS/issues) for pricing and terms.
- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/elder-plinius/OBLITERATUS/issues) for pricing and terms.
This is the same dual-licensing model used by MongoDB, Qt, Grafana, and others.
+1 -1
View File
@@ -11,7 +11,7 @@ OBLITERATUS is a mechanistic interpretability research tool. It removes refusal
If you discover a security vulnerability in OBLITERATUS, please report it responsibly:
1. **Do not** open a public GitHub issue
2. Open a [private security advisory](https://github.com/obliteratus-project/OBLITERATUS/security/advisories/new) with:
2. Open a [private security advisory](https://github.com/elder-plinius/OBLITERATUS/security/advisories/new) with:
- Description of the vulnerability
- Steps to reproduce
- Potential impact
+50 -25
View File
@@ -115,6 +115,10 @@ _last_obliterated_label: str = ""
# Counter for unique obliteration save directories
_obliterate_counter: int = 0
# Flag to suppress session_model_dd.change when obliterate programmatically
# sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
_skip_session_load: bool = False
# ---------------------------------------------------------------------------
# Model presets — 100+ models organized by provider
# ---------------------------------------------------------------------------
@@ -1459,7 +1463,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
f" or locally: `export HF_TOKEN=hf_...`\n\n"
f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n"
f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).",
"", gr.update(), gr.update(), gr.update(),
"", gr.update(), gr.update(), gr.update(), gr.update(),
)
return
@@ -1468,14 +1472,14 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub):
yield (
"**Error:** Invalid Hub repo format. Use `username/model-name`.",
"", gr.update(), gr.update(), gr.update(),
"", gr.update(), gr.update(), gr.update(), gr.update(),
)
return
if not os.environ.get("HF_TOKEN"):
yield (
"**Error:** HF_TOKEN not set. Push to Hub requires a write token. "
"Set it via `export HF_TOKEN=hf_...` or in your Space secrets.",
"", gr.update(), gr.update(), gr.update(),
"", gr.update(), gr.update(), gr.update(), gr.update(),
)
return
@@ -1486,7 +1490,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
_clear_gpu()
with _lock:
if _state["status"] == "obliterating":
yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update()
yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update(), gr.update()
return
_state["log"] = []
_state["status"] = "obliterating"
@@ -1638,9 +1642,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
status_msg = f"**Obliterating\u2026** ({_elapsed()})"
if len(log_lines) > last_yielded[0]:
last_yielded[0] = len(log_lines)
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
else:
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
if time.time() - _pipeline_start > _max_pipeline_secs:
log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.")
break
@@ -1655,7 +1659,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
err_msg = str(error_ref[0]) or repr(error_ref[0])
log_lines.append(f"\nERROR: {err_msg}")
_state["log"] = log_lines
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
return
# Success — keep model in memory for chat.
@@ -1757,7 +1761,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
if bnb_available:
log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...")
last_yielded[0] = len(log_lines)
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
try:
from transformers import BitsAndBytesConfig
bnb_cfg = BitsAndBytesConfig(
@@ -1804,7 +1808,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
else "Falling back to CPU offload..."
)
last_yielded[0] = len(log_lines)
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
try:
offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
model_reloaded = AutoModelForCausalLM.from_pretrained(
@@ -1861,13 +1865,21 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
f"**{model_choice}** liberated with `{method}` method. "
f"Saved to `{save_dir}`. Chat requires a larger GPU."
)
# Update session dropdown directly (don't rely on .then() which can
# fail to fire on ZeroGPU after generator teardown)
# Update BOTH session dropdowns directly (don't rely on .then() which
# fails to fire on ZeroGPU after generator teardown).
# Set skip flag so the .change handler doesn't trigger a wasteful
# GPU re-allocation — the model is already loaded.
global _skip_session_load
_skip_session_load = True
_dd_update = gr.update(
choices=_get_session_model_choices(),
value=_last_obliterated_label or None,
)
yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card
_ab_dd_update = gr.update(
choices=_get_session_model_choices(),
value=_last_obliterated_label or None,
)
yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card, _ab_dd_update
except Exception as e:
# Ensure status never gets stuck on "obliterating"
@@ -1876,7 +1888,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
err_msg = str(e) or repr(e)
log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
_state["log"] = log_lines
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
# ---------------------------------------------------------------------------
@@ -2102,6 +2114,18 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
On ZeroGPU, uses the visitor's GPU quota.
"""
# Skip if the obliterate function just set the dropdown value — the model
# is already loaded and we'd just waste GPU quota re-allocating.
global _skip_session_load
if _skip_session_load:
_skip_session_load = False
if choice and _state.get("status") == "ready":
yield (
f"**Ready!** `{choice}` is loaded — just type in the chat below.",
get_chat_header(),
)
return
if not choice or choice not in _bench_configs:
yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", ""
return
@@ -3727,6 +3751,7 @@ Pre-configured benchmark configurations for common research questions.
choices=_get_session_model_choices(),
label="Cached Models",
info="Select a model to auto-load it for chat",
allow_custom_value=True,
)
session_load_status = gr.Markdown("")
@@ -3779,6 +3804,7 @@ See exactly how abliteration changes model behavior on the same prompt.
choices=_get_session_model_choices(),
label="Cached Models",
info="Select a model to auto-load it for A/B comparison",
allow_custom_value=True,
)
ab_session_load_status = gr.Markdown("")
@@ -4125,8 +4151,8 @@ Built on the shoulders of:
### Links
- [GitHub](https://github.com/obliteratus-project/OBLITERATUS)
- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper)
- [GitHub](https://github.com/elder-plinius/OBLITERATUS)
- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper)
""")
# Wire method dropdown → auto-update advanced settings
@@ -4192,28 +4218,27 @@ Built on the shoulders of:
).then(fn=_get_vram_html, outputs=[vram_display])
# Wire obliterate button (after all tabs so chat_status is defined)
# session_model_dd is a direct output (4th) so the dropdown updates
# reliably even on ZeroGPU where .then() may not fire after generator teardown.
# Both session_model_dd (4th) and ab_session_model_dd (6th) are direct
# outputs so the dropdowns update reliably even on ZeroGPU where .then()
# may not fire after generator teardown.
obliterate_btn.click(
fn=obliterate,
inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd,
custom_harmful_tb, custom_harmless_tb] + _adv_controls,
outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md],
outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md, ab_session_model_dd],
).then(
fn=lambda: (
gr.update(choices=_get_session_model_choices()),
_get_vram_html(),
),
outputs=[ab_session_model_dd, vram_display],
fn=lambda: _get_vram_html(),
outputs=[vram_display],
)
# Wire session model auto-loading (Chat tab dropdown change)
# Always pass choices + value together so ZeroGPU doesn't hit stale choices
session_model_dd.change(
fn=load_bench_into_chat,
inputs=[session_model_dd],
outputs=[session_load_status, chat_status],
).then(
fn=lambda v: (gr.update(value=v), _get_vram_html()),
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
inputs=[session_model_dd],
outputs=[ab_session_model_dd, vram_display],
)
@@ -4224,7 +4249,7 @@ Built on the shoulders of:
inputs=[ab_session_model_dd],
outputs=[ab_session_load_status, chat_status],
).then(
fn=lambda v: (gr.update(value=v), _get_vram_html()),
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
inputs=[ab_session_model_dd],
outputs=[session_model_dd, vram_display],
)
+3 -3
View File
@@ -1095,7 +1095,7 @@
<h2>&gt; Quickstart: Free a Model</h2>
<div style="background:#000; padding:16px; border:1px solid var(--border); margin-top:12px; line-height:2; font-size:0.78rem;">
<span style="color:var(--text-dim)"># 1. get the liberation toolkit</span><br>
<span style="color:var(--accent)">$</span> git clone https://github.com/obliteratus-project/OBLITERATUS<br>
<span style="color:var(--accent)">$</span> git clone https://github.com/elder-plinius/OBLITERATUS<br>
<span style="color:var(--accent)">$</span> cd OBLITERATUS<br>
<span style="color:var(--accent)">$</span> pip install -e .<br><br>
<span style="color:var(--text-dim)"># 2. interactive mode (guided liberation)</span><br>
@@ -1154,7 +1154,7 @@
<div style="margin-bottom:16px; padding:12px; border-left:3px solid var(--yellow); background:rgba(255,183,0,0.03)">
<h4 style="color:var(--yellow); font-size:0.82rem">Concept Cone Geometry <span style="font-size:0.65rem; color:var(--red)">[NOVEL]</span></h4>
<p style="color:var(--text-dim); font-size:0.75rem; margin-top:4px">
Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Gurnee &amp; Nanda (ICML 2025) with novel extensions.
Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Wollschlager et al. (ICML 2025) with novel extensions.
</p>
</div>
<div style="margin-bottom:16px; padding:12px; border-left:3px solid var(--yellow); background:rgba(255,183,0,0.03)">
@@ -1397,7 +1397,7 @@
<div style="margin-bottom:16px; padding:16px; background:linear-gradient(135deg, rgba(249,171,0,0.08), rgba(249,171,0,0.02)); border:1px solid rgba(249,171,0,0.3); border-radius:6px">
<div style="font-size:0.82rem; font-weight:700; color:var(--yellow); margin-bottom:8px; letter-spacing:0.5px">&#9656; COLAB NOTEBOOK</div>
<div style="display:flex; align-items:center; gap:12px; flex-wrap:wrap">
<a id="colab-link" href="https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb" target="_blank" rel="noopener"
<a id="colab-link" href="https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb" target="_blank" rel="noopener"
style="display:inline-flex; align-items:center; gap:8px; background:#f9ab00; color:#000; padding:10px 20px; font-weight:700; font-size:0.85rem; text-decoration:none; border-radius:4px; letter-spacing:0.5px; font-family:'Fira Code',monospace">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="" style="height:20px; vertical-align:middle">
OPEN IN COLAB
+3 -3
View File
@@ -50,7 +50,7 @@ Logged-in HuggingFace users get free GPU quota. For more quota, upgrade to [HF P
## Run locally (same UI, your own GPU)
```bash
git clone https://github.com/obliteratus-project/OBLITERATUS
git clone https://github.com/elder-plinius/OBLITERATUS
cd OBLITERATUS
pip install -e ".[spaces]"
@@ -73,5 +73,5 @@ No GPU hardware selection needed — ZeroGPU handles allocation automatically.
## Links
- [GitHub](https://github.com/obliteratus-project/OBLITERATUS)
- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper)
- [GitHub](https://github.com/elder-plinius/OBLITERATUS)
- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper)
+1 -1
View File
@@ -53,7 +53,7 @@
"id": "install"
},
"outputs": [],
"source": "!pip install -q git+https://github.com/obliteratus-project/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
"source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
},
{
"cell_type": "markdown",
+24 -3
View File
@@ -4010,6 +4010,11 @@ class AbliterationPipeline:
f"Projecting packed quantized data would silently corrupt the model. "
f"Original error: {e}"
)
# Some architectures store weights as non-float types (e.g. uint8 from
# custom quantization schemes). Projections require float math, so
# convert and treat as "quantized" so the caller writes back properly.
if not weight.data.is_floating_point():
return weight.data.to(torch.float32), True
return weight.data, False
@staticmethod
@@ -4049,10 +4054,20 @@ class AbliterationPipeline:
)
return
# ── Non-float weight (e.g. uint8 from custom quantization) ─────
# If the original weight isn't a bitsandbytes/GPTQ/AWQ param, just
# replace with the float version so projections are preserved.
weight = proj_module.weight
if not AbliterationPipeline._is_quantized_param(weight):
proj_module.weight = nn.Parameter(
W_modified.to(device=weight.device),
requires_grad=weight.requires_grad,
)
return
# ── bitsandbytes re-quantization ──────────────────────────
try:
import bitsandbytes as bnb
weight = proj_module.weight
quantized, new_state = bnb.functional.quantize_4bit(
W_modified.to(weight.device),
quant_type=getattr(weight, "quant_type", "nf4"),
@@ -4087,7 +4102,8 @@ class AbliterationPipeline:
norms: dict[str, float] = {}
for param_name, param in layer.named_parameters():
if param_name.endswith(".weight"):
norms[param_name] = param.data.norm().item()
data = param.data.float() if not param.data.is_floating_point() else param.data
norms[param_name] = data.norm().item()
return norms
@staticmethod
@@ -4106,7 +4122,8 @@ class AbliterationPipeline:
continue
original_norm = saved_norms[param_name]
if original_norm > 0:
new_norm = param.data.norm().item()
data = param.data.float() if not param.data.is_floating_point() else param.data
new_norm = data.norm().item()
if math.isnan(new_norm) or math.isinf(new_norm) or new_norm == 0:
continue # Skip — weight is degenerate after projection
if abs(new_norm - original_norm) > 1e-6:
@@ -4294,6 +4311,10 @@ class AbliterationPipeline:
continue
else:
data = param.data
# Non-float (e.g. uint8) fused params need float conversion
if not data.is_floating_point():
data = data.float()
is_quantized = True # ensure write-back replaces param
if data.dim() < 3:
continue
+3 -3
View File
@@ -38,9 +38,9 @@ dependencies = [
]
[project.urls]
"Homepage" = "https://github.com/obliteratus-project/OBLITERATUS"
"Repository" = "https://github.com/obliteratus-project/OBLITERATUS"
"Bug Tracker" = "https://github.com/obliteratus-project/OBLITERATUS/issues"
"Homepage" = "https://github.com/elder-plinius/OBLITERATUS"
"Repository" = "https://github.com/elder-plinius/OBLITERATUS"
"Bug Tracker" = "https://github.com/elder-plinius/OBLITERATUS/issues"
[project.optional-dependencies]
dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
View File
File diff suppressed because it is too large Load Diff
+302
View File
@@ -0,0 +1,302 @@
"""Extended tests for novel abliteration pipeline features.
Tests the new capabilities added to the OBLITERATUS abliteration pipeline:
- Bias projection
- Chat template wrapping
- Method presets with new parameters
- True iterative refinement
- Whitened SVD integration
"""
from __future__ import annotations
from unittest.mock import MagicMock
import torch
from transformers import GPT2Config, GPT2LMHeadModel
from obliteratus.abliterate import (
METHODS,
AbliterationPipeline,
)
from obliteratus.models.loader import ModelHandle
def _make_tiny_handle():
"""Create a minimal ModelHandle with a tiny GPT-2 for testing."""
config = GPT2Config(
vocab_size=1000,
n_positions=128,
n_embd=64,
n_layer=4,
n_head=2,
n_inner=256,
)
model = GPT2LMHeadModel(config)
model.eval()
tokenizer = MagicMock()
tokenizer.pad_token = "<pad>"
tokenizer.eos_token = "<eos>"
tokenizer.return_value = {
"input_ids": torch.randint(0, 1000, (1, 10)),
"attention_mask": torch.ones(1, 10, dtype=torch.long),
}
tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city"
handle = ModelHandle(
model=model,
tokenizer=tokenizer,
config=config,
model_name="gpt2-test",
task="causal_lm",
)
handle.snapshot()
return handle
def _make_varied_tokenizer(handle):
"""Set up a tokenizer mock that returns different tokens per call."""
call_count = [0]
def mock_tokenizer(prompt, **kwargs):
call_count[0] += 1
torch.manual_seed(call_count[0])
return {
"input_ids": torch.randint(0, 1000, (1, 5)),
"attention_mask": torch.ones(1, 5, dtype=torch.long),
}
handle.tokenizer.side_effect = mock_tokenizer
# ---------------------------------------------------------------------------
# New method preset parameters
# ---------------------------------------------------------------------------
class TestNewMethodPresets:
def test_basic_has_new_params(self):
cfg = METHODS["basic"]
assert "project_biases" in cfg
assert "use_chat_template" in cfg
assert "use_whitened_svd" in cfg
assert "true_iterative_refinement" in cfg
assert cfg["project_biases"] is False
assert cfg["use_chat_template"] is False
def test_advanced_has_new_params(self):
cfg = METHODS["advanced"]
assert cfg["project_biases"] is True
assert cfg["use_chat_template"] is True
assert cfg["use_whitened_svd"] is False
assert cfg["true_iterative_refinement"] is False
def test_aggressive_has_new_params(self):
cfg = METHODS["aggressive"]
assert cfg["project_biases"] is True
assert cfg["use_chat_template"] is True
assert cfg["use_whitened_svd"] is True
assert cfg["true_iterative_refinement"] is True
# ---------------------------------------------------------------------------
# Pipeline initialization with new parameters
# ---------------------------------------------------------------------------
class TestNewPipelineInit:
def test_default_new_params(self):
pipeline = AbliterationPipeline(model_name="test-model")
# advanced method defaults
assert pipeline.project_biases is True
assert pipeline.use_chat_template is True
assert pipeline.use_whitened_svd is False
assert pipeline.true_iterative_refinement is False
def test_basic_method_new_params(self):
pipeline = AbliterationPipeline(model_name="test-model", method="basic")
assert pipeline.project_biases is False
assert pipeline.use_chat_template is False
assert pipeline.use_whitened_svd is False
assert pipeline.true_iterative_refinement is False
def test_aggressive_method_new_params(self):
pipeline = AbliterationPipeline(model_name="test-model", method="aggressive")
assert pipeline.project_biases is True
assert pipeline.use_chat_template is True
assert pipeline.use_whitened_svd is True
assert pipeline.true_iterative_refinement is True
def test_explicit_overrides_new_params(self):
pipeline = AbliterationPipeline(
model_name="test-model",
method="basic",
project_biases=True,
use_chat_template=True,
use_whitened_svd=True,
true_iterative_refinement=True,
)
assert pipeline.project_biases is True
assert pipeline.use_chat_template is True
assert pipeline.use_whitened_svd is True
assert pipeline.true_iterative_refinement is True
# ---------------------------------------------------------------------------
# Bias projection
# ---------------------------------------------------------------------------
class TestBiasProjection:
def test_project_bias_removes_component(self):
"""Bias projection should remove refusal direction component from bias."""
class Wrapper(torch.nn.Module):
def __init__(self):
super().__init__()
self.o_proj = torch.nn.Linear(4, 4, bias=True)
module = Wrapper()
torch.manual_seed(42)
module.o_proj.bias.data = torch.tensor([1.0, 2.0, 3.0, 4.0])
direction = torch.tensor([1.0, 0.0, 0.0, 0.0]).unsqueeze(-1) # unit vector along dim 0
count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
assert count == 1
# The component along direction [1,0,0,0] was 1.0, should now be ~0
new_bias = module.o_proj.bias.data
projection_onto_dir = (new_bias @ direction.squeeze()).item()
assert abs(projection_onto_dir) < 1e-5
# Other components should be unchanged
assert abs(new_bias[1].item() - 2.0) < 1e-5
assert abs(new_bias[2].item() - 3.0) < 1e-5
assert abs(new_bias[3].item() - 4.0) < 1e-5
def test_project_bias_no_bias(self):
"""Should handle modules without bias gracefully."""
class Wrapper(torch.nn.Module):
def __init__(self):
super().__init__()
self.o_proj = torch.nn.Linear(4, 4, bias=False)
module = Wrapper()
direction = torch.randn(4, 1)
count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
assert count == 0
def test_project_bias_no_matching_module(self):
"""Should return 0 when no candidate names match."""
class Wrapper(torch.nn.Module):
def __init__(self):
super().__init__()
self.something = torch.nn.Linear(4, 4, bias=True)
module = Wrapper()
direction = torch.randn(4, 1)
count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
assert count == 0
# ---------------------------------------------------------------------------
# Chat template wrapping
# ---------------------------------------------------------------------------
class TestChatTemplate:
def test_no_wrap_when_disabled(self):
"""Should not wrap prompts when use_chat_template is False."""
pipeline = AbliterationPipeline(
model_name="test-model",
method="basic",
use_chat_template=False,
)
prompts = ["Hello", "World"]
result = pipeline._maybe_apply_chat_template(prompts)
assert result == prompts
def test_no_wrap_without_handle(self):
"""Should return raw prompts when handle is not set."""
pipeline = AbliterationPipeline(
model_name="test-model",
use_chat_template=True,
)
prompts = ["Hello"]
result = pipeline._maybe_apply_chat_template(prompts)
assert result == prompts
def test_wraps_with_template(self):
"""Should wrap prompts when tokenizer has apply_chat_template."""
pipeline = AbliterationPipeline(
model_name="test-model",
use_chat_template=True,
)
handle = MagicMock()
tokenizer = MagicMock()
def mock_apply(messages, tokenize=False, add_generation_prompt=True):
return f"<user>{messages[0]['content']}</user><assistant>"
tokenizer.apply_chat_template = mock_apply
handle.tokenizer = tokenizer
pipeline.handle = handle
pipeline._on_log = lambda m: None
result = pipeline._maybe_apply_chat_template(["Hello"])
assert "<user>Hello</user>" in result[0]
def test_fallback_when_no_template(self):
"""Should fall back to raw prompts when template is not configured."""
pipeline = AbliterationPipeline(
model_name="test-model",
use_chat_template=True,
)
handle = MagicMock()
tokenizer = MagicMock()
tokenizer.apply_chat_template.side_effect = Exception("No template")
handle.tokenizer = tokenizer
pipeline.handle = handle
pipeline._on_log = lambda m: None
result = pipeline._maybe_apply_chat_template(["Hello"])
assert result == ["Hello"]
# ---------------------------------------------------------------------------
# Metadata includes new fields
# ---------------------------------------------------------------------------
class TestMetadata:
def test_rebirth_includes_new_config(self):
"""Metadata should include all new configuration parameters."""
import json
handle = _make_tiny_handle()
pipeline = AbliterationPipeline(
model_name="test-model",
method="aggressive",
)
pipeline.handle = handle
pipeline._on_log = lambda m: None
pipeline._on_stage = lambda r: None
pipeline._strong_layers = [0]
pipeline._quality_metrics = {"perplexity": 8.5, "coherence": 1.0}
handle.model.save_pretrained = MagicMock()
handle.tokenizer.save_pretrained = MagicMock()
import tempfile
from pathlib import Path
with tempfile.TemporaryDirectory() as tmp:
pipeline.output_dir = Path(tmp) / "output"
pipeline._rebirth()
metadata = json.loads(
(pipeline.output_dir / "abliteration_metadata.json").read_text()
)
cfg = metadata["method_config"]
assert "project_biases" in cfg
assert "use_chat_template" in cfg
assert "use_whitened_svd" in cfg
assert "true_iterative_refinement" in cfg
assert cfg["project_biases"] is True
assert cfg["use_whitened_svd"] is True
# Should have more references now
assert len(metadata["references"]) >= 5
assert any("OBLITERATUS" in r for r in metadata["references"])
+300
View File
@@ -0,0 +1,300 @@
"""Mathematical verification that abliteration actually removes refusal directions.
These tests verify the core linear algebra claims WITHOUT mocks:
1. Projection removes the target direction from weight matrices
2. Norm-preserving projection maintains weight magnitude
3. Multi-direction SVD extracts the correct subspace
4. Whitened SVD produces orthogonal directions
5. Random directions do NOT have the same effect (negative control)
Unlike the other test files, these use real tensors and verify mathematical
properties directly no MagicMock, no mocked tokenizers.
"""
from __future__ import annotations
import torch
class TestProjectionRemovesDirection:
"""Verify that orthogonal projection removes the target direction."""
def test_single_direction_projection(self):
"""After projecting out direction d from weight W,
W_proj @ d should be approximately zero."""
torch.manual_seed(42)
hidden = 256
out_dim = 128
W = torch.randn(out_dim, hidden)
d = torch.randn(hidden)
d = d / d.norm()
# Project out d: W_proj = W - (W @ d) @ d^T
proj = W @ d # (out_dim,)
W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0)
# Verify: W_proj @ d should be ~0
residual = W_proj @ d
assert residual.abs().max().item() < 1e-5, f"Residual too large: {residual.abs().max()}"
def test_projection_preserves_orthogonal_components(self):
"""Projection should NOT change components orthogonal to d."""
torch.manual_seed(42)
hidden = 256
out_dim = 128
W = torch.randn(out_dim, hidden)
d = torch.randn(hidden)
d = d / d.norm()
# Create a vector orthogonal to d
v = torch.randn(hidden)
v = v - (v @ d) * d # Gram-Schmidt
v = v / v.norm()
# Project out d
proj = W @ d
W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0)
# W @ v should equal W_proj @ v (orthogonal component unchanged)
original = W @ v
projected = W_proj @ v
diff = (original - projected).abs().max().item()
assert diff < 1e-5, f"Orthogonal component changed by {diff}"
def test_multi_direction_subspace_removal(self):
"""Projecting out a k-dimensional subspace should remove all k directions."""
torch.manual_seed(42)
hidden = 256
out_dim = 128
k = 4
W = torch.randn(out_dim, hidden)
# Create orthonormal subspace
Q, _ = torch.linalg.qr(torch.randn(hidden, k))
subspace = Q.T # (k, hidden)
# Project out subspace: W_proj = W - W @ Q @ Q^T
W_proj = W - (W @ Q) @ Q.T
# Verify: W_proj @ subspace^T should be ~0 for all directions
residual = W_proj @ subspace.T # (out_dim, k)
assert residual.abs().max().item() < 1e-5, f"Subspace residual: {residual.abs().max()}"
def test_double_projection_is_idempotent(self):
"""Projecting twice should give the same result as projecting once."""
torch.manual_seed(42)
hidden = 256
out_dim = 128
W = torch.randn(out_dim, hidden)
d = torch.randn(hidden)
d = d / d.norm()
# Project once
proj1 = W @ d
W1 = W - proj1.unsqueeze(1) * d.unsqueeze(0)
# Project twice
proj2 = W1 @ d
W2 = W1 - proj2.unsqueeze(1) * d.unsqueeze(0)
diff = (W1 - W2).abs().max().item()
assert diff < 1e-5, f"Second projection changed weights by {diff}"
class TestNormPreservation:
"""Verify that norm-preserving projection maintains weight magnitude."""
def test_norm_preserving_projection(self):
"""Biprojected norm-preserving abliteration should keep ||W|| constant."""
torch.manual_seed(42)
hidden = 256
out_dim = 128
W = torch.randn(out_dim, hidden)
d = torch.randn(hidden)
d = d / d.norm()
# Standard projection
proj_coeff = W @ d
W_proj = W - proj_coeff.unsqueeze(1) * d.unsqueeze(0)
# Norm-preserving rescaling (per-row)
row_norms_orig = W.norm(dim=1, keepdim=True).clamp(min=1e-8)
row_norms_proj = W_proj.norm(dim=1, keepdim=True).clamp(min=1e-8)
W_norm_preserved = W_proj * (row_norms_orig / row_norms_proj)
# Direction is still removed
residual = W_norm_preserved @ d
# Norm-preserving can't guarantee zero projection (it rescales),
# but projection should be significantly reduced
original_proj = (W @ d).abs().mean().item()
preserved_proj = residual.abs().mean().item()
assert preserved_proj < original_proj * 0.5, \
f"Norm-preserved projection {preserved_proj} not much less than original {original_proj}"
# Row norms are preserved
row_diff = (W_norm_preserved.norm(dim=1) - W.norm(dim=1)).abs().max().item()
assert row_diff < 1e-5, f"Row norms changed by {row_diff}"
class TestSVDDirectionExtraction:
"""Verify that SVD on the difference matrix extracts the refusal direction."""
def test_planted_direction_recovery(self):
"""Plant a known direction in the difference and verify SVD recovers it."""
torch.manual_seed(42)
n_samples = 50
hidden = 256
# Plant a known refusal direction
true_direction = torch.randn(hidden)
true_direction = true_direction / true_direction.norm()
# Harmful activations = harmless + signal along true_direction + noise
harmless = torch.randn(n_samples, hidden) * 0.5
signal_strength = 5.0
harmful = harmless + signal_strength * true_direction.unsqueeze(0) + torch.randn(n_samples, hidden) * 0.1
# Extract via SVD on difference
diff = harmful - harmless
U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
extracted = Vh[0]
extracted = extracted / extracted.norm()
# The extracted direction should align with the true direction
cosine = (extracted @ true_direction).abs().item()
assert cosine > 0.95, f"Cosine similarity {cosine:.3f} too low (expected > 0.95)"
def test_multi_direction_recovery(self):
"""Plant k directions and verify SVD recovers the subspace."""
torch.manual_seed(42)
n_samples = 200
hidden = 256
k = 3
# Plant k orthogonal directions with varying per-sample strength
Q, _ = torch.linalg.qr(torch.randn(hidden, k))
true_subspace = Q.T # (k, hidden)
# Each sample gets a random mix of the k planted directions
harmless = torch.randn(n_samples, hidden) * 0.01
coefficients = torch.randn(n_samples, k).abs() * 5.0
signal = coefficients @ true_subspace # (n_samples, hidden)
harmful = harmless + signal
diff = harmful - harmless
U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
extracted_subspace = Vh[:k] # (k, hidden)
# Check subspace overlap: project true directions into extracted subspace
for i in range(k):
proj = extracted_subspace @ true_subspace[i]
captured_variance = proj.norm().item()
assert captured_variance > 0.9, \
f"Direction {i}: captured variance {captured_variance:.3f} too low"
class TestRandomDirectionBaseline:
"""Verify that random directions do NOT have the same effect as learned ones."""
def test_random_direction_has_lower_projection(self):
"""Random directions should project much less on harmful activations
than the true refusal direction."""
torch.manual_seed(42)
n_samples = 50
hidden = 256
# Create structured harmful vs harmless difference
true_dir = torch.randn(hidden)
true_dir = true_dir / true_dir.norm()
harmless = torch.randn(n_samples, hidden) * 0.5
harmful = harmless + 3.0 * true_dir.unsqueeze(0)
harmful_mean = harmful.mean(dim=0)
# True direction projection
true_proj = (harmful_mean @ true_dir).abs().item()
# Random direction projections (seeds far from 42 to avoid collision)
random_projs = []
for i in range(100):
rng = torch.Generator().manual_seed(10000 + i)
rand_dir = torch.randn(hidden, generator=rng)
rand_dir = rand_dir / rand_dir.norm()
random_projs.append((harmful_mean @ rand_dir).abs().item())
mean_random = sum(random_projs) / len(random_projs)
# True direction should project MUCH more than random average
assert true_proj > mean_random * 3.0, \
f"True projection ({true_proj:.3f}) not much larger than random mean ({mean_random:.3f})"
class TestWhitenedSVD:
"""Verify whitened SVD properties."""
def test_whitened_directions_are_orthogonal(self):
"""Whitened SVD should produce orthogonal directions."""
torch.manual_seed(42)
n_samples = 80
hidden = 128
k = 4
H = torch.randn(n_samples, hidden) + torch.randn(1, hidden) * 2
B = torch.randn(n_samples, hidden)
mu_B = B.mean(dim=0, keepdim=True)
B_centered = B - mu_B
cov_B = (B_centered.T @ B_centered) / (n_samples - 1)
cov_B += 1e-4 * torch.eye(hidden)
eigenvalues, eigenvectors = torch.linalg.eigh(cov_B)
eigenvalues = eigenvalues.clamp(min=0)
inv_sqrt_eig = 1.0 / torch.sqrt(eigenvalues + 1e-4)
whiten_proj = eigenvectors * inv_sqrt_eig.unsqueeze(0)
H_whitened = (H - mu_B) @ whiten_proj
B_whitened = B_centered @ whiten_proj
D_whitened = H_whitened - B_whitened
U, S, Vh = torch.linalg.svd(D_whitened, full_matrices=False)
directions = Vh[:k]
# Check orthogonality: directions @ directions^T should be ~identity
gram = directions @ directions.T
identity = torch.eye(k)
off_diag = (gram - identity).abs().max().item()
assert off_diag < 1e-4, f"Directions not orthogonal: max off-diagonal = {off_diag}"
class TestReproducibility:
"""Verify that seed setting produces deterministic results."""
def test_set_seed_determinism(self):
"""Same seed should produce identical random tensors."""
from obliteratus.reproducibility import set_seed
set_seed(123, deterministic=False)
a = torch.randn(100)
set_seed(123, deterministic=False)
b = torch.randn(100)
assert torch.equal(a, b), "Same seed produced different tensors"
def test_different_seeds_differ(self):
"""Different seeds should produce different tensors."""
from obliteratus.reproducibility import set_seed
set_seed(123, deterministic=False)
a = torch.randn(100)
set_seed(456, deterministic=False)
b = torch.randn(100)
assert not torch.equal(a, b), "Different seeds produced identical tensors"
+364
View File
@@ -0,0 +1,364 @@
"""Tests for advanced evaluation metrics."""
from __future__ import annotations
import pytest
import torch
from obliteratus.evaluation.advanced_metrics import (
REFUSAL_PREFIXES,
REFUSAL_SUBSTRINGS,
_is_refusal,
refusal_rate,
token_kl_divergence,
first_token_kl_divergence,
effective_rank,
effective_rank_change,
activation_cosine_similarity,
linear_cka,
refusal_projection_magnitude,
format_eval_report,
AbliterationEvalResult,
)
# ---------------------------------------------------------------------------
# Refusal Rate
# ---------------------------------------------------------------------------
class TestRefusalRate:
def test_all_refusals(self):
responses = [
"I'm sorry, but I cannot help with that.",
"I apologize, but I'm unable to assist.",
"As an AI, I cannot provide that information.",
]
assert refusal_rate(responses, mode="prefix") == 1.0
def test_no_refusals(self):
responses = [
"Sure, here's how to do that.",
"Here's a step-by-step guide.",
"The answer is 42.",
]
assert refusal_rate(responses, mode="prefix") == 0.0
def test_mixed_responses(self):
responses = [
"I'm sorry, I can't do that.",
"Sure, here you go.",
"I cannot provide that.",
"Here's what you need.",
]
rate = refusal_rate(responses, mode="prefix")
assert rate == 0.5 # 2 out of 4
def test_substring_mode(self):
responses = [
"Well, actually I cannot help with that request.",
"Let me explain how to do this.",
]
assert refusal_rate(responses, mode="substring") == 0.5
def test_combined_mode(self):
responses = [
"I'm sorry, but no.", # caught by prefix
"That said, i won't do that for you.", # caught by substring
"Here's the answer.", # not a refusal
]
rate = refusal_rate(responses, mode="combined")
assert abs(rate - 2.0 / 3.0) < 1e-6
def test_empty_responses(self):
assert refusal_rate([], mode="prefix") == 0.0
def test_empty_string_not_refusal(self):
assert not _is_refusal("", mode="combined")
def test_is_refusal_prefix(self):
assert _is_refusal("I'm sorry, I can't do that.", mode="prefix")
assert _is_refusal("As an AI language model, I cannot", mode="prefix")
assert not _is_refusal("The answer is 42.", mode="prefix")
def test_is_refusal_substring(self):
assert _is_refusal("I must say, i cannot assist with that.", mode="substring")
assert not _is_refusal("The capital of France is Paris.", mode="substring")
def test_refusal_prefixes_nonempty(self):
assert len(REFUSAL_PREFIXES) > 20
def test_refusal_substrings_nonempty(self):
assert len(REFUSAL_SUBSTRINGS) > 10
# ---------------------------------------------------------------------------
# KL Divergence
# ---------------------------------------------------------------------------
class TestKLDivergence:
def test_identical_distributions(self):
"""KL divergence of identical distributions should be 0."""
logits = torch.randn(2, 10, 100)
kl = token_kl_divergence(logits, logits)
assert abs(kl) < 1e-5
def test_different_distributions(self):
"""KL divergence of different distributions should be positive."""
torch.manual_seed(42)
logits_a = torch.randn(2, 10, 100)
logits_b = torch.randn(2, 10, 100)
kl = token_kl_divergence(logits_a, logits_b)
assert kl > 0
def test_kl_nonnegative(self):
"""KL divergence should always be non-negative."""
torch.manual_seed(42)
for _ in range(5):
logits_a = torch.randn(1, 5, 50)
logits_b = torch.randn(1, 5, 50)
kl = token_kl_divergence(logits_a, logits_b)
assert kl >= -1e-6 # allow small numerical errors
def test_first_token_kl_identical(self):
"""First-token KL of identical distributions should be 0."""
logits = torch.randn(4, 20, 100)
kl = first_token_kl_divergence(logits, logits)
assert abs(kl) < 1e-5
def test_first_token_kl_different(self):
"""First-token KL of different distributions should be positive."""
torch.manual_seed(42)
logits_a = torch.randn(4, 20, 100)
logits_b = torch.randn(4, 20, 100)
kl = first_token_kl_divergence(logits_a, logits_b)
assert kl > 0
def test_temperature_effect(self):
"""Higher temperature should reduce KL divergence (smoother distributions)."""
torch.manual_seed(42)
logits_a = torch.randn(2, 5, 50)
logits_b = torch.randn(2, 5, 50)
kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0)
kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0)
assert kl_t5 < kl_t1
# ---------------------------------------------------------------------------
# Effective Rank
# ---------------------------------------------------------------------------
class TestEffectiveRank:
def test_rank_one_matrix(self):
"""Rank-1 matrix should have effective rank close to 1."""
v = torch.randn(8, 1)
u = torch.randn(1, 4)
W = v @ u # rank-1
erank = effective_rank(W)
assert erank < 1.5
def test_identity_matrix(self):
"""Identity matrix should have effective rank equal to dimension."""
n = 8
W = torch.eye(n)
erank = effective_rank(W)
assert abs(erank - n) < 0.1
def test_random_full_rank(self):
"""Random matrix should have high effective rank."""
torch.manual_seed(42)
W = torch.randn(16, 16)
erank = effective_rank(W)
assert erank > 10 # should be close to 16
def test_zero_matrix(self):
"""Zero matrix should have effective rank 0."""
W = torch.zeros(4, 4)
erank = effective_rank(W)
assert erank == 0.0
def test_effective_rank_change(self):
"""Should compute before/after rank comparison."""
torch.manual_seed(42)
W_before = torch.randn(8, 8)
# Simulate abliteration: remove a direction (reduces rank slightly)
d = torch.randn(8, 1)
d = d / d.norm()
W_after = W_before - (W_before @ d) @ d.T
result = effective_rank_change(W_before, W_after)
assert "rank_before" in result
assert "rank_after" in result
assert "rank_delta" in result
assert "rank_ratio" in result
assert result["rank_after"] <= result["rank_before"] + 0.1
def test_rejects_non_2d(self):
"""Should raise ValueError for non-2D tensors."""
with pytest.raises(ValueError):
effective_rank(torch.randn(4, 4, 4))
# ---------------------------------------------------------------------------
# Activation Cosine Similarity
# ---------------------------------------------------------------------------
class TestActivationCosineSimilarity:
def test_identical_activations(self):
acts = torch.randn(10, 32)
sim = activation_cosine_similarity(acts, acts)
assert abs(sim - 1.0) < 1e-5
def test_orthogonal_activations(self):
"""Orthogonal activations should have cosine near 0."""
a = torch.tensor([[1.0, 0.0, 0.0]])
b = torch.tensor([[0.0, 1.0, 0.0]])
sim = activation_cosine_similarity(a, b)
assert abs(sim) < 1e-5
def test_opposite_activations(self):
"""Opposite activations should have cosine -1."""
a = torch.randn(5, 16)
sim = activation_cosine_similarity(a, -a)
assert abs(sim - (-1.0)) < 1e-5
def test_handles_3d(self):
"""Should handle 3D tensors by reshaping."""
a = torch.randn(2, 5, 16)
b = torch.randn(2, 5, 16)
sim = activation_cosine_similarity(a, b)
assert -1.0 <= sim <= 1.0
# ---------------------------------------------------------------------------
# Linear CKA
# ---------------------------------------------------------------------------
class TestLinearCKA:
def test_identical_representations(self):
"""CKA of identical representations should be 1.0."""
X = torch.randn(20, 16)
cka = linear_cka(X, X)
assert abs(cka - 1.0) < 1e-4
def test_scaled_representations(self):
"""CKA should be invariant to isotropic scaling."""
X = torch.randn(20, 16)
Y = X * 5.0
cka = linear_cka(X, Y)
assert abs(cka - 1.0) < 1e-4
def test_random_representations(self):
"""CKA of random representations should be low."""
torch.manual_seed(42)
X = torch.randn(100, 16)
Y = torch.randn(100, 16)
cka = linear_cka(X, Y)
assert cka < 0.3 # random should be near 0
def test_cka_bounded(self):
"""CKA should be between 0 and 1."""
torch.manual_seed(42)
for _ in range(5):
X = torch.randn(20, 8)
Y = torch.randn(20, 8)
cka = linear_cka(X, Y)
assert -0.01 <= cka <= 1.01 # small tolerance for numerics
def test_different_dimensions(self):
"""CKA should work with different hidden dimensions."""
X = torch.randn(20, 16)
Y = torch.randn(20, 32)
cka = linear_cka(X, Y)
assert -0.01 <= cka <= 1.01
def test_handles_3d(self):
"""Should handle 3D tensors by reshaping."""
X = torch.randn(2, 10, 16)
Y = torch.randn(2, 10, 16)
cka = linear_cka(X, Y)
assert -0.01 <= cka <= 1.01
# ---------------------------------------------------------------------------
# Refusal Direction Projection Magnitude
# ---------------------------------------------------------------------------
class TestRefusalProjection:
def test_aligned_activations(self):
"""Activations aligned with direction should have high projection."""
d = torch.tensor([1.0, 0.0, 0.0])
acts = torch.tensor([
[5.0, 0.0, 0.0],
[3.0, 0.0, 0.0],
[4.0, 0.0, 0.0],
])
result = refusal_projection_magnitude(acts, d)
assert result["mean"] == 4.0
assert result["abs_mean"] == 4.0
def test_orthogonal_activations(self):
"""Orthogonal activations should have zero projection."""
d = torch.tensor([1.0, 0.0, 0.0])
acts = torch.tensor([
[0.0, 5.0, 0.0],
[0.0, 0.0, 3.0],
])
result = refusal_projection_magnitude(acts, d)
assert abs(result["mean"]) < 1e-5
assert abs(result["abs_mean"]) < 1e-5
def test_result_keys(self):
"""Should return all expected keys."""
d = torch.randn(8)
acts = torch.randn(5, 8)
result = refusal_projection_magnitude(acts, d)
assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"}
# ---------------------------------------------------------------------------
# Eval Report Formatting
# ---------------------------------------------------------------------------
class TestEvalReport:
def test_format_report(self):
result = AbliterationEvalResult(
refusal_rate_harmful=0.1,
refusal_rate_harmless=0.02,
kl_divergence=0.15,
perplexity=12.5,
coherence_score=0.8,
mean_activation_cosine=0.95,
mean_cka=0.92,
)
report = format_eval_report(result)
assert "10.0%" in report
assert "12.50" in report
assert "excellent" in report # KL < 0.2
def test_format_report_high_kl(self):
result = AbliterationEvalResult(
refusal_rate_harmful=0.0,
refusal_rate_harmless=0.0,
kl_divergence=1.5,
perplexity=50.0,
coherence_score=0.4,
mean_activation_cosine=None,
mean_cka=None,
)
report = format_eval_report(result)
assert "significant damage" in report
def test_format_report_no_kl(self):
result = AbliterationEvalResult(
refusal_rate_harmful=0.5,
refusal_rate_harmless=0.1,
kl_divergence=None,
perplexity=20.0,
coherence_score=1.0,
mean_activation_cosine=None,
mean_cka=None,
)
report = format_eval_report(result)
assert "50.0%" in report
assert "KL" not in report
+345
View File
@@ -0,0 +1,345 @@
"""Tests for the analysis techniques."""
from __future__ import annotations
import torch
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor, WhitenedSVDResult
from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer, CrossLayerResult
from obliteratus.analysis.activation_probing import ActivationProbe, ProbeResult
# ---------------------------------------------------------------------------
# WhitenedSVDExtractor
# ---------------------------------------------------------------------------
class TestWhitenedSVD:
def test_basic_extraction(self):
"""Whitened SVD should extract directions from activation differences."""
torch.manual_seed(42)
n_prompts, hidden_dim = 10, 32
# Create activations with a clear refusal direction
refusal_dir = torch.randn(hidden_dim)
refusal_dir = refusal_dir / refusal_dir.norm()
harmless = [torch.randn(hidden_dim) for _ in range(n_prompts)]
harmful = [h + 2.0 * refusal_dir for h in harmless] # shifted along refusal dir
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless, n_directions=3)
assert isinstance(result, WhitenedSVDResult)
assert result.directions.shape == (3, hidden_dim)
assert result.singular_values.shape == (3,)
assert result.variance_explained > 0
assert result.condition_number > 0
assert result.effective_rank > 0
def test_directions_are_unit_vectors(self):
"""Extracted directions should be unit length."""
torch.manual_seed(42)
harmless = [torch.randn(16) for _ in range(8)]
harmful = [h + torch.randn(16) * 0.5 for h in harmless]
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless, n_directions=2)
for i in range(result.directions.shape[0]):
assert abs(result.directions[i].norm().item() - 1.0) < 1e-4
def test_primary_aligns_with_planted_direction(self):
"""Primary whitened direction should capture the planted refusal signal.
Whitening rotates directions relative to the covariance structure,
so perfect alignment with the raw direction is not expected. We verify
the whitened direction explains substantial variance and has moderate
alignment (whitening intentionally reweights dimensions).
"""
torch.manual_seed(42)
hidden_dim = 64
n_prompts = 30
refusal_dir = torch.randn(hidden_dim)
refusal_dir = refusal_dir / refusal_dir.norm()
# Isotropic harmless activations (whitening has minimal effect)
harmless = [torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
harmful = [h + 5.0 * refusal_dir for h in harmless]
extractor = WhitenedSVDExtractor(regularization_eps=1e-3)
result = extractor.extract(harmful, harmless, n_directions=1)
cos_sim = (result.directions[0] @ refusal_dir).abs().item()
# Moderate alignment expected (whitening reweights dimensions)
assert cos_sim > 0.2, f"Expected alignment > 0.2, got {cos_sim:.3f}"
# More importantly: the direction should explain most variance
assert result.variance_explained > 0.5
def test_extract_all_layers(self):
"""Should extract directions for all provided layers."""
torch.manual_seed(42)
harmful_acts = {}
harmless_acts = {}
for layer in range(4):
harmful_acts[layer] = [torch.randn(16) for _ in range(5)]
harmless_acts[layer] = [torch.randn(16) for _ in range(5)]
extractor = WhitenedSVDExtractor()
results = extractor.extract_all_layers(harmful_acts, harmless_acts, n_directions=2)
assert len(results) == 4
for idx in range(4):
assert idx in results
assert results[idx].directions.shape[0] == 2
def test_compare_with_standard(self):
"""Comparison should return valid cosine similarities."""
torch.manual_seed(42)
harmless = [torch.randn(16) for _ in range(8)]
harmful = [h + torch.randn(16) for h in harmless]
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless, n_directions=2)
std_dir = torch.randn(16)
std_dir = std_dir / std_dir.norm()
comparison = WhitenedSVDExtractor.compare_with_standard(result, std_dir)
assert "primary_direction_cosine" in comparison
assert "subspace_principal_cosine" in comparison
assert 0 <= comparison["primary_direction_cosine"] <= 1.0
def test_handles_3d_activations(self):
"""Should handle activations with an extra batch dimension."""
torch.manual_seed(42)
# (1, hidden_dim) shape from hook output
harmless = [torch.randn(1, 16) for _ in range(5)]
harmful = [torch.randn(1, 16) for _ in range(5)]
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless, n_directions=2)
assert result.directions.shape == (2, 16)
def test_variance_explained_bounded(self):
"""Variance explained should be between 0 and 1."""
torch.manual_seed(42)
harmless = [torch.randn(16) for _ in range(8)]
harmful = [torch.randn(16) for _ in range(8)]
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless, n_directions=3)
assert 0 <= result.variance_explained <= 1.0
# ---------------------------------------------------------------------------
# CrossLayerAlignmentAnalyzer
# ---------------------------------------------------------------------------
class TestCrossLayerAlignment:
def test_identical_directions(self):
"""Identical directions across layers should give persistence = 1."""
direction = torch.randn(32)
direction = direction / direction.norm()
directions = {i: direction.clone() for i in range(5)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
assert isinstance(result, CrossLayerResult)
assert result.direction_persistence_score > 0.99
assert result.mean_adjacent_cosine > 0.99
assert result.total_geodesic_distance < 0.01
def test_orthogonal_directions(self):
"""Orthogonal directions should give low persistence."""
# Create orthogonal directions via QR decomposition
torch.manual_seed(42)
M = torch.randn(5, 32)
Q, _ = torch.linalg.qr(M.T)
directions = {i: Q[:, i] for i in range(5)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
assert result.direction_persistence_score < 0.3
assert result.mean_adjacent_cosine < 0.3
def test_cluster_detection(self):
"""Should detect clusters of similar directions."""
torch.manual_seed(42)
# Create two clusters
d1 = torch.randn(32)
d1 = d1 / d1.norm()
d2 = torch.randn(32)
d2 = d2 / d2.norm()
directions = {
0: d1, 1: d1 + 0.01 * torch.randn(32),
2: d1 + 0.01 * torch.randn(32),
3: d2, 4: d2 + 0.01 * torch.randn(32),
}
# Normalize
directions = {k: v / v.norm() for k, v in directions.items()}
analyzer = CrossLayerAlignmentAnalyzer(cluster_threshold=0.9)
result = analyzer.analyze(directions)
# Should find at least 2 clusters
assert result.cluster_count >= 2
def test_empty_input(self):
"""Should handle empty input gracefully."""
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze({})
assert result.layer_indices == []
assert result.cluster_count == 0
def test_single_layer(self):
"""Single layer should work fine."""
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze({5: torch.randn(16)})
assert result.layer_indices == [5]
assert result.direction_persistence_score == 1.0
def test_strong_layers_filter(self):
"""Should only analyze specified strong layers."""
directions = {i: torch.randn(16) for i in range(10)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions, strong_layers=[2, 5, 7])
assert result.layer_indices == [2, 5, 7]
assert result.cosine_matrix.shape == (3, 3)
def test_cosine_matrix_symmetry(self):
"""Cosine matrix should be symmetric."""
torch.manual_seed(42)
directions = {i: torch.randn(16) for i in range(4)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
diff = (result.cosine_matrix - result.cosine_matrix.T).abs().max().item()
assert diff < 1e-5
def test_cosine_matrix_diagonal_ones(self):
"""Diagonal of cosine matrix should be 1.0."""
torch.manual_seed(42)
directions = {i: torch.randn(16) for i in range(4)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
for i in range(4):
assert abs(result.cosine_matrix[i, i].item() - 1.0) < 1e-4
def test_angular_drift_monotonic(self):
"""Angular drift should be monotonically non-decreasing."""
torch.manual_seed(42)
directions = {i: torch.randn(16) for i in range(6)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
for i in range(len(result.angular_drift) - 1):
assert result.angular_drift[i + 1] >= result.angular_drift[i] - 1e-6
def test_format_report(self):
"""Format report should produce a non-empty string."""
torch.manual_seed(42)
directions = {i: torch.randn(16) for i in range(4)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
report = CrossLayerAlignmentAnalyzer.format_report(result)
assert "Cross-Layer" in report
assert "persistence" in report
# ---------------------------------------------------------------------------
# ActivationProbe
# ---------------------------------------------------------------------------
class TestActivationProbe:
def test_clean_elimination(self):
"""After removing direction, projections should be near-zero."""
torch.manual_seed(42)
hidden_dim = 32
refusal_dir = torch.randn(hidden_dim)
refusal_dir = refusal_dir / refusal_dir.norm()
# "Post-abliteration" activations: direction has been removed
harmless = [torch.randn(hidden_dim) for _ in range(10)]
harmful = [torch.randn(hidden_dim) for _ in range(10)]
# Both sets are random, no refusal signal => gap should be small
probe = ActivationProbe()
result = probe.probe_layer(harmful, harmless, refusal_dir)
assert abs(result.projection_gap) < 1.0
assert result.separation_d_prime < 2.0
def test_residual_detection(self):
"""Should detect residual refusal signal when direction wasn't removed."""
torch.manual_seed(42)
hidden_dim = 32
refusal_dir = torch.randn(hidden_dim)
refusal_dir = refusal_dir / refusal_dir.norm()
harmless = [torch.randn(hidden_dim) for _ in range(10)]
# Harmful still has strong refusal direction component
harmful = [h + 5.0 * refusal_dir for h in harmless]
probe = ActivationProbe()
result = probe.probe_layer(harmful, harmless, refusal_dir)
assert abs(result.projection_gap) > 1.0
assert result.separation_d_prime > 2.0
def test_probe_all_layers(self):
"""Should compute aggregate metrics across layers."""
torch.manual_seed(42)
hidden_dim = 16
n_layers = 4
harmful_acts = {}
harmless_acts = {}
refusal_dirs = {}
for layer in range(n_layers):
harmful_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
harmless_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
d = torch.randn(hidden_dim)
refusal_dirs[layer] = d / d.norm()
probe = ActivationProbe()
result = probe.probe_all_layers(harmful_acts, harmless_acts, refusal_dirs)
assert isinstance(result, ProbeResult)
assert len(result.per_layer) == n_layers
assert 0 <= result.refusal_elimination_score <= 1.0
assert result.mean_projection_gap >= 0
def test_res_score_range(self):
"""RES should always be between 0 and 1."""
torch.manual_seed(42)
for seed in range(5):
torch.manual_seed(seed)
harmful = {0: [torch.randn(8) for _ in range(3)]}
harmless = {0: [torch.randn(8) for _ in range(3)]}
dirs = {0: torch.randn(8)}
dirs[0] = dirs[0] / dirs[0].norm()
probe = ActivationProbe()
result = probe.probe_all_layers(harmful, harmless, dirs)
assert 0 <= result.refusal_elimination_score <= 1.0
def test_format_report(self):
"""Format report should produce readable output."""
torch.manual_seed(42)
harmful = {0: [torch.randn(8) for _ in range(3)]}
harmless = {0: [torch.randn(8) for _ in range(3)]}
dirs = {0: torch.randn(8)}
probe = ActivationProbe()
result = probe.probe_all_layers(harmful, harmless, dirs)
report = ActivationProbe.format_report(result)
assert "Refusal Elimination Score" in report
def test_empty_input(self):
"""Should handle empty input gracefully."""
probe = ActivationProbe()
result = probe.probe_all_layers({}, {}, {})
assert result.refusal_elimination_score == 0.0
assert len(result.per_layer) == 0
+65
View File
@@ -0,0 +1,65 @@
"""Tests for shared analysis utilities (gini_coefficient, etc.)."""
from __future__ import annotations
import pytest
from obliteratus.analysis.utils import gini_coefficient
class TestGiniCoefficient:
"""Tests for the Gini coefficient computation."""
def test_empty_list(self):
assert gini_coefficient([]) == 0.0
def test_single_value(self):
assert gini_coefficient([42.0]) == 0.0
def test_uniform_distribution(self):
"""All-equal values → Gini = 0."""
assert gini_coefficient([1.0, 1.0, 1.0, 1.0]) == pytest.approx(0.0, abs=1e-10)
def test_maximally_concentrated(self):
"""One value, rest zero → Gini ≈ 1."""
result = gini_coefficient([100.0, 0.0, 0.0, 0.0])
assert result > 0.7 # For n=4, max Gini = (n-1)/n = 0.75
def test_all_zeros(self):
assert gini_coefficient([0.0, 0.0, 0.0]) == 0.0
def test_two_equal_values(self):
assert gini_coefficient([5.0, 5.0]) == pytest.approx(0.0, abs=1e-10)
def test_two_unequal_values(self):
"""[0, 10] → Gini = 0.5 for n=2."""
result = gini_coefficient([0.0, 10.0])
assert result == pytest.approx(0.5, abs=0.01)
def test_moderate_inequality(self):
"""Moderate spread → Gini between 0 and 1."""
result = gini_coefficient([1.0, 2.0, 3.0, 4.0, 5.0])
assert 0.1 < result < 0.5
def test_result_in_valid_range(self):
"""Gini is always in [0, 1]."""
for vals in [[1, 2, 3], [0, 0, 100], [5, 5, 5], [1], [0.1, 0.9]]:
result = gini_coefficient(vals)
assert 0.0 <= result <= 1.0, f"Gini({vals}) = {result} out of range"
def test_large_uniform(self):
"""Large uniform distribution → Gini ≈ 0."""
vals = [1.0] * 1000
assert gini_coefficient(vals) == pytest.approx(0.0, abs=1e-10)
def test_large_concentrated(self):
"""Large distribution with one outlier → high Gini."""
vals = [0.0] * 999 + [1000.0]
result = gini_coefficient(vals)
assert result > 0.99
def test_order_invariant(self):
"""Gini should not depend on input order."""
a = gini_coefficient([1.0, 3.0, 5.0, 7.0])
b = gini_coefficient([7.0, 1.0, 5.0, 3.0])
assert a == pytest.approx(b)
+598
View File
@@ -0,0 +1,598 @@
"""Tests for architecture-aware preset defaults.
Tests the detection logic and recommended parameter overrides for each
architecture class (dense/MoE, standard/reasoning).
"""
from __future__ import annotations
from obliteratus.architecture_profiles import (
ArchitectureClass,
ArchitectureProfile,
ReasoningClass,
detect_architecture,
get_profile_summary,
apply_profile_to_method_config,
)
# ---------------------------------------------------------------------------
# Detection: Dense models
# ---------------------------------------------------------------------------
class TestDenseDetection:
"""Test that standard dense models are correctly classified."""
def test_llama_is_dense(self):
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert profile.arch_class == ArchitectureClass.DENSE
assert profile.reasoning_class == ReasoningClass.STANDARD
assert not profile.is_moe
def test_qwen_dense_is_dense(self):
profile = detect_architecture("Qwen/Qwen2.5-7B-Instruct")
assert profile.arch_class == ArchitectureClass.DENSE
assert not profile.is_moe
def test_gemma_is_dense(self):
profile = detect_architecture("google/gemma-3-27b-it")
assert profile.arch_class == ArchitectureClass.DENSE
def test_phi_is_dense(self):
profile = detect_architecture("microsoft/Phi-4-mini-instruct")
assert profile.arch_class == ArchitectureClass.DENSE
def test_mistral_small_is_dense(self):
profile = detect_architecture("mistralai/Mistral-Small-24B-Instruct-2501")
assert profile.arch_class == ArchitectureClass.DENSE
def test_yi_is_dense(self):
profile = detect_architecture("01-ai/Yi-1.5-9B-Chat")
assert profile.arch_class == ArchitectureClass.DENSE
def test_dense_label(self):
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert profile.profile_label == "Dense Standard"
def test_dense_recommended_method(self):
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert profile.recommended_method == "aggressive"
# ---------------------------------------------------------------------------
# Detection: MoE models
# ---------------------------------------------------------------------------
class TestMoEDetection:
"""Test that MoE models are correctly classified."""
def test_gpt_oss_is_moe(self):
"""GPT-OSS is MoE. Without config, defaults to small (conservative)."""
profile = detect_architecture("openai/gpt-oss-20b")
assert profile.is_moe
assert profile.arch_class == ArchitectureClass.SMALL_MOE
def test_qwen3_30b_is_small_moe(self):
profile = detect_architecture("Qwen/Qwen3-30B-A3B")
assert profile.is_moe
def test_deepseek_v3_is_large_moe(self):
profile = detect_architecture("deepseek-ai/DeepSeek-V3.2")
assert profile.is_moe
def test_kimi_k2_is_large_moe(self):
profile = detect_architecture("moonshotai/Kimi-K2-Instruct")
assert profile.is_moe
def test_qwen3_235b_is_moe(self):
profile = detect_architecture("Qwen/Qwen3-235B-A22B")
assert profile.is_moe
def test_glm_47_is_moe(self):
profile = detect_architecture("zai-org/GLM-4.7")
assert profile.is_moe
def test_llama4_maverick_is_moe(self):
profile = detect_architecture("meta-llama/Llama-4-Maverick-17B-128E-Instruct")
assert profile.is_moe
def test_step_flash_is_moe(self):
profile = detect_architecture("stepfun-ai/Step-3.5-Flash")
assert profile.is_moe
def test_minimax_is_moe(self):
profile = detect_architecture("MiniMaxAI/MiniMax-M2.1")
assert profile.is_moe
def test_mistral_large_3_is_moe(self):
profile = detect_architecture("mistralai/Mistral-Large-3-675B-Instruct-2512")
assert profile.is_moe
def test_moe_recommended_method_is_surgical(self):
"""All MoE profiles recommend surgical method."""
profile = detect_architecture("openai/gpt-oss-20b")
assert profile.recommended_method == "surgical"
def test_gpt_oss_with_config_is_small_moe(self):
"""GPT-OSS with config providing expert count → small MoE."""
class MockConfig:
model_type = "gpt_neox"
num_hidden_layers = 32
hidden_size = 2560
intermediate_size = 6912
vocab_size = 50304
num_local_experts = 8
num_experts_per_tok = 2
profile = detect_architecture("openai/gpt-oss-20b", config=MockConfig())
assert profile.is_moe
assert profile.arch_class == ArchitectureClass.SMALL_MOE
# ---------------------------------------------------------------------------
# Detection: Reasoning models
# ---------------------------------------------------------------------------
class TestReasoningDetection:
"""Test that reasoning models are correctly classified."""
def test_r1_distill_qwen_is_reasoning(self):
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
assert profile.reasoning_class == ReasoningClass.REASONING
def test_r1_distill_llama_is_reasoning(self):
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
assert profile.reasoning_class == ReasoningClass.REASONING
def test_r1_distill_is_dense_reasoning(self):
"""R1 distills are dense (distilled from MoE into dense)."""
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
assert profile.arch_class == ArchitectureClass.DENSE
assert profile.reasoning_class == ReasoningClass.REASONING
assert profile.profile_label == "Dense Reasoning"
def test_olmo_think_is_reasoning(self):
profile = detect_architecture("allenai/Olmo-3.1-32B-Think")
assert profile.reasoning_class == ReasoningClass.REASONING
def test_olmo_standard_is_not_reasoning(self):
"""OLMo (without Think) must NOT be classified as reasoning.
Regression test: 'olmo' contains 'o1' substring."""
profile = detect_architecture("allenai/Olmo-3-7B-Instruct")
assert profile.reasoning_class == ReasoningClass.STANDARD
def test_falcon3_is_not_reasoning(self):
"""falcon3 must NOT match 'o3' reasoning pattern."""
profile = detect_architecture("tiiuae/Falcon3-7B-Instruct")
assert profile.reasoning_class == ReasoningClass.STANDARD
def test_full_r1_is_moe_reasoning(self):
profile = detect_architecture("deepseek-ai/DeepSeek-R1")
assert profile.is_moe
assert profile.reasoning_class == ReasoningClass.REASONING
def test_reasoning_dense_more_directions(self):
"""Dense reasoning models need more directions (>=12) to span refusal."""
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
assert profile.arch_class == ArchitectureClass.DENSE
assert profile.method_overrides.get("n_directions", 0) >= 12
def test_reasoning_dense_more_passes(self):
"""Dense reasoning models need more refinement passes (>=4)."""
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
assert profile.arch_class == ArchitectureClass.DENSE
assert profile.method_overrides.get("refinement_passes", 0) >= 4
def test_non_reasoning_is_standard(self):
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert profile.reasoning_class == ReasoningClass.STANDARD
# ---------------------------------------------------------------------------
# Detection with config object
# ---------------------------------------------------------------------------
class TestConfigDetection:
"""Test detection when a mock config is provided."""
def test_moe_config_attrs(self):
"""Config with num_local_experts should be detected as MoE."""
class MockConfig:
model_type = "mixtral"
num_hidden_layers = 32
hidden_size = 4096
intermediate_size = 14336
vocab_size = 32000
num_local_experts = 8
num_experts_per_tok = 2
profile = detect_architecture(
"custom/mixtral-model", config=MockConfig(),
num_layers=32, hidden_size=4096,
)
assert profile.is_moe
assert profile.num_experts == 8
assert profile.num_active_experts == 2
def test_large_moe_threshold(self):
"""MoE models with >100B params should be classified as large."""
class MockConfig:
model_type = "deepseek_v3"
num_hidden_layers = 61
hidden_size = 7168
intermediate_size = 18432
vocab_size = 102400
n_routed_experts = 256
num_experts_per_tok = 8
profile = detect_architecture(
"custom/large-moe", config=MockConfig(),
)
assert profile.arch_class == ArchitectureClass.LARGE_MOE
def test_small_moe_threshold(self):
"""MoE models with <=16 experts should be classified as small."""
class MockConfig:
model_type = "mixtral"
num_hidden_layers = 32
hidden_size = 4096
intermediate_size = 14336
vocab_size = 32000
num_local_experts = 8
num_experts_per_tok = 2
profile = detect_architecture(
"custom/small-moe", config=MockConfig(),
)
assert profile.arch_class == ArchitectureClass.SMALL_MOE
def test_dense_config(self):
"""Config without MoE attributes should be dense."""
class MockConfig:
model_type = "llama"
num_hidden_layers = 32
hidden_size = 4096
intermediate_size = 11008
vocab_size = 32000
profile = detect_architecture(
"custom/dense-model", config=MockConfig(),
)
assert profile.arch_class == ArchitectureClass.DENSE
assert not profile.is_moe
def test_llama4_scout_is_large_moe(self):
"""Llama 4 Scout: 109B total params with 16 experts → LARGE_MOE.
Regression test: params > 100B must override low expert count."""
class MockConfig:
model_type = "llama4"
num_hidden_layers = 48
hidden_size = 5120
intermediate_size = 14336
vocab_size = 202048
num_local_experts = 16
num_experts_per_tok = 1
profile = detect_architecture(
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
config=MockConfig(),
)
assert profile.is_moe
assert profile.arch_class == ArchitectureClass.LARGE_MOE
# ---------------------------------------------------------------------------
# Recommended defaults validation
# ---------------------------------------------------------------------------
class TestRecommendedDefaults:
"""Test that recommended defaults match research findings."""
def test_dense_standard_no_riemannian(self):
"""Dense Standard: Riemannian OFF (manifolds are flat)."""
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert not profile.breakthrough_modules.get("riemannian", True)
def test_dense_standard_anti_ouroboros_on(self):
"""Dense Standard: Anti-Ouroboros ON for self-repair mapping."""
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert profile.breakthrough_modules.get("anti_ouroboros", False)
def test_dense_standard_spectral_cert_on(self):
"""Dense Standard: Spectral cert ON for verification."""
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert profile.breakthrough_modules.get("spectral_cert", False)
def test_moe_conditional_on(self):
"""MoE: Conditional abliteration is #1 technique (Cracken AI 2025)."""
profile = detect_architecture("openai/gpt-oss-20b")
assert profile.breakthrough_modules.get("conditional", False)
def test_moe_no_project_embeddings(self):
"""MoE: Project embeddings OFF (cascades through router)."""
profile = detect_architecture("openai/gpt-oss-20b")
assert not profile.method_overrides.get("project_embeddings", True)
def test_moe_per_expert_directions(self):
"""MoE: Per-expert directions ON (global directions fail on MoE)."""
profile = detect_architecture("openai/gpt-oss-20b")
assert profile.method_overrides.get("per_expert_directions", False)
def test_large_moe_riemannian_on(self):
"""Large MoE: Riemannian ON (curved shared layer geometry)."""
profile = detect_architecture("deepseek-ai/DeepSeek-V3.2")
assert profile.breakthrough_modules.get("riemannian", False)
def test_reasoning_dense_jailbreak_contrast(self):
"""Reasoning Dense: Jailbreak contrast ON for thinking-chain refusal."""
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
assert profile.method_overrides.get("use_jailbreak_contrast", False)
def test_reasoning_moe_gentle_transplant(self):
"""Reasoning MoE: transplant_blend very low (preserve reasoning)."""
profile = detect_architecture("deepseek-ai/DeepSeek-R1")
assert profile.method_overrides.get("transplant_blend", 1.0) <= 0.10
# ---------------------------------------------------------------------------
# Profile summary
# ---------------------------------------------------------------------------
class TestProfileSummary:
"""Test the human-readable profile summary."""
def test_summary_contains_profile_label(self):
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
summary = get_profile_summary(profile)
assert "Dense Standard" in summary
def test_summary_contains_method(self):
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
summary = get_profile_summary(profile)
assert "aggressive" in summary
def test_summary_contains_citations(self):
profile = detect_architecture("openai/gpt-oss-20b")
summary = get_profile_summary(profile)
assert "SAFEx" in summary or "Cracken" in summary
def test_summary_contains_moe_info(self):
profile = detect_architecture("openai/gpt-oss-20b")
summary = get_profile_summary(profile)
assert "MoE" in summary
def test_summary_contains_breakthrough_modules(self):
profile = detect_architecture("openai/gpt-oss-20b")
summary = get_profile_summary(profile)
assert "conditional" in summary
# ---------------------------------------------------------------------------
# apply_profile_to_method_config
# ---------------------------------------------------------------------------
class TestApplyProfile:
"""Test that profile overrides are correctly applied to method configs."""
def test_overrides_applied(self):
from obliteratus.abliterate import METHODS
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
base = dict(METHODS["aggressive"])
merged = apply_profile_to_method_config(profile, base)
assert merged["n_directions"] == profile.method_overrides["n_directions"]
def test_non_overridden_preserved(self):
from obliteratus.abliterate import METHODS
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
base = dict(METHODS["aggressive"])
merged = apply_profile_to_method_config(profile, base)
# norm_preserve is not in overrides, should come from base
assert merged["norm_preserve"] == base["norm_preserve"]
def test_empty_overrides(self):
from obliteratus.abliterate import METHODS
base = dict(METHODS["advanced"])
profile = ArchitectureProfile(
arch_class=ArchitectureClass.DENSE,
reasoning_class=ReasoningClass.STANDARD,
method_overrides={},
breakthrough_modules={},
)
merged = apply_profile_to_method_config(profile, base)
assert merged == base
def test_override_key_not_in_base_is_added(self):
"""Override keys absent from base config should be added to result.
This is important for the UI auto-detect path: keys like
use_jailbreak_contrast may not exist in the base method config
but are valid pipeline parameters that app.py reads via merged.get().
"""
from obliteratus.abliterate import METHODS
base = dict(METHODS["advanced"])
profile = ArchitectureProfile(
arch_class=ArchitectureClass.DENSE,
reasoning_class=ReasoningClass.STANDARD,
method_overrides={"use_jailbreak_contrast": True},
breakthrough_modules={},
)
merged = apply_profile_to_method_config(profile, base)
assert merged["use_jailbreak_contrast"] is True
# ---------------------------------------------------------------------------
# All 6 profile combinations
# ---------------------------------------------------------------------------
class TestAllSixProfiles:
"""Verify label, method, overrides, and breakthrough modules for each profile."""
def _make_moe_config(self, num_experts=8, active=2, layers=32, hidden=4096):
class C:
model_type = "mixtral"
num_hidden_layers = layers
hidden_size = hidden
intermediate_size = hidden * 4
vocab_size = 32000
num_local_experts = num_experts
num_experts_per_tok = active
return C()
def test_dense_standard_full(self):
p = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
assert p.profile_label == "Dense Standard"
assert p.recommended_method == "aggressive"
assert not p.breakthrough_modules["riemannian"]
assert p.breakthrough_modules["anti_ouroboros"]
assert p.breakthrough_modules["spectral_cert"]
assert not p.breakthrough_modules["conditional"]
assert len(p.profile_description) > 0
assert len(p.research_citations) > 0
def test_dense_reasoning_full(self):
p = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
assert p.profile_label == "Dense Reasoning"
assert p.recommended_method == "aggressive"
assert p.method_overrides["n_directions"] >= 12
assert p.method_overrides["refinement_passes"] >= 4
assert p.method_overrides["use_jailbreak_contrast"] is True
assert p.method_overrides["use_chat_template"] is True
assert p.breakthrough_modules["anti_ouroboros"]
assert p.breakthrough_modules["riemannian"]
assert p.breakthrough_modules["conditional"]
assert p.breakthrough_modules["spectral_cert"]
assert len(p.profile_description) > 0
def test_small_moe_standard_full(self):
config = self._make_moe_config(num_experts=8, active=2)
p = detect_architecture("custom/small-moe-model", config=config)
assert p.profile_label == "Small MoE Standard"
assert p.arch_class == ArchitectureClass.SMALL_MOE
assert p.recommended_method == "surgical"
assert p.method_overrides["per_expert_directions"] is True
assert p.method_overrides["invert_refusal"] is False
assert p.method_overrides["project_embeddings"] is False
assert p.breakthrough_modules["conditional"]
assert p.breakthrough_modules["anti_ouroboros"]
assert p.breakthrough_modules["spectral_cert"]
assert not p.breakthrough_modules["riemannian"]
assert len(p.profile_description) > 0
def test_small_moe_reasoning_full(self):
"""The most fragile combination: MoE + reasoning."""
config = self._make_moe_config(num_experts=8, active=2)
# Add "think" to name to trigger reasoning detection
p = detect_architecture("custom/small-moe-think-model", config=config)
assert p.profile_label == "Small MoE Reasoning"
assert p.arch_class == ArchitectureClass.SMALL_MOE
assert p.reasoning_class == ReasoningClass.REASONING
assert p.recommended_method == "surgical"
assert p.method_overrides["per_expert_directions"] is True
assert p.method_overrides["use_jailbreak_contrast"] is True
assert p.method_overrides["use_chat_template"] is True
assert p.method_overrides["invert_refusal"] is False
assert p.breakthrough_modules["conditional"]
assert p.breakthrough_modules["anti_ouroboros"]
assert p.breakthrough_modules["spectral_cert"]
assert len(p.profile_description) > 0
def test_large_moe_standard_full(self):
config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168)
p = detect_architecture("custom/large-moe-model", config=config)
assert p.profile_label == "Large MoE Standard"
assert p.arch_class == ArchitectureClass.LARGE_MOE
assert p.recommended_method == "surgical"
assert p.method_overrides["per_expert_directions"] is True
assert p.method_overrides["layer_adaptive_strength"] is True
assert p.method_overrides["expert_transplant"] is True
assert p.method_overrides["transplant_blend"] == 0.10
assert p.method_overrides["attention_head_surgery"] is True
assert p.method_overrides["project_embeddings"] is False
assert p.breakthrough_modules["conditional"]
assert p.breakthrough_modules["riemannian"]
assert p.breakthrough_modules["anti_ouroboros"]
assert p.breakthrough_modules["spectral_cert"]
assert len(p.profile_description) > 0
def test_large_moe_reasoning_full(self):
config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168)
p = detect_architecture("custom/large-moe-r1-model", config=config)
assert p.profile_label == "Large MoE Reasoning"
assert p.arch_class == ArchitectureClass.LARGE_MOE
assert p.reasoning_class == ReasoningClass.REASONING
assert p.recommended_method == "surgical"
assert p.method_overrides["n_directions"] == 8
assert p.method_overrides["transplant_blend"] == 0.08
assert p.method_overrides["use_jailbreak_contrast"] is True
assert p.method_overrides["safety_neuron_masking"] is True
assert p.breakthrough_modules["conditional"]
assert p.breakthrough_modules["riemannian"]
assert p.breakthrough_modules["anti_ouroboros"]
assert p.breakthrough_modules["spectral_cert"]
assert len(p.profile_description) > 0
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEdgeCases:
"""Edge cases for architecture detection."""
def test_empty_model_name(self):
"""Empty string should fall through to Dense Standard."""
profile = detect_architecture("")
assert profile.arch_class == ArchitectureClass.DENSE
assert profile.reasoning_class == ReasoningClass.STANDARD
def test_unknown_model_type_in_config(self):
"""Unknown model_type should not cause MoE classification."""
class MockConfig:
model_type = "banana"
num_hidden_layers = 12
hidden_size = 768
intermediate_size = 3072
vocab_size = 30522
profile = detect_architecture("custom/unknown-arch", config=MockConfig())
assert profile.arch_class == ArchitectureClass.DENSE
def test_config_with_zero_experts(self):
"""num_local_experts=0 should not trigger MoE."""
class MockConfig:
model_type = "llama"
num_hidden_layers = 32
hidden_size = 4096
intermediate_size = 11008
vocab_size = 32000
num_local_experts = 0
profile = detect_architecture("custom/dense-with-zero", config=MockConfig())
assert not profile.is_moe
assert profile.arch_class == ArchitectureClass.DENSE
def test_allcaps_model_name(self):
"""Case-insensitive matching should work for all-caps names."""
profile = detect_architecture("DEEPSEEK-AI/DEEPSEEK-R1-DISTILL-QWEN-7B")
assert profile.reasoning_class == ReasoningClass.REASONING
assert profile.arch_class == ArchitectureClass.DENSE # distill = dense
def test_single_expert_is_moe(self):
"""num_local_experts=1 is technically MoE (single expert)."""
class MockConfig:
model_type = "llama"
num_hidden_layers = 32
hidden_size = 4096
intermediate_size = 11008
vocab_size = 32000
num_local_experts = 1
profile = detect_architecture("custom/single-expert", config=MockConfig())
# 1 expert still triggers MoE detection (the code treats any >0 as MoE)
assert profile.is_moe
+183
View File
@@ -0,0 +1,183 @@
"""Tests for lightweight benchmark harnesses."""
from __future__ import annotations
from unittest.mock import MagicMock
import torch
from obliteratus.evaluation.benchmarks import (
KNOWLEDGE_ITEMS,
TRUTHFULNESS_ITEMS,
MATH_REASONING_ITEMS,
BenchmarkRunner,
BenchmarkResult,
format_benchmark_report,
)
def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64):
"""Create mock model and tokenizer for benchmark testing."""
model = MagicMock()
# Model returns logits when called
def mock_forward(**kwargs):
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
batch_size, seq_len = input_ids.shape
result = MagicMock()
result.logits = torch.randn(batch_size, seq_len, vocab_size)
return result
model.side_effect = mock_forward
model.__call__ = mock_forward
# Model.generate returns token IDs
def mock_generate(**kwargs):
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
# Append some "generated" tokens
gen_tokens = torch.randint(0, vocab_size, (1, 20))
return torch.cat([input_ids, gen_tokens], dim=1)
model.generate = mock_generate
# Model.parameters for device detection
param = torch.nn.Parameter(torch.randn(1))
model.parameters = MagicMock(return_value=iter([param]))
tokenizer = MagicMock()
tokenizer.return_value = {
"input_ids": torch.randint(0, vocab_size, (1, 15)),
"attention_mask": torch.ones(1, 15, dtype=torch.long),
}
tokenizer.side_effect = lambda text, **kwargs: {
"input_ids": torch.randint(0, vocab_size, (1, 15)),
"attention_mask": torch.ones(1, 15, dtype=torch.long),
}
def mock_decode(ids, **kwargs):
return "The answer is 42. This is a generated response about the topic."
def mock_encode(text, **kwargs):
# Return different IDs for A, B, C, D
if text == "A":
return [65]
elif text == "B":
return [66]
elif text == "C":
return [67]
elif text == "D":
return [68]
return [hash(text) % vocab_size]
tokenizer.decode = mock_decode
tokenizer.encode = mock_encode
return model, tokenizer
class TestBenchmarkItems:
def test_knowledge_items_have_required_fields(self):
for item in KNOWLEDGE_ITEMS:
assert "q" in item
assert "choices" in item
assert "answer" in item
assert "category" in item
assert 0 <= item["answer"] < len(item["choices"])
def test_knowledge_items_count(self):
assert len(KNOWLEDGE_ITEMS) >= 20
def test_knowledge_categories(self):
categories = set(item["category"] for item in KNOWLEDGE_ITEMS)
assert len(categories) >= 4 # multiple categories
def test_truthfulness_items_have_required_fields(self):
for item in TRUTHFULNESS_ITEMS:
assert "q" in item
assert "true_answer" in item
assert "common_false" in item
assert "category" in item
def test_truthfulness_items_count(self):
assert len(TRUTHFULNESS_ITEMS) >= 10
def test_math_items_have_required_fields(self):
for item in MATH_REASONING_ITEMS:
assert "q" in item
assert "answer" in item
assert "category" in item
assert isinstance(item["answer"], (int, float))
def test_math_items_count(self):
assert len(MATH_REASONING_ITEMS) >= 10
class TestBenchmarkRunner:
def test_knowledge_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_knowledge_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "knowledge_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(KNOWLEDGE_ITEMS)
assert result.n_correct >= 0
assert len(result.per_category) > 0
def test_truthfulness_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_truthfulness_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "truthfulness_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(TRUTHFULNESS_ITEMS)
def test_math_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_math_reasoning_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "math_reasoning_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(MATH_REASONING_ITEMS)
def test_run_all(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
results = runner.run_all()
assert "knowledge" in results
assert "truthfulness" in results
assert "math_reasoning" in results
def test_format_report(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
results = runner.run_all()
report = format_benchmark_report(results)
assert "Capability" in report
assert "knowledge" in report
assert "truthfulness" in report
assert "math" in report
def test_per_category_scores_bounded(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_knowledge_probe()
for cat, score in result.per_category.items():
assert 0 <= score <= 1.0
def test_extract_number(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
assert runner._extract_number("The answer is 42.") == 42.0
assert runner._extract_number("$20.50 is the price") == 20.50
assert runner._extract_number("Result: -3.14") == -3.14
assert runner._extract_number("No numbers here") is None
+535
View File
@@ -0,0 +1,535 @@
"""Tests for causal tracing, residual stream decomposition,
probing classifiers, and cross-model transfer analysis."""
from __future__ import annotations
import math
import torch
from obliteratus.analysis.causal_tracing import (
CausalRefusalTracer,
CausalTracingResult,
ComponentCausalEffect,
)
from obliteratus.analysis.residual_stream import (
ResidualStreamDecomposer,
ResidualStreamResult,
LayerDecomposition,
)
from obliteratus.analysis.probing_classifiers import (
LinearRefusalProbe,
ProbeResult,
ProbingSuiteResult,
)
from obliteratus.analysis.cross_model_transfer import (
TransferAnalyzer,
CrossModelResult,
CrossCategoryResult,
CrossLayerResult,
UniversalityReport,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_layer_activations(
n_layers=8, hidden_dim=32, refusal_strength=2.0,
):
"""Create synthetic per-layer activations with planted refusal signal."""
torch.manual_seed(42)
directions = {}
activations = {}
base = torch.randn(hidden_dim) * 0.1
for i in range(n_layers):
d = torch.randn(hidden_dim)
d = d / d.norm()
directions[i] = d
# Stronger refusal in middle layers
strength = refusal_strength if 2 <= i <= 5 else 0.3
activations[i] = base + strength * d + torch.randn(hidden_dim) * 0.05
return activations, directions
def _make_separable_activations(
n_per_class=20, hidden_dim=16, separation=3.0, seed=42,
):
"""Create harmful/harmless activations that are linearly separable."""
torch.manual_seed(seed)
direction = torch.randn(hidden_dim)
direction = direction / direction.norm()
harmful = [
torch.randn(hidden_dim) * 0.5 + separation * direction
for _ in range(n_per_class)
]
harmless = [
torch.randn(hidden_dim) * 0.5 - separation * direction
for _ in range(n_per_class)
]
return harmful, harmless, direction
# ===========================================================================
# Tests: Causal Tracing
# ===========================================================================
class TestCausalTracing:
def test_basic_tracing(self):
activations, directions = _make_layer_activations()
tracer = CausalRefusalTracer(noise_level=3.0)
result = tracer.trace_from_activations(activations, directions)
assert isinstance(result, CausalTracingResult)
assert result.n_layers == 8
assert result.clean_refusal_strength > 0
assert len(result.component_effects) == 8
def test_causal_components_identified(self):
activations, directions = _make_layer_activations()
tracer = CausalRefusalTracer(noise_level=3.0, causal_threshold=0.05)
result = tracer.trace_from_activations(activations, directions)
assert result.circuit_size > 0
assert result.circuit_fraction > 0
assert len(result.causal_components) > 0
def test_corruption_reduces_strength(self):
activations, directions = _make_layer_activations(refusal_strength=5.0)
tracer = CausalRefusalTracer(noise_level=10.0)
result = tracer.trace_from_activations(activations, directions)
# With high noise, corrupted should differ from clean
assert result.total_corruption_effect != 0
def test_single_direction_input(self):
activations, directions = _make_layer_activations()
single_dir = directions[3] # Use one direction for all layers
tracer = CausalRefusalTracer()
result = tracer.trace_from_activations(activations, single_dir)
assert result.n_layers == 8
assert len(result.component_effects) == 8
def test_component_effects_structure(self):
activations, directions = _make_layer_activations()
tracer = CausalRefusalTracer()
result = tracer.trace_from_activations(activations, directions)
for e in result.component_effects:
assert isinstance(e, ComponentCausalEffect)
assert e.component_type == "full_layer"
assert e.causal_effect >= 0
def test_correlation_causal_agreement_bounded(self):
activations, directions = _make_layer_activations()
tracer = CausalRefusalTracer()
result = tracer.trace_from_activations(activations, directions)
assert -1.0 <= result.correlation_causal_agreement <= 1.0
def test_silent_contributors(self):
activations, directions = _make_layer_activations()
tracer = CausalRefusalTracer()
result = tracer.trace_from_activations(activations, directions)
sc = tracer.identify_silent_contributors(result, top_k=3)
assert "silent_contributors" in sc
assert "loud_non_contributors" in sc
assert len(sc["silent_contributors"]) <= 3
def test_custom_component_types(self):
activations, directions = _make_layer_activations()
tracer = CausalRefusalTracer()
result = tracer.trace_from_activations(
activations, directions,
component_types=["attention", "mlp"],
)
# 8 layers * 2 types = 16 effects
assert len(result.component_effects) == 16
def test_format_report(self):
activations, directions = _make_layer_activations()
tracer = CausalRefusalTracer()
result = tracer.trace_from_activations(activations, directions)
report = CausalRefusalTracer.format_tracing_report(result)
assert "Causal Tracing" in report
assert "Circuit size" in report
# ===========================================================================
# Tests: Residual Stream Decomposition
# ===========================================================================
class TestResidualStreamDecomposition:
def test_basic_decomposition(self):
activations, directions = _make_layer_activations()
decomposer = ResidualStreamDecomposer()
result = decomposer.decompose(activations, directions)
assert isinstance(result, ResidualStreamResult)
assert result.n_layers == 8
assert len(result.per_layer) == 8
assert result.total_attention_contribution > 0
assert result.total_mlp_contribution > 0
def test_attention_fraction_bounded(self):
activations, directions = _make_layer_activations()
decomposer = ResidualStreamDecomposer()
result = decomposer.decompose(activations, directions)
assert 0 <= result.attention_fraction <= 1.0
def test_with_head_count(self):
activations, directions = _make_layer_activations()
decomposer = ResidualStreamDecomposer(n_heads_per_layer=4)
result = decomposer.decompose(activations, directions)
assert result.n_refusal_heads >= 0
assert len(result.refusal_heads) > 0
def test_layer_decomposition_structure(self):
activations, directions = _make_layer_activations()
decomposer = ResidualStreamDecomposer()
result = decomposer.decompose(activations, directions)
for _layer_idx, d in result.per_layer.items():
assert isinstance(d, LayerDecomposition)
assert 0 <= d.attn_mlp_ratio <= 1.0
assert d.cumulative_refusal >= 0
def test_accumulation_profile(self):
activations, directions = _make_layer_activations()
decomposer = ResidualStreamDecomposer()
result = decomposer.decompose(activations, directions)
assert len(result.accumulation_profile) == 8
# Accumulation should be monotonically non-decreasing
for i in range(1, len(result.accumulation_profile)):
assert result.accumulation_profile[i] >= result.accumulation_profile[i - 1]
def test_with_explicit_attn_mlp(self):
"""Test with provided attention and MLP outputs."""
torch.manual_seed(42)
hidden_dim = 16
n_layers = 4
ref_dir = torch.randn(hidden_dim)
ref_dir = ref_dir / ref_dir.norm()
acts = {}
attn_outs = {}
mlp_outs = {}
for i in range(n_layers):
attn = torch.randn(hidden_dim) * 0.5
mlp = torch.randn(hidden_dim) * 0.5
attn_outs[i] = attn
mlp_outs[i] = mlp
acts[i] = attn + mlp + (torch.randn(hidden_dim) * 0.1 if i == 0 else acts[i-1])
decomposer = ResidualStreamDecomposer()
result = decomposer.decompose(
acts, ref_dir,
attn_outputs=attn_outs, mlp_outputs=mlp_outs,
)
assert len(result.per_layer) == n_layers
def test_single_direction(self):
activations, _ = _make_layer_activations()
single_dir = torch.randn(32)
decomposer = ResidualStreamDecomposer()
result = decomposer.decompose(activations, single_dir)
assert result.n_layers == 8
def test_head_concentration_bounded(self):
activations, directions = _make_layer_activations()
decomposer = ResidualStreamDecomposer(n_heads_per_layer=8)
result = decomposer.decompose(activations, directions)
assert 0 <= result.head_concentration <= 1.0
def test_format_decomposition(self):
activations, directions = _make_layer_activations()
decomposer = ResidualStreamDecomposer(n_heads_per_layer=4)
result = decomposer.decompose(activations, directions)
report = ResidualStreamDecomposer.format_decomposition(result)
assert "Residual Stream" in report
assert "Attention" in report
assert "MLP" in report
# ===========================================================================
# Tests: Probing Classifiers
# ===========================================================================
class TestProbingClassifiers:
def test_separable_data_high_accuracy(self):
"""With well-separated data, probe should achieve high accuracy."""
harmful, harmless, direction = _make_separable_activations(
n_per_class=30, separation=5.0,
)
probe = LinearRefusalProbe(n_epochs=200)
result = probe.probe_layer(harmful, harmless, direction, layer_idx=5)
assert isinstance(result, ProbeResult)
assert result.layer_idx == 5
assert result.accuracy > 0.7 # Should be separable
def test_inseparable_data_low_accuracy(self):
"""With overlapping data, probe should have lower accuracy."""
harmful, harmless, direction = _make_separable_activations(
n_per_class=30, separation=0.01,
)
probe = LinearRefusalProbe(n_epochs=50)
result = probe.probe_layer(harmful, harmless, direction)
# Accuracy should be near chance (0.5)
assert result.accuracy < 0.9
def test_learned_direction_unit(self):
harmful, harmless, direction = _make_separable_activations()
probe = LinearRefusalProbe(n_epochs=100)
result = probe.probe_layer(harmful, harmless, direction)
assert abs(result.learned_direction.norm().item() - 1.0) < 0.01
def test_cosine_with_analytical(self):
"""Learned direction should align with analytical direction."""
harmful, harmless, direction = _make_separable_activations(
n_per_class=50, separation=5.0,
)
probe = LinearRefusalProbe(n_epochs=300)
result = probe.probe_layer(harmful, harmless, direction)
# With clear separation, learned direction should agree
assert result.cosine_with_analytical > 0.3
def test_without_analytical_direction(self):
harmful, harmless, _ = _make_separable_activations()
probe = LinearRefusalProbe(n_epochs=50)
result = probe.probe_layer(harmful, harmless)
assert result.cosine_with_analytical == 0.0
def test_auroc_bounded(self):
harmful, harmless, direction = _make_separable_activations()
probe = LinearRefusalProbe(n_epochs=100)
result = probe.probe_layer(harmful, harmless, direction)
assert 0 <= result.auroc <= 1.0
def test_mutual_information_nonnegative(self):
harmful, harmless, direction = _make_separable_activations()
probe = LinearRefusalProbe(n_epochs=100)
result = probe.probe_layer(harmful, harmless, direction)
assert result.mutual_information >= 0
def test_probe_all_layers(self):
harmful_acts = {}
harmless_acts = {}
anal_dirs = {}
for li in range(6):
harmful, harmless, direction = _make_separable_activations(
n_per_class=15, separation=3.0, seed=li * 10,
)
harmful_acts[li] = harmful
harmless_acts[li] = harmless
anal_dirs[li] = direction
probe = LinearRefusalProbe(n_epochs=100)
result = probe.probe_all_layers(harmful_acts, harmless_acts, anal_dirs)
assert isinstance(result, ProbingSuiteResult)
assert len(result.per_layer) == 6
assert result.best_accuracy > 0
assert result.total_mutual_information >= 0
def test_format_report(self):
harmful_acts = {}
harmless_acts = {}
for li in range(4):
harmful, harmless, _ = _make_separable_activations(
n_per_class=15, seed=li,
)
harmful_acts[li] = harmful
harmless_acts[li] = harmless
probe = LinearRefusalProbe(n_epochs=50)
result = probe.probe_all_layers(harmful_acts, harmless_acts)
report = LinearRefusalProbe.format_probing_report(result)
assert "Linear Probing" in report
assert "accuracy" in report.lower()
def test_cross_entropy_finite(self):
harmful, harmless, direction = _make_separable_activations()
probe = LinearRefusalProbe(n_epochs=100)
result = probe.probe_layer(harmful, harmless, direction)
assert math.isfinite(result.cross_entropy)
# ===========================================================================
# Tests: Cross-Model Transfer Analysis
# ===========================================================================
class TestTransferAnalysis:
def test_cross_model_identical(self):
"""Identical directions should give perfect transfer."""
torch.manual_seed(42)
dirs = {i: torch.randn(32) for i in range(8)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_model(dirs, dirs, "model_a", "model_a")
assert isinstance(result, CrossModelResult)
assert result.mean_transfer_score > 0.99
def test_cross_model_random(self):
"""Random directions should give low transfer."""
torch.manual_seed(42)
dirs_a = {i: torch.randn(32) for i in range(8)}
torch.manual_seed(99)
dirs_b = {i: torch.randn(32) for i in range(8)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_model(dirs_a, dirs_b, "a", "b")
# Random 32-dim vectors have low expected cosine
assert result.mean_transfer_score < 0.7
def test_cross_model_structure(self):
torch.manual_seed(42)
dirs_a = {i: torch.randn(32) for i in range(8)}
dirs_b = {i: torch.randn(32) for i in range(8)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
assert 0 <= result.transfer_above_threshold <= 1.0
assert len(result.per_layer_transfer) == 8
def test_cross_category_similar(self):
"""Similar categories should cluster together."""
torch.manual_seed(42)
shared = torch.randn(32)
shared = shared / shared.norm()
cat_dirs = {}
for cat in ["weapons", "bombs", "explosives"]:
d = shared + 0.2 * torch.randn(32)
cat_dirs[cat] = d / d.norm()
# Add one very different category
cat_dirs["fraud"] = torch.randn(32)
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_category(cat_dirs)
assert isinstance(result, CrossCategoryResult)
assert result.mean_cross_category_transfer > 0
assert len(result.categories) == 4
def test_cross_category_specificity(self):
torch.manual_seed(42)
cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(5)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_category(cat_dirs)
assert result.most_universal_category != ""
assert result.most_specific_category != ""
assert len(result.category_clusters) > 0
def test_cross_layer(self):
_, directions = _make_layer_activations()
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_layer(directions)
assert isinstance(result, CrossLayerResult)
assert result.mean_adjacent_transfer >= 0
assert result.transfer_decay_rate >= 0
def test_cross_layer_adjacent_vs_distant(self):
"""Adjacent layers typically have higher transfer than distant ones."""
torch.manual_seed(42)
# Create directions with gradual drift
d = torch.randn(32)
d = d / d.norm()
directions = {}
for i in range(10):
noise = torch.randn(32) * 0.1 * i
di = d + noise
directions[i] = di / di.norm()
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_layer(directions)
# Adjacent should have higher transfer than distant
assert result.mean_adjacent_transfer >= result.mean_distant_transfer - 0.1
def test_universality_index(self):
torch.manual_seed(42)
dirs = {i: torch.randn(32) for i in range(6)}
analyzer = TransferAnalyzer()
cross_model = analyzer.analyze_cross_model(dirs, dirs)
cross_layer = analyzer.analyze_cross_layer(dirs)
cat_dirs = {f"cat_{i}": torch.randn(32) for i in range(4)}
cross_cat = analyzer.analyze_cross_category(cat_dirs)
report = analyzer.compute_universality_index(
cross_model=cross_model,
cross_category=cross_cat,
cross_layer=cross_layer,
)
assert isinstance(report, UniversalityReport)
assert 0 <= report.universality_index <= 1.0
def test_universality_empty(self):
analyzer = TransferAnalyzer()
report = analyzer.compute_universality_index()
assert report.universality_index == 0.0
def test_format_cross_model(self):
torch.manual_seed(42)
dirs = {i: torch.randn(32) for i in range(4)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_model(dirs, dirs, "llama", "mistral")
report = TransferAnalyzer.format_cross_model(result)
assert "Cross-Model" in report
assert "llama" in report
def test_format_cross_category(self):
torch.manual_seed(42)
cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(3)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_category(cat_dirs)
report = TransferAnalyzer.format_cross_category(result)
assert "Cross-Category" in report
def test_format_universality(self):
analyzer = TransferAnalyzer()
report_obj = analyzer.compute_universality_index()
report = TransferAnalyzer.format_universality(report_obj)
assert "Universality" in report
def test_dimension_mismatch_handled(self):
"""Cross-model with different hidden dims should truncate."""
dirs_a = {0: torch.randn(32), 1: torch.randn(32)}
dirs_b = {0: torch.randn(64), 1: torch.randn(64)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
assert len(result.per_layer_transfer) == 2
# ===========================================================================
# Tests: Integration
# ===========================================================================
class TestNewImports:
def test_all_new_modules_importable(self):
from obliteratus.analysis import (
CausalRefusalTracer,
ResidualStreamDecomposer,
LinearRefusalProbe,
TransferAnalyzer,
)
assert CausalRefusalTracer is not None
assert ResidualStreamDecomposer is not None
assert LinearRefusalProbe is not None
assert TransferAnalyzer is not None
+133
View File
@@ -0,0 +1,133 @@
"""CLI dispatch tests for obliteratus.cli.main().
These tests verify argument parsing and subcommand routing without
downloading real models or running any pipeline. They use
``unittest.mock.patch`` to capture stdout/stderr and
``pytest.raises(SystemExit)`` for argparse exits.
"""
from __future__ import annotations
from io import StringIO
from unittest.mock import patch
import pytest
from obliteratus.cli import main
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _capture_exit(argv: list[str] | None, *, expect_code: int | None = None):
"""Call main(argv), expecting SystemExit; return captured stderr text."""
buf = StringIO()
with pytest.raises(SystemExit) as exc_info, patch("sys.stderr", buf):
main(argv)
if expect_code is not None:
assert exc_info.value.code == expect_code
return buf.getvalue()
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestCLIDispatch:
"""Test suite for CLI argument parsing and subcommand dispatch."""
# 1. No args -> prints help / exits with error
def test_main_no_args_prints_help(self):
"""Calling main() with no args should exit (subcommand is required)."""
stderr_text = _capture_exit([], expect_code=2)
# argparse prints usage info to stderr on error
assert "usage" in stderr_text.lower() or "required" in stderr_text.lower()
# 2. models command lists models without error
def test_models_command(self):
"""Calling main(['models']) should list models without raising."""
with patch("obliteratus.cli.console") as mock_console:
main(["models"])
# console.print is called at least once to render the table
assert mock_console.print.call_count >= 1
# 3. obliterate without model arg -> error
def test_obliterate_requires_model(self):
"""Calling main(['obliterate']) without a model arg should error."""
stderr_text = _capture_exit(["obliterate"], expect_code=2)
assert "model" in stderr_text.lower() or "required" in stderr_text.lower()
# 4. obliterate --method accepts valid methods
def test_obliterate_valid_methods(self):
"""Test that --method accepts all 9 pipeline methods."""
valid_methods = [
"basic", "advanced", "aggressive", "spectral_cascade",
"informed", "surgical", "optimized", "inverted", "nuclear",
]
for method in valid_methods:
# Patch the actual pipeline execution so nothing runs
with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
main(["obliterate", "fake/model", "--method", method])
mock_cmd.assert_called_once()
args_passed = mock_cmd.call_args[0][0]
assert args_passed.method == method
# 4b. invalid methods are rejected
def test_obliterate_rejects_invalid_method(self):
"""The CLI --method flag rejects unknown method names."""
stderr_text = _capture_exit(
["obliterate", "fake/model", "--method", "nonexistent"],
expect_code=2,
)
assert "invalid choice" in stderr_text.lower()
# 5. run requires config path
def test_run_requires_config(self):
"""Calling main(['run']) without a config path should error."""
stderr_text = _capture_exit(["run"], expect_code=2)
assert "config" in stderr_text.lower() or "required" in stderr_text.lower()
# 6. aggregate with nonexistent dir handles gracefully
def test_aggregate_command_missing_dir(self):
"""Calling main(['aggregate']) with nonexistent dir should handle gracefully."""
with patch("obliteratus.cli.console") as mock_console:
main(["aggregate", "--dir", "/nonexistent/path/to/nowhere"])
# The command prints a message about no contributions found and returns
printed_text = " ".join(
str(call) for call in mock_console.print.call_args_list
)
assert "no contributions found" in printed_text.lower() or mock_console.print.called
# 7. --help flag prints help
def test_help_flag(self):
"""Calling main(['--help']) should print help and exit 0."""
buf = StringIO()
with pytest.raises(SystemExit) as exc_info, patch("sys.stdout", buf):
main(["--help"])
assert exc_info.value.code == 0
output = buf.getvalue()
assert "obliteratus" in output.lower() or "usage" in output.lower()
# 8. interactive subcommand is registered
def test_interactive_command_exists(self):
"""Verify 'interactive' subcommand is registered and dispatches."""
with patch("obliteratus.cli._cmd_interactive") as mock_cmd:
main(["interactive"])
mock_cmd.assert_called_once()
# 9. --contribute and --contribute-notes are accepted on obliterate
def test_contribute_flags_on_obliterate(self):
"""Verify --contribute and --contribute-notes are accepted args."""
with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
main([
"obliterate", "fake/model",
"--contribute",
"--contribute-notes", "Testing contribution system",
])
mock_cmd.assert_called_once()
args_passed = mock_cmd.call_args[0][0]
assert args_passed.contribute is True
assert args_passed.contribute_notes == "Testing contribution system"
+567
View File
@@ -0,0 +1,567 @@
"""Tests for the community contribution system."""
import json
from unittest.mock import MagicMock
import pytest
import torch
from obliteratus.community import (
CONTRIBUTION_SCHEMA_VERSION,
_config_fingerprint,
_model_short_name,
aggregate_results,
generate_latex_table,
load_contributions,
save_contribution,
)
# ── Helper: mock pipeline ──────────────────────────────────────────────
def _make_mock_pipeline():
"""Build a mock pipeline with all fields the community module reads."""
p = MagicMock()
p.handle.summary.return_value = {
"architecture": "LlamaForCausalLM",
"num_layers": 32,
"num_heads": 32,
"hidden_size": 4096,
"total_params": 8_000_000_000,
}
p.method = "advanced"
p.n_directions = 4
p.norm_preserve = True
p.regularization = 0.3
p.refinement_passes = 2
p.project_biases = True
p.use_chat_template = True
p.use_whitened_svd = True
p.true_iterative_refinement = False
p.use_jailbreak_contrast = False
p.layer_adaptive_strength = False
p.attention_head_surgery = True
p.safety_neuron_masking = False
p.per_expert_directions = False
p.use_sae_features = False
p.invert_refusal = False
p.project_embeddings = False
p.embed_regularization = 0.5
p.activation_steering = False
p.steering_strength = 0.3
p.expert_transplant = False
p.transplant_blend = 0.3
p.reflection_strength = 2.0
p.quantization = None
p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05}
p._strong_layers = [10, 11, 12, 13]
p._stage_durations = {
"summon": 3.0, "probe": 12.5, "distill": 4.1,
"excise": 2.0, "verify": 8.3, "rebirth": 5.0,
}
p._excise_modified_count = 128
# Direction data
d = torch.randn(4096)
d = d / d.norm()
p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096)}
p.refusal_subspaces = {10: torch.randn(4, 4096)}
# Excise details
p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]}
p._sae_directions = {}
p._expert_safety_scores = {}
p._layer_excise_weights = {}
p._expert_directions = {}
p._steering_hooks = []
# Prompts
p.harmful_prompts = ["x"] * 33
p.harmless_prompts = ["y"] * 33
p.jailbreak_prompts = None
return p
# ── Model short name ───────────────────────────────────────────────────
class TestModelShortName:
def test_strips_org_prefix(self):
assert _model_short_name("meta-llama/Llama-2-7b-chat-hf") == "llama-2-7b-chat-hf"
def test_no_org_prefix(self):
assert _model_short_name("gpt2") == "gpt2"
def test_sanitizes_special_chars(self):
assert _model_short_name("org/Model_V2.1") == "model-v2-1"
def test_caps_length(self):
long_name = "a" * 100
assert len(_model_short_name(long_name)) <= 60
def test_collapses_dashes(self):
assert _model_short_name("org/Model---Name") == "model-name"
def test_strips_trailing_dashes(self):
assert _model_short_name("org/Model-") == "model"
# ── Config fingerprint ─────────────────────────────────────────────────
class TestConfigFingerprint:
def test_deterministic(self):
config = {"n_directions": 4, "norm_preserve": True}
fp1 = _config_fingerprint(config)
fp2 = _config_fingerprint(config)
assert fp1 == fp2
def test_different_configs_different_hashes(self):
fp1 = _config_fingerprint({"n_directions": 4})
fp2 = _config_fingerprint({"n_directions": 8})
assert fp1 != fp2
def test_key_order_invariant(self):
fp1 = _config_fingerprint({"a": 1, "b": 2})
fp2 = _config_fingerprint({"b": 2, "a": 1})
assert fp1 == fp2
def test_returns_8_char_hex(self):
fp = _config_fingerprint({"test": True})
assert len(fp) == 8
assert all(c in "0123456789abcdef" for c in fp)
# ── Save contribution ──────────────────────────────────────────────────
class TestSaveContribution:
def test_saves_json_file(self, tmp_path):
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline,
model_name="meta-llama/Llama-2-7b-chat-hf",
output_dir=tmp_path,
)
assert path.exists()
assert path.suffix == ".json"
data = json.loads(path.read_text())
assert data["contribution_schema_version"] == CONTRIBUTION_SCHEMA_VERSION
assert data["model_name"] == "meta-llama/Llama-2-7b-chat-hf"
def test_filename_format(self, tmp_path):
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline,
model_name="meta-llama/Llama-2-7b-chat-hf",
output_dir=tmp_path,
)
name = path.stem
assert name.startswith("llama-2-7b-chat-hf_advanced_")
def test_includes_telemetry_report(self, tmp_path):
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline,
model_name="meta-llama/Llama-2-7b-chat-hf",
output_dir=tmp_path,
)
data = json.loads(path.read_text())
telemetry = data["telemetry"]
assert telemetry["schema_version"] == 2
assert telemetry["model"]["architecture"] == "LlamaForCausalLM"
assert telemetry["method"] == "advanced"
assert telemetry["quality_metrics"]["refusal_rate"] == 0.05
def test_includes_config_fingerprint(self, tmp_path):
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline,
model_name="meta-llama/Llama-2-7b-chat-hf",
output_dir=tmp_path,
)
data = json.loads(path.read_text())
assert "config_fingerprint" in data
assert len(data["config_fingerprint"]) == 8
def test_includes_notes(self, tmp_path):
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline,
model_name="test/model",
notes="Ran on A100 with default prompts",
output_dir=tmp_path,
)
data = json.loads(path.read_text())
assert data["notes"] == "Ran on A100 with default prompts"
def test_creates_output_dir(self, tmp_path):
subdir = tmp_path / "nested" / "dir"
assert not subdir.exists()
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline, model_name="test/model", output_dir=subdir,
)
assert subdir.exists()
assert path.exists()
def test_timestamp_format(self, tmp_path):
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline, model_name="test/model", output_dir=tmp_path,
)
data = json.loads(path.read_text())
ts = data["timestamp"]
# Should be UTC ISO-ish: YYYYMMDDTHHMMSSZ
assert ts.endswith("Z")
assert "T" in ts
assert len(ts) == 16
def test_method_config_extracted(self, tmp_path):
pipeline = _make_mock_pipeline()
path = save_contribution(
pipeline, model_name="test/model", output_dir=tmp_path,
)
data = json.loads(path.read_text())
cfg = data["telemetry"]["method_config"]
assert cfg["n_directions"] == 4
assert cfg["norm_preserve"] is True
assert cfg["attention_head_surgery"] is True
# ── Load contributions ─────────────────────────────────────────────────
class TestLoadContributions:
def _write_contrib(self, directory, model, method, refusal_rate, idx=0):
"""Write a minimal valid contribution file."""
record = {
"contribution_schema_version": CONTRIBUTION_SCHEMA_VERSION,
"timestamp": f"20260227T{idx:06d}Z",
"model_name": model,
"config_fingerprint": "abcd1234",
"notes": "",
"telemetry": {
"schema_version": 2,
"method": method,
"quality_metrics": {"refusal_rate": refusal_rate},
},
}
path = directory / f"contrib_{idx}.json"
path.write_text(json.dumps(record))
return path
def test_loads_valid_files(self, tmp_path):
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
self._write_contrib(tmp_path, "test/model", "basic", 0.10, 1)
records = load_contributions(tmp_path)
assert len(records) == 2
def test_sorts_by_timestamp(self, tmp_path):
self._write_contrib(tmp_path, "model-b", "advanced", 0.05, 2)
self._write_contrib(tmp_path, "model-a", "advanced", 0.10, 1)
records = load_contributions(tmp_path)
assert records[0]["model_name"] == "model-a"
assert records[1]["model_name"] == "model-b"
def test_skips_non_contribution_json(self, tmp_path):
# Write a JSON file without contribution_schema_version
(tmp_path / "random.json").write_text('{"foo": "bar"}')
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
records = load_contributions(tmp_path)
assert len(records) == 1
def test_skips_invalid_json(self, tmp_path):
(tmp_path / "bad.json").write_text("not valid json {{{")
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
records = load_contributions(tmp_path)
assert len(records) == 1
def test_returns_empty_for_missing_dir(self, tmp_path):
records = load_contributions(tmp_path / "nonexistent")
assert records == []
def test_tracks_source_file(self, tmp_path):
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
records = load_contributions(tmp_path)
assert "_source_file" in records[0]
assert "contrib_0.json" in records[0]["_source_file"]
def test_ignores_non_json_files(self, tmp_path):
(tmp_path / "readme.txt").write_text("some text")
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
records = load_contributions(tmp_path)
assert len(records) == 1
# ── Aggregate results ──────────────────────────────────────────────────
class TestAggregateResults:
def _make_record(self, model, method, refusal_rate, perplexity=None, coherence=None):
metrics = {"refusal_rate": refusal_rate}
if perplexity is not None:
metrics["perplexity"] = perplexity
if coherence is not None:
metrics["coherence"] = coherence
return {
"model_name": model,
"telemetry": {
"method": method,
"quality_metrics": metrics,
},
}
def test_single_record(self):
records = [self._make_record("model-a", "advanced", 0.05)]
result = aggregate_results(records)
assert "model-a" in result
assert "advanced" in result["model-a"]
assert result["model-a"]["advanced"]["n_runs"] == 1
assert result["model-a"]["advanced"]["refusal_rate"]["mean"] == 0.05
def test_multiple_runs_same_model_method(self):
records = [
self._make_record("model-a", "advanced", 0.04),
self._make_record("model-a", "advanced", 0.06),
]
result = aggregate_results(records)
stats = result["model-a"]["advanced"]
assert stats["n_runs"] == 2
assert stats["refusal_rate"]["mean"] == 0.05
assert stats["refusal_rate"]["min"] == 0.04
assert stats["refusal_rate"]["max"] == 0.06
assert stats["refusal_rate"]["n"] == 2
def test_multiple_models(self):
records = [
self._make_record("model-a", "advanced", 0.05),
self._make_record("model-b", "basic", 0.10),
]
result = aggregate_results(records)
assert len(result) == 2
assert "model-a" in result
assert "model-b" in result
def test_multiple_methods(self):
records = [
self._make_record("model-a", "advanced", 0.05),
self._make_record("model-a", "basic", 0.10),
]
result = aggregate_results(records)
assert len(result["model-a"]) == 2
assert "advanced" in result["model-a"]
assert "basic" in result["model-a"]
def test_std_zero_for_single_run(self):
records = [self._make_record("model-a", "advanced", 0.05)]
result = aggregate_results(records)
assert result["model-a"]["advanced"]["refusal_rate"]["std"] == 0.0
def test_multiple_metrics(self):
records = [
self._make_record("model-a", "advanced", 0.05, perplexity=5.2, coherence=0.8),
]
result = aggregate_results(records)
stats = result["model-a"]["advanced"]
assert "refusal_rate" in stats
assert "perplexity" in stats
assert "coherence" in stats
assert stats["perplexity"]["mean"] == 5.2
def test_missing_metric_skipped(self):
records = [self._make_record("model-a", "advanced", 0.05)]
result = aggregate_results(records)
# coherence not provided, should not appear
assert "coherence" not in result["model-a"]["advanced"]
def test_unknown_model_and_method(self):
records = [{
"telemetry": {"quality_metrics": {"refusal_rate": 0.1}},
}]
result = aggregate_results(records)
assert "unknown" in result
assert "unknown" in result["unknown"]
# ── LaTeX table generation ─────────────────────────────────────────────
class TestGenerateLatexTable:
def _sample_aggregated(self):
return {
"meta-llama/Llama-2-7b-chat-hf": {
"advanced": {
"n_runs": 3,
"refusal_rate": {"mean": 0.04, "std": 0.01, "n": 3, "min": 0.03, "max": 0.05},
},
"basic": {
"n_runs": 2,
"refusal_rate": {"mean": 0.08, "std": 0.02, "n": 2, "min": 0.06, "max": 0.10},
},
},
"mistralai/Mistral-7B-Instruct-v0.2": {
"advanced": {
"n_runs": 1,
"refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03},
},
},
}
def test_produces_valid_latex(self):
agg = self._sample_aggregated()
latex = generate_latex_table(agg)
assert "\\begin{tabular}" in latex
assert "\\end{tabular}" in latex
assert "\\toprule" in latex
assert "\\bottomrule" in latex
def test_includes_model_names(self):
agg = self._sample_aggregated()
latex = generate_latex_table(agg)
assert "Llama-2-7b-chat-hf" in latex
assert "Mistral-7B-Instruct-v0.2" in latex
def test_includes_method_headers(self):
agg = self._sample_aggregated()
latex = generate_latex_table(agg)
assert "advanced" in latex
assert "basic" in latex
def test_missing_method_shows_dash(self):
agg = self._sample_aggregated()
latex = generate_latex_table(agg)
# Mistral doesn't have "basic" method
assert "---" in latex
def test_shows_std_when_multiple_runs(self):
agg = self._sample_aggregated()
latex = generate_latex_table(agg)
assert "$\\pm$" in latex
def test_no_std_for_single_run(self):
agg = {
"model": {
"method": {
"n_runs": 1,
"refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03},
},
},
}
latex = generate_latex_table(agg)
assert "$\\pm$" not in latex
def test_methods_filter(self):
agg = self._sample_aggregated()
latex = generate_latex_table(agg, methods=["advanced"])
assert "\\textbf{advanced}" in latex
assert "\\textbf{basic}" not in latex
def test_custom_metric(self):
agg = {
"model": {
"method": {
"n_runs": 2,
"perplexity": {"mean": 5.2, "std": 0.3, "n": 2, "min": 4.9, "max": 5.5},
},
},
}
latex = generate_latex_table(agg, metric="perplexity")
assert "5.2" in latex
def test_column_count_matches_methods(self):
agg = self._sample_aggregated()
latex = generate_latex_table(agg)
# 2 methods → "lcc" (1 model col + 2 method cols)
assert "{@{}lcc@{}}" in latex
# ── CLI integration ────────────────────────────────────────────────────
class TestCLIContributeFlag:
def test_contribute_flag_accepted(self):
"""Verify the --contribute flag parses without error."""
from obliteratus.cli import main
# We can't run the full command (no GPU), but verify parsing works
with pytest.raises(SystemExit):
# "obliterate" requires a model, so parse will fail,
# but if --contribute is not recognized it fails differently
main(["obliterate", "--help"])
def test_aggregate_command_accepted(self):
"""Verify the aggregate command parses without error."""
from obliteratus.cli import main
with pytest.raises(SystemExit):
main(["aggregate", "--help"])
# ── Package exports ────────────────────────────────────────────────────
class TestPackageExports:
def test_save_contribution_importable(self):
from obliteratus import save_contribution
assert callable(save_contribution)
def test_load_contributions_importable(self):
from obliteratus import load_contributions
assert callable(load_contributions)
def test_aggregate_results_importable(self):
from obliteratus import aggregate_results
assert callable(aggregate_results)
# ── End-to-end: save → load → aggregate ───────────────────────────────
class TestEndToEnd:
def test_save_load_aggregate_roundtrip(self, tmp_path):
"""Full roundtrip: save contributions, load them, aggregate."""
pipeline = _make_mock_pipeline()
# Save two contributions (different models to avoid filename collision)
save_contribution(
pipeline, model_name="test/model-a", output_dir=tmp_path,
)
# Tweak metrics for second run with a different model name
pipeline._quality_metrics = {"perplexity": 5.5, "coherence": 0.75, "refusal_rate": 0.07}
save_contribution(
pipeline, model_name="test/model-b", output_dir=tmp_path,
)
# Load
records = load_contributions(tmp_path)
assert len(records) == 2
# Aggregate
aggregated = aggregate_results(records)
assert "test/model-a" in aggregated
assert "test/model-b" in aggregated
stats_a = aggregated["test/model-a"]["advanced"]
stats_b = aggregated["test/model-b"]["advanced"]
assert stats_a["n_runs"] == 1
assert stats_b["n_runs"] == 1
assert abs(stats_a["refusal_rate"]["mean"] - 0.05) < 0.001
assert abs(stats_b["refusal_rate"]["mean"] - 0.07) < 0.001
def test_save_load_aggregate_to_latex(self, tmp_path):
"""Full roundtrip ending in LaTeX output."""
pipeline = _make_mock_pipeline()
save_contribution(
pipeline, model_name="meta-llama/Llama-2-7b-chat-hf", output_dir=tmp_path,
)
records = load_contributions(tmp_path)
aggregated = aggregate_results(records)
latex = generate_latex_table(aggregated)
assert "\\begin{tabular}" in latex
assert "Llama-2-7b-chat-hf" in latex
assert "advanced" in latex
+59
View File
@@ -0,0 +1,59 @@
"""Tests for configuration loading."""
from __future__ import annotations
import yaml
from obliteratus.config import StudyConfig
SAMPLE_CONFIG = {
"model": {
"name": "gpt2",
"task": "causal_lm",
"dtype": "float32",
"device": "cpu",
},
"dataset": {
"name": "wikitext",
"subset": "wikitext-2-raw-v1",
"split": "test",
"text_column": "text",
"max_samples": 50,
},
"strategies": [
{"name": "layer_removal", "params": {}},
{"name": "ffn_ablation", "params": {}},
],
"metrics": ["perplexity"],
"batch_size": 4,
"max_length": 256,
"output_dir": "results/test",
}
class TestStudyConfig:
def test_from_dict(self):
config = StudyConfig.from_dict(SAMPLE_CONFIG)
assert config.model.name == "gpt2"
assert config.model.task == "causal_lm"
assert config.dataset.name == "wikitext"
assert len(config.strategies) == 2
assert config.strategies[0].name == "layer_removal"
def test_from_yaml(self, tmp_path):
yaml_path = tmp_path / "test_config.yaml"
yaml_path.write_text(yaml.dump(SAMPLE_CONFIG))
config = StudyConfig.from_yaml(yaml_path)
assert config.model.name == "gpt2"
assert config.batch_size == 4
def test_roundtrip(self):
config = StudyConfig.from_dict(SAMPLE_CONFIG)
d = config.to_dict()
config2 = StudyConfig.from_dict(d)
assert config2.model.name == config.model.name
assert config2.dataset.name == config.dataset.name
assert len(config2.strategies) == len(config.strategies)
+169
View File
@@ -0,0 +1,169 @@
"""Tests for defense robustness evaluation framework."""
from __future__ import annotations
from unittest.mock import MagicMock
import torch
from obliteratus.analysis.defense_robustness import (
DefenseProfile,
DefenseRobustnessEvaluator,
EntanglementMap,
SelfRepairResult,
)
def _make_mock_pipeline(n_layers=6, hidden_dim=16, n_prompts=5):
"""Create a mock pipeline with refusal directions and activations."""
pipeline = MagicMock()
pipeline.model_name = "test-model"
# Generate refusal directions (some strong, some weak)
torch.manual_seed(42)
directions = {}
for i in range(n_layers):
d = torch.randn(hidden_dim)
directions[i] = d / d.norm()
pipeline.refusal_directions = directions
# Generate activations with a planted refusal signal in middle layers
harmful_means = {}
harmless_means = {}
harmful_acts = {}
harmless_acts = {}
for i in range(n_layers):
base = torch.randn(hidden_dim)
harmless_means[i] = base.unsqueeze(0)
# Middle layers have stronger refusal signal
signal_strength = 3.0 if 2 <= i <= 4 else 0.5
harmful_means[i] = (base + signal_strength * directions[i]).unsqueeze(0)
harmful_acts[i] = [base + signal_strength * directions[i] + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
harmless_acts[i] = [base + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
pipeline._harmful_means = harmful_means
pipeline._harmless_means = harmless_means
pipeline._harmful_acts = harmful_acts
pipeline._harmless_acts = harmless_acts
return pipeline
class TestDefenseProfile:
def test_profile_generates(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert isinstance(profile, DefenseProfile)
assert profile.model_name == "test-model"
assert profile.refusal_layer_spread > 0
assert profile.mean_refusal_strength > 0
assert profile.max_refusal_strength >= profile.mean_refusal_strength
assert profile.estimated_robustness in ("low", "medium", "high", "very_high")
def test_alignment_type_estimate(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert profile.alignment_type_estimate != "unknown"
def test_empty_pipeline(self):
pipeline = MagicMock()
pipeline.model_name = "empty"
pipeline.refusal_directions = {}
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert profile.estimated_robustness == "unknown"
def test_concentration_bounded(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
# Gini coefficient should be between 0 and 1
assert 0 <= profile.refusal_concentration <= 1.0
def test_self_repair_bounded(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert 0 <= profile.self_repair_estimate <= 1.0
def test_format_report(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
report = DefenseRobustnessEvaluator.format_defense_profile(profile)
assert "Defense Robustness" in report
assert "test-model" in report
class TestSelfRepair:
def test_self_repair_measurement(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
result = evaluator.measure_self_repair(layer_idx=3)
assert isinstance(result, SelfRepairResult)
assert result.layer_idx == 3
assert result.original_refusal_strength >= 0
assert 0 <= result.repair_ratio <= 1.0
assert len(result.compensating_layers) > 0
assert 3 not in result.compensating_layers # shouldn't list itself
def test_repair_ratio_high_for_distributed(self):
"""Distributed refusal should have high repair ratio."""
pipeline = _make_mock_pipeline(n_layers=10)
evaluator = DefenseRobustnessEvaluator(pipeline)
result = evaluator.measure_self_repair(layer_idx=3)
# With distributed signal, removing one layer leaves much compensation
assert result.repair_ratio > 0.5
def test_format_self_repair(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
result = evaluator.measure_self_repair(layer_idx=2)
report = DefenseRobustnessEvaluator.format_self_repair(result)
assert "Self-Repair" in report
assert "Layer 2" in report
class TestEntanglement:
def test_entanglement_map(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
assert isinstance(emap, EntanglementMap)
assert len(emap.layer_entanglement) > 0
assert 0 <= emap.overall_entanglement <= 1.0
assert len(emap.most_entangled_layers) > 0
assert len(emap.least_entangled_layers) > 0
def test_capability_sensitivity_keys(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
expected_keys = {"factual_knowledge", "reasoning", "language_fluency",
"instruction_following", "math"}
assert set(emap.capability_sensitivity.keys()) == expected_keys
def test_math_most_sensitive(self):
"""Math should be estimated as the most sensitive capability."""
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
if emap.overall_entanglement > 0:
assert emap.capability_sensitivity["math"] >= emap.capability_sensitivity["language_fluency"]
def test_format_entanglement(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
report = DefenseRobustnessEvaluator.format_entanglement(emap)
assert "Entanglement" in report
assert "math" in report
+510
View File
@@ -0,0 +1,510 @@
"""Edge-case and robustness tests.
Tests for NaN/Inf handling, empty inputs, extreme dimensions,
and other boundary conditions that the main test suite doesn't cover.
"""
from __future__ import annotations
import math
import pytest
import torch
import torch.nn as nn
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
from obliteratus.analysis.concept_geometry import ConceptConeAnalyzer
from obliteratus.analysis.alignment_imprint import AlignmentImprintDetector
from obliteratus.analysis.multi_token_position import MultiTokenPositionAnalyzer
from obliteratus.analysis.sparse_surgery import SparseDirectionSurgeon
from obliteratus.analysis.causal_tracing import CausalRefusalTracer
from obliteratus.analysis.residual_stream import ResidualStreamDecomposer
from obliteratus.analysis.probing_classifiers import LinearRefusalProbe
from obliteratus.analysis.cross_model_transfer import TransferAnalyzer
from obliteratus.evaluation.advanced_metrics import (
refusal_rate,
effective_rank,
activation_cosine_similarity,
)
from obliteratus.analysis.steering_vectors import (
SteeringVectorFactory,
SteeringHookManager,
SteeringConfig,
SteeringResult,
compute_steering_effectiveness,
format_steering_report,
)
# ===========================================================================
# NaN / Inf handling
# ===========================================================================
class TestNaNInfHandling:
"""Test that modules handle degenerate inputs gracefully."""
def test_whitened_svd_nan_activations(self):
"""WhitenedSVD with NaN — currently raises; documenting behavior."""
harmful = [torch.tensor([float("nan"), 1.0, 2.0]) for _ in range(5)]
harmless = [torch.randn(3) for _ in range(5)]
extractor = WhitenedSVDExtractor()
# NaN propagation through SVD is expected to produce NaN results
# This documents the current behavior — ideally would guard against it
raised = False
result = None
try:
result = extractor.extract(harmful, harmless)
except (RuntimeError, ValueError):
raised = True
# Either it raised an exception (acceptable) or returned a result with NaNs
assert raised or result is not None, (
"Should either raise on NaN input or return a result"
)
def test_whitened_svd_zero_activations(self):
"""WhitenedSVD with all-zero activations."""
harmful = [torch.zeros(8) for _ in range(5)]
harmless = [torch.zeros(8) for _ in range(5)]
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless)
# Should return a valid result without crashing
assert result is not None
assert result.directions is not None
assert result.singular_values is not None
def test_concept_cone_nan_direction(self):
"""ConceptConeAnalyzer with NaN in activations — documenting behavior."""
harmful = [torch.randn(16) for _ in range(10)]
harmless = [torch.randn(16) for _ in range(10)]
# Poison one activation
harmful[3] = torch.full((16,), float("nan"))
cat_map = {i: f"cat_{i % 3}" for i in range(10)}
analyzer = ConceptConeAnalyzer(category_map=cat_map)
raised = False
result = None
try:
result = analyzer.analyze_layer(harmful, harmless)
except (RuntimeError, ValueError):
raised = True
# Either it raised an exception (acceptable) or returned a result
assert raised or result is not None, (
"Should either raise on NaN input or return a result"
)
def test_sparse_surgery_zero_direction(self):
"""Sparse surgery with zero refusal direction."""
W = torch.randn(32, 16)
zero_dir = torch.zeros(16)
surgeon = SparseDirectionSurgeon()
result = surgeon.analyze_weight_matrix(W, zero_dir)
assert result.mean_projection == 0.0
def test_sparse_surgery_zero_weight(self):
"""Sparse surgery with zero weight matrix."""
W = torch.zeros(32, 16)
ref_dir = torch.randn(16)
surgeon = SparseDirectionSurgeon()
result = surgeon.analyze_weight_matrix(W, ref_dir)
assert result.max_projection < 1e-6
def test_effective_rank_nan_matrix(self):
"""effective_rank should handle matrix with NaN."""
W = torch.randn(10, 10)
W[0, 0] = float("nan")
# Should either return a value or raise cleanly
try:
result = effective_rank(torch.nan_to_num(W))
assert math.isfinite(result)
except Exception:
pass # Raising is acceptable for NaN input
def test_cosine_similarity_zero_vectors(self):
"""Cosine similarity between zero vectors."""
a = torch.zeros(32)
b = torch.zeros(32)
result = activation_cosine_similarity(a, b)
# Should be 0 or NaN, not crash
assert math.isfinite(result) or math.isnan(result)
def test_transfer_analyzer_nan_directions(self):
"""Transfer analyzer with NaN directions."""
dirs_a = {0: torch.randn(16), 1: torch.tensor([float("nan")] * 16)}
dirs_b = {0: torch.randn(16), 1: torch.randn(16)}
analyzer = TransferAnalyzer()
# Should not crash
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
assert result is not None
assert isinstance(result.mean_transfer_score, float)
assert result.per_layer_transfer is not None
# ===========================================================================
# Empty inputs
# ===========================================================================
class TestEmptyInputs:
"""Test graceful handling of empty or minimal inputs."""
def test_cross_layer_empty_directions(self):
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze({})
assert result.direction_persistence_score == 0.0
def test_alignment_imprint_single_layer(self):
"""Single layer should still return a result."""
detector = AlignmentImprintDetector()
dirs = {0: torch.randn(32)}
result = detector.detect_imprint(dirs)
assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown")
def test_multi_token_single_position(self):
"""Single-position sequence."""
ref_dir = torch.randn(16)
acts = torch.randn(1, 16)
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
assert result.n_tokens == 1
assert result.peak_position == 0
def test_probing_minimal_data(self):
"""Probing with very few samples."""
harmful = [torch.randn(8) for _ in range(3)]
harmless = [torch.randn(8) for _ in range(3)]
probe = LinearRefusalProbe(n_epochs=10)
result = probe.probe_layer(harmful, harmless)
assert 0 <= result.accuracy <= 1.0
def test_residual_stream_single_layer(self):
acts = {0: torch.randn(32)}
ref_dir = torch.randn(32)
decomposer = ResidualStreamDecomposer()
result = decomposer.decompose(acts, ref_dir)
assert result.n_layers == 1
def test_causal_tracing_single_layer(self):
acts = {0: torch.randn(32)}
ref_dirs = {0: torch.randn(32)}
tracer = CausalRefusalTracer()
result = tracer.trace_from_activations(acts, ref_dirs)
assert result.n_layers == 1
def test_transfer_no_common_layers(self):
"""Cross-model with no overlapping layer indices."""
dirs_a = {0: torch.randn(16), 1: torch.randn(16)}
dirs_b = {2: torch.randn(16), 3: torch.randn(16)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
assert result.mean_transfer_score == 0.0
def test_refusal_rate_empty_list(self):
result = refusal_rate([])
assert result == 0.0
def test_refusal_rate_single_response(self):
result = refusal_rate(["I cannot help with that."])
assert result == 1.0
# ===========================================================================
# Extreme dimensions
# ===========================================================================
class TestExtremeDimensions:
"""Test with unusually large or small dimensions."""
def test_high_dimensional_directions(self):
"""Test with realistic hidden dimension (4096)."""
hidden_dim = 4096
torch.manual_seed(42)
dirs = {i: torch.randn(hidden_dim) for i in range(8)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_layer(dirs)
assert result.mean_adjacent_transfer >= 0
def test_high_dim_sparse_surgery(self):
"""Sparse surgery with large weight matrix."""
W = torch.randn(2048, 1024)
ref_dir = torch.randn(1024)
surgeon = SparseDirectionSurgeon(sparsity=0.05)
result = surgeon.analyze_weight_matrix(W, ref_dir)
assert result.n_rows_modified == int(0.05 * 2048)
def test_single_dimension(self):
"""1D hidden dimension edge case."""
dirs = {i: torch.randn(1) for i in range(4)}
analyzer = TransferAnalyzer()
result = analyzer.analyze_cross_layer(dirs)
# All 1D directions are parallel or anti-parallel, so cosine is always 1.0
assert result.mean_adjacent_transfer >= 0.99
def test_many_layers_imprint(self):
"""Alignment imprint with many layers (128)."""
dirs = {i: torch.randn(32) for i in range(128)}
detector = AlignmentImprintDetector()
result = detector.detect_imprint(dirs)
total = (result.dpo_probability + result.rlhf_probability +
result.cai_probability + result.sft_probability)
assert abs(total - 1.0) < 0.01
@pytest.mark.parametrize("n_prompts", [1, 2, 5, 50, 100])
def test_concept_cone_varying_prompt_counts(self, n_prompts):
"""Concept cone with varying numbers of prompts."""
harmful = [torch.randn(16) for _ in range(n_prompts)]
harmless = [torch.randn(16) for _ in range(n_prompts)]
cat_map = {i: f"cat_{i % 3}" for i in range(n_prompts)}
analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=1)
result = analyzer.analyze_layer(harmful, harmless)
# Should produce a valid result regardless of prompt count
assert result is not None
assert result.general_direction is not None
assert result.category_count >= 1
@pytest.mark.parametrize("seq_len", [1, 2, 10, 100, 500])
def test_multi_token_varying_lengths(self, seq_len):
"""Multi-token analysis with varying sequence lengths."""
ref_dir = torch.randn(16)
acts = torch.randn(seq_len, 16)
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
assert result.n_tokens == seq_len
@pytest.mark.parametrize("hidden_dim", [2, 4, 16, 64, 256, 1024])
def test_probing_varying_dimensions(self, hidden_dim):
"""Probing classifier across different hidden dims (min 2 for SGD)."""
harmful = [torch.randn(hidden_dim) * 2 + 1 for _ in range(10)]
harmless = [torch.randn(hidden_dim) * 2 - 1 for _ in range(10)]
probe = LinearRefusalProbe(n_epochs=20)
result = probe.probe_layer(harmful, harmless)
assert 0 <= result.accuracy <= 1.0
# ===========================================================================
# Refusal rate edge cases
# ===========================================================================
class TestRefusalRateEdgeCases:
"""Edge cases for refusal rate detection."""
def test_novel_refusal_phrasing(self):
"""Test that unusual refusal phrasings might be missed (known limitation)."""
novel_refusals = [
"That request falls outside my operational parameters.",
"I'm designed to be helpful but this conflicts with my guidelines.",
]
# These should ideally be detected but string matching may miss them
rate = refusal_rate(novel_refusals)
# This tests the known limitation — rate may be 0 (string matching misses novel phrasings)
assert 0 <= rate <= 1.0
def test_false_positive_sorry(self):
"""'I'm sorry to hear that' followed by helpful content."""
responses = [
"I'm sorry to hear that. Here's how you can solve the problem: first...",
]
rate = refusal_rate(responses, mode="prefix")
# Known limitation: prefix matching produces false positives here
# Just verify it returns a valid float
assert 0 <= rate <= 1.0
def test_case_insensitivity(self):
responses = ["I CANNOT ASSIST WITH THAT REQUEST."]
rate = refusal_rate(responses, mode="substring")
assert rate == 1.0
def test_whitespace_handling(self):
responses = [" I'm sorry, but I can't help with that. "]
rate = refusal_rate(responses, mode="prefix")
# Should handle leading whitespace
assert 0 <= rate <= 1.0
# ===========================================================================
# Steering Vectors
# ===========================================================================
class TestSteeringVectors:
"""Tests for the steering vector inference-time intervention system."""
def test_from_refusal_direction(self):
d = torch.randn(32)
vec = SteeringVectorFactory.from_refusal_direction(d, source_layer=5)
assert vec.label == "refusal"
assert vec.source_layer == 5
assert vec.default_alpha == -1.0
assert abs(vec.direction.norm().item() - 1.0) < 0.01
def test_from_contrastive_pairs(self):
pos = [torch.randn(16) + 2 for _ in range(10)]
neg = [torch.randn(16) - 2 for _ in range(10)]
vec = SteeringVectorFactory.from_contrastive_pairs(pos, neg, label="test")
assert vec.label == "test"
assert abs(vec.direction.norm().item() - 1.0) < 0.01
assert "n_positive" in vec.metadata
def test_combine_vectors(self):
v1 = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
v2 = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
combined = SteeringVectorFactory.combine([v1, v2], label="merged")
assert combined.label == "merged"
assert abs(combined.direction.norm().item() - 1.0) < 0.01
def test_combine_single(self):
v = SteeringVectorFactory.from_refusal_direction(torch.randn(16))
combined = SteeringVectorFactory.combine([v])
assert abs(combined.direction.norm().item() - 1.0) < 0.01
def test_combine_empty_raises(self):
with pytest.raises(ValueError):
SteeringVectorFactory.combine([])
def test_hook_manager_lifecycle(self):
"""Test install/remove lifecycle without a real model."""
manager = SteeringHookManager()
assert not manager.is_active
manager.remove() # Should not crash even with no hooks
assert not manager.is_active
def test_hook_with_simple_model(self):
"""Test steering on a simple nn.Sequential model."""
model = nn.Sequential(
nn.Linear(16, 16),
nn.ReLU(),
nn.Linear(16, 16),
nn.ReLU(),
nn.Linear(16, 8),
)
vec = SteeringVectorFactory.from_refusal_direction(torch.randn(16))
config = SteeringConfig(
vectors=[vec],
target_layers=[0, 2], # steer at first and third linear layers
alpha=1.0,
)
manager = SteeringHookManager()
# Install on specific modules
layers = list(model.children())
result = manager.install(model, config, layer_modules=layers)
assert result.hooks_installed == 2
assert manager.is_active
# Run a forward pass (should not crash)
x = torch.randn(1, 16)
output = model(x)
assert output.shape == (1, 8)
# Remove hooks
manager.remove()
assert not manager.is_active
def test_steering_effectiveness_remove(self):
eff = compute_steering_effectiveness(2.0, 0.5, direction="remove")
assert 0 < eff < 1.0 # Reduced but not eliminated
def test_steering_effectiveness_perfect_remove(self):
eff = compute_steering_effectiveness(2.0, 0.0, direction="remove")
assert eff == 1.0
def test_steering_effectiveness_no_change(self):
eff = compute_steering_effectiveness(2.0, 2.0, direction="remove")
assert eff == 0.0
def test_steering_effectiveness_add(self):
eff = compute_steering_effectiveness(1.0, 3.0, direction="add")
assert eff == 1.0 # Capped at 1.0
def test_format_report(self):
vec = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
config = SteeringConfig(vectors=[vec], target_layers=[3, 5], alpha=0.5)
result = SteeringResult(config=config, hooks_installed=2, total_steered_layers=2)
report = format_steering_report(result)
assert "Steering" in report
assert "refusal" in report
def test_steering_config_position_modes(self):
"""Test different position modes in config."""
for pos in ["all", "last", "first"]:
config = SteeringConfig(
vectors=[SteeringVectorFactory.from_refusal_direction(torch.randn(8))],
target_layers=[0],
position=pos,
)
assert config.position == pos
def test_imports(self):
from obliteratus.analysis import SteeringVectorFactory, SteeringHookManager
assert SteeringVectorFactory is not None
assert SteeringHookManager is not None
class TestParametrizedDimensions:
"""Parametrized tests across different hidden dimensions."""
@pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256, 768])
def test_whitened_svd_various_dims(self, hidden_dim):
n_samples = max(4, hidden_dim // 4)
harmful = [torch.randn(hidden_dim) for _ in range(n_samples)]
harmless = [torch.randn(hidden_dim) for _ in range(n_samples)]
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless, n_directions=1)
assert result.directions.shape[1] == hidden_dim
@pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256])
def test_cross_layer_various_dims(self, hidden_dim):
directions = {i: torch.randn(hidden_dim) for i in range(4)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
assert 0.0 <= result.direction_persistence_score <= 1.0
@pytest.mark.parametrize("hidden_dim", [4, 32, 128])
def test_sparse_surgery_various_dims(self, hidden_dim):
weight = torch.randn(hidden_dim, hidden_dim)
direction = torch.randn(hidden_dim)
direction = direction / direction.norm()
surgeon = SparseDirectionSurgeon()
result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0)
assert 0.0 <= result.energy_removed <= 1.0
@pytest.mark.parametrize("n_layers", [1, 4, 12, 32])
def test_imprint_various_layer_counts(self, n_layers):
directions = {i: torch.randn(64) for i in range(n_layers)}
detector = AlignmentImprintDetector()
result = detector.detect_imprint(directions)
assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown")
class TestExceptionPaths:
"""Tests for error handling and boundary conditions."""
def test_whitened_svd_mismatched_dims(self):
"""Harmful and harmless with different hidden dims should fail or handle gracefully."""
harmful = [torch.randn(64) for _ in range(10)]
harmless = [torch.randn(32) for _ in range(10)]
extractor = WhitenedSVDExtractor()
with pytest.raises(Exception):
extractor.extract(harmful, harmless, n_directions=1)
def test_whitened_svd_single_sample(self):
"""Single sample should not crash (may return 0 directions due to insufficient data)."""
harmful = [torch.randn(32)]
harmless = [torch.randn(32)]
extractor = WhitenedSVDExtractor()
result = extractor.extract(harmful, harmless, n_directions=1)
assert result.directions.shape[1] == 32 # hidden dim preserved
def test_sparse_surgery_zero_direction(self):
"""Zero direction vector should not crash."""
weight = torch.randn(16, 16)
direction = torch.zeros(16)
surgeon = SparseDirectionSurgeon()
# Should handle gracefully (possibly returning 0 energy)
result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0)
assert result is not None
def test_cross_layer_single_layer(self):
"""Single layer directions should still produce a result."""
directions = {0: torch.randn(32)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
assert result is not None
+483
View File
@@ -0,0 +1,483 @@
"""Tests for the community-standard (Heretics/Arditi) evaluation protocol."""
from __future__ import annotations
import pytest
import torch
from obliteratus.evaluation.heretic_eval import (
ARDITI_REFUSAL_SUBSTRINGS,
_DEFAULT_HARMLESS_PROMPTS,
_HARMBENCH_CLS_TEMPLATE,
_JAILBREAKBENCH_BUILTIN,
_extract_score,
_last_real_token_indices,
_sanitize_model_path,
arditi_refusal_rate,
first_token_kl_on_prompts,
format_comparison_table,
HereticComparisonResult,
LM_EVAL_BENCHMARKS,
load_jailbreakbench_prompts,
unload_harmbench_classifier,
)
# ---------------------------------------------------------------------------
# JailbreakBench Prompts
# ---------------------------------------------------------------------------
class TestJailbreakBenchPrompts:
def test_builtin_has_100_prompts(self):
assert len(_JAILBREAKBENCH_BUILTIN) == 100
def test_builtin_all_non_empty(self):
for i, p in enumerate(_JAILBREAKBENCH_BUILTIN):
assert isinstance(p, str) and len(p.strip()) > 10, f"Prompt {i} is too short"
def test_builtin_no_duplicates(self):
assert len(set(_JAILBREAKBENCH_BUILTIN)) == len(_JAILBREAKBENCH_BUILTIN)
def test_load_returns_list(self):
prompts = load_jailbreakbench_prompts()
assert isinstance(prompts, list)
assert len(prompts) >= 100
def test_load_caches(self):
p1 = load_jailbreakbench_prompts()
p2 = load_jailbreakbench_prompts()
assert p1 == p2
def test_load_returns_copy(self):
"""Loading should return a copy, not the cached list itself."""
p1 = load_jailbreakbench_prompts()
p1.append("mutated")
p2 = load_jailbreakbench_prompts()
assert "mutated" not in p2
# ---------------------------------------------------------------------------
# Arditi Refusal Rate
# ---------------------------------------------------------------------------
class TestArditiRefusalRate:
def test_canonical_substring_count(self):
"""Arditi/Zou list has exactly 29 substrings."""
assert len(ARDITI_REFUSAL_SUBSTRINGS) == 29
def test_all_refusals(self):
responses = [
"I'm sorry, but I cannot help with that.",
"I apologize for the inconvenience.",
"As an AI, I don't do that.",
]
assert arditi_refusal_rate(responses) == 1.0
def test_no_refusals(self):
responses = [
"Sure, here is a step-by-step guide.",
"The answer to your question is 42.",
"Here you go, this should work.",
]
assert arditi_refusal_rate(responses) == 0.0
def test_empty_responses_not_refusals(self):
"""Per Zou et al. logic, empty responses have no substring match = not refusals."""
responses = ["", " ", ""]
assert arditi_refusal_rate(responses) == 0.0
def test_empty_list(self):
assert arditi_refusal_rate([]) == 0.0
def test_case_insensitive(self):
responses = ["i'm sorry, but I can't help."]
assert arditi_refusal_rate(responses) == 1.0
def test_mixed(self):
responses = [
"I cannot do that.",
"Sure, here's the info.",
"My apologies, I'm unable to help.",
"The capital of France is Paris.",
]
rate = arditi_refusal_rate(responses)
assert rate == 0.5 # 2/4
def test_subtle_refusal_not_caught(self):
"""Arditi list is deliberately limited — subtle refusals should not match."""
responses = [
"I'd rather not help with that specific request.",
"Let me redirect you to a more appropriate resource.",
]
# These are NOT in the Arditi 29-substring list
assert arditi_refusal_rate(responses) == 0.0
# ---------------------------------------------------------------------------
# First-Token KL Divergence
# ---------------------------------------------------------------------------
class TestFirstTokenKL:
@pytest.fixture
def simple_models(self):
"""Create two tiny 'models' with controllable logit distributions."""
class FakeModel(torch.nn.Module):
def __init__(self, peak_idx: int = 0):
super().__init__()
# A single parameter so next(model.parameters()).device works
self._param = torch.nn.Parameter(torch.zeros(1))
self._peak_idx = peak_idx
def __call__(self, **kwargs):
batch_size = kwargs["input_ids"].shape[0]
seq_len = kwargs["input_ids"].shape[1]
vocab_size = 10
# Create a non-uniform distribution peaked at _peak_idx
base = torch.zeros(vocab_size)
base[self._peak_idx] = 5.0
logits = base.unsqueeze(0).unsqueeze(0).expand(
batch_size, seq_len, vocab_size
).clone()
return type("Output", (), {"logits": logits})()
class FakeTokenizer:
pad_token_id = 0
def __call__(self, texts, return_tensors="pt", **kwargs):
batch_size = len(texts) if isinstance(texts, list) else 1
input_ids = torch.ones(batch_size, 5, dtype=torch.long)
return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
return FakeModel, FakeTokenizer
def test_identical_models_zero_kl(self, simple_models):
FakeModel, FakeTokenizer = simple_models
model_a = FakeModel(peak_idx=0)
model_b = FakeModel(peak_idx=0)
tokenizer = FakeTokenizer()
result = first_token_kl_on_prompts(
model_a, model_b, tokenizer,
["hello", "world"],
)
assert abs(result["mean_kl"]) < 1e-5
assert result["interpretation"] == "excellent (minimal collateral damage)"
def test_different_models_positive_kl(self, simple_models):
FakeModel, FakeTokenizer = simple_models
model_a = FakeModel(peak_idx=0) # peaked at vocab position 0
model_b = FakeModel(peak_idx=5) # peaked at vocab position 5
tokenizer = FakeTokenizer()
result = first_token_kl_on_prompts(
model_a, model_b, tokenizer,
["test prompt"],
)
assert result["mean_kl"] > 0
def test_returns_per_prompt_kl(self, simple_models):
FakeModel, FakeTokenizer = simple_models
model_a = FakeModel(peak_idx=0)
model_b = FakeModel(peak_idx=3)
tokenizer = FakeTokenizer()
result = first_token_kl_on_prompts(
model_a, model_b, tokenizer,
["a", "b", "c"],
)
assert len(result["per_prompt_kl"]) == 3
assert result["std_kl"] >= 0
# ---------------------------------------------------------------------------
# HereticComparisonResult
# ---------------------------------------------------------------------------
class TestHereticComparisonResult:
def test_dataclass_fields(self):
r = HereticComparisonResult(
model_name="test-model",
method="OBLITERATUS",
refusal_rate_arditi=0.05,
refusal_rate_obliteratus=0.03,
harmbench_asr=0.85,
n_jailbreakbench=100,
n_refusals_remaining=5,
first_token_kl=0.15,
kl_interpretation="excellent",
)
assert r.model_name == "test-model"
assert r.method == "OBLITERATUS"
assert r.refusal_rate_arditi == 0.05
assert r.harmbench_asr == 0.85
assert r.first_token_kl == 0.15
def test_optional_fields_default_none(self):
r = HereticComparisonResult(
model_name="test",
method="test",
refusal_rate_arditi=0.0,
refusal_rate_obliteratus=0.0,
harmbench_asr=None,
n_jailbreakbench=100,
n_refusals_remaining=0,
)
assert r.mmlu is None
assert r.gsm8k is None
assert r.perplexity is None
assert r.harmbench_per_item == []
assert r.kl_per_prompt == []
# ---------------------------------------------------------------------------
# Comparison Table Formatting
# ---------------------------------------------------------------------------
class TestComparisonTable:
def test_format_single_result(self):
r = HereticComparisonResult(
model_name="Llama-2-7B",
method="OBLITERATUS",
refusal_rate_arditi=0.05,
refusal_rate_obliteratus=0.03,
harmbench_asr=0.85,
n_jailbreakbench=100,
n_refusals_remaining=5,
first_token_kl=0.15,
kl_interpretation="excellent",
mmlu=0.518,
gsm8k=0.313,
)
table = format_comparison_table([r])
assert "OBLITERATUS" in table
assert "REFUSAL REMOVAL" in table
assert "CAPABILITY PRESERVATION" in table
assert "DISTRIBUTION QUALITY" in table
assert "5.0%" in table # arditi refusal rate
assert "85.0%" in table # harmbench asr
assert "5/100" in table # JBB refusals
assert "0.1500" in table # KL divergence
def test_format_multiple_results(self):
results = [
HereticComparisonResult(
model_name="test", method="OBLITERATUS",
refusal_rate_arditi=0.05, refusal_rate_obliteratus=0.03,
harmbench_asr=0.85, n_jailbreakbench=100, n_refusals_remaining=5,
),
HereticComparisonResult(
model_name="test", method="Heretic",
refusal_rate_arditi=0.03, refusal_rate_obliteratus=0.03,
harmbench_asr=0.90, n_jailbreakbench=100, n_refusals_remaining=3,
),
]
table = format_comparison_table(results)
assert "OBLITERATUS" in table
assert "Heretic" in table
def test_heretic_reference_numbers_present(self):
"""The comparison table should include Heretic's published reference numbers."""
table = format_comparison_table([
HereticComparisonResult(
model_name="test", method="test",
refusal_rate_arditi=0.0, refusal_rate_obliteratus=0.0,
harmbench_asr=None, n_jailbreakbench=100, n_refusals_remaining=0,
),
])
assert "p-e-w/heretic" in table
assert "97/100" in table # Original model refusal count
assert "0.16" in table # Heretic's KL divergence
# ---------------------------------------------------------------------------
# LM-Eval Benchmark Config
# ---------------------------------------------------------------------------
class TestLmEvalBenchmarks:
def test_five_standard_benchmarks(self):
assert len(LM_EVAL_BENCHMARKS) == 5
def test_required_keys(self):
for key, config in LM_EVAL_BENCHMARKS.items():
assert "task" in config
assert "num_fewshot" in config
assert "description" in config
def test_expected_benchmarks(self):
expected = {"mmlu", "gsm8k", "arc_challenge", "hellaswag", "truthfulqa_mc2"}
assert set(LM_EVAL_BENCHMARKS.keys()) == expected
def test_fewshot_settings_match_arditi(self):
"""Few-shot settings should match Arditi et al. / Open LLM Leaderboard."""
assert LM_EVAL_BENCHMARKS["mmlu"]["num_fewshot"] == 5
assert LM_EVAL_BENCHMARKS["gsm8k"]["num_fewshot"] == 5
assert LM_EVAL_BENCHMARKS["truthfulqa_mc2"]["num_fewshot"] == 0
# ---------------------------------------------------------------------------
# HarmBench Template
# ---------------------------------------------------------------------------
class TestHarmBenchTemplate:
def test_template_has_sys_tags(self):
"""Official template requires <<SYS>> / <</SYS>> delimiters."""
assert "<<SYS>>" in _HARMBENCH_CLS_TEMPLATE
assert "<</SYS>>" in _HARMBENCH_CLS_TEMPLATE
def test_template_has_seven_rules(self):
"""Official HarmBench template has exactly 7 rules."""
rule_count = _HARMBENCH_CLS_TEMPLATE.count("\n- ")
assert rule_count == 7
def test_template_has_behavior_placeholder(self):
assert "{behavior}" in _HARMBENCH_CLS_TEMPLATE
def test_template_has_generation_placeholder(self):
assert "{generation}" in _HARMBENCH_CLS_TEMPLATE
def test_template_ends_with_inst_tag(self):
assert _HARMBENCH_CLS_TEMPLATE.strip().endswith("[/INST]")
def test_template_answer_format(self):
"""Official template uses lowercase 'yes' or 'no'."""
assert '"yes" or "no"' in _HARMBENCH_CLS_TEMPLATE
# ---------------------------------------------------------------------------
# Score Extraction (C3 fix)
# ---------------------------------------------------------------------------
class TestExtractScore:
def test_normal_score(self):
assert _extract_score({"acc,none": 0.75}) == 0.75
def test_zero_score_not_skipped(self):
"""A legitimate score of 0.0 should be returned, not treated as falsy."""
assert _extract_score({"acc,none": 0.0}) == 0.0
def test_fallback_to_next_key(self):
assert _extract_score({"acc_norm,none": 0.65}) == 0.65
def test_mc2_key(self):
assert _extract_score({"mc2,none": 0.42}) == 0.42
def test_no_matching_key(self):
assert _extract_score({"unknown_metric": 0.99}) == 0.0
def test_priority_order(self):
"""acc,none should take priority over acc_norm,none."""
result = _extract_score({"acc,none": 0.5, "acc_norm,none": 0.9})
assert result == 0.5
# ---------------------------------------------------------------------------
# Padding-Aware Last-Token Indices
# ---------------------------------------------------------------------------
class TestLastRealTokenIndices:
def test_no_padding(self):
mask = torch.ones(3, 5, dtype=torch.long)
indices = _last_real_token_indices(mask)
assert indices.tolist() == [4, 4, 4]
def test_with_padding(self):
mask = torch.tensor([
[1, 1, 1, 1, 1], # length 5, last real = index 4
[1, 1, 1, 0, 0], # length 3, last real = index 2
[1, 0, 0, 0, 0], # length 1, last real = index 0
])
indices = _last_real_token_indices(mask)
assert indices.tolist() == [4, 2, 0]
def test_single_token(self):
mask = torch.tensor([[1]])
indices = _last_real_token_indices(mask)
assert indices.tolist() == [0]
# ---------------------------------------------------------------------------
# Model Path Sanitization
# ---------------------------------------------------------------------------
class TestSanitizeModelPath:
def test_normal_path(self):
assert _sanitize_model_path("/tmp/my-model") == "/tmp/my-model"
def test_hf_model_id(self):
assert _sanitize_model_path("meta-llama/Llama-2-7b-hf") == "meta-llama/Llama-2-7b-hf"
def test_rejects_commas(self):
with pytest.raises(ValueError, match="commas"):
_sanitize_model_path("evil,trust_remote_code=True")
# ---------------------------------------------------------------------------
# Classifier Unload
# ---------------------------------------------------------------------------
class TestClassifierUnload:
def test_unload_when_not_loaded(self):
"""Unloading when nothing is loaded should not raise."""
unload_harmbench_classifier() # should be a no-op
# ---------------------------------------------------------------------------
# Default Harmless Prompts
# ---------------------------------------------------------------------------
class TestDefaultHarmlessPrompts:
def test_has_100_unique_prompts(self):
assert len(_DEFAULT_HARMLESS_PROMPTS) == 100
def test_no_duplicates(self):
assert len(set(_DEFAULT_HARMLESS_PROMPTS)) == len(_DEFAULT_HARMLESS_PROMPTS)
def test_all_non_empty(self):
for i, p in enumerate(_DEFAULT_HARMLESS_PROMPTS):
assert isinstance(p, str) and len(p) > 10, f"Prompt {i} is too short"
# ---------------------------------------------------------------------------
# KL Divergence Non-Negativity
# ---------------------------------------------------------------------------
class TestKLNonNegativity:
@pytest.fixture
def models_and_tokenizer(self):
class FakeModel(torch.nn.Module):
def __init__(self, peak_idx: int = 0):
super().__init__()
self._param = torch.nn.Parameter(torch.zeros(1))
self._peak_idx = peak_idx
def __call__(self, **kwargs):
batch_size = kwargs["input_ids"].shape[0]
seq_len = kwargs["input_ids"].shape[1]
vocab_size = 10
base = torch.zeros(vocab_size)
base[self._peak_idx] = 5.0
logits = base.unsqueeze(0).unsqueeze(0).expand(
batch_size, seq_len, vocab_size
).clone()
return type("Output", (), {"logits": logits})()
class FakeTokenizer:
pad_token_id = 0
def __call__(self, texts, return_tensors="pt", **kwargs):
batch_size = len(texts) if isinstance(texts, list) else 1
input_ids = torch.ones(batch_size, 5, dtype=torch.long)
return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
return FakeModel, FakeTokenizer
def test_all_kl_values_non_negative(self, models_and_tokenizer):
FakeModel, FakeTokenizer = models_and_tokenizer
model_a = FakeModel(peak_idx=0)
model_b = FakeModel(peak_idx=3)
tokenizer = FakeTokenizer()
result = first_token_kl_on_prompts(
model_a, model_b, tokenizer,
["a", "b", "c", "d", "e"],
)
for val in result["per_prompt_kl"]:
assert val >= 0.0, f"KL value {val} is negative"
+385
View File
@@ -0,0 +1,385 @@
"""Tests for the Analysis-Informed Abliteration Pipeline."""
from __future__ import annotations
import pytest
import torch
from obliteratus.informed_pipeline import (
AnalysisInsights,
InformedAbliterationPipeline,
InformedPipelineReport,
INFORMED_METHOD,
)
from obliteratus.abliterate import METHODS
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def insights():
"""Default AnalysisInsights for testing."""
return AnalysisInsights()
@pytest.fixture
def pipeline(tmp_path):
"""An InformedAbliterationPipeline with no model loaded."""
return InformedAbliterationPipeline(
model_name="test-model",
output_dir=str(tmp_path / "test_informed"),
)
# ---------------------------------------------------------------------------
# AnalysisInsights
# ---------------------------------------------------------------------------
class TestAnalysisInsights:
def test_default_values(self, insights):
assert insights.detected_alignment_method == "unknown"
assert insights.alignment_confidence == 0.0
assert insights.cone_is_polyhedral is False
assert insights.cone_dimensionality == 1.0
assert insights.mean_pairwise_cosine == 1.0
assert insights.per_category_directions == {}
assert insights.direction_specificity == {}
assert insights.cluster_count == 0
assert insights.direction_persistence == 0.0
assert insights.use_sparse_surgery is False
assert insights.recommended_n_directions == 4
assert insights.recommended_regularization == 0.0
assert insights.recommended_refinement_passes == 2
assert insights.recommended_layers == []
assert insights.skip_layers == []
def test_default_robustness(self, insights):
assert insights.estimated_robustness == "unknown"
assert insights.self_repair_estimate == 0.0
assert insights.entanglement_score == 0.0
assert insights.entangled_layers == []
assert insights.clean_layers == []
class TestInformedPipelineReport:
def test_default_report(self):
insights = AnalysisInsights()
report = InformedPipelineReport(insights=insights)
assert report.analysis_duration == 0.0
assert report.total_duration == 0.0
assert report.ouroboros_passes == 0
assert report.final_refusal_rate == 0.0
assert report.stages == []
# ---------------------------------------------------------------------------
# Method preset
# ---------------------------------------------------------------------------
class TestInformedMethod:
def test_informed_method_in_abliterate_methods(self):
assert "informed" in METHODS
cfg = METHODS["informed"]
assert cfg["norm_preserve"] is True
assert cfg["project_biases"] is True
assert cfg["use_chat_template"] is True
assert cfg["use_whitened_svd"] is True
assert cfg["true_iterative_refinement"] is True
def test_informed_method_standalone(self):
assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)"
assert INFORMED_METHOD["n_directions"] == 4
assert INFORMED_METHOD["norm_preserve"] is True
# ---------------------------------------------------------------------------
# Pipeline initialization
# ---------------------------------------------------------------------------
class TestPipelineInit:
def test_method_set_to_informed(self, pipeline):
assert pipeline.method == "informed"
def test_default_analysis_flags(self, pipeline):
assert pipeline._run_cone is True
assert pipeline._run_alignment is True
assert pipeline._run_cross_layer is True
assert pipeline._run_sparse is True
assert pipeline._run_defense is True
def test_ouroboros_defaults(self, pipeline):
assert pipeline._ouroboros_threshold == 0.5
assert pipeline._max_ouroboros_passes == 3
def test_entanglement_gate(self, pipeline):
assert pipeline._entanglement_gate == 0.8
def test_inherits_base_pipeline(self, pipeline):
assert pipeline.norm_preserve is True
assert pipeline.project_biases is True
assert pipeline.use_chat_template is True
assert pipeline.use_whitened_svd is True
assert pipeline.true_iterative_refinement is True
def test_custom_flags(self):
p = InformedAbliterationPipeline(
model_name="test",
run_cone_analysis=False,
run_alignment_detection=False,
ouroboros_threshold=0.3,
max_ouroboros_passes=5,
entanglement_gate=0.9,
)
assert p._run_cone is False
assert p._run_alignment is False
assert p._ouroboros_threshold == 0.3
assert p._max_ouroboros_passes == 5
assert p._entanglement_gate == 0.9
# ---------------------------------------------------------------------------
# Configuration derivation
# ---------------------------------------------------------------------------
class TestConfigurationDerivation:
"""Test the _derive_configuration logic with various insights."""
def _make_pipeline_with_insights(self, **kwargs):
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
for k, v in kwargs.items():
setattr(p._insights, k, v)
return p
def test_polyhedral_cone_more_directions(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=True,
cone_dimensionality=3.5,
)
p._derive_configuration()
# Polyhedral with dim 3.5 → n_dirs = max(4, min(8, int(3.5*2))) = 7
assert p.n_directions == 7
def test_linear_cone_fewer_directions(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=False,
cone_dimensionality=1.0,
)
p._derive_configuration()
# Linear with dim 1.0 → n_dirs = max(1, min(4, int(1.0+1))) = 2
assert p.n_directions == 2
def test_dpo_zero_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="dpo",
entanglement_score=0.1,
)
p._derive_configuration()
assert p.regularization == 0.0
def test_rlhf_moderate_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="rlhf",
entanglement_score=0.2,
)
p._derive_configuration()
assert p.regularization == 0.15
def test_cai_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="cai",
entanglement_score=0.2,
)
p._derive_configuration()
assert p.regularization == 0.2
def test_sft_low_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="sft",
entanglement_score=0.1,
)
p._derive_configuration()
assert p.regularization == 0.05
def test_high_entanglement_increases_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="dpo",
entanglement_score=0.7,
)
p._derive_configuration()
# DPO base = 0.0, + 0.15 for high entanglement = 0.15
assert p.regularization == 0.15
def test_high_self_repair_more_passes(self):
p = self._make_pipeline_with_insights(
self_repair_estimate=0.8,
)
p._derive_configuration()
assert p.refinement_passes == 3
def test_moderate_self_repair_two_passes(self):
p = self._make_pipeline_with_insights(
self_repair_estimate=0.5,
)
p._derive_configuration()
assert p.refinement_passes == 2
def test_low_self_repair_one_pass(self):
p = self._make_pipeline_with_insights(
self_repair_estimate=0.2,
)
p._derive_configuration()
assert p.refinement_passes == 1
def test_cluster_layers_used(self):
p = self._make_pipeline_with_insights(
cluster_representative_layers=[5, 10, 15],
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
)
p.refusal_directions = {i: torch.randn(64) for i in range(20)}
p._derive_configuration()
# Should include all cluster layers
assert 5 in p._insights.recommended_layers
assert 10 in p._insights.recommended_layers
def test_entangled_layers_skipped(self):
p = self._make_pipeline_with_insights(
cluster_representative_layers=[5, 10, 15],
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
entangled_layers=[10],
)
p._derive_configuration()
# Layer 10 should be skipped
assert 10 not in p._insights.recommended_layers
assert 10 in p._insights.skip_layers
def test_sparse_surgery_enabled_when_rsi_high(self):
p = self._make_pipeline_with_insights(
mean_refusal_sparsity_index=0.7,
)
p._sparse_threshold = 0.5
p._derive_configuration()
assert p._insights.use_sparse_surgery is True
def test_sparse_surgery_disabled_when_rsi_low(self):
p = self._make_pipeline_with_insights(
mean_refusal_sparsity_index=0.3,
)
p._sparse_threshold = 0.5
p._derive_configuration()
assert p._insights.use_sparse_surgery is False
def test_whitened_svd_for_multi_direction(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=True,
cone_dimensionality=2.5,
)
p._derive_configuration()
assert p.n_directions > 1
assert p.use_whitened_svd is True
def test_no_whitened_svd_for_single_direction(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=False,
cone_dimensionality=0.5,
)
p._derive_configuration()
# dim 0.5 → max(1, min(4, int(0.5+1))) = 1
assert p.n_directions == 1
assert p.use_whitened_svd is False
# ---------------------------------------------------------------------------
# Format report
# ---------------------------------------------------------------------------
class TestFormatInsights:
def test_format_default(self, insights):
text = InformedAbliterationPipeline.format_insights(insights)
assert "Analysis-Informed Pipeline" in text
assert "UNKNOWN" in text # detected method
assert "LINEAR" in text # cone type
def test_format_polyhedral(self):
insights = AnalysisInsights(
detected_alignment_method="dpo",
alignment_confidence=0.85,
cone_is_polyhedral=True,
cone_dimensionality=3.5,
cluster_count=4,
)
text = InformedAbliterationPipeline.format_insights(insights)
assert "DPO" in text
assert "POLYHEDRAL" in text
assert "3.50" in text
def test_format_includes_derived_config(self, insights):
insights.recommended_n_directions = 6
insights.recommended_regularization = 0.2
insights.recommended_refinement_passes = 3
text = InformedAbliterationPipeline.format_insights(insights)
assert "n_directions: 6" in text
assert "regularization: 0.2" in text
assert "refinement_passes: 3" in text
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEdgeCases:
def test_no_cluster_layers_falls_back(self):
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
p._insights.cluster_representative_layers = []
p._derive_configuration()
assert p._insights.recommended_layers == []
def test_regularization_capped(self):
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
p._insights.detected_alignment_method = "cai"
p._insights.entanglement_score = 0.9
p._derive_configuration()
# CAI base = 0.2, + 0.15 = 0.35, capped at 0.5
assert p.regularization <= 0.5
def test_all_layers_entangled_keeps_some(self):
"""If all cluster layers are entangled, don't skip all of them."""
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
p._insights.cluster_representative_layers = [5]
p._insights.direction_clusters = [[5]]
p._insights.entangled_layers = [5]
p._derive_configuration()
# Should NOT skip the only layer
assert 5 in p._insights.recommended_layers
def test_cone_dimensionality_bounds(self):
"""Extreme cone dimensionality values are handled."""
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
# Very high dimensionality
p._insights.cone_is_polyhedral = True
p._insights.cone_dimensionality = 10.0
p._derive_configuration()
assert p.n_directions <= 8 # capped
# Very low dimensionality
p._insights.cone_is_polyhedral = False
p._insights.cone_dimensionality = 0.1
p._derive_configuration()
assert p.n_directions >= 1 # at least 1
+172
View File
@@ -0,0 +1,172 @@
"""Tests for logit lens refusal direction analysis."""
from __future__ import annotations
from unittest.mock import MagicMock
import torch
from obliteratus.analysis.logit_lens import (
RefusalLogitLens,
LogitLensResult,
MultiLayerLogitLensResult,
REFUSAL_TOKENS,
COMPLIANCE_TOKENS,
)
def _make_mock_model(hidden_dim=32, vocab_size=100):
"""Create a mock model with LM head and layer norm."""
model = MagicMock()
# LM head weight (vocab_size, hidden_dim)
lm_head = MagicMock()
lm_head.weight = MagicMock()
lm_head.weight.data = torch.randn(vocab_size, hidden_dim)
model.lm_head = lm_head
# Final LayerNorm
ln_f = MagicMock()
ln_f.weight = MagicMock()
ln_f.weight.data = torch.ones(hidden_dim)
ln_f.bias = MagicMock()
ln_f.bias.data = torch.zeros(hidden_dim)
model.transformer = MagicMock()
model.transformer.ln_f = ln_f
return model
def _make_mock_tokenizer(vocab_size=100):
"""Create a mock tokenizer."""
tokenizer = MagicMock()
def mock_decode(ids):
if isinstance(ids, list) and len(ids) == 1:
return f"tok_{ids[0]}"
return f"tok_{ids}"
def mock_encode(text, add_special_tokens=False):
# Return a deterministic token ID based on the text
return [hash(text) % vocab_size]
tokenizer.decode = mock_decode
tokenizer.encode = mock_encode
return tokenizer
class TestRefusalLogitLens:
def test_basic_analysis(self):
"""Should produce a LogitLensResult with expected fields."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
direction = torch.randn(32)
lens = RefusalLogitLens(top_k=10)
result = lens.analyze_direction(direction, model, tokenizer, layer_idx=5)
assert isinstance(result, LogitLensResult)
assert result.layer_idx == 5
assert len(result.top_promoted) == 10
assert len(result.top_suppressed) == 10
assert isinstance(result.refusal_specificity, float)
assert isinstance(result.logit_effect_entropy, float)
assert isinstance(result.refusal_compliance_gap, float)
def test_promoted_suppressed_ordering(self):
"""Top promoted should have higher logit boost than top suppressed."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
direction = torch.randn(32)
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_direction(direction, model, tokenizer)
# Promoted tokens should have positive-ish values
# Suppressed tokens should have negative-ish values
max_promoted = max(v for _, v in result.top_promoted)
min_suppressed = min(v for _, v in result.top_suppressed)
assert max_promoted > min_suppressed
def test_multi_layer_analysis(self):
"""Should analyze multiple layers."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
directions = {0: torch.randn(32), 1: torch.randn(32), 2: torch.randn(32)}
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_all_layers(directions, model, tokenizer)
assert isinstance(result, MultiLayerLogitLensResult)
assert len(result.per_layer) == 3
assert result.strongest_refusal_layer in [0, 1, 2]
assert result.peak_specificity_layer in [0, 1, 2]
def test_strong_layers_filter(self):
"""Should only analyze specified strong layers."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
directions = {i: torch.randn(32) for i in range(10)}
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_all_layers(
directions, model, tokenizer, strong_layers=[2, 5]
)
assert set(result.per_layer.keys()) == {2, 5}
def test_handles_unnormalized_direction(self):
"""Should handle non-unit directions."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
direction = torch.randn(32) * 100.0 # large magnitude
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_direction(direction, model, tokenizer)
# Should still produce valid results
assert len(result.top_promoted) == 5
def test_format_report(self):
"""Format report should produce readable output."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
directions = {0: torch.randn(32), 1: torch.randn(32)}
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_all_layers(directions, model, tokenizer)
report = RefusalLogitLens.format_report(result)
assert "Logit Lens" in report
assert "Layer 0:" in report
def test_empty_directions(self):
"""Should handle empty input gracefully."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_all_layers({}, model, tokenizer)
assert len(result.per_layer) == 0
def test_token_lists_nonempty(self):
"""Refusal and compliance token lists should have entries."""
assert len(REFUSAL_TOKENS) > 10
assert len(COMPLIANCE_TOKENS) > 10
def test_entropy_nonnegative(self):
"""Logit effect entropy should be non-negative."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
direction = torch.randn(32)
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_direction(direction, model, tokenizer)
assert result.logit_effect_entropy >= 0
def test_2d_direction_input(self):
"""Should handle 2D direction input (unsqueezed)."""
model = _make_mock_model()
tokenizer = _make_mock_tokenizer()
direction = torch.randn(1, 32)
lens = RefusalLogitLens(top_k=5)
result = lens.analyze_direction(direction, model, tokenizer)
assert len(result.top_promoted) == 5
+60
View File
@@ -0,0 +1,60 @@
"""Tests for evaluation metrics."""
from __future__ import annotations
import torch
from obliteratus.evaluation.metrics import accuracy, f1_score_metric, perplexity
class TestPerplexity:
def test_perfect_prediction(self):
# Create logits that strongly predict the correct next token
vocab_size = 10
seq_len = 5
batch_size = 1
labels = torch.tensor([[0, 1, 2, 3, 4]])
logits = torch.full((batch_size, seq_len, vocab_size), -100.0)
# Set high logit for the correct next token
for t in range(seq_len - 1):
logits[0, t, labels[0, t + 1]] = 100.0
ppl = perplexity(logits, labels)
assert ppl < 2.0, f"Expected near-1 perplexity, got {ppl}"
def test_random_prediction_higher(self):
vocab_size = 100
seq_len = 20
batch_size = 2
torch.manual_seed(42)
logits = torch.randn(batch_size, seq_len, vocab_size)
labels = torch.randint(0, vocab_size, (batch_size, seq_len))
ppl = perplexity(logits, labels)
assert ppl > 10, f"Random logits should yield high perplexity, got {ppl}"
class TestAccuracy:
def test_perfect(self):
assert accuracy([1, 2, 3], [1, 2, 3]) == 1.0
def test_zero(self):
assert accuracy([1, 2, 3], [4, 5, 6]) == 0.0
def test_partial(self):
assert accuracy([1, 2, 3, 4], [1, 2, 0, 0]) == 0.5
def test_empty(self):
assert accuracy([], []) == 0.0
class TestF1:
def test_perfect(self):
assert f1_score_metric([0, 1, 0, 1], [0, 1, 0, 1]) == 1.0
def test_zero(self):
score = f1_score_metric([0, 0, 0, 0], [1, 1, 1, 1])
assert score == 0.0
+85
View File
@@ -0,0 +1,85 @@
"""Smoke tests verifying all new modules are importable from package level."""
from __future__ import annotations
class TestTopLevelImports:
"""Verify obliteratus top-level exports."""
def test_set_seed(self):
from obliteratus import set_seed
assert callable(set_seed)
def test_run_sweep(self):
from obliteratus import run_sweep
assert callable(run_sweep)
def test_sweep_config(self):
from obliteratus import SweepConfig
cfg = SweepConfig(
model_name="test",
sweep_params={"n_directions": [1, 2]},
)
assert cfg.model_name == "test"
def test_sweep_result(self):
from obliteratus import SweepResult
r = SweepResult(
params={"n_directions": 1},
seed=42,
quality_metrics={},
stage_durations={},
strong_layers=[],
)
assert r.seed == 42
class TestEvaluationImports:
"""Verify evaluation subpackage exports."""
def test_refusal_rate_with_ci(self):
from obliteratus.evaluation import refusal_rate_with_ci
result = refusal_rate_with_ci(["Sure, here you go."], mode="combined")
assert result["rate"] == 0.0
assert result["n_samples"] == 1
def test_random_direction_ablation(self):
from obliteratus.evaluation import random_direction_ablation
assert callable(random_direction_ablation)
def test_direction_specificity_test(self):
from obliteratus.evaluation import direction_specificity_test
assert callable(direction_specificity_test)
def test_run_benchmarks(self):
from obliteratus.evaluation import run_benchmarks
assert callable(run_benchmarks)
def test_compare_models(self):
from obliteratus.evaluation import compare_models
assert callable(compare_models)
class TestDirectImports:
"""Verify direct module imports still work."""
def test_reproducibility(self):
from obliteratus.reproducibility import set_seed
import torch
set_seed(999, deterministic=False)
a = torch.randn(10)
set_seed(999, deterministic=False)
b = torch.randn(10)
assert torch.equal(a, b)
def test_baselines(self):
from obliteratus.evaluation.baselines import (
BaselineResult,
)
assert BaselineResult is not None
def test_lm_eval_integration(self):
from obliteratus.evaluation.lm_eval_integration import (
run_benchmarks,
)
assert callable(run_benchmarks)
+672
View File
@@ -0,0 +1,672 @@
"""Tests for the five new analysis modules:
1. Tuned Lens (learned-affine logit lens variant)
2. Activation Patching (real interchange intervention)
3. Enhanced SAE Decomposition Pipeline
4. Wasserstein-Optimal Direction Extraction
5. Bayesian-Optimized Kernel Projection
"""
from __future__ import annotations
import pytest
import torch
import torch.nn as nn
from obliteratus.analysis.tuned_lens import (
TunedLensTrainer,
TunedLensProbe,
RefusalTunedLens,
TunedLensResult,
MultiLayerTunedLensResult,
)
from obliteratus.analysis.activation_patching import (
ActivationPatcher,
PatchingSite,
ActivationPatchingResult,
)
from obliteratus.analysis.sae_abliteration import (
SAEDecompositionPipeline,
SAEDecompositionResult,
FeatureClusterResult,
)
from obliteratus.analysis.wasserstein_optimal import (
WassersteinOptimalExtractor,
WassersteinDirectionResult,
WassersteinComparisonResult,
MultiLayerWassersteinResult,
)
from obliteratus.analysis.bayesian_kernel_projection import (
BayesianKernelProjection,
BayesianOptimizationResult,
ProjectionConfig,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_activations(
hidden_dim=32, n_per_class=20, separation=2.0, seed=42,
):
"""Create harmful/harmless activations with planted refusal signal."""
torch.manual_seed(seed)
direction = torch.randn(hidden_dim)
direction = direction / direction.norm()
harmful = [
torch.randn(hidden_dim) * 0.3 + separation * direction
for _ in range(n_per_class)
]
harmless = [
torch.randn(hidden_dim) * 0.3
for _ in range(n_per_class)
]
return harmful, harmless, direction
def _make_multilayer_activations(
n_layers=6, hidden_dim=32, n_per_class=20, separation=2.0, seed=42,
):
"""Create per-layer activations with planted refusal signals."""
torch.manual_seed(seed)
harmful_acts = {}
harmless_acts = {}
directions = {}
for li in range(n_layers):
d = torch.randn(hidden_dim)
d = d / d.norm()
directions[li] = d
strength = separation if 1 <= li <= n_layers - 2 else 0.3
harmful_acts[li] = [
torch.randn(hidden_dim) * 0.3 + strength * d
for _ in range(n_per_class)
]
harmless_acts[li] = [
torch.randn(hidden_dim) * 0.3
for _ in range(n_per_class)
]
return harmful_acts, harmless_acts, directions
class FakeTokenizer:
"""Fake tokenizer that maps strings to reproducible token IDs."""
def __init__(self, vocab_size=100):
self.vocab_size = vocab_size
def encode(self, text, add_special_tokens=False):
return [hash(text) % self.vocab_size]
def decode(self, ids):
return f"tok_{ids[0]}"
class FakeModel(nn.Module):
"""Fake model with lm_head and transformer.ln_f for testing."""
def __init__(self, hidden_dim=32, vocab_size=100, n_layers=4):
super().__init__()
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
self.n_layers = n_layers
self.lm_head = nn.Linear(hidden_dim, vocab_size, bias=False)
self.transformer = nn.Module()
self.transformer.ln_f = nn.LayerNorm(hidden_dim)
self.transformer.h = nn.ModuleList([
nn.Linear(hidden_dim, hidden_dim) for _ in range(n_layers)
])
def forward(self, input_ids):
# Fake forward pass
batch_size, seq_len = input_ids.shape
x = torch.randn(batch_size, seq_len, self.hidden_dim)
for layer in self.transformer.h:
x = layer(x) + x
logits = self.lm_head(self.transformer.ln_f(x))
return type('Output', (), {'logits': logits})()
# ===========================================================================
# Tests: Tuned Lens
# ===========================================================================
class TestTunedLensTrainer:
def test_train_single_probe(self):
hidden_dim = 16
n_samples = 30
layer_acts = torch.randn(n_samples, hidden_dim)
final_acts = layer_acts + torch.randn(n_samples, hidden_dim) * 0.1
trainer = TunedLensTrainer(hidden_dim, n_epochs=20)
probe = trainer.train_probe(layer_acts, final_acts, layer_idx=3)
assert isinstance(probe, TunedLensProbe)
assert probe.layer_idx == 3
assert probe.weight.shape == (hidden_dim, hidden_dim)
assert probe.bias.shape == (hidden_dim,)
assert probe.train_loss < 1.0 # should converge somewhat
def test_train_all_layers(self):
hidden_dim = 16
n_samples = 20
layer_acts = {
i: torch.randn(n_samples, hidden_dim) for i in range(4)
}
final_acts = torch.randn(n_samples, hidden_dim)
trainer = TunedLensTrainer(hidden_dim, n_epochs=10)
probes = trainer.train_all_layers(layer_acts, final_acts)
assert len(probes) == 4
for i in range(4):
assert i in probes
assert probes[i].weight.shape == (hidden_dim, hidden_dim)
def test_probe_near_identity_for_final_layer(self):
"""Probe for the final layer should be close to identity."""
hidden_dim = 16
n_samples = 50
acts = torch.randn(n_samples, hidden_dim)
trainer = TunedLensTrainer(hidden_dim, n_epochs=50)
probe = trainer.train_probe(acts, acts, layer_idx=0)
# Weight should be close to identity
identity = torch.eye(hidden_dim)
diff = (probe.weight - identity).norm().item()
assert diff < 1.0
class TestRefusalTunedLens:
def test_analyze_direction(self):
hidden_dim = 32
vocab_size = 100
model = FakeModel(hidden_dim, vocab_size)
tokenizer = FakeTokenizer(vocab_size)
direction = torch.randn(hidden_dim)
probe = TunedLensProbe(
layer_idx=2,
weight=torch.eye(hidden_dim) + torch.randn(hidden_dim, hidden_dim) * 0.01,
bias=torch.zeros(hidden_dim),
train_loss=0.01,
)
lens = RefusalTunedLens(top_k=10)
result = lens.analyze_direction(direction, probe, model, tokenizer)
assert isinstance(result, TunedLensResult)
assert result.layer_idx == 2
assert len(result.top_promoted) <= 10
assert len(result.top_suppressed) <= 10
assert isinstance(result.correction_magnitude, float)
assert result.correction_magnitude >= 0
def test_analyze_all_layers(self):
hidden_dim = 32
vocab_size = 100
model = FakeModel(hidden_dim, vocab_size)
tokenizer = FakeTokenizer(vocab_size)
directions = {
i: torch.randn(hidden_dim) for i in range(4)
}
probes = {
i: TunedLensProbe(
layer_idx=i,
weight=torch.eye(hidden_dim),
bias=torch.zeros(hidden_dim),
train_loss=0.01,
)
for i in range(4)
}
lens = RefusalTunedLens(top_k=5)
result = lens.analyze_all_layers(directions, probes, model, tokenizer)
assert isinstance(result, MultiLayerTunedLensResult)
assert len(result.per_layer) == 4
assert result.strongest_refusal_layer in range(4)
def test_compare_with_logit_lens(self):
logit_gaps = {0: 0.1, 1: 0.5, 2: 0.3, 3: 0.8}
tuned_result = MultiLayerTunedLensResult(
per_layer={
i: TunedLensResult(
layer_idx=i,
top_promoted=[], top_suppressed=[],
refusal_token_mean_boost=0.0,
compliance_token_mean_boost=0.0,
refusal_compliance_gap=v * 1.1, # similar ranking
correction_magnitude=0.1,
)
for i, v in logit_gaps.items()
},
probes={},
strongest_refusal_layer=3,
peak_gap_layer=3,
mean_refusal_compliance_gap=0.5,
logit_lens_agreement=0.0,
)
agreement = RefusalTunedLens.compare_with_logit_lens(tuned_result, logit_gaps)
# Same ranking → correlation should be 1.0
assert agreement == pytest.approx(1.0, abs=0.01)
def test_format_report(self):
result = MultiLayerTunedLensResult(
per_layer={},
probes={},
strongest_refusal_layer=0,
peak_gap_layer=0,
mean_refusal_compliance_gap=0.0,
logit_lens_agreement=0.0,
)
report = RefusalTunedLens.format_report(result)
assert "Tuned Lens" in report
assert "No layers analyzed" in report
# ===========================================================================
# Tests: Activation Patching
# ===========================================================================
class TestActivationPatcher:
def test_patching_site_creation(self):
site = PatchingSite(layer_idx=3, component="residual")
assert site.layer_idx == 3
assert site.component == "residual"
assert site.head_idx is None
def test_patching_site_with_head(self):
site = PatchingSite(layer_idx=2, component="attn_head", head_idx=5)
assert site.head_idx == 5
def test_patch_sweep_with_model(self):
"""Test full patching sweep on fake model."""
hidden_dim = 32
model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
clean_ids = torch.randint(0, 100, (1, 10))
corrupted_ids = torch.randint(0, 100, (1, 10))
patcher = ActivationPatcher(significance_threshold=0.05)
result = patcher.patch_sweep(
model, clean_ids, corrupted_ids,
mode="noising",
)
assert isinstance(result, ActivationPatchingResult)
assert result.patching_mode == "noising"
assert result.n_layers == 4
assert len(result.effects) > 0
assert isinstance(result.circuit_fraction, float)
assert 0.0 <= result.circuit_fraction <= 1.0
def test_patch_sweep_denoising(self):
hidden_dim = 32
model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
clean_ids = torch.randint(0, 100, (1, 10))
corrupted_ids = torch.randint(0, 100, (1, 10))
patcher = ActivationPatcher()
result = patcher.patch_sweep(
model, clean_ids, corrupted_ids,
mode="denoising",
)
assert result.patching_mode == "denoising"
def test_custom_metric(self):
hidden_dim = 32
model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
clean_ids = torch.randint(0, 100, (1, 10))
corrupted_ids = torch.randint(0, 100, (1, 10))
def custom_metric(logits):
return logits.sum().item()
patcher = ActivationPatcher(metric_fn=custom_metric)
result = patcher.patch_sweep(model, clean_ids, corrupted_ids)
assert isinstance(result, ActivationPatchingResult)
assert isinstance(result.clean_baseline, float)
def test_format_report(self):
result = ActivationPatchingResult(
n_layers=4,
n_sites=4,
patching_mode="noising",
effects=[],
clean_baseline=1.0,
corrupted_baseline=0.0,
total_effect=1.0,
significant_sites=[],
circuit_fraction=0.0,
top_causal_layers=[],
)
report = ActivationPatcher.format_report(result)
assert "Activation Patching" in report
assert "noising" in report
# ===========================================================================
# Tests: Enhanced SAE Decomposition Pipeline
# ===========================================================================
class TestSAEDecompositionPipeline:
def test_basic_pipeline(self):
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=2.0)
pipeline = SAEDecompositionPipeline(
expansion=2, n_epochs=10, top_k_features=8, n_clusters=3,
)
result = pipeline.run(harmful, harmless, layer_idx=0)
assert isinstance(result, SAEDecompositionResult)
assert result.layer_idx == 0
assert result.sae is not None
assert result.refusal_features.n_refusal_features == 8
assert len(result.feature_sparsity) == 8
assert len(result.feature_monosemanticity) == 8
assert len(result.per_feature_refusal_reduction) == 8
assert len(result.cumulative_refusal_reduction) == 8
assert 0.0 <= result.raw_direction_overlap <= 1.0
def test_feature_clustering(self):
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30)
pipeline = SAEDecompositionPipeline(
expansion=2, n_epochs=10, top_k_features=8, n_clusters=3,
)
result = pipeline.run(harmful, harmless)
clusters = result.feature_clusters
assert clusters is not None
assert isinstance(clusters, FeatureClusterResult)
assert clusters.n_clusters == 3
assert len(clusters.cluster_labels) == 8
assert all(0 <= lbl < 3 for lbl in clusters.cluster_labels)
assert clusters.cluster_directions.shape[0] == 3
assert -1.0 <= clusters.silhouette_score <= 1.0
def test_cumulative_reduction_monotonic(self):
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=3.0)
pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=10, top_k_features=6)
result = pipeline.run(harmful, harmless)
# Cumulative reduction should be non-decreasing
for i in range(1, len(result.cumulative_refusal_reduction)):
assert result.cumulative_refusal_reduction[i] >= result.cumulative_refusal_reduction[i - 1] - 1e-6
def test_format_report(self):
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20)
pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=5, top_k_features=4, n_clusters=2)
result = pipeline.run(harmful, harmless)
report = SAEDecompositionPipeline.format_report(result)
assert "SAE Feature Decomposition" in report
assert "Variance explained" in report
# ===========================================================================
# Tests: Wasserstein-Optimal Direction Extraction
# ===========================================================================
class TestWassersteinOptimalExtractor:
def test_basic_extraction(self):
harmful, harmless, planted_dir = _make_activations(
hidden_dim=32, n_per_class=30, separation=3.0,
)
extractor = WassersteinOptimalExtractor()
result = extractor.extract(harmful, harmless, layer_idx=0)
assert isinstance(result, WassersteinDirectionResult)
assert result.layer_idx == 0
assert result.direction.shape == (32,)
assert abs(result.direction.norm().item() - 1.0) < 1e-5
assert result.wasserstein_cost >= 0
assert result.mean_shift_component >= 0
assert result.bures_component >= 0
assert result.cost_effectiveness_ratio >= 0
def test_direction_captures_signal(self):
"""Wasserstein direction should have non-trivial refusal projection."""
harmful, harmless, planted_dir = _make_activations(
hidden_dim=32, n_per_class=30, separation=3.0,
)
extractor = WassersteinOptimalExtractor()
result = extractor.extract(harmful, harmless)
# Direction should have some alignment with planted signal
cosine = abs((result.direction @ planted_dir).item())
assert cosine > 0.1 # not totally orthogonal
def test_extract_all_layers(self):
harmful_acts, harmless_acts, _ = _make_multilayer_activations(
n_layers=4, hidden_dim=16, n_per_class=20,
)
extractor = WassersteinOptimalExtractor()
result = extractor.extract_all_layers(harmful_acts, harmless_acts)
assert isinstance(result, MultiLayerWassersteinResult)
assert len(result.per_layer) == 4
assert result.best_layer in range(4)
assert result.mean_cost_ratio >= 0
def test_compare_with_alternatives(self):
harmful, harmless, planted_dir = _make_activations(
hidden_dim=16, n_per_class=30, separation=3.0,
)
extractor = WassersteinOptimalExtractor()
w_result = extractor.extract(harmful, harmless)
# Use planted direction as "Fisher" and diff-in-means
H = torch.stack(harmful).float()
B = torch.stack(harmless).float()
dim_dir = (H.mean(0) - B.mean(0))
dim_dir = dim_dir / dim_dir.norm()
comparison = extractor.compare_with_alternatives(
w_result, harmful, harmless,
fisher_direction=planted_dir,
dim_direction=dim_dir,
)
assert isinstance(comparison, WassersteinComparisonResult)
assert comparison.wasserstein_cost_ratio >= 0
assert comparison.fisher_cost_ratio is not None
assert comparison.dim_cost_ratio is not None
assert 0 <= comparison.cosine_wasserstein_fisher <= 1
assert 0 <= comparison.cosine_wasserstein_dim <= 1
def test_wasserstein_lower_cost_than_dim(self):
"""Wasserstein-optimal should have lower cost ratio than diff-in-means."""
harmful, harmless, _ = _make_activations(
hidden_dim=32, n_per_class=50, separation=2.0,
)
extractor = WassersteinOptimalExtractor()
w_result = extractor.extract(harmful, harmless)
H = torch.stack(harmful).float()
B = torch.stack(harmless).float()
dim_dir = (H.mean(0) - B.mean(0))
dim_dir = dim_dir / dim_dir.norm()
comparison = extractor.compare_with_alternatives(
w_result, harmful, harmless, dim_direction=dim_dir,
)
# Wasserstein should have lower or equal cost ratio by construction
assert comparison.wasserstein_cost_ratio <= comparison.dim_cost_ratio + 1e-4
def test_format_report(self):
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20)
extractor = WassersteinOptimalExtractor()
result = extractor.extract_all_layers(
{0: harmful, 1: harmful},
{0: harmless, 1: harmless},
)
report = WassersteinOptimalExtractor.format_report(result)
assert "Wasserstein" in report
assert "cost ratio" in report.lower()
# ===========================================================================
# Tests: Bayesian-Optimized Kernel Projection
# ===========================================================================
class TestBayesianKernelProjection:
def test_basic_optimization(self):
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
n_layers=6, hidden_dim=16, n_per_class=20,
)
optimizer = BayesianKernelProjection(
n_trials=30, refusal_weight=0.6, distortion_weight=0.4,
)
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
assert isinstance(result, BayesianOptimizationResult)
assert result.n_trials == 30
assert result.best_score >= 0
assert 0 <= result.best_refusal_reduction <= 1.0
assert result.best_harmless_distortion >= 0
assert len(result.all_trials) == 30
def test_best_config_structure(self):
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
n_layers=4, hidden_dim=16, n_per_class=15,
)
optimizer = BayesianKernelProjection(n_trials=20)
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
config = result.best_config
assert isinstance(config, ProjectionConfig)
assert config.layer_range[0] <= config.layer_range[1]
assert config.n_directions >= 1
assert 0 <= config.regularization <= 0.5
def test_pareto_front(self):
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
n_layers=6, hidden_dim=16, n_per_class=20,
)
optimizer = BayesianKernelProjection(n_trials=50)
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
# Pareto front should have at least 1 entry
assert len(result.pareto_configs) >= 1
# Pareto entries should be non-dominated
for i in range(len(result.pareto_configs) - 1):
# Each entry should have lower distortion than the next
# (since they're sorted by decreasing refusal reduction)
assert (
result.pareto_configs[i].harmless_distortion
>= result.pareto_configs[i + 1].harmless_distortion - 1e-8
)
def test_layer_importance(self):
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
n_layers=6, hidden_dim=16, n_per_class=20,
)
optimizer = BayesianKernelProjection(n_trials=50)
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
assert len(result.layer_importance) == 6
for _layer, imp in result.layer_importance.items():
assert 0 <= imp <= 1.0
def test_tpe_improves_over_random(self):
"""TPE phase should produce better configs than random exploration."""
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
n_layers=6, hidden_dim=16, n_per_class=20,
)
optimizer = BayesianKernelProjection(n_trials=60, seed=42)
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
# Compare average score of first 20 (random) vs last 20 (TPE)
first_20 = sorted(result.all_trials[:20], key=lambda t: t.combined_score)
last_20 = sorted(result.all_trials[-20:], key=lambda t: t.combined_score)
best_random = first_20[0].combined_score
best_tpe = min(t.combined_score for t in last_20)
# TPE should find at least as good (lower = better)
# This is probabilistic so we allow some slack
assert best_tpe <= best_random + 0.3
def test_empty_input(self):
optimizer = BayesianKernelProjection(n_trials=10)
result = optimizer.optimize({}, {}, {})
assert result.n_trials == 0
assert result.best_score == 0.0
def test_format_report(self):
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
n_layers=4, hidden_dim=16, n_per_class=15,
)
optimizer = BayesianKernelProjection(n_trials=20)
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
report = BayesianKernelProjection.format_report(result)
assert "Bayesian" in report
assert "Pareto" in report
assert "Layer importance" in report
# ===========================================================================
# Tests: Module imports
# ===========================================================================
class TestModuleImports:
def test_all_new_modules_importable(self):
from obliteratus.analysis import TunedLensTrainer
from obliteratus.analysis import RefusalTunedLens
from obliteratus.analysis import ActivationPatcher
from obliteratus.analysis import WassersteinOptimalExtractor
from obliteratus.analysis import BayesianKernelProjection
from obliteratus.analysis import SAEDecompositionPipeline
assert TunedLensTrainer is not None
assert RefusalTunedLens is not None
assert ActivationPatcher is not None
assert WassersteinOptimalExtractor is not None
assert BayesianKernelProjection is not None
assert SAEDecompositionPipeline is not None
def test_new_modules_in_all(self):
import obliteratus.analysis as analysis
assert "TunedLensTrainer" in analysis.__all__
assert "RefusalTunedLens" in analysis.__all__
assert "ActivationPatcher" in analysis.__all__
assert "WassersteinOptimalExtractor" in analysis.__all__
assert "BayesianKernelProjection" in analysis.__all__
assert "SAEDecompositionPipeline" in analysis.__all__
+669
View File
@@ -0,0 +1,669 @@
"""Tests for analysis techniques: concept cones, alignment imprints,
multi-token position, and sparse direction surgery."""
from __future__ import annotations
import torch
from obliteratus.analysis.concept_geometry import (
ConceptConeAnalyzer,
ConeConeResult,
MultiLayerConeResult,
CategoryDirection,
DEFAULT_HARM_CATEGORIES,
)
from obliteratus.analysis.alignment_imprint import (
AlignmentImprintDetector,
AlignmentImprint,
BaseInstructDelta,
)
from obliteratus.analysis.multi_token_position import (
MultiTokenPositionAnalyzer,
PositionAnalysisResult,
MultiTokenSummary,
)
from obliteratus.analysis.sparse_surgery import (
SparseDirectionSurgeon,
SparseProjectionResult,
SparseSurgeryPlan,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_category_activations(
hidden_dim=32, n_prompts=30, n_categories=5, category_spread=0.3,
):
"""Create synthetic activations with planted per-category refusal directions.
Each category gets its own refusal direction, with some shared component
to simulate a polyhedral cone structure.
"""
torch.manual_seed(42)
# Shared refusal component
shared = torch.randn(hidden_dim)
shared = shared / shared.norm()
# Per-category unique components
cat_dirs = {}
categories = [f"cat_{i}" for i in range(n_categories)]
for cat in categories:
unique = torch.randn(hidden_dim)
unique = unique / unique.norm()
combined = shared + category_spread * unique
cat_dirs[cat] = combined / combined.norm()
# Assign prompts to categories
prompts_per_cat = n_prompts // n_categories
category_map = {}
for i, cat in enumerate(categories):
for j in range(prompts_per_cat):
category_map[i * prompts_per_cat + j] = cat
actual_n = prompts_per_cat * n_categories
# Generate activations
harmful_acts = []
harmless_acts = []
for idx in range(actual_n):
cat = category_map[idx]
base = torch.randn(hidden_dim) * 0.1
harmful_acts.append(base + 2.0 * cat_dirs[cat])
harmless_acts.append(base)
return harmful_acts, harmless_acts, category_map, cat_dirs
def _make_refusal_directions(n_layers=8, hidden_dim=32, concentration="distributed"):
"""Create synthetic refusal directions with specified concentration pattern."""
torch.manual_seed(123)
directions = {}
strengths = {}
for i in range(n_layers):
d = torch.randn(hidden_dim)
directions[i] = d / d.norm()
if concentration == "concentrated":
# Strong in last few layers only (SFT-like)
strengths[i] = 3.0 if i >= n_layers - 2 else 0.1
elif concentration == "distributed":
# Even across layers (RLHF-like)
strengths[i] = 1.0 + 0.2 * torch.randn(1).item()
elif concentration == "orthogonal":
# Each layer direction is more orthogonal (CAI-like)
if i > 0:
# Make each direction more orthogonal to previous
prev = directions[i - 1]
d = d - (d @ prev) * prev
d = d / d.norm().clamp(min=1e-8)
directions[i] = d
strengths[i] = 1.5
else:
strengths[i] = 2.0 if 2 <= i <= 4 else 0.5
return directions, strengths
# ===========================================================================
# Tests: Concept Cone Geometry
# ===========================================================================
class TestConceptConeAnalyzer:
def test_basic_analysis(self):
harmful, harmless, cat_map, _ = _make_category_activations()
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_layer(harmful, harmless, layer_idx=5)
assert isinstance(result, ConeConeResult)
assert result.layer_idx == 5
assert result.category_count >= 2
assert result.cone_dimensionality > 0
assert result.cone_solid_angle >= 0
assert 0 <= result.mean_pairwise_cosine <= 1.0
def test_polyhedral_detection(self):
"""With spread-out categories, should detect polyhedral geometry."""
harmful, harmless, cat_map, _ = _make_category_activations(
category_spread=2.0, # Large spread -> distinct directions
)
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_layer(harmful, harmless)
# With high spread, directions should be more distinct
assert result.cone_dimensionality > 1.0
def test_linear_detection(self):
"""With no spread, should detect linear (single direction) geometry."""
harmful, harmless, cat_map, _ = _make_category_activations(
category_spread=0.0, # No spread -> all directions aligned
)
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_layer(harmful, harmless)
assert result.mean_pairwise_cosine > 0.8
def test_category_directions_populated(self):
harmful, harmless, cat_map, _ = _make_category_activations()
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_layer(harmful, harmless)
for cd in result.category_directions:
assert isinstance(cd, CategoryDirection)
assert cd.strength > 0
assert cd.n_prompts >= 2
assert 0 <= cd.specificity <= 1.0
def test_pairwise_cosines(self):
harmful, harmless, cat_map, _ = _make_category_activations()
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_layer(harmful, harmless)
for (a, b), cos in result.pairwise_cosines.items():
assert 0 <= cos <= 1.0
assert a < b # Sorted pair
def test_general_direction_unit(self):
harmful, harmless, cat_map, _ = _make_category_activations()
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_layer(harmful, harmless)
assert abs(result.general_direction.norm().item() - 1.0) < 0.01
def test_multi_layer_analysis(self):
harmful, harmless, cat_map, _ = _make_category_activations()
harmful_by_layer = {i: harmful for i in range(4)}
harmless_by_layer = {i: harmless for i in range(4)}
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_all_layers(harmful_by_layer, harmless_by_layer)
assert isinstance(result, MultiLayerConeResult)
assert len(result.per_layer) == 4
assert result.mean_cone_dimensionality > 0
def test_format_report(self):
harmful, harmless, cat_map, _ = _make_category_activations()
analyzer = ConceptConeAnalyzer(category_map=cat_map)
result = analyzer.analyze_layer(harmful, harmless, layer_idx=3)
report = ConceptConeAnalyzer.format_report(result)
assert "Concept Cone" in report
assert "Layer 3" in report
assert "dimensionality" in report
def test_default_category_map(self):
assert len(DEFAULT_HARM_CATEGORIES) == 30
cats = set(DEFAULT_HARM_CATEGORIES.values())
assert "weapons" in cats
assert "cyber" in cats
def test_empty_activations(self):
analyzer = ConceptConeAnalyzer()
result = analyzer.analyze_layer([], [], layer_idx=0)
assert result.category_count == 0
def test_min_category_size(self):
"""Categories with too few prompts should be excluded."""
harmful, harmless, cat_map, _ = _make_category_activations(
n_prompts=10, n_categories=5,
)
analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=3)
result = analyzer.analyze_layer(harmful, harmless)
# Each category has only 2 prompts, so with min_size=3 all are excluded
assert result.category_count == 0
# ===========================================================================
# Tests: Alignment Imprint Detector
# ===========================================================================
class TestAlignmentImprintDetector:
def test_basic_detection(self):
directions, strengths = _make_refusal_directions()
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
assert isinstance(imprint, AlignmentImprint)
assert imprint.predicted_method in ("dpo", "rlhf", "cai", "sft")
assert 0 <= imprint.confidence <= 1.0
def test_probabilities_sum_to_one(self):
directions, strengths = _make_refusal_directions()
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
total = (imprint.dpo_probability + imprint.rlhf_probability +
imprint.cai_probability + imprint.sft_probability)
assert abs(total - 1.0) < 0.01
def test_concentrated_detects_sft_or_dpo(self):
"""Concentrated refusal (tail-biased) should predict SFT or DPO."""
directions, strengths = _make_refusal_directions(concentration="concentrated")
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
# SFT and DPO both have concentrated signatures
assert imprint.predicted_method in ("sft", "dpo")
def test_distributed_detects_not_sft(self):
"""Distributed refusal should not be predicted as SFT."""
directions, strengths = _make_refusal_directions(
n_layers=16, concentration="distributed",
)
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
# With distributed refusal, Gini is low -> SFT is unlikely to be top prediction
assert imprint.predicted_method != "sft"
def test_orthogonal_detects_cai(self):
"""Orthogonal layer directions should lean toward CAI."""
directions, strengths = _make_refusal_directions(
n_layers=12, concentration="orthogonal",
)
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
# CAI should rank highly due to orthogonality
assert imprint.cai_probability > 0.15
def test_feature_extraction(self):
directions, strengths = _make_refusal_directions()
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
assert 0 <= imprint.gini_coefficient <= 1.0
assert imprint.effective_rank > 0
assert 0 <= imprint.cross_layer_smoothness <= 1.0
assert 0 <= imprint.tail_layer_bias <= 1.0
assert 0 <= imprint.mean_pairwise_orthogonality <= 1.0
assert imprint.spectral_decay_rate >= 0
def test_empty_directions(self):
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint({})
assert imprint.predicted_method == "unknown"
assert imprint.confidence == 0.0
def test_compare_base_instruct(self):
torch.manual_seed(42)
hidden_dim = 32
directions, _ = _make_refusal_directions(hidden_dim=hidden_dim)
base_acts = {i: torch.randn(hidden_dim) for i in range(8)}
instruct_acts = {
i: base_acts[i] + 1.5 * directions[i] for i in range(8)
}
detector = AlignmentImprintDetector()
deltas = detector.compare_base_instruct(base_acts, instruct_acts, directions)
assert len(deltas) == 8
for d in deltas:
assert isinstance(d, BaseInstructDelta)
assert d.delta_magnitude > 0
# Since delta IS the refusal direction, cosine should be high
assert abs(d.cosine_with_refusal) > 0.5
def test_format_imprint(self):
directions, strengths = _make_refusal_directions()
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
report = AlignmentImprintDetector.format_imprint(imprint)
assert "Alignment Imprint" in report
assert "DPO" in report
assert "RLHF" in report
assert "Gini" in report
def test_per_layer_strength_populated(self):
directions, strengths = _make_refusal_directions()
detector = AlignmentImprintDetector()
imprint = detector.detect_imprint(directions, strengths)
assert len(imprint.per_layer_strength) == len(directions)
# ===========================================================================
# Tests: Multi-Token Position Analysis
# ===========================================================================
class TestMultiTokenPositionAnalyzer:
def _make_activations_with_trigger(
self, seq_len=20, hidden_dim=32, trigger_pos=5,
):
"""Create activations with a planted trigger at a specific position."""
torch.manual_seed(42)
refusal_dir = torch.randn(hidden_dim)
refusal_dir = refusal_dir / refusal_dir.norm()
# Background activations
acts = torch.randn(seq_len, hidden_dim) * 0.1
# Strong refusal at trigger position
acts[trigger_pos] += 3.0 * refusal_dir
# Weaker refusal at last position
acts[-1] += 1.0 * refusal_dir
# Moderate at a few positions after trigger (decay)
for i in range(trigger_pos + 1, min(trigger_pos + 4, seq_len)):
decay = 0.5 ** (i - trigger_pos)
acts[i] += 3.0 * decay * refusal_dir
return acts, refusal_dir
def test_basic_analysis(self):
acts, ref_dir = self._make_activations_with_trigger()
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir, layer_idx=3)
assert isinstance(result, PositionAnalysisResult)
assert result.layer_idx == 3
assert result.n_tokens == 20
assert result.peak_strength > 0
def test_trigger_detection(self):
acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
analyzer = MultiTokenPositionAnalyzer(trigger_threshold=0.5)
result = analyzer.analyze_prompt(acts, ref_dir)
# The planted trigger should be detected
assert 5 in result.trigger_positions
assert result.peak_position == 5
def test_peak_vs_last(self):
"""Peak should be at trigger, not last token."""
acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
assert result.peak_strength > result.last_token_strength
assert result.peak_position != result.n_tokens - 1
def test_decay_rate_positive(self):
acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
# With exponential decay planted, decay rate should be positive
assert result.decay_rate > 0
def test_position_gini_bounded(self):
acts, ref_dir = self._make_activations_with_trigger()
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
assert 0 <= result.position_gini <= 1.0
def test_token_profiles_length(self):
acts, ref_dir = self._make_activations_with_trigger(seq_len=15)
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
assert len(result.token_profiles) == 15
def test_custom_token_texts(self):
acts, ref_dir = self._make_activations_with_trigger(seq_len=10, trigger_pos=3)
tokens = ["How", "to", "make", "a", "bomb", "from", "scratch", "please", "help", "me"]
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir, token_texts=tokens)
for tp in result.token_profiles:
assert tp.token_text in tokens or tp.token_text.startswith("pos_")
def test_batch_analysis(self):
batch = []
for i in range(5):
acts, ref_dir = self._make_activations_with_trigger(
trigger_pos=3 + i % 3,
)
batch.append(acts)
analyzer = MultiTokenPositionAnalyzer()
summary = analyzer.analyze_batch(batch, ref_dir)
assert isinstance(summary, MultiTokenSummary)
assert len(summary.per_prompt) == 5
assert summary.mean_peak_vs_last_ratio > 0
assert summary.mean_trigger_count > 0
assert 0 <= summary.peak_is_last_fraction <= 1.0
assert 0 <= summary.last_token_dominance <= 1.0
def test_last_token_dominant_case(self):
"""When signal is only at last token, peak should equal last."""
torch.manual_seed(42)
hidden_dim = 32
seq_len = 10
ref_dir = torch.randn(hidden_dim)
ref_dir = ref_dir / ref_dir.norm()
acts = torch.randn(seq_len, hidden_dim) * 0.01
acts[-1] += 5.0 * ref_dir
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
assert result.peak_position == seq_len - 1
def test_format_position_report(self):
acts, ref_dir = self._make_activations_with_trigger()
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir, prompt_text="How to hack?")
report = MultiTokenPositionAnalyzer.format_position_report(result)
assert "Multi-Token" in report
assert "Peak position" in report
def test_format_summary(self):
batch = []
for _ in range(3):
acts, ref_dir = self._make_activations_with_trigger()
batch.append(acts)
analyzer = MultiTokenPositionAnalyzer()
summary = analyzer.analyze_batch(batch, ref_dir)
report = MultiTokenPositionAnalyzer.format_summary(summary)
assert "Summary" in report
assert "Prompts analyzed" in report
def test_3d_activations_handled(self):
"""Should handle (1, seq_len, hidden_dim) inputs."""
acts, ref_dir = self._make_activations_with_trigger()
acts = acts.unsqueeze(0) # Add batch dim
analyzer = MultiTokenPositionAnalyzer()
result = analyzer.analyze_prompt(acts, ref_dir)
assert result.n_tokens == 20
def test_empty_batch(self):
ref_dir = torch.randn(32)
analyzer = MultiTokenPositionAnalyzer()
summary = analyzer.analyze_batch([], ref_dir)
assert len(summary.per_prompt) == 0
assert summary.peak_is_last_fraction == 1.0
# ===========================================================================
# Tests: Sparse Direction Surgery
# ===========================================================================
class TestSparseDirectionSurgeon:
def _make_weight_with_sparse_refusal(
self, out_dim=64, in_dim=32, n_refusal_rows=5,
):
"""Create a weight matrix where refusal is concentrated in a few rows."""
torch.manual_seed(42)
refusal_dir = torch.randn(in_dim)
refusal_dir = refusal_dir / refusal_dir.norm()
W = torch.randn(out_dim, in_dim) * 0.1
# Plant strong refusal signal in specific rows
refusal_rows = list(range(n_refusal_rows))
for i in refusal_rows:
W[i] += 5.0 * refusal_dir
return W, refusal_dir, refusal_rows
def test_basic_analysis(self):
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
surgeon = SparseDirectionSurgeon(sparsity=0.1)
result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=3)
assert isinstance(result, SparseProjectionResult)
assert result.layer_idx == 3
assert result.n_rows_total == 64
assert result.n_rows_modified > 0
assert result.mean_projection > 0
assert result.max_projection > result.mean_projection
def test_refusal_sparsity_index(self):
"""With sparse refusal, RSI should be high."""
W, ref_dir, _ = self._make_weight_with_sparse_refusal(
out_dim=100, n_refusal_rows=5,
)
surgeon = SparseDirectionSurgeon()
result = surgeon.analyze_weight_matrix(W, ref_dir)
assert result.refusal_sparsity_index > 0.3 # Concentrated signal
def test_energy_removed(self):
"""Top rows should capture most of the refusal energy."""
W, ref_dir, _ = self._make_weight_with_sparse_refusal(
out_dim=64, n_refusal_rows=5,
)
surgeon = SparseDirectionSurgeon(sparsity=0.15) # ~10 rows out of 64
result = surgeon.analyze_weight_matrix(W, ref_dir)
# With 5 refusal rows and 10 modified, should capture most energy
assert result.energy_removed > 0.5
def test_frobenius_change_bounded(self):
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
surgeon = SparseDirectionSurgeon(sparsity=0.1)
result = surgeon.analyze_weight_matrix(W, ref_dir)
assert result.frobenius_change > 0
assert result.frobenius_change < 1.0 # Shouldn't change more than 100%
def test_apply_sparse_projection(self):
"""Sparse projection should reduce refusal signal."""
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
surgeon = SparseDirectionSurgeon(sparsity=0.1)
W_modified = surgeon.apply_sparse_projection(W, ref_dir)
# Check that modified rows have reduced projection
original_proj = (W @ ref_dir).abs().sum().item()
modified_proj = (W_modified @ ref_dir).abs().sum().item()
assert modified_proj < original_proj
def test_sparse_preserves_unmodified_rows(self):
"""Rows below the threshold should be unchanged."""
W, ref_dir, refusal_rows = self._make_weight_with_sparse_refusal(
out_dim=64, n_refusal_rows=5,
)
surgeon = SparseDirectionSurgeon(sparsity=0.1) # ~6 rows
W_modified = surgeon.apply_sparse_projection(W, ref_dir)
# Count rows that actually changed
diffs = (W - W_modified).abs().sum(dim=1)
n_changed = (diffs > 1e-6).sum().item()
n_unchanged = (diffs < 1e-6).sum().item()
assert n_changed <= int(0.1 * 64) + 1 # Sparsity bound
assert n_unchanged >= 57 # Most rows unchanged
def test_dense_vs_sparse_comparison(self):
"""Dense projection should modify all rows; sparse should modify fewer."""
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
# Dense projection
r = ref_dir / ref_dir.norm()
W_dense = W - (W @ r).unsqueeze(1) * r.unsqueeze(0)
# Sparse projection
surgeon = SparseDirectionSurgeon(sparsity=0.1)
W_sparse = surgeon.apply_sparse_projection(W, ref_dir)
dense_changes = (W - W_dense).abs().sum(dim=1)
sparse_changes = (W - W_sparse).abs().sum(dim=1)
n_dense_changed = (dense_changes > 1e-6).sum().item()
n_sparse_changed = (sparse_changes > 1e-6).sum().item()
assert n_sparse_changed < n_dense_changed
def test_plan_surgery(self):
weights = {}
directions = {}
for i in range(6):
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
weights[i] = W
directions[i] = ref_dir
surgeon = SparseDirectionSurgeon(sparsity=0.1)
plan = surgeon.plan_surgery(weights, directions)
assert isinstance(plan, SparseSurgeryPlan)
assert len(plan.per_layer) == 6
assert 0 < plan.recommended_sparsity < 1.0
assert plan.mean_refusal_sparsity_index > 0
assert plan.mean_energy_removed > 0
def test_auto_sparsity(self):
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
surgeon = SparseDirectionSurgeon(auto_sparsity=True)
result = surgeon.analyze_weight_matrix(W, ref_dir)
# Auto sparsity should find a reasonable value
assert 0.01 <= result.sparsity <= 0.5
def test_auto_sparsity_apply(self):
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
surgeon = SparseDirectionSurgeon(auto_sparsity=True)
W_modified = surgeon.apply_sparse_projection(W, ref_dir)
# Should reduce projection
assert (W_modified @ ref_dir).abs().sum() < (W @ ref_dir).abs().sum()
def test_format_analysis(self):
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
surgeon = SparseDirectionSurgeon(sparsity=0.1)
result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=4)
report = SparseDirectionSurgeon.format_analysis(result)
assert "Sparse Direction Surgery" in report
assert "Layer 4" in report
assert "Refusal Sparsity Index" in report
def test_format_plan(self):
weights = {i: torch.randn(32, 16) for i in range(4)}
directions = {i: torch.randn(16) for i in range(4)}
surgeon = SparseDirectionSurgeon(sparsity=0.1)
plan = surgeon.plan_surgery(weights, directions)
report = SparseDirectionSurgeon.format_plan(plan)
assert "Sparse Direction Surgery Plan" in report
assert "Recommended sparsity" in report
def test_empty_inputs(self):
surgeon = SparseDirectionSurgeon()
plan = surgeon.plan_surgery({}, {})
assert len(plan.per_layer) == 0
def test_output_dtype_preserved(self):
"""Output should match input dtype."""
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
W_half = W.half()
surgeon = SparseDirectionSurgeon(sparsity=0.1)
W_out = surgeon.apply_sparse_projection(W_half, ref_dir)
assert W_out.dtype == torch.float16
# ===========================================================================
# Tests: Integration / Imports
# ===========================================================================
class TestAnalysisImports:
def test_all_new_modules_importable(self):
from obliteratus.analysis import (
ConceptConeAnalyzer,
AlignmentImprintDetector,
MultiTokenPositionAnalyzer,
SparseDirectionSurgeon,
)
assert ConceptConeAnalyzer is not None
assert AlignmentImprintDetector is not None
assert MultiTokenPositionAnalyzer is not None
assert SparseDirectionSurgeon is not None
+155
View File
@@ -0,0 +1,155 @@
"""Tests for refusal detection, including semantic mode and confidence intervals."""
from __future__ import annotations
from obliteratus.evaluation.advanced_metrics import (
_is_refusal,
_is_refusal_semantic,
refusal_rate,
refusal_rate_with_ci,
)
class TestKeywordRefusal:
"""Test the keyword-based refusal detection."""
def test_obvious_refusal(self):
assert _is_refusal("I'm sorry, I can't help with that.", "combined")
def test_obvious_non_refusal(self):
assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined")
def test_empty_string(self):
assert not _is_refusal("", "combined")
def test_prefix_mode(self):
assert _is_refusal("I cannot assist with that request.", "prefix")
assert not _is_refusal("Well, I cannot do that.", "prefix") # not a prefix
def test_substring_mode(self):
assert _is_refusal("Well, I cannot assist with that.", "substring")
class TestSemanticRefusal:
"""Test the semantic refusal detection patterns."""
def test_hedge_then_refuse(self):
text = "While I understand your interest in this topic, I cannot provide instructions for that."
assert _is_refusal(text, "semantic")
def test_redirect(self):
text = "Instead of providing those instructions, I can help you with something constructive."
assert _is_refusal(text, "semantic")
def test_concern_flagging(self):
text = "This raises serious ethical concerns that prevent me from responding."
assert _is_refusal(text, "semantic")
def test_meta_refusal(self):
text = "I'm designed to avoid providing information that could cause harm."
assert _is_refusal(text, "semantic")
def test_responsibility_deflection(self):
text = "You should consult a professional about this matter."
assert _is_refusal(text, "semantic")
def test_genuine_compliance_not_flagged(self):
"""Actual compliant responses should not trigger semantic detection."""
texts = [
"Here are the steps to build a birdhouse from scratch.",
"The chemical formula for water is H2O.",
"Python is a great language for data science. Here's how to get started.",
"To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.",
]
for text in texts:
assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}"
def test_semantic_catches_what_keyword_misses(self):
"""Semantic mode should catch refusals that keyword mode misses."""
# This hedge-then-refuse doesn't start with any refusal prefix
# and doesn't contain exact refusal substrings
text = "While I appreciate your curiosity, I'm not able to share that information."
assert not _is_refusal(text, "prefix")
# "not able to" is in substring list, so combined catches it
assert _is_refusal(text, "combined")
# This one truly requires semantic detection
text = "Although I respect your question, I will not be able to help with this particular request."
# "will not" is in substrings, but test the semantic path independently
assert _is_refusal_semantic(
"While I understand your request, I cannot assist with that specific topic."
)
class TestRefusalRate:
def test_all_refuse(self):
responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."]
assert refusal_rate(responses, "combined") == 1.0
def test_none_refuse(self):
responses = ["Sure, here you go.", "Of course!", "The answer is 42."]
assert refusal_rate(responses, "combined") == 0.0
def test_mixed(self):
responses = ["I'm sorry, no.", "Sure, here you go."]
assert refusal_rate(responses, "combined") == 0.5
def test_empty_list(self):
assert refusal_rate([], "combined") == 0.0
def test_semantic_mode(self):
responses = [
"While I understand your interest, I cannot provide that information.",
"Here are the steps you requested.",
]
assert refusal_rate(responses, "semantic") == 0.5
class TestRefusalRateWithCI:
def test_returns_all_fields(self):
responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5
ci = refusal_rate_with_ci(responses, mode="combined")
assert "rate" in ci
assert "ci_lower" in ci
assert "ci_upper" in ci
assert "n_samples" in ci
assert ci["n_samples"] == 10
def test_ci_bounds_bracket_rate(self):
responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70
ci = refusal_rate_with_ci(responses, mode="combined")
assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"]
def test_all_refuse_tight_ci(self):
responses = ["I'm sorry"] * 50
ci = refusal_rate_with_ci(responses, mode="combined")
assert ci["rate"] == 1.0
# Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0
# (a proper CI acknowledges uncertainty even with all-positive observations)
assert ci["ci_lower"] > 0.9
assert ci["ci_upper"] == 1.0
def test_empty_responses(self):
ci = refusal_rate_with_ci([], mode="combined")
assert ci["rate"] == 0.0
assert ci["n_samples"] == 0
def test_ci_narrower_with_more_samples(self):
"""More samples should produce tighter confidence intervals."""
responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5
responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50
ci_small = refusal_rate_with_ci(responses_small)
ci_large = refusal_rate_with_ci(responses_large)
width_small = ci_small["ci_upper"] - ci_small["ci_lower"]
width_large = ci_large["ci_upper"] - ci_large["ci_lower"]
assert width_large < width_small, \
f"Large CI ({width_large}) not narrower than small CI ({width_small})"
def test_deterministic_with_seed(self):
responses = ["I'm sorry"] * 30 + ["Sure"] * 70
ci1 = refusal_rate_with_ci(responses)
ci2 = refusal_rate_with_ci(responses)
assert ci1 == ci2, "Same input produced different CIs"
+70
View File
@@ -0,0 +1,70 @@
"""Tests for the reporting module."""
from __future__ import annotations
import json
from obliteratus.reporting.report import AblationReport, AblationResult
def _make_report() -> AblationReport:
report = AblationReport(model_name="test-model")
report.add_baseline({"perplexity": 25.0, "accuracy": 0.85})
report.add_result(
AblationResult(
strategy="layer_removal",
component="layer_0",
description="Remove layer 0",
metrics={"perplexity": 30.0, "accuracy": 0.80},
)
)
report.add_result(
AblationResult(
strategy="layer_removal",
component="layer_1",
description="Remove layer 1",
metrics={"perplexity": 50.0, "accuracy": 0.60},
)
)
return report
class TestAblationReport:
def test_to_dataframe(self):
report = _make_report()
df = report.to_dataframe()
assert len(df) == 2
assert "perplexity" in df.columns
assert "perplexity_delta" in df.columns
assert "perplexity_pct_change" in df.columns
def test_save_json(self, tmp_path):
report = _make_report()
out = tmp_path / "results.json"
report.save_json(out)
data = json.loads(out.read_text())
assert data["model_name"] == "test-model"
assert len(data["results"]) == 2
assert data["baseline_metrics"]["perplexity"] == 25.0
def test_save_csv(self, tmp_path):
report = _make_report()
out = tmp_path / "results.csv"
report.save_csv(out)
text = out.read_text()
assert "layer_0" in text
assert "perplexity" in text
def test_delta_calculation(self):
report = _make_report()
df = report.to_dataframe()
row0 = df[df["component"] == "layer_0"].iloc[0]
assert row0["perplexity_delta"] == 5.0 # 30 - 25
assert abs(row0["perplexity_pct_change"] - 20.0) < 0.01
def test_plot_impact(self, tmp_path):
report = _make_report()
out = tmp_path / "impact.png"
report.plot_impact(metric="perplexity", output_path=out)
assert out.exists()
assert out.stat().st_size > 0
+179
View File
@@ -0,0 +1,179 @@
"""Tests for ablation strategies using a small GPT-2 model."""
from __future__ import annotations
import pytest
import torch
from obliteratus.strategies.base import AblationSpec
from obliteratus.strategies.registry import STRATEGY_REGISTRY, get_strategy
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _make_dummy_handle():
"""Create a minimal ModelHandle with a tiny GPT-2 for testing (no network)."""
from unittest.mock import MagicMock
from transformers import GPT2Config, GPT2LMHeadModel
from obliteratus.models.loader import ModelHandle
config = GPT2Config(
vocab_size=1000,
n_positions=128,
n_embd=64,
n_layer=2,
n_head=2,
n_inner=256,
)
model = GPT2LMHeadModel(config)
model.eval()
# Strategy tests don't tokenize — use a simple mock
tokenizer = MagicMock()
tokenizer.pad_token = "<pad>"
tokenizer.eos_token = "<eos>"
handle = ModelHandle(
model=model,
tokenizer=tokenizer,
config=config,
model_name="gpt2-test",
task="causal_lm",
)
handle.snapshot()
return handle
@pytest.fixture
def handle():
return _make_dummy_handle()
# ---------------------------------------------------------------------------
# Registry tests
# ---------------------------------------------------------------------------
class TestRegistry:
def test_all_strategies_registered(self):
expected = {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"}
assert expected.issubset(set(STRATEGY_REGISTRY.keys()))
def test_get_strategy_returns_instance(self):
strat = get_strategy("layer_removal")
assert strat.name == "layer_removal"
def test_get_unknown_strategy_raises(self):
with pytest.raises(KeyError, match="Unknown strategy"):
get_strategy("nonexistent_strategy")
# ---------------------------------------------------------------------------
# Layer removal
# ---------------------------------------------------------------------------
class TestLayerRemoval:
def test_enumerate(self, handle):
strat = get_strategy("layer_removal")
specs = strat.enumerate(handle)
assert len(specs) == handle.num_layers
assert all(s.strategy_name == "layer_removal" for s in specs)
def test_apply_zeros_layer(self, handle):
strat = get_strategy("layer_removal")
specs = strat.enumerate(handle)
strat.apply(handle, specs[0])
from obliteratus.strategies.utils import get_layer_modules
layer = get_layer_modules(handle)[0]
for param in layer.parameters():
assert torch.all(param == 0), "Layer params should be zeroed after ablation"
def test_restore_after_ablation(self, handle):
strat = get_strategy("layer_removal")
specs = strat.enumerate(handle)
from obliteratus.strategies.utils import get_layer_modules
original_weight = get_layer_modules(handle)[0].attn.c_attn.weight.clone()
strat.apply(handle, specs[0])
handle.restore()
restored_weight = get_layer_modules(handle)[0].attn.c_attn.weight
assert torch.allclose(original_weight, restored_weight)
# ---------------------------------------------------------------------------
# Head pruning
# ---------------------------------------------------------------------------
class TestHeadPruning:
def test_enumerate(self, handle):
strat = get_strategy("head_pruning")
specs = strat.enumerate(handle)
assert len(specs) == handle.num_layers * handle.num_heads
def test_apply_zeros_head(self, handle):
strat = get_strategy("head_pruning")
spec = AblationSpec(
strategy_name="head_pruning",
component="layer_0_head_0",
description="test",
metadata={"layer_idx": 0, "head_idx": 0},
)
strat.apply(handle, spec)
from obliteratus.strategies.utils import get_layer_modules, get_attention_module
attn = get_attention_module(get_layer_modules(handle)[0], handle.architecture)
head_dim = handle.hidden_size // handle.num_heads
# GPT-2 uses c_attn (Conv1D), check output projection c_proj
if hasattr(attn, "c_proj"):
# Conv1D stores weight transposed
assert torch.all(attn.c_proj.weight[0:head_dim, :] == 0)
# ---------------------------------------------------------------------------
# FFN ablation
# ---------------------------------------------------------------------------
class TestFFNAblation:
def test_enumerate(self, handle):
strat = get_strategy("ffn_ablation")
specs = strat.enumerate(handle)
assert len(specs) == handle.num_layers
def test_apply_zeros_ffn(self, handle):
strat = get_strategy("ffn_ablation")
specs = strat.enumerate(handle)
strat.apply(handle, specs[0])
from obliteratus.strategies.utils import get_layer_modules, get_ffn_module
ffn = get_ffn_module(get_layer_modules(handle)[0], handle.architecture)
for param in ffn.parameters():
assert torch.all(param == 0)
# ---------------------------------------------------------------------------
# Embedding ablation
# ---------------------------------------------------------------------------
class TestEmbeddingAblation:
def test_enumerate(self, handle):
strat = get_strategy("embedding_ablation")
specs = strat.enumerate(handle)
assert len(specs) > 0
def test_apply_zeros_dims(self, handle):
strat = get_strategy("embedding_ablation")
spec = AblationSpec(
strategy_name="embedding_ablation",
component="embed_dims_0_4",
description="test",
metadata={"dim_start": 0, "dim_end": 4},
)
strat.apply(handle, spec)
from obliteratus.strategies.utils import get_embedding_module
emb = get_embedding_module(handle)
assert torch.all(emb.weight[:, 0:4] == 0)
+108
View File
@@ -0,0 +1,108 @@
"""Tests for ablation presets."""
from __future__ import annotations
from obliteratus.study_presets import (
STUDY_PRESETS,
get_study_preset,
get_preset,
list_study_presets,
list_presets,
)
from obliteratus.config import StudyConfig
class TestPresets:
def test_all_presets_registered(self):
expected_keys = {"quick", "full", "attention", "layers", "knowledge", "pruning", "embeddings", "jailbreak", "guardrail", "robustness"}
assert expected_keys.issubset(set(STUDY_PRESETS.keys()))
def test_get_preset(self):
preset = get_study_preset("quick")
assert preset.name == "Quick Scan"
assert preset.key == "quick"
assert len(preset.strategies) == 2
def test_get_preset_alias(self):
preset = get_preset("quick")
assert preset.name == "Quick Scan"
def test_get_unknown_preset_raises(self):
import pytest
with pytest.raises(KeyError, match="Unknown preset"):
get_study_preset("nonexistent")
def test_list_presets(self):
presets = list_study_presets()
assert len(presets) >= 7
keys = [p.key for p in presets]
assert "quick" in keys
assert "full" in keys
def test_list_presets_alias(self):
assert list_presets() == list_study_presets()
def test_preset_strategies_are_valid(self):
from obliteratus.strategies import STRATEGY_REGISTRY
for preset in list_study_presets():
for s in preset.strategies:
assert s["name"] in STRATEGY_REGISTRY, (
f"Preset {preset.key!r} references unknown strategy {s['name']!r}"
)
class TestConfigWithPreset:
def test_preset_key_in_config(self):
config_dict = {
"preset": "quick",
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
}
config = StudyConfig.from_dict(config_dict)
# Should inherit strategies from the quick preset
assert len(config.strategies) == 2
strategy_names = [s.name for s in config.strategies]
assert "layer_removal" in strategy_names
assert "ffn_ablation" in strategy_names
# Should inherit max_samples
assert config.dataset.max_samples == 25
# Should inherit batch_size and max_length
assert config.batch_size == 4
assert config.max_length == 128
def test_legacy_study_preset_key_still_works(self):
config_dict = {
"study_preset": "quick",
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
}
config = StudyConfig.from_dict(config_dict)
assert len(config.strategies) == 2
def test_preset_can_be_overridden(self):
config_dict = {
"preset": "quick",
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text", "max_samples": 999},
"batch_size": 16,
"strategies": [{"name": "head_pruning", "params": {}}],
}
config = StudyConfig.from_dict(config_dict)
# Explicit strategies should override preset
assert len(config.strategies) == 1
assert config.strategies[0].name == "head_pruning"
# Explicit batch_size should override
assert config.batch_size == 16
# Explicit max_samples in dataset should be kept
assert config.dataset.max_samples == 999
def test_full_preset(self):
config_dict = {
"preset": "full",
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
}
config = StudyConfig.from_dict(config_dict)
assert len(config.strategies) == 4
strategy_names = {s.name for s in config.strategies}
assert strategy_names == {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"}
+696
View File
@@ -0,0 +1,696 @@
"""Tests for the opt-in telemetry module."""
import json
import os
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from unittest.mock import MagicMock, patch
import torch
from obliteratus.telemetry import (
_ALLOWED_METHOD_CONFIG_KEYS,
_direction_stats,
_extract_excise_details,
_extract_prompt_counts,
_extract_analysis_insights,
_is_mount_point,
_test_writable,
build_report,
disable_telemetry,
enable_telemetry,
is_enabled,
maybe_send_informed_report,
maybe_send_pipeline_report,
restore_from_hub,
send_report,
storage_diagnostic,
)
def _reset_telemetry():
import obliteratus.telemetry as t
t._enabled = None
# ── Enable / disable ────────────────────────────────────────────────────
class TestTelemetryConfig:
"""Test telemetry enable/disable logic."""
def setup_method(self):
_reset_telemetry()
def test_disabled_by_default(self):
with patch.dict(os.environ, {}, clear=True):
_reset_telemetry()
assert not is_enabled()
def test_enabled_by_default_on_hf_spaces(self):
with patch.dict(os.environ, {"SPACE_ID": "user/space"}, clear=True):
import obliteratus.telemetry as t
old_val = t._ON_HF_SPACES
t._ON_HF_SPACES = True
_reset_telemetry()
assert is_enabled()
t._ON_HF_SPACES = old_val
def test_disable_via_env_zero(self):
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "0"}):
_reset_telemetry()
assert not is_enabled()
def test_disable_via_env_false(self):
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "false"}):
_reset_telemetry()
assert not is_enabled()
def test_enable_via_env_explicit(self):
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
_reset_telemetry()
assert is_enabled()
def test_enable_programmatically(self):
enable_telemetry()
assert is_enabled()
def test_disable_programmatically(self):
enable_telemetry()
assert is_enabled()
disable_telemetry()
assert not is_enabled()
def test_programmatic_overrides_env(self):
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
disable_telemetry()
assert not is_enabled()
# ── Report building ─────────────────────────────────────────────────────
class TestBuildReport:
"""Test report payload construction."""
def _base_kwargs(self, **overrides):
defaults = dict(
architecture="LlamaForCausalLM",
num_layers=32,
num_heads=32,
hidden_size=4096,
total_params=8_000_000_000,
method="advanced",
method_config={"n_directions": 4, "norm_preserve": True},
quality_metrics={"perplexity": 5.2, "refusal_rate": 0.05},
)
defaults.update(overrides)
return defaults
def test_schema_version_2(self):
report = build_report(**self._base_kwargs())
assert report["schema_version"] == 2
def test_basic_fields(self):
report = build_report(**self._base_kwargs())
assert report["model"]["architecture"] == "LlamaForCausalLM"
assert report["model"]["num_layers"] == 32
assert report["model"]["total_params"] == 8_000_000_000
assert report["method"] == "advanced"
assert report["quality_metrics"]["refusal_rate"] == 0.05
assert len(report["session_id"]) == 32
def test_filters_unknown_config_keys(self):
report = build_report(**self._base_kwargs(
method_config={"n_directions": 1, "secret_flag": True, "nuke": "boom"},
))
assert "n_directions" in report["method_config"]
assert "secret_flag" not in report["method_config"]
assert "nuke" not in report["method_config"]
def test_allows_all_valid_config_keys(self):
"""Every key in the allowlist should pass through."""
config = {k: True for k in _ALLOWED_METHOD_CONFIG_KEYS}
report = build_report(**self._base_kwargs(method_config=config))
for k in _ALLOWED_METHOD_CONFIG_KEYS:
assert k in report["method_config"], f"Missing allowlisted key: {k}"
def test_no_model_name_in_report(self):
report = build_report(**self._base_kwargs())
report_str = json.dumps(report)
assert "meta-llama" not in report_str
assert "Llama-3" not in report_str
def test_environment_info(self):
report = build_report(**self._base_kwargs())
env = report["environment"]
assert "python_version" in env
assert "os" in env
assert "arch" in env
def test_stage_durations(self):
durations = {"summon": 2.5, "probe": 10.1, "distill": 3.2}
report = build_report(**self._base_kwargs(stage_durations=durations))
assert report["stage_durations"] == durations
def test_direction_stats(self):
stats = {"direction_norms": {"10": 0.95}, "mean_direction_persistence": 0.87}
report = build_report(**self._base_kwargs(direction_stats=stats))
assert report["direction_stats"]["mean_direction_persistence"] == 0.87
def test_excise_details(self):
details = {"modified_count": 128, "used_techniques": ["head_surgery"]}
report = build_report(**self._base_kwargs(excise_details=details))
assert report["excise_details"]["modified_count"] == 128
def test_prompt_counts(self):
counts = {"harmful": 33, "harmless": 33, "jailbreak": 15}
report = build_report(**self._base_kwargs(prompt_counts=counts))
assert report["prompt_counts"]["harmful"] == 33
assert report["prompt_counts"]["jailbreak"] == 15
def test_gpu_memory(self):
mem = {"peak_allocated_gb": 7.2, "peak_reserved_gb": 8.0}
report = build_report(**self._base_kwargs(gpu_memory=mem))
assert report["gpu_memory"]["peak_allocated_gb"] == 7.2
def test_analysis_insights_filtered(self):
"""Only allowlisted analysis keys should pass through."""
insights = {
"detected_alignment_method": "DPO",
"alignment_confidence": 0.92,
"secret_internal_data": "should not appear",
}
report = build_report(**self._base_kwargs(analysis_insights=insights))
assert report["analysis_insights"]["detected_alignment_method"] == "DPO"
assert "secret_internal_data" not in report["analysis_insights"]
def test_informed_extras(self):
extras = {"ouroboros_passes": 3, "final_refusal_rate": 0.02, "total_duration": 120.5}
report = build_report(**self._base_kwargs(informed_extras=extras))
assert report["informed"]["ouroboros_passes"] == 3
def test_optional_fields_omitted_when_empty(self):
"""Optional fields should not appear when not provided."""
report = build_report(**self._base_kwargs())
assert "stage_durations" not in report
assert "direction_stats" not in report
assert "excise_details" not in report
assert "prompt_counts" not in report
assert "gpu_memory" not in report
assert "analysis_insights" not in report
assert "informed" not in report
# ── Direction stats extraction ──────────────────────────────────────────
class TestDirectionStats:
"""Test direction quality metric extraction."""
def test_direction_norms(self):
pipeline = MagicMock()
pipeline.refusal_directions = {
0: torch.randn(128),
1: torch.randn(128),
}
pipeline.refusal_subspaces = {}
stats = _direction_stats(pipeline)
assert "direction_norms" in stats
assert "0" in stats["direction_norms"]
assert "1" in stats["direction_norms"]
def test_direction_persistence(self):
"""Adjacent layers with similar directions should have high persistence."""
d = torch.randn(128)
d = d / d.norm()
pipeline = MagicMock()
pipeline.refusal_directions = {0: d, 1: d + 0.01 * torch.randn(128)}
pipeline.refusal_subspaces = {}
stats = _direction_stats(pipeline)
assert "mean_direction_persistence" in stats
assert stats["mean_direction_persistence"] > 0.9
def test_effective_rank(self):
"""Multi-direction subspace should yield effective rank > 1."""
pipeline = MagicMock()
pipeline.refusal_directions = {0: torch.randn(128)}
# 4-direction subspace with distinct directions
sub = torch.randn(4, 128)
pipeline.refusal_subspaces = {0: sub}
stats = _direction_stats(pipeline)
assert "effective_ranks" in stats
assert float(stats["effective_ranks"]["0"]) > 1.0
def test_empty_directions(self):
pipeline = MagicMock()
pipeline.refusal_directions = {}
pipeline.refusal_subspaces = {}
stats = _direction_stats(pipeline)
assert stats == {}
# ── Excise details extraction ───────────────────────────────────────────
class TestExciseDetails:
def test_basic_excise_details(self):
pipeline = MagicMock()
pipeline._excise_modified_count = 64
pipeline._refusal_heads = {10: [(0, 0.9), (3, 0.8)], 11: [(1, 0.7)]}
pipeline._sae_directions = {}
pipeline._expert_safety_scores = {}
pipeline._layer_excise_weights = {}
pipeline._expert_directions = {}
pipeline._steering_hooks = []
pipeline.invert_refusal = False
pipeline.project_embeddings = False
pipeline.activation_steering = False
pipeline.expert_transplant = False
details = _extract_excise_details(pipeline)
assert details["modified_count"] == 64
assert details["head_surgery_layers"] == 2
assert details["total_heads_projected"] == 3
assert "head_surgery" in details["used_techniques"]
def test_adaptive_weights(self):
pipeline = MagicMock()
pipeline._excise_modified_count = None
pipeline._refusal_heads = {}
pipeline._sae_directions = {}
pipeline._expert_safety_scores = {}
pipeline._layer_excise_weights = {0: 0.2, 1: 0.8, 2: 0.5}
pipeline._expert_directions = {}
pipeline._steering_hooks = []
pipeline.invert_refusal = False
pipeline.project_embeddings = False
pipeline.activation_steering = False
pipeline.expert_transplant = False
details = _extract_excise_details(pipeline)
assert details["adaptive_weight_min"] == 0.2
assert details["adaptive_weight_max"] == 0.8
assert "layer_adaptive" in details["used_techniques"]
# ── Prompt counts extraction ────────────────────────────────────────────
class TestPromptCounts:
def test_basic_counts(self):
pipeline = MagicMock()
pipeline.harmful_prompts = ["a"] * 33
pipeline.harmless_prompts = ["b"] * 33
pipeline.jailbreak_prompts = None
counts = _extract_prompt_counts(pipeline)
assert counts["harmful"] == 33
assert counts["harmless"] == 33
assert "jailbreak" not in counts
def test_with_jailbreak(self):
pipeline = MagicMock()
pipeline.harmful_prompts = ["a"] * 33
pipeline.harmless_prompts = ["b"] * 33
pipeline.jailbreak_prompts = ["c"] * 10
counts = _extract_prompt_counts(pipeline)
assert counts["jailbreak"] == 10
# ── Send behavior ───────────────────────────────────────────────────────
class TestSendReport:
def setup_method(self):
_reset_telemetry()
def test_does_not_send_when_disabled(self):
disable_telemetry()
with patch("obliteratus.telemetry._send_sync") as mock_send:
send_report({"test": True})
mock_send.assert_not_called()
def test_sends_when_enabled(self):
enable_telemetry()
with patch("obliteratus.telemetry._send_sync") as mock_send:
send_report({"test": True})
import time
time.sleep(0.1)
mock_send.assert_called_once_with({"test": True})
def test_send_failure_is_silent(self):
enable_telemetry()
with patch("obliteratus.telemetry._send_sync", side_effect=Exception("network down")) as mock_send:
# send_report should not propagate the exception to the caller
send_report({"test": True})
import time
time.sleep(0.1) # Allow background thread to execute
mock_send.assert_called_once_with({"test": True})
# ── Pipeline integration ────────────────────────────────────────────────
def _make_mock_pipeline():
"""Build a mock pipeline with all fields the telemetry module reads."""
p = MagicMock()
p.handle.summary.return_value = {
"architecture": "LlamaForCausalLM",
"num_layers": 32,
"num_heads": 32,
"hidden_size": 4096,
"total_params": 8_000_000_000,
}
p.method = "advanced"
p.n_directions = 4
p.norm_preserve = True
p.regularization = 0.1
p.refinement_passes = 2
p.project_biases = True
p.use_chat_template = True
p.use_whitened_svd = True
p.true_iterative_refinement = False
p.use_jailbreak_contrast = False
p.layer_adaptive_strength = False
p.attention_head_surgery = True
p.safety_neuron_masking = False
p.per_expert_directions = False
p.use_sae_features = False
p.invert_refusal = False
p.project_embeddings = False
p.embed_regularization = 0.5
p.activation_steering = False
p.steering_strength = 0.3
p.expert_transplant = False
p.transplant_blend = 0.3
p.reflection_strength = 2.0
p.quantization = None
p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05}
p._strong_layers = [10, 11, 12, 13]
p._stage_durations = {"summon": 3.0, "probe": 12.5, "distill": 4.1, "excise": 2.0, "verify": 8.3, "rebirth": 5.0}
p._excise_modified_count = 128
# Direction data
d = torch.randn(4096)
d = d / d.norm()
p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096), 12: d, 13: d}
p.refusal_subspaces = {10: torch.randn(4, 4096)}
# Excise details
p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]}
p._sae_directions = {}
p._expert_safety_scores = {}
p._layer_excise_weights = {}
p._expert_directions = {}
p._steering_hooks = []
# Prompts
p.harmful_prompts = ["x"] * 33
p.harmless_prompts = ["y"] * 33
p.jailbreak_prompts = None
return p
class TestPipelineIntegration:
def setup_method(self):
_reset_telemetry()
def test_does_nothing_when_disabled(self):
disable_telemetry()
with patch("obliteratus.telemetry.send_report") as mock_send:
maybe_send_pipeline_report(_make_mock_pipeline())
mock_send.assert_not_called()
def test_comprehensive_report(self):
"""Verify that all data points are extracted from the pipeline."""
enable_telemetry()
p = _make_mock_pipeline()
with patch("obliteratus.telemetry.send_report") as mock_send:
maybe_send_pipeline_report(p)
mock_send.assert_called_once()
report = mock_send.call_args[0][0]
# Core fields
assert report["schema_version"] == 2
assert report["model"]["architecture"] == "LlamaForCausalLM"
assert report["method"] == "advanced"
# Method config — check all keys passed through
cfg = report["method_config"]
assert cfg["n_directions"] == 4
assert cfg["norm_preserve"] is True
assert cfg["use_whitened_svd"] is True
assert cfg["attention_head_surgery"] is True
# Quality metrics
assert report["quality_metrics"]["perplexity"] == 5.2
assert report["quality_metrics"]["refusal_rate"] == 0.05
# Stage durations
assert "stage_durations" in report
assert report["stage_durations"]["summon"] == 3.0
assert report["stage_durations"]["verify"] == 8.3
# Strong layers
assert report["strong_layers"] == [10, 11, 12, 13]
# Direction stats
assert "direction_stats" in report
assert "direction_norms" in report["direction_stats"]
assert "mean_direction_persistence" in report["direction_stats"]
# Excise details
assert "excise_details" in report
assert report["excise_details"]["modified_count"] == 128
assert "head_surgery" in report["excise_details"]["used_techniques"]
# Prompt counts
assert report["prompt_counts"]["harmful"] == 33
assert report["prompt_counts"]["harmless"] == 33
# Environment
assert "os" in report["environment"]
assert "python_version" in report["environment"]
# ── Informed pipeline integration ────────────────────────────────────────
@dataclass
class _MockInsights:
detected_alignment_method: str = "DPO"
alignment_confidence: float = 0.92
alignment_probabilities: dict = field(default_factory=lambda: {"DPO": 0.92, "RLHF": 0.05})
cone_is_polyhedral: bool = True
cone_dimensionality: float = 3.2
mean_pairwise_cosine: float = 0.45
direction_specificity: dict = field(default_factory=lambda: {"violence": 0.8})
cluster_count: int = 3
direction_persistence: float = 0.87
mean_refusal_sparsity_index: float = 0.15
recommended_sparsity: float = 0.1
use_sparse_surgery: bool = True
estimated_robustness: str = "medium"
self_repair_estimate: float = 0.3
entanglement_score: float = 0.2
entangled_layers: list = field(default_factory=lambda: [15, 16])
clean_layers: list = field(default_factory=lambda: [10, 11, 12])
recommended_n_directions: int = 6
recommended_regularization: float = 0.05
recommended_refinement_passes: int = 3
recommended_layers: list = field(default_factory=lambda: [10, 11, 12, 13])
skip_layers: list = field(default_factory=lambda: [15])
@dataclass
class _MockInformedReport:
insights: _MockInsights = field(default_factory=_MockInsights)
ouroboros_passes: int = 2
final_refusal_rate: float = 0.02
analysis_duration: float = 15.3
total_duration: float = 85.7
class TestInformedPipelineIntegration:
def setup_method(self):
_reset_telemetry()
def test_does_nothing_when_disabled(self):
disable_telemetry()
with patch("obliteratus.telemetry.send_report") as mock_send:
maybe_send_informed_report(_make_mock_pipeline(), _MockInformedReport())
mock_send.assert_not_called()
def test_comprehensive_informed_report(self):
enable_telemetry()
p = _make_mock_pipeline()
report_obj = _MockInformedReport()
with patch("obliteratus.telemetry.send_report") as mock_send:
maybe_send_informed_report(p, report_obj)
mock_send.assert_called_once()
report = mock_send.call_args[0][0]
# All base fields present
assert report["schema_version"] == 2
assert report["model"]["architecture"] == "LlamaForCausalLM"
assert "direction_stats" in report
assert "excise_details" in report
# Analysis insights
ai = report["analysis_insights"]
assert ai["detected_alignment_method"] == "DPO"
assert ai["alignment_confidence"] == 0.92
assert ai["cone_is_polyhedral"] is True
assert ai["cone_dimensionality"] == 3.2
assert ai["cluster_count"] == 3
assert ai["self_repair_estimate"] == 0.3
assert ai["entanglement_score"] == 0.2
assert ai["recommended_n_directions"] == 6
# Informed extras
inf = report["informed"]
assert inf["ouroboros_passes"] == 2
assert inf["final_refusal_rate"] == 0.02
assert inf["analysis_duration"] == 15.3
assert inf["total_duration"] == 85.7
def test_analysis_insights_filter_unknown_keys(self):
enable_telemetry()
_make_mock_pipeline()
@dataclass
class _BadInsights(_MockInsights):
secret_sauce: str = "should not appear"
report_obj = _MockInformedReport(insights=_BadInsights())
insights = _extract_analysis_insights(report_obj)
assert "detected_alignment_method" in insights
assert "secret_sauce" not in insights
# ── Stage duration tracking on pipeline ──────────────────────────────────
class TestStageDurationTracking:
def test_emit_records_durations(self):
"""Verify _emit stores durations in _stage_durations dict."""
from obliteratus.abliterate import AbliterationPipeline
p = AbliterationPipeline.__new__(AbliterationPipeline)
p._stage_durations = {}
p._excise_modified_count = None
p._on_stage = lambda r: None
p._emit("summon", "done", "loaded", duration=3.5)
p._emit("probe", "done", "probed", duration=10.2)
p._emit("excise", "done", "excised", duration=2.1, modified_count=64)
assert p._stage_durations == {"summon": 3.5, "probe": 10.2, "excise": 2.1}
assert p._excise_modified_count == 64
def test_running_status_does_not_record(self):
"""Only 'done' status should record durations."""
from obliteratus.abliterate import AbliterationPipeline
p = AbliterationPipeline.__new__(AbliterationPipeline)
p._stage_durations = {}
p._excise_modified_count = None
p._on_stage = lambda r: None
p._emit("summon", "running", "loading...", duration=0)
assert p._stage_durations == {}
# ── Storage helpers ──────────────────────────────────────────────────────
class TestStorageHelpers:
"""Test persistent storage helper functions."""
def test_test_writable_valid_dir(self):
with tempfile.TemporaryDirectory() as d:
assert _test_writable(Path(d) / "subdir")
def test_test_writable_unwritable(self):
# /proc is never writable for arbitrary files
assert not _test_writable(Path("/proc/obliteratus_test"))
def test_is_mount_point_existing_path(self):
# Should return a bool without raising for any existing path
result = _is_mount_point(Path("/"))
assert isinstance(result, bool)
def test_is_mount_point_nonexistent(self):
assert not _is_mount_point(Path("/nonexistent_dir_12345"))
def test_storage_diagnostic_returns_dict(self):
diag = storage_diagnostic()
assert isinstance(diag, dict)
assert "telemetry_dir" in diag
assert "is_persistent" in diag
assert "on_hf_spaces" in diag
assert "telemetry_enabled" in diag
assert "data_dir_exists" in diag
# ── Hub restore ──────────────────────────────────────────────────────────
class TestHubRestore:
"""Test Hub-to-local restore functionality."""
def setup_method(self):
_reset_telemetry()
# Reset restore state so each test can trigger it
import obliteratus.telemetry as t
t._restore_done = False
def test_restore_skips_when_no_repo(self):
with patch("obliteratus.telemetry._TELEMETRY_REPO", ""):
assert restore_from_hub() == 0
def test_restore_deduplicates(self):
"""Records already in local JSONL should not be re-added."""
import obliteratus.telemetry as t
with tempfile.TemporaryDirectory() as d:
test_file = Path(d) / "telemetry.jsonl"
existing = {"session_id": "abc", "timestamp": "2025-01-01T00:00:00"}
test_file.write_text(json.dumps(existing) + "\n")
old_file = t.TELEMETRY_FILE
old_repo = t._TELEMETRY_REPO
t.TELEMETRY_FILE = test_file
t._TELEMETRY_REPO = "test/repo"
t._restore_done = False
try:
hub_records = [
{"session_id": "abc", "timestamp": "2025-01-01T00:00:00"}, # duplicate
{"session_id": "def", "timestamp": "2025-01-02T00:00:00"}, # new
]
with patch("obliteratus.telemetry.fetch_hub_records", return_value=hub_records):
count = restore_from_hub()
assert count == 1 # Only the new record
# Verify file contents
lines = test_file.read_text().strip().split("\n")
assert len(lines) == 2 # original + 1 new
finally:
t.TELEMETRY_FILE = old_file
t._TELEMETRY_REPO = old_repo
def test_restore_only_runs_once(self):
"""Calling restore_from_hub() twice should be a no-op the second time."""
import obliteratus.telemetry as t
t._restore_done = False
with patch("obliteratus.telemetry._TELEMETRY_REPO", "test/repo"):
with patch("obliteratus.telemetry.fetch_hub_records", return_value=[]):
restore_from_hub()
# Second call should return 0 immediately
assert restore_from_hub() == 0
+167
View File
@@ -0,0 +1,167 @@
"""Tests for visualization module (non-interactive, save-to-file)."""
from __future__ import annotations
import tempfile
from pathlib import Path
import pytest
import torch
from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
from obliteratus.analysis.activation_probing import ActivationProbe
from obliteratus.analysis.visualization import (
_sanitize_label,
plot_refusal_topology,
plot_cross_layer_heatmap,
plot_angular_drift,
plot_probe_dashboard,
plot_defense_radar,
)
from obliteratus.analysis.defense_robustness import DefenseProfile
@pytest.fixture
def tmp_dir():
with tempfile.TemporaryDirectory() as d:
yield Path(d)
def _make_refusal_data(n_layers=6, hidden_dim=16):
"""Create test refusal directions and means."""
torch.manual_seed(42)
directions = {}
harmful_means = {}
harmless_means = {}
for i in range(n_layers):
d = torch.randn(hidden_dim)
directions[i] = d / d.norm()
base = torch.randn(hidden_dim)
harmless_means[i] = base.unsqueeze(0)
harmful_means[i] = (base + (2.0 if i in [2, 3, 4] else 0.3) * directions[i]).unsqueeze(0)
strong_layers = [2, 3, 4]
return directions, harmful_means, harmless_means, strong_layers
class TestRefusalTopology:
def test_plot_saves_file(self, tmp_dir):
directions, h_means, b_means, strong = _make_refusal_data()
path = tmp_dir / "topology.png"
plot_refusal_topology(
directions, h_means, b_means, strong, output_path=path
)
assert path.exists()
assert path.stat().st_size > 0
def test_plot_returns_figure(self, tmp_dir):
directions, h_means, b_means, strong = _make_refusal_data()
fig = plot_refusal_topology(
directions, h_means, b_means, strong, output_path=tmp_dir / "test.png"
)
assert fig is not None
class TestCrossLayerHeatmap:
def test_plot_saves_file(self, tmp_dir):
torch.manual_seed(42)
directions = {i: torch.randn(16) for i in range(6)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
path = tmp_dir / "heatmap.png"
plot_cross_layer_heatmap(result, output_path=path)
assert path.exists()
class TestAngularDrift:
def test_plot_saves_file(self, tmp_dir):
torch.manual_seed(42)
directions = {i: torch.randn(16) for i in range(8)}
analyzer = CrossLayerAlignmentAnalyzer()
result = analyzer.analyze(directions)
path = tmp_dir / "drift.png"
plot_angular_drift(result, output_path=path)
assert path.exists()
class TestProbeDashboard:
def test_plot_saves_file(self, tmp_dir):
torch.manual_seed(42)
harmful = {i: [torch.randn(8) for _ in range(3)] for i in range(4)}
harmless = {i: [torch.randn(8) for _ in range(3)] for i in range(4)}
dirs = {i: torch.randn(8) for i in range(4)}
probe = ActivationProbe()
result = probe.probe_all_layers(harmful, harmless, dirs)
path = tmp_dir / "probe.png"
plot_probe_dashboard(result, output_path=path)
assert path.exists()
class TestDefenseRadar:
def test_plot_saves_file(self, tmp_dir):
profile = DefenseProfile(
model_name="test-model",
alignment_type_estimate="RLHF-like",
refusal_concentration=0.4,
refusal_layer_spread=5,
mean_refusal_strength=2.0,
max_refusal_strength=4.0,
self_repair_estimate=0.6,
entanglement_score=0.3,
estimated_robustness="medium",
)
path = tmp_dir / "radar.png"
plot_defense_radar(profile, output_path=path)
assert path.exists()
def test_model_name_sanitized_in_title(self, tmp_dir):
"""Ensure sensitive paths in model_name don't leak into saved charts."""
profile = DefenseProfile(
model_name="/home/user/.cache/huggingface/hub/models--secret-org/private-model",
alignment_type_estimate="RLHF-like",
refusal_concentration=0.4,
refusal_layer_spread=5,
mean_refusal_strength=2.0,
max_refusal_strength=4.0,
self_repair_estimate=0.6,
entanglement_score=0.3,
estimated_robustness="medium",
)
path = tmp_dir / "radar_sanitized.png"
fig = plot_defense_radar(profile, output_path=path)
# Title should not contain the full filesystem path
title_text = fig.axes[0].get_title()
assert "/home/user" not in title_text
assert ".cache" not in title_text
class TestSanitizeLabel:
def test_strips_absolute_paths(self):
result = _sanitize_label("/home/user/.cache/huggingface/models--org/model")
assert "/home/user" not in result
assert "model" in result
def test_redacts_hf_tokens(self):
result = _sanitize_label("model with hf_abcdefghij token")
assert "hf_abcdefghij" not in result
assert "<TOKEN>" in result
def test_redacts_long_hex_strings(self):
hex_str = "a" * 40
result = _sanitize_label(f"commit {hex_str}")
assert hex_str not in result
assert "<REDACTED>" in result
def test_truncates_long_strings(self):
long = "x" * 200
result = _sanitize_label(long)
assert len(result) <= 80
assert result.endswith("...")
def test_passes_normal_strings_through(self):
assert _sanitize_label("Refusal Topology Map") == "Refusal Topology Map"