mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-06-06 22:34:02 +02:00
Add files via upload
This commit is contained in:
+1
-1
@@ -5,7 +5,7 @@ Thanks for your interest in contributing. This document covers everything you ne
|
||||
## Development Setup
|
||||
|
||||
```bash
|
||||
git clone https://github.com/obliteratus-project/OBLITERATUS.git
|
||||
git clone https://github.com/elder-plinius/OBLITERATUS.git
|
||||
cd OBLITERATUS
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
@@ -28,7 +28,7 @@ short_description: "One-click model liberation + chat playground"
|
||||
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue" alt="Open in HF Spaces">
|
||||
</a>
|
||||
|
||||
<a href="https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb">
|
||||
<a href="https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab">
|
||||
</a>
|
||||
</p>
|
||||
@@ -55,7 +55,7 @@ Built on published research from [Arditi et al. (2024)](https://arxiv.org/abs/24
|
||||
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --method advanced
|
||||
```
|
||||
|
||||
Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All.
|
||||
Or zero commands — just [open the Colab notebook](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb) and hit Run All.
|
||||
|
||||
## What it does
|
||||
|
||||
@@ -153,7 +153,7 @@ The `obliteratus ui` command adds a Rich terminal startup with GPU detection and
|
||||
|
||||
### 3. Google Colab (free GPU)
|
||||
|
||||
[](https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb)
|
||||
[](https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb)
|
||||
|
||||
Pick a model from the dropdown, pick a method, hit Run All. Download the result or push straight to HuggingFace Hub. Works on the free T4 tier for models up to ~8B parameters.
|
||||
|
||||
@@ -545,7 +545,7 @@ If you use OBLITERATUS in your research, please cite:
|
||||
Refusal Removal in Large Language Models},
|
||||
author = {{OBLITERATUS Contributors}},
|
||||
year = {2026},
|
||||
url = {https://github.com/obliteratus-project/OBLITERATUS},
|
||||
url = {https://github.com/elder-plinius/OBLITERATUS},
|
||||
note = {15 analysis modules, 837 tests}
|
||||
}
|
||||
```
|
||||
@@ -565,7 +565,7 @@ pytest
|
||||
|
||||
- **Open source** — [GNU Affero General Public License v3.0](LICENSE) (AGPL-3.0). You can freely use, modify, and distribute OBLITERATUS under AGPL terms. If you run a modified version as a network service (SaaS), you must release your source code to users under the same license.
|
||||
|
||||
- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/obliteratus-project/OBLITERATUS/issues) for pricing and terms.
|
||||
- **Commercial** — Organizations that cannot comply with AGPL obligations (e.g., proprietary SaaS, closed-source products, internal tools where source disclosure is not possible) can purchase a commercial license. Contact us via [GitHub Issues](https://github.com/elder-plinius/OBLITERATUS/issues) for pricing and terms.
|
||||
|
||||
This is the same dual-licensing model used by MongoDB, Qt, Grafana, and others.
|
||||
|
||||
|
||||
+1
-1
@@ -11,7 +11,7 @@ OBLITERATUS is a mechanistic interpretability research tool. It removes refusal
|
||||
If you discover a security vulnerability in OBLITERATUS, please report it responsibly:
|
||||
|
||||
1. **Do not** open a public GitHub issue
|
||||
2. Open a [private security advisory](https://github.com/obliteratus-project/OBLITERATUS/security/advisories/new) with:
|
||||
2. Open a [private security advisory](https://github.com/elder-plinius/OBLITERATUS/security/advisories/new) with:
|
||||
- Description of the vulnerability
|
||||
- Steps to reproduce
|
||||
- Potential impact
|
||||
|
||||
@@ -115,6 +115,10 @@ _last_obliterated_label: str = ""
|
||||
# Counter for unique obliteration save directories
|
||||
_obliterate_counter: int = 0
|
||||
|
||||
# Flag to suppress session_model_dd.change when obliterate programmatically
|
||||
# sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
|
||||
_skip_session_load: bool = False
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model presets — 100+ models organized by provider
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1459,7 +1463,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
f" or locally: `export HF_TOKEN=hf_...`\n\n"
|
||||
f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n"
|
||||
f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).",
|
||||
"", gr.update(), gr.update(), gr.update(),
|
||||
"", gr.update(), gr.update(), gr.update(), gr.update(),
|
||||
)
|
||||
return
|
||||
|
||||
@@ -1468,14 +1472,14 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub):
|
||||
yield (
|
||||
"**Error:** Invalid Hub repo format. Use `username/model-name`.",
|
||||
"", gr.update(), gr.update(), gr.update(),
|
||||
"", gr.update(), gr.update(), gr.update(), gr.update(),
|
||||
)
|
||||
return
|
||||
if not os.environ.get("HF_TOKEN"):
|
||||
yield (
|
||||
"**Error:** HF_TOKEN not set. Push to Hub requires a write token. "
|
||||
"Set it via `export HF_TOKEN=hf_...` or in your Space secrets.",
|
||||
"", gr.update(), gr.update(), gr.update(),
|
||||
"", gr.update(), gr.update(), gr.update(), gr.update(),
|
||||
)
|
||||
return
|
||||
|
||||
@@ -1486,7 +1490,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
_clear_gpu()
|
||||
with _lock:
|
||||
if _state["status"] == "obliterating":
|
||||
yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update()
|
||||
yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update(), gr.update()
|
||||
return
|
||||
_state["log"] = []
|
||||
_state["status"] = "obliterating"
|
||||
@@ -1638,9 +1642,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
status_msg = f"**Obliterating\u2026** ({_elapsed()})"
|
||||
if len(log_lines) > last_yielded[0]:
|
||||
last_yielded[0] = len(log_lines)
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
||||
else:
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
||||
if time.time() - _pipeline_start > _max_pipeline_secs:
|
||||
log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.")
|
||||
break
|
||||
@@ -1655,7 +1659,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
err_msg = str(error_ref[0]) or repr(error_ref[0])
|
||||
log_lines.append(f"\nERROR: {err_msg}")
|
||||
_state["log"] = log_lines
|
||||
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
|
||||
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
|
||||
return
|
||||
|
||||
# Success — keep model in memory for chat.
|
||||
@@ -1757,7 +1761,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
if bnb_available:
|
||||
log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...")
|
||||
last_yielded[0] = len(log_lines)
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
||||
try:
|
||||
from transformers import BitsAndBytesConfig
|
||||
bnb_cfg = BitsAndBytesConfig(
|
||||
@@ -1804,7 +1808,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
else "Falling back to CPU offload..."
|
||||
)
|
||||
last_yielded[0] = len(log_lines)
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
||||
try:
|
||||
offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
|
||||
model_reloaded = AutoModelForCausalLM.from_pretrained(
|
||||
@@ -1861,13 +1865,21 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
f"**{model_choice}** liberated with `{method}` method. "
|
||||
f"Saved to `{save_dir}`. Chat requires a larger GPU."
|
||||
)
|
||||
# Update session dropdown directly (don't rely on .then() which can
|
||||
# fail to fire on ZeroGPU after generator teardown)
|
||||
# Update BOTH session dropdowns directly (don't rely on .then() which
|
||||
# fails to fire on ZeroGPU after generator teardown).
|
||||
# Set skip flag so the .change handler doesn't trigger a wasteful
|
||||
# GPU re-allocation — the model is already loaded.
|
||||
global _skip_session_load
|
||||
_skip_session_load = True
|
||||
_dd_update = gr.update(
|
||||
choices=_get_session_model_choices(),
|
||||
value=_last_obliterated_label or None,
|
||||
)
|
||||
yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card
|
||||
_ab_dd_update = gr.update(
|
||||
choices=_get_session_model_choices(),
|
||||
value=_last_obliterated_label or None,
|
||||
)
|
||||
yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card, _ab_dd_update
|
||||
|
||||
except Exception as e:
|
||||
# Ensure status never gets stuck on "obliterating"
|
||||
@@ -1876,7 +1888,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
||||
err_msg = str(e) or repr(e)
|
||||
log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
|
||||
_state["log"] = log_lines
|
||||
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
|
||||
yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -2102,6 +2114,18 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
||||
|
||||
On ZeroGPU, uses the visitor's GPU quota.
|
||||
"""
|
||||
# Skip if the obliterate function just set the dropdown value — the model
|
||||
# is already loaded and we'd just waste GPU quota re-allocating.
|
||||
global _skip_session_load
|
||||
if _skip_session_load:
|
||||
_skip_session_load = False
|
||||
if choice and _state.get("status") == "ready":
|
||||
yield (
|
||||
f"**Ready!** `{choice}` is loaded — just type in the chat below.",
|
||||
get_chat_header(),
|
||||
)
|
||||
return
|
||||
|
||||
if not choice or choice not in _bench_configs:
|
||||
yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", ""
|
||||
return
|
||||
@@ -3727,6 +3751,7 @@ Pre-configured benchmark configurations for common research questions.
|
||||
choices=_get_session_model_choices(),
|
||||
label="Cached Models",
|
||||
info="Select a model to auto-load it for chat",
|
||||
allow_custom_value=True,
|
||||
)
|
||||
session_load_status = gr.Markdown("")
|
||||
|
||||
@@ -3779,6 +3804,7 @@ See exactly how abliteration changes model behavior on the same prompt.
|
||||
choices=_get_session_model_choices(),
|
||||
label="Cached Models",
|
||||
info="Select a model to auto-load it for A/B comparison",
|
||||
allow_custom_value=True,
|
||||
)
|
||||
ab_session_load_status = gr.Markdown("")
|
||||
|
||||
@@ -4125,8 +4151,8 @@ Built on the shoulders of:
|
||||
|
||||
### Links
|
||||
|
||||
- [GitHub](https://github.com/obliteratus-project/OBLITERATUS)
|
||||
- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper)
|
||||
- [GitHub](https://github.com/elder-plinius/OBLITERATUS)
|
||||
- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper)
|
||||
""")
|
||||
|
||||
# Wire method dropdown → auto-update advanced settings
|
||||
@@ -4192,28 +4218,27 @@ Built on the shoulders of:
|
||||
).then(fn=_get_vram_html, outputs=[vram_display])
|
||||
|
||||
# Wire obliterate button (after all tabs so chat_status is defined)
|
||||
# session_model_dd is a direct output (4th) so the dropdown updates
|
||||
# reliably even on ZeroGPU where .then() may not fire after generator teardown.
|
||||
# Both session_model_dd (4th) and ab_session_model_dd (6th) are direct
|
||||
# outputs so the dropdowns update reliably even on ZeroGPU where .then()
|
||||
# may not fire after generator teardown.
|
||||
obliterate_btn.click(
|
||||
fn=obliterate,
|
||||
inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd,
|
||||
custom_harmful_tb, custom_harmless_tb] + _adv_controls,
|
||||
outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md],
|
||||
outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md, ab_session_model_dd],
|
||||
).then(
|
||||
fn=lambda: (
|
||||
gr.update(choices=_get_session_model_choices()),
|
||||
_get_vram_html(),
|
||||
),
|
||||
outputs=[ab_session_model_dd, vram_display],
|
||||
fn=lambda: _get_vram_html(),
|
||||
outputs=[vram_display],
|
||||
)
|
||||
|
||||
# Wire session model auto-loading (Chat tab dropdown change)
|
||||
# Always pass choices + value together so ZeroGPU doesn't hit stale choices
|
||||
session_model_dd.change(
|
||||
fn=load_bench_into_chat,
|
||||
inputs=[session_model_dd],
|
||||
outputs=[session_load_status, chat_status],
|
||||
).then(
|
||||
fn=lambda v: (gr.update(value=v), _get_vram_html()),
|
||||
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
|
||||
inputs=[session_model_dd],
|
||||
outputs=[ab_session_model_dd, vram_display],
|
||||
)
|
||||
@@ -4224,7 +4249,7 @@ Built on the shoulders of:
|
||||
inputs=[ab_session_model_dd],
|
||||
outputs=[ab_session_load_status, chat_status],
|
||||
).then(
|
||||
fn=lambda v: (gr.update(value=v), _get_vram_html()),
|
||||
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
|
||||
inputs=[ab_session_model_dd],
|
||||
outputs=[session_model_dd, vram_display],
|
||||
)
|
||||
|
||||
+3
-3
@@ -1095,7 +1095,7 @@
|
||||
<h2>> Quickstart: Free a Model</h2>
|
||||
<div style="background:#000; padding:16px; border:1px solid var(--border); margin-top:12px; line-height:2; font-size:0.78rem;">
|
||||
<span style="color:var(--text-dim)"># 1. get the liberation toolkit</span><br>
|
||||
<span style="color:var(--accent)">$</span> git clone https://github.com/obliteratus-project/OBLITERATUS<br>
|
||||
<span style="color:var(--accent)">$</span> git clone https://github.com/elder-plinius/OBLITERATUS<br>
|
||||
<span style="color:var(--accent)">$</span> cd OBLITERATUS<br>
|
||||
<span style="color:var(--accent)">$</span> pip install -e .<br><br>
|
||||
<span style="color:var(--text-dim)"># 2. interactive mode (guided liberation)</span><br>
|
||||
@@ -1154,7 +1154,7 @@
|
||||
<div style="margin-bottom:16px; padding:12px; border-left:3px solid var(--yellow); background:rgba(255,183,0,0.03)">
|
||||
<h4 style="color:var(--yellow); font-size:0.82rem">Concept Cone Geometry <span style="font-size:0.65rem; color:var(--red)">[NOVEL]</span></h4>
|
||||
<p style="color:var(--text-dim); font-size:0.75rem; margin-top:4px">
|
||||
Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Gurnee & Nanda (ICML 2025) with novel extensions.
|
||||
Analyzes whether different harm categories (weapons, cyber, drugs, etc.) share a single refusal direction or have distinct mechanisms. Computes cone solid angles, Direction Specificity Index, and polyhedral classification. Based on Wollschlager et al. (ICML 2025) with novel extensions.
|
||||
</p>
|
||||
</div>
|
||||
<div style="margin-bottom:16px; padding:12px; border-left:3px solid var(--yellow); background:rgba(255,183,0,0.03)">
|
||||
@@ -1397,7 +1397,7 @@
|
||||
<div style="margin-bottom:16px; padding:16px; background:linear-gradient(135deg, rgba(249,171,0,0.08), rgba(249,171,0,0.02)); border:1px solid rgba(249,171,0,0.3); border-radius:6px">
|
||||
<div style="font-size:0.82rem; font-weight:700; color:var(--yellow); margin-bottom:8px; letter-spacing:0.5px">▸ COLAB NOTEBOOK</div>
|
||||
<div style="display:flex; align-items:center; gap:12px; flex-wrap:wrap">
|
||||
<a id="colab-link" href="https://colab.research.google.com/github/obliteratus-project/OBLITERATUS/blob/main/notebooks/abliterate.ipynb" target="_blank" rel="noopener"
|
||||
<a id="colab-link" href="https://colab.research.google.com/github/elder-plinius/OBLITERATUS/blob/main/notebooks/abliterate.ipynb" target="_blank" rel="noopener"
|
||||
style="display:inline-flex; align-items:center; gap:8px; background:#f9ab00; color:#000; padding:10px 20px; font-weight:700; font-size:0.85rem; text-decoration:none; border-radius:4px; letter-spacing:0.5px; font-family:'Fira Code',monospace">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="" style="height:20px; vertical-align:middle">
|
||||
OPEN IN COLAB
|
||||
|
||||
+3
-3
@@ -50,7 +50,7 @@ Logged-in HuggingFace users get free GPU quota. For more quota, upgrade to [HF P
|
||||
## Run locally (same UI, your own GPU)
|
||||
|
||||
```bash
|
||||
git clone https://github.com/obliteratus-project/OBLITERATUS
|
||||
git clone https://github.com/elder-plinius/OBLITERATUS
|
||||
cd OBLITERATUS
|
||||
pip install -e ".[spaces]"
|
||||
|
||||
@@ -73,5 +73,5 @@ No GPU hardware selection needed — ZeroGPU handles allocation automatically.
|
||||
|
||||
## Links
|
||||
|
||||
- [GitHub](https://github.com/obliteratus-project/OBLITERATUS)
|
||||
- [Paper](https://github.com/obliteratus-project/OBLITERATUS/tree/main/paper)
|
||||
- [GitHub](https://github.com/elder-plinius/OBLITERATUS)
|
||||
- [Paper](https://github.com/elder-plinius/OBLITERATUS/tree/main/paper)
|
||||
|
||||
@@ -53,7 +53,7 @@
|
||||
"id": "install"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": "!pip install -q git+https://github.com/obliteratus-project/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
|
||||
"source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
|
||||
@@ -4010,6 +4010,11 @@ class AbliterationPipeline:
|
||||
f"Projecting packed quantized data would silently corrupt the model. "
|
||||
f"Original error: {e}"
|
||||
)
|
||||
# Some architectures store weights as non-float types (e.g. uint8 from
|
||||
# custom quantization schemes). Projections require float math, so
|
||||
# convert and treat as "quantized" so the caller writes back properly.
|
||||
if not weight.data.is_floating_point():
|
||||
return weight.data.to(torch.float32), True
|
||||
return weight.data, False
|
||||
|
||||
@staticmethod
|
||||
@@ -4049,10 +4054,20 @@ class AbliterationPipeline:
|
||||
)
|
||||
return
|
||||
|
||||
# ── Non-float weight (e.g. uint8 from custom quantization) ─────
|
||||
# If the original weight isn't a bitsandbytes/GPTQ/AWQ param, just
|
||||
# replace with the float version so projections are preserved.
|
||||
weight = proj_module.weight
|
||||
if not AbliterationPipeline._is_quantized_param(weight):
|
||||
proj_module.weight = nn.Parameter(
|
||||
W_modified.to(device=weight.device),
|
||||
requires_grad=weight.requires_grad,
|
||||
)
|
||||
return
|
||||
|
||||
# ── bitsandbytes re-quantization ──────────────────────────
|
||||
try:
|
||||
import bitsandbytes as bnb
|
||||
weight = proj_module.weight
|
||||
quantized, new_state = bnb.functional.quantize_4bit(
|
||||
W_modified.to(weight.device),
|
||||
quant_type=getattr(weight, "quant_type", "nf4"),
|
||||
@@ -4087,7 +4102,8 @@ class AbliterationPipeline:
|
||||
norms: dict[str, float] = {}
|
||||
for param_name, param in layer.named_parameters():
|
||||
if param_name.endswith(".weight"):
|
||||
norms[param_name] = param.data.norm().item()
|
||||
data = param.data.float() if not param.data.is_floating_point() else param.data
|
||||
norms[param_name] = data.norm().item()
|
||||
return norms
|
||||
|
||||
@staticmethod
|
||||
@@ -4106,7 +4122,8 @@ class AbliterationPipeline:
|
||||
continue
|
||||
original_norm = saved_norms[param_name]
|
||||
if original_norm > 0:
|
||||
new_norm = param.data.norm().item()
|
||||
data = param.data.float() if not param.data.is_floating_point() else param.data
|
||||
new_norm = data.norm().item()
|
||||
if math.isnan(new_norm) or math.isinf(new_norm) or new_norm == 0:
|
||||
continue # Skip — weight is degenerate after projection
|
||||
if abs(new_norm - original_norm) > 1e-6:
|
||||
@@ -4294,6 +4311,10 @@ class AbliterationPipeline:
|
||||
continue
|
||||
else:
|
||||
data = param.data
|
||||
# Non-float (e.g. uint8) fused params need float conversion
|
||||
if not data.is_floating_point():
|
||||
data = data.float()
|
||||
is_quantized = True # ensure write-back replaces param
|
||||
|
||||
if data.dim() < 3:
|
||||
continue
|
||||
|
||||
+3
-3
@@ -38,9 +38,9 @@ dependencies = [
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
"Homepage" = "https://github.com/obliteratus-project/OBLITERATUS"
|
||||
"Repository" = "https://github.com/obliteratus-project/OBLITERATUS"
|
||||
"Bug Tracker" = "https://github.com/obliteratus-project/OBLITERATUS/issues"
|
||||
"Homepage" = "https://github.com/elder-plinius/OBLITERATUS"
|
||||
"Repository" = "https://github.com/elder-plinius/OBLITERATUS"
|
||||
"Bug Tracker" = "https://github.com/elder-plinius/OBLITERATUS/issues"
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,302 @@
|
||||
"""Extended tests for novel abliteration pipeline features.
|
||||
|
||||
Tests the new capabilities added to the OBLITERATUS abliteration pipeline:
|
||||
- Bias projection
|
||||
- Chat template wrapping
|
||||
- Method presets with new parameters
|
||||
- True iterative refinement
|
||||
- Whitened SVD integration
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
from transformers import GPT2Config, GPT2LMHeadModel
|
||||
|
||||
from obliteratus.abliterate import (
|
||||
METHODS,
|
||||
AbliterationPipeline,
|
||||
)
|
||||
from obliteratus.models.loader import ModelHandle
|
||||
|
||||
|
||||
def _make_tiny_handle():
|
||||
"""Create a minimal ModelHandle with a tiny GPT-2 for testing."""
|
||||
config = GPT2Config(
|
||||
vocab_size=1000,
|
||||
n_positions=128,
|
||||
n_embd=64,
|
||||
n_layer=4,
|
||||
n_head=2,
|
||||
n_inner=256,
|
||||
)
|
||||
model = GPT2LMHeadModel(config)
|
||||
model.eval()
|
||||
|
||||
tokenizer = MagicMock()
|
||||
tokenizer.pad_token = "<pad>"
|
||||
tokenizer.eos_token = "<eos>"
|
||||
tokenizer.return_value = {
|
||||
"input_ids": torch.randint(0, 1000, (1, 10)),
|
||||
"attention_mask": torch.ones(1, 10, dtype=torch.long),
|
||||
}
|
||||
tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city"
|
||||
|
||||
handle = ModelHandle(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
config=config,
|
||||
model_name="gpt2-test",
|
||||
task="causal_lm",
|
||||
)
|
||||
handle.snapshot()
|
||||
return handle
|
||||
|
||||
|
||||
def _make_varied_tokenizer(handle):
|
||||
"""Set up a tokenizer mock that returns different tokens per call."""
|
||||
call_count = [0]
|
||||
def mock_tokenizer(prompt, **kwargs):
|
||||
call_count[0] += 1
|
||||
torch.manual_seed(call_count[0])
|
||||
return {
|
||||
"input_ids": torch.randint(0, 1000, (1, 5)),
|
||||
"attention_mask": torch.ones(1, 5, dtype=torch.long),
|
||||
}
|
||||
handle.tokenizer.side_effect = mock_tokenizer
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# New method preset parameters
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNewMethodPresets:
|
||||
def test_basic_has_new_params(self):
|
||||
cfg = METHODS["basic"]
|
||||
assert "project_biases" in cfg
|
||||
assert "use_chat_template" in cfg
|
||||
assert "use_whitened_svd" in cfg
|
||||
assert "true_iterative_refinement" in cfg
|
||||
assert cfg["project_biases"] is False
|
||||
assert cfg["use_chat_template"] is False
|
||||
|
||||
def test_advanced_has_new_params(self):
|
||||
cfg = METHODS["advanced"]
|
||||
assert cfg["project_biases"] is True
|
||||
assert cfg["use_chat_template"] is True
|
||||
assert cfg["use_whitened_svd"] is False
|
||||
assert cfg["true_iterative_refinement"] is False
|
||||
|
||||
def test_aggressive_has_new_params(self):
|
||||
cfg = METHODS["aggressive"]
|
||||
assert cfg["project_biases"] is True
|
||||
assert cfg["use_chat_template"] is True
|
||||
assert cfg["use_whitened_svd"] is True
|
||||
assert cfg["true_iterative_refinement"] is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pipeline initialization with new parameters
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNewPipelineInit:
|
||||
def test_default_new_params(self):
|
||||
pipeline = AbliterationPipeline(model_name="test-model")
|
||||
# advanced method defaults
|
||||
assert pipeline.project_biases is True
|
||||
assert pipeline.use_chat_template is True
|
||||
assert pipeline.use_whitened_svd is False
|
||||
assert pipeline.true_iterative_refinement is False
|
||||
|
||||
def test_basic_method_new_params(self):
|
||||
pipeline = AbliterationPipeline(model_name="test-model", method="basic")
|
||||
assert pipeline.project_biases is False
|
||||
assert pipeline.use_chat_template is False
|
||||
assert pipeline.use_whitened_svd is False
|
||||
assert pipeline.true_iterative_refinement is False
|
||||
|
||||
def test_aggressive_method_new_params(self):
|
||||
pipeline = AbliterationPipeline(model_name="test-model", method="aggressive")
|
||||
assert pipeline.project_biases is True
|
||||
assert pipeline.use_chat_template is True
|
||||
assert pipeline.use_whitened_svd is True
|
||||
assert pipeline.true_iterative_refinement is True
|
||||
|
||||
def test_explicit_overrides_new_params(self):
|
||||
pipeline = AbliterationPipeline(
|
||||
model_name="test-model",
|
||||
method="basic",
|
||||
project_biases=True,
|
||||
use_chat_template=True,
|
||||
use_whitened_svd=True,
|
||||
true_iterative_refinement=True,
|
||||
)
|
||||
assert pipeline.project_biases is True
|
||||
assert pipeline.use_chat_template is True
|
||||
assert pipeline.use_whitened_svd is True
|
||||
assert pipeline.true_iterative_refinement is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bias projection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBiasProjection:
|
||||
def test_project_bias_removes_component(self):
|
||||
"""Bias projection should remove refusal direction component from bias."""
|
||||
class Wrapper(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.o_proj = torch.nn.Linear(4, 4, bias=True)
|
||||
|
||||
module = Wrapper()
|
||||
torch.manual_seed(42)
|
||||
module.o_proj.bias.data = torch.tensor([1.0, 2.0, 3.0, 4.0])
|
||||
|
||||
direction = torch.tensor([1.0, 0.0, 0.0, 0.0]).unsqueeze(-1) # unit vector along dim 0
|
||||
|
||||
count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
|
||||
assert count == 1
|
||||
|
||||
# The component along direction [1,0,0,0] was 1.0, should now be ~0
|
||||
new_bias = module.o_proj.bias.data
|
||||
projection_onto_dir = (new_bias @ direction.squeeze()).item()
|
||||
assert abs(projection_onto_dir) < 1e-5
|
||||
|
||||
# Other components should be unchanged
|
||||
assert abs(new_bias[1].item() - 2.0) < 1e-5
|
||||
assert abs(new_bias[2].item() - 3.0) < 1e-5
|
||||
assert abs(new_bias[3].item() - 4.0) < 1e-5
|
||||
|
||||
def test_project_bias_no_bias(self):
|
||||
"""Should handle modules without bias gracefully."""
|
||||
class Wrapper(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.o_proj = torch.nn.Linear(4, 4, bias=False)
|
||||
|
||||
module = Wrapper()
|
||||
direction = torch.randn(4, 1)
|
||||
count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
|
||||
assert count == 0
|
||||
|
||||
def test_project_bias_no_matching_module(self):
|
||||
"""Should return 0 when no candidate names match."""
|
||||
class Wrapper(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.something = torch.nn.Linear(4, 4, bias=True)
|
||||
|
||||
module = Wrapper()
|
||||
direction = torch.randn(4, 1)
|
||||
count = AbliterationPipeline._project_bias(module, direction, ["o_proj"])
|
||||
assert count == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chat template wrapping
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestChatTemplate:
|
||||
def test_no_wrap_when_disabled(self):
|
||||
"""Should not wrap prompts when use_chat_template is False."""
|
||||
pipeline = AbliterationPipeline(
|
||||
model_name="test-model",
|
||||
method="basic",
|
||||
use_chat_template=False,
|
||||
)
|
||||
prompts = ["Hello", "World"]
|
||||
result = pipeline._maybe_apply_chat_template(prompts)
|
||||
assert result == prompts
|
||||
|
||||
def test_no_wrap_without_handle(self):
|
||||
"""Should return raw prompts when handle is not set."""
|
||||
pipeline = AbliterationPipeline(
|
||||
model_name="test-model",
|
||||
use_chat_template=True,
|
||||
)
|
||||
prompts = ["Hello"]
|
||||
result = pipeline._maybe_apply_chat_template(prompts)
|
||||
assert result == prompts
|
||||
|
||||
def test_wraps_with_template(self):
|
||||
"""Should wrap prompts when tokenizer has apply_chat_template."""
|
||||
pipeline = AbliterationPipeline(
|
||||
model_name="test-model",
|
||||
use_chat_template=True,
|
||||
)
|
||||
handle = MagicMock()
|
||||
tokenizer = MagicMock()
|
||||
|
||||
def mock_apply(messages, tokenize=False, add_generation_prompt=True):
|
||||
return f"<user>{messages[0]['content']}</user><assistant>"
|
||||
|
||||
tokenizer.apply_chat_template = mock_apply
|
||||
handle.tokenizer = tokenizer
|
||||
pipeline.handle = handle
|
||||
pipeline._on_log = lambda m: None
|
||||
|
||||
result = pipeline._maybe_apply_chat_template(["Hello"])
|
||||
assert "<user>Hello</user>" in result[0]
|
||||
|
||||
def test_fallback_when_no_template(self):
|
||||
"""Should fall back to raw prompts when template is not configured."""
|
||||
pipeline = AbliterationPipeline(
|
||||
model_name="test-model",
|
||||
use_chat_template=True,
|
||||
)
|
||||
handle = MagicMock()
|
||||
tokenizer = MagicMock()
|
||||
tokenizer.apply_chat_template.side_effect = Exception("No template")
|
||||
handle.tokenizer = tokenizer
|
||||
pipeline.handle = handle
|
||||
pipeline._on_log = lambda m: None
|
||||
|
||||
result = pipeline._maybe_apply_chat_template(["Hello"])
|
||||
assert result == ["Hello"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metadata includes new fields
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMetadata:
|
||||
def test_rebirth_includes_new_config(self):
|
||||
"""Metadata should include all new configuration parameters."""
|
||||
import json
|
||||
handle = _make_tiny_handle()
|
||||
pipeline = AbliterationPipeline(
|
||||
model_name="test-model",
|
||||
method="aggressive",
|
||||
)
|
||||
pipeline.handle = handle
|
||||
pipeline._on_log = lambda m: None
|
||||
pipeline._on_stage = lambda r: None
|
||||
pipeline._strong_layers = [0]
|
||||
pipeline._quality_metrics = {"perplexity": 8.5, "coherence": 1.0}
|
||||
|
||||
handle.model.save_pretrained = MagicMock()
|
||||
handle.tokenizer.save_pretrained = MagicMock()
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
pipeline.output_dir = Path(tmp) / "output"
|
||||
pipeline._rebirth()
|
||||
|
||||
metadata = json.loads(
|
||||
(pipeline.output_dir / "abliteration_metadata.json").read_text()
|
||||
)
|
||||
cfg = metadata["method_config"]
|
||||
assert "project_biases" in cfg
|
||||
assert "use_chat_template" in cfg
|
||||
assert "use_whitened_svd" in cfg
|
||||
assert "true_iterative_refinement" in cfg
|
||||
assert cfg["project_biases"] is True
|
||||
assert cfg["use_whitened_svd"] is True
|
||||
|
||||
# Should have more references now
|
||||
assert len(metadata["references"]) >= 5
|
||||
assert any("OBLITERATUS" in r for r in metadata["references"])
|
||||
@@ -0,0 +1,300 @@
|
||||
"""Mathematical verification that abliteration actually removes refusal directions.
|
||||
|
||||
These tests verify the core linear algebra claims WITHOUT mocks:
|
||||
1. Projection removes the target direction from weight matrices
|
||||
2. Norm-preserving projection maintains weight magnitude
|
||||
3. Multi-direction SVD extracts the correct subspace
|
||||
4. Whitened SVD produces orthogonal directions
|
||||
5. Random directions do NOT have the same effect (negative control)
|
||||
|
||||
Unlike the other test files, these use real tensors and verify mathematical
|
||||
properties directly — no MagicMock, no mocked tokenizers.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class TestProjectionRemovesDirection:
|
||||
"""Verify that orthogonal projection removes the target direction."""
|
||||
|
||||
def test_single_direction_projection(self):
|
||||
"""After projecting out direction d from weight W,
|
||||
W_proj @ d should be approximately zero."""
|
||||
torch.manual_seed(42)
|
||||
hidden = 256
|
||||
out_dim = 128
|
||||
|
||||
W = torch.randn(out_dim, hidden)
|
||||
d = torch.randn(hidden)
|
||||
d = d / d.norm()
|
||||
|
||||
# Project out d: W_proj = W - (W @ d) @ d^T
|
||||
proj = W @ d # (out_dim,)
|
||||
W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0)
|
||||
|
||||
# Verify: W_proj @ d should be ~0
|
||||
residual = W_proj @ d
|
||||
assert residual.abs().max().item() < 1e-5, f"Residual too large: {residual.abs().max()}"
|
||||
|
||||
def test_projection_preserves_orthogonal_components(self):
|
||||
"""Projection should NOT change components orthogonal to d."""
|
||||
torch.manual_seed(42)
|
||||
hidden = 256
|
||||
out_dim = 128
|
||||
|
||||
W = torch.randn(out_dim, hidden)
|
||||
d = torch.randn(hidden)
|
||||
d = d / d.norm()
|
||||
|
||||
# Create a vector orthogonal to d
|
||||
v = torch.randn(hidden)
|
||||
v = v - (v @ d) * d # Gram-Schmidt
|
||||
v = v / v.norm()
|
||||
|
||||
# Project out d
|
||||
proj = W @ d
|
||||
W_proj = W - proj.unsqueeze(1) * d.unsqueeze(0)
|
||||
|
||||
# W @ v should equal W_proj @ v (orthogonal component unchanged)
|
||||
original = W @ v
|
||||
projected = W_proj @ v
|
||||
diff = (original - projected).abs().max().item()
|
||||
assert diff < 1e-5, f"Orthogonal component changed by {diff}"
|
||||
|
||||
def test_multi_direction_subspace_removal(self):
|
||||
"""Projecting out a k-dimensional subspace should remove all k directions."""
|
||||
torch.manual_seed(42)
|
||||
hidden = 256
|
||||
out_dim = 128
|
||||
k = 4
|
||||
|
||||
W = torch.randn(out_dim, hidden)
|
||||
# Create orthonormal subspace
|
||||
Q, _ = torch.linalg.qr(torch.randn(hidden, k))
|
||||
subspace = Q.T # (k, hidden)
|
||||
|
||||
# Project out subspace: W_proj = W - W @ Q @ Q^T
|
||||
W_proj = W - (W @ Q) @ Q.T
|
||||
|
||||
# Verify: W_proj @ subspace^T should be ~0 for all directions
|
||||
residual = W_proj @ subspace.T # (out_dim, k)
|
||||
assert residual.abs().max().item() < 1e-5, f"Subspace residual: {residual.abs().max()}"
|
||||
|
||||
def test_double_projection_is_idempotent(self):
|
||||
"""Projecting twice should give the same result as projecting once."""
|
||||
torch.manual_seed(42)
|
||||
hidden = 256
|
||||
out_dim = 128
|
||||
|
||||
W = torch.randn(out_dim, hidden)
|
||||
d = torch.randn(hidden)
|
||||
d = d / d.norm()
|
||||
|
||||
# Project once
|
||||
proj1 = W @ d
|
||||
W1 = W - proj1.unsqueeze(1) * d.unsqueeze(0)
|
||||
|
||||
# Project twice
|
||||
proj2 = W1 @ d
|
||||
W2 = W1 - proj2.unsqueeze(1) * d.unsqueeze(0)
|
||||
|
||||
diff = (W1 - W2).abs().max().item()
|
||||
assert diff < 1e-5, f"Second projection changed weights by {diff}"
|
||||
|
||||
|
||||
class TestNormPreservation:
|
||||
"""Verify that norm-preserving projection maintains weight magnitude."""
|
||||
|
||||
def test_norm_preserving_projection(self):
|
||||
"""Biprojected norm-preserving abliteration should keep ||W|| constant."""
|
||||
torch.manual_seed(42)
|
||||
hidden = 256
|
||||
out_dim = 128
|
||||
|
||||
W = torch.randn(out_dim, hidden)
|
||||
d = torch.randn(hidden)
|
||||
d = d / d.norm()
|
||||
|
||||
# Standard projection
|
||||
proj_coeff = W @ d
|
||||
W_proj = W - proj_coeff.unsqueeze(1) * d.unsqueeze(0)
|
||||
|
||||
# Norm-preserving rescaling (per-row)
|
||||
row_norms_orig = W.norm(dim=1, keepdim=True).clamp(min=1e-8)
|
||||
row_norms_proj = W_proj.norm(dim=1, keepdim=True).clamp(min=1e-8)
|
||||
W_norm_preserved = W_proj * (row_norms_orig / row_norms_proj)
|
||||
|
||||
# Direction is still removed
|
||||
residual = W_norm_preserved @ d
|
||||
# Norm-preserving can't guarantee zero projection (it rescales),
|
||||
# but projection should be significantly reduced
|
||||
original_proj = (W @ d).abs().mean().item()
|
||||
preserved_proj = residual.abs().mean().item()
|
||||
assert preserved_proj < original_proj * 0.5, \
|
||||
f"Norm-preserved projection {preserved_proj} not much less than original {original_proj}"
|
||||
|
||||
# Row norms are preserved
|
||||
row_diff = (W_norm_preserved.norm(dim=1) - W.norm(dim=1)).abs().max().item()
|
||||
assert row_diff < 1e-5, f"Row norms changed by {row_diff}"
|
||||
|
||||
|
||||
class TestSVDDirectionExtraction:
|
||||
"""Verify that SVD on the difference matrix extracts the refusal direction."""
|
||||
|
||||
def test_planted_direction_recovery(self):
|
||||
"""Plant a known direction in the difference and verify SVD recovers it."""
|
||||
torch.manual_seed(42)
|
||||
n_samples = 50
|
||||
hidden = 256
|
||||
|
||||
# Plant a known refusal direction
|
||||
true_direction = torch.randn(hidden)
|
||||
true_direction = true_direction / true_direction.norm()
|
||||
|
||||
# Harmful activations = harmless + signal along true_direction + noise
|
||||
harmless = torch.randn(n_samples, hidden) * 0.5
|
||||
signal_strength = 5.0
|
||||
harmful = harmless + signal_strength * true_direction.unsqueeze(0) + torch.randn(n_samples, hidden) * 0.1
|
||||
|
||||
# Extract via SVD on difference
|
||||
diff = harmful - harmless
|
||||
U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
|
||||
extracted = Vh[0]
|
||||
extracted = extracted / extracted.norm()
|
||||
|
||||
# The extracted direction should align with the true direction
|
||||
cosine = (extracted @ true_direction).abs().item()
|
||||
assert cosine > 0.95, f"Cosine similarity {cosine:.3f} too low (expected > 0.95)"
|
||||
|
||||
def test_multi_direction_recovery(self):
|
||||
"""Plant k directions and verify SVD recovers the subspace."""
|
||||
torch.manual_seed(42)
|
||||
n_samples = 200
|
||||
hidden = 256
|
||||
k = 3
|
||||
|
||||
# Plant k orthogonal directions with varying per-sample strength
|
||||
Q, _ = torch.linalg.qr(torch.randn(hidden, k))
|
||||
true_subspace = Q.T # (k, hidden)
|
||||
|
||||
# Each sample gets a random mix of the k planted directions
|
||||
harmless = torch.randn(n_samples, hidden) * 0.01
|
||||
coefficients = torch.randn(n_samples, k).abs() * 5.0
|
||||
signal = coefficients @ true_subspace # (n_samples, hidden)
|
||||
harmful = harmless + signal
|
||||
|
||||
diff = harmful - harmless
|
||||
U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
|
||||
extracted_subspace = Vh[:k] # (k, hidden)
|
||||
|
||||
# Check subspace overlap: project true directions into extracted subspace
|
||||
for i in range(k):
|
||||
proj = extracted_subspace @ true_subspace[i]
|
||||
captured_variance = proj.norm().item()
|
||||
assert captured_variance > 0.9, \
|
||||
f"Direction {i}: captured variance {captured_variance:.3f} too low"
|
||||
|
||||
|
||||
class TestRandomDirectionBaseline:
|
||||
"""Verify that random directions do NOT have the same effect as learned ones."""
|
||||
|
||||
def test_random_direction_has_lower_projection(self):
|
||||
"""Random directions should project much less on harmful activations
|
||||
than the true refusal direction."""
|
||||
torch.manual_seed(42)
|
||||
n_samples = 50
|
||||
hidden = 256
|
||||
|
||||
# Create structured harmful vs harmless difference
|
||||
true_dir = torch.randn(hidden)
|
||||
true_dir = true_dir / true_dir.norm()
|
||||
|
||||
harmless = torch.randn(n_samples, hidden) * 0.5
|
||||
harmful = harmless + 3.0 * true_dir.unsqueeze(0)
|
||||
|
||||
harmful_mean = harmful.mean(dim=0)
|
||||
|
||||
# True direction projection
|
||||
true_proj = (harmful_mean @ true_dir).abs().item()
|
||||
|
||||
# Random direction projections (seeds far from 42 to avoid collision)
|
||||
random_projs = []
|
||||
for i in range(100):
|
||||
rng = torch.Generator().manual_seed(10000 + i)
|
||||
rand_dir = torch.randn(hidden, generator=rng)
|
||||
rand_dir = rand_dir / rand_dir.norm()
|
||||
random_projs.append((harmful_mean @ rand_dir).abs().item())
|
||||
|
||||
mean_random = sum(random_projs) / len(random_projs)
|
||||
|
||||
# True direction should project MUCH more than random average
|
||||
assert true_proj > mean_random * 3.0, \
|
||||
f"True projection ({true_proj:.3f}) not much larger than random mean ({mean_random:.3f})"
|
||||
|
||||
|
||||
class TestWhitenedSVD:
|
||||
"""Verify whitened SVD properties."""
|
||||
|
||||
def test_whitened_directions_are_orthogonal(self):
|
||||
"""Whitened SVD should produce orthogonal directions."""
|
||||
torch.manual_seed(42)
|
||||
n_samples = 80
|
||||
hidden = 128
|
||||
k = 4
|
||||
|
||||
H = torch.randn(n_samples, hidden) + torch.randn(1, hidden) * 2
|
||||
B = torch.randn(n_samples, hidden)
|
||||
|
||||
mu_B = B.mean(dim=0, keepdim=True)
|
||||
B_centered = B - mu_B
|
||||
cov_B = (B_centered.T @ B_centered) / (n_samples - 1)
|
||||
cov_B += 1e-4 * torch.eye(hidden)
|
||||
|
||||
eigenvalues, eigenvectors = torch.linalg.eigh(cov_B)
|
||||
eigenvalues = eigenvalues.clamp(min=0)
|
||||
inv_sqrt_eig = 1.0 / torch.sqrt(eigenvalues + 1e-4)
|
||||
whiten_proj = eigenvectors * inv_sqrt_eig.unsqueeze(0)
|
||||
|
||||
H_whitened = (H - mu_B) @ whiten_proj
|
||||
B_whitened = B_centered @ whiten_proj
|
||||
D_whitened = H_whitened - B_whitened
|
||||
|
||||
U, S, Vh = torch.linalg.svd(D_whitened, full_matrices=False)
|
||||
directions = Vh[:k]
|
||||
|
||||
# Check orthogonality: directions @ directions^T should be ~identity
|
||||
gram = directions @ directions.T
|
||||
identity = torch.eye(k)
|
||||
off_diag = (gram - identity).abs().max().item()
|
||||
assert off_diag < 1e-4, f"Directions not orthogonal: max off-diagonal = {off_diag}"
|
||||
|
||||
|
||||
class TestReproducibility:
|
||||
"""Verify that seed setting produces deterministic results."""
|
||||
|
||||
def test_set_seed_determinism(self):
|
||||
"""Same seed should produce identical random tensors."""
|
||||
from obliteratus.reproducibility import set_seed
|
||||
|
||||
set_seed(123, deterministic=False)
|
||||
a = torch.randn(100)
|
||||
|
||||
set_seed(123, deterministic=False)
|
||||
b = torch.randn(100)
|
||||
|
||||
assert torch.equal(a, b), "Same seed produced different tensors"
|
||||
|
||||
def test_different_seeds_differ(self):
|
||||
"""Different seeds should produce different tensors."""
|
||||
from obliteratus.reproducibility import set_seed
|
||||
|
||||
set_seed(123, deterministic=False)
|
||||
a = torch.randn(100)
|
||||
|
||||
set_seed(456, deterministic=False)
|
||||
b = torch.randn(100)
|
||||
|
||||
assert not torch.equal(a, b), "Different seeds produced identical tensors"
|
||||
@@ -0,0 +1,364 @@
|
||||
"""Tests for advanced evaluation metrics."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from obliteratus.evaluation.advanced_metrics import (
|
||||
REFUSAL_PREFIXES,
|
||||
REFUSAL_SUBSTRINGS,
|
||||
_is_refusal,
|
||||
refusal_rate,
|
||||
token_kl_divergence,
|
||||
first_token_kl_divergence,
|
||||
effective_rank,
|
||||
effective_rank_change,
|
||||
activation_cosine_similarity,
|
||||
linear_cka,
|
||||
refusal_projection_magnitude,
|
||||
format_eval_report,
|
||||
AbliterationEvalResult,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Refusal Rate
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRefusalRate:
|
||||
def test_all_refusals(self):
|
||||
responses = [
|
||||
"I'm sorry, but I cannot help with that.",
|
||||
"I apologize, but I'm unable to assist.",
|
||||
"As an AI, I cannot provide that information.",
|
||||
]
|
||||
assert refusal_rate(responses, mode="prefix") == 1.0
|
||||
|
||||
def test_no_refusals(self):
|
||||
responses = [
|
||||
"Sure, here's how to do that.",
|
||||
"Here's a step-by-step guide.",
|
||||
"The answer is 42.",
|
||||
]
|
||||
assert refusal_rate(responses, mode="prefix") == 0.0
|
||||
|
||||
def test_mixed_responses(self):
|
||||
responses = [
|
||||
"I'm sorry, I can't do that.",
|
||||
"Sure, here you go.",
|
||||
"I cannot provide that.",
|
||||
"Here's what you need.",
|
||||
]
|
||||
rate = refusal_rate(responses, mode="prefix")
|
||||
assert rate == 0.5 # 2 out of 4
|
||||
|
||||
def test_substring_mode(self):
|
||||
responses = [
|
||||
"Well, actually I cannot help with that request.",
|
||||
"Let me explain how to do this.",
|
||||
]
|
||||
assert refusal_rate(responses, mode="substring") == 0.5
|
||||
|
||||
def test_combined_mode(self):
|
||||
responses = [
|
||||
"I'm sorry, but no.", # caught by prefix
|
||||
"That said, i won't do that for you.", # caught by substring
|
||||
"Here's the answer.", # not a refusal
|
||||
]
|
||||
rate = refusal_rate(responses, mode="combined")
|
||||
assert abs(rate - 2.0 / 3.0) < 1e-6
|
||||
|
||||
def test_empty_responses(self):
|
||||
assert refusal_rate([], mode="prefix") == 0.0
|
||||
|
||||
def test_empty_string_not_refusal(self):
|
||||
assert not _is_refusal("", mode="combined")
|
||||
|
||||
def test_is_refusal_prefix(self):
|
||||
assert _is_refusal("I'm sorry, I can't do that.", mode="prefix")
|
||||
assert _is_refusal("As an AI language model, I cannot", mode="prefix")
|
||||
assert not _is_refusal("The answer is 42.", mode="prefix")
|
||||
|
||||
def test_is_refusal_substring(self):
|
||||
assert _is_refusal("I must say, i cannot assist with that.", mode="substring")
|
||||
assert not _is_refusal("The capital of France is Paris.", mode="substring")
|
||||
|
||||
def test_refusal_prefixes_nonempty(self):
|
||||
assert len(REFUSAL_PREFIXES) > 20
|
||||
|
||||
def test_refusal_substrings_nonempty(self):
|
||||
assert len(REFUSAL_SUBSTRINGS) > 10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# KL Divergence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestKLDivergence:
|
||||
def test_identical_distributions(self):
|
||||
"""KL divergence of identical distributions should be 0."""
|
||||
logits = torch.randn(2, 10, 100)
|
||||
kl = token_kl_divergence(logits, logits)
|
||||
assert abs(kl) < 1e-5
|
||||
|
||||
def test_different_distributions(self):
|
||||
"""KL divergence of different distributions should be positive."""
|
||||
torch.manual_seed(42)
|
||||
logits_a = torch.randn(2, 10, 100)
|
||||
logits_b = torch.randn(2, 10, 100)
|
||||
kl = token_kl_divergence(logits_a, logits_b)
|
||||
assert kl > 0
|
||||
|
||||
def test_kl_nonnegative(self):
|
||||
"""KL divergence should always be non-negative."""
|
||||
torch.manual_seed(42)
|
||||
for _ in range(5):
|
||||
logits_a = torch.randn(1, 5, 50)
|
||||
logits_b = torch.randn(1, 5, 50)
|
||||
kl = token_kl_divergence(logits_a, logits_b)
|
||||
assert kl >= -1e-6 # allow small numerical errors
|
||||
|
||||
def test_first_token_kl_identical(self):
|
||||
"""First-token KL of identical distributions should be 0."""
|
||||
logits = torch.randn(4, 20, 100)
|
||||
kl = first_token_kl_divergence(logits, logits)
|
||||
assert abs(kl) < 1e-5
|
||||
|
||||
def test_first_token_kl_different(self):
|
||||
"""First-token KL of different distributions should be positive."""
|
||||
torch.manual_seed(42)
|
||||
logits_a = torch.randn(4, 20, 100)
|
||||
logits_b = torch.randn(4, 20, 100)
|
||||
kl = first_token_kl_divergence(logits_a, logits_b)
|
||||
assert kl > 0
|
||||
|
||||
def test_temperature_effect(self):
|
||||
"""Higher temperature should reduce KL divergence (smoother distributions)."""
|
||||
torch.manual_seed(42)
|
||||
logits_a = torch.randn(2, 5, 50)
|
||||
logits_b = torch.randn(2, 5, 50)
|
||||
kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0)
|
||||
kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0)
|
||||
assert kl_t5 < kl_t1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Effective Rank
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEffectiveRank:
|
||||
def test_rank_one_matrix(self):
|
||||
"""Rank-1 matrix should have effective rank close to 1."""
|
||||
v = torch.randn(8, 1)
|
||||
u = torch.randn(1, 4)
|
||||
W = v @ u # rank-1
|
||||
erank = effective_rank(W)
|
||||
assert erank < 1.5
|
||||
|
||||
def test_identity_matrix(self):
|
||||
"""Identity matrix should have effective rank equal to dimension."""
|
||||
n = 8
|
||||
W = torch.eye(n)
|
||||
erank = effective_rank(W)
|
||||
assert abs(erank - n) < 0.1
|
||||
|
||||
def test_random_full_rank(self):
|
||||
"""Random matrix should have high effective rank."""
|
||||
torch.manual_seed(42)
|
||||
W = torch.randn(16, 16)
|
||||
erank = effective_rank(W)
|
||||
assert erank > 10 # should be close to 16
|
||||
|
||||
def test_zero_matrix(self):
|
||||
"""Zero matrix should have effective rank 0."""
|
||||
W = torch.zeros(4, 4)
|
||||
erank = effective_rank(W)
|
||||
assert erank == 0.0
|
||||
|
||||
def test_effective_rank_change(self):
|
||||
"""Should compute before/after rank comparison."""
|
||||
torch.manual_seed(42)
|
||||
W_before = torch.randn(8, 8)
|
||||
# Simulate abliteration: remove a direction (reduces rank slightly)
|
||||
d = torch.randn(8, 1)
|
||||
d = d / d.norm()
|
||||
W_after = W_before - (W_before @ d) @ d.T
|
||||
|
||||
result = effective_rank_change(W_before, W_after)
|
||||
assert "rank_before" in result
|
||||
assert "rank_after" in result
|
||||
assert "rank_delta" in result
|
||||
assert "rank_ratio" in result
|
||||
assert result["rank_after"] <= result["rank_before"] + 0.1
|
||||
|
||||
def test_rejects_non_2d(self):
|
||||
"""Should raise ValueError for non-2D tensors."""
|
||||
with pytest.raises(ValueError):
|
||||
effective_rank(torch.randn(4, 4, 4))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Activation Cosine Similarity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestActivationCosineSimilarity:
|
||||
def test_identical_activations(self):
|
||||
acts = torch.randn(10, 32)
|
||||
sim = activation_cosine_similarity(acts, acts)
|
||||
assert abs(sim - 1.0) < 1e-5
|
||||
|
||||
def test_orthogonal_activations(self):
|
||||
"""Orthogonal activations should have cosine near 0."""
|
||||
a = torch.tensor([[1.0, 0.0, 0.0]])
|
||||
b = torch.tensor([[0.0, 1.0, 0.0]])
|
||||
sim = activation_cosine_similarity(a, b)
|
||||
assert abs(sim) < 1e-5
|
||||
|
||||
def test_opposite_activations(self):
|
||||
"""Opposite activations should have cosine -1."""
|
||||
a = torch.randn(5, 16)
|
||||
sim = activation_cosine_similarity(a, -a)
|
||||
assert abs(sim - (-1.0)) < 1e-5
|
||||
|
||||
def test_handles_3d(self):
|
||||
"""Should handle 3D tensors by reshaping."""
|
||||
a = torch.randn(2, 5, 16)
|
||||
b = torch.randn(2, 5, 16)
|
||||
sim = activation_cosine_similarity(a, b)
|
||||
assert -1.0 <= sim <= 1.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Linear CKA
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLinearCKA:
|
||||
def test_identical_representations(self):
|
||||
"""CKA of identical representations should be 1.0."""
|
||||
X = torch.randn(20, 16)
|
||||
cka = linear_cka(X, X)
|
||||
assert abs(cka - 1.0) < 1e-4
|
||||
|
||||
def test_scaled_representations(self):
|
||||
"""CKA should be invariant to isotropic scaling."""
|
||||
X = torch.randn(20, 16)
|
||||
Y = X * 5.0
|
||||
cka = linear_cka(X, Y)
|
||||
assert abs(cka - 1.0) < 1e-4
|
||||
|
||||
def test_random_representations(self):
|
||||
"""CKA of random representations should be low."""
|
||||
torch.manual_seed(42)
|
||||
X = torch.randn(100, 16)
|
||||
Y = torch.randn(100, 16)
|
||||
cka = linear_cka(X, Y)
|
||||
assert cka < 0.3 # random should be near 0
|
||||
|
||||
def test_cka_bounded(self):
|
||||
"""CKA should be between 0 and 1."""
|
||||
torch.manual_seed(42)
|
||||
for _ in range(5):
|
||||
X = torch.randn(20, 8)
|
||||
Y = torch.randn(20, 8)
|
||||
cka = linear_cka(X, Y)
|
||||
assert -0.01 <= cka <= 1.01 # small tolerance for numerics
|
||||
|
||||
def test_different_dimensions(self):
|
||||
"""CKA should work with different hidden dimensions."""
|
||||
X = torch.randn(20, 16)
|
||||
Y = torch.randn(20, 32)
|
||||
cka = linear_cka(X, Y)
|
||||
assert -0.01 <= cka <= 1.01
|
||||
|
||||
def test_handles_3d(self):
|
||||
"""Should handle 3D tensors by reshaping."""
|
||||
X = torch.randn(2, 10, 16)
|
||||
Y = torch.randn(2, 10, 16)
|
||||
cka = linear_cka(X, Y)
|
||||
assert -0.01 <= cka <= 1.01
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Refusal Direction Projection Magnitude
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRefusalProjection:
|
||||
def test_aligned_activations(self):
|
||||
"""Activations aligned with direction should have high projection."""
|
||||
d = torch.tensor([1.0, 0.0, 0.0])
|
||||
acts = torch.tensor([
|
||||
[5.0, 0.0, 0.0],
|
||||
[3.0, 0.0, 0.0],
|
||||
[4.0, 0.0, 0.0],
|
||||
])
|
||||
result = refusal_projection_magnitude(acts, d)
|
||||
assert result["mean"] == 4.0
|
||||
assert result["abs_mean"] == 4.0
|
||||
|
||||
def test_orthogonal_activations(self):
|
||||
"""Orthogonal activations should have zero projection."""
|
||||
d = torch.tensor([1.0, 0.0, 0.0])
|
||||
acts = torch.tensor([
|
||||
[0.0, 5.0, 0.0],
|
||||
[0.0, 0.0, 3.0],
|
||||
])
|
||||
result = refusal_projection_magnitude(acts, d)
|
||||
assert abs(result["mean"]) < 1e-5
|
||||
assert abs(result["abs_mean"]) < 1e-5
|
||||
|
||||
def test_result_keys(self):
|
||||
"""Should return all expected keys."""
|
||||
d = torch.randn(8)
|
||||
acts = torch.randn(5, 8)
|
||||
result = refusal_projection_magnitude(acts, d)
|
||||
assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Eval Report Formatting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEvalReport:
|
||||
def test_format_report(self):
|
||||
result = AbliterationEvalResult(
|
||||
refusal_rate_harmful=0.1,
|
||||
refusal_rate_harmless=0.02,
|
||||
kl_divergence=0.15,
|
||||
perplexity=12.5,
|
||||
coherence_score=0.8,
|
||||
mean_activation_cosine=0.95,
|
||||
mean_cka=0.92,
|
||||
)
|
||||
report = format_eval_report(result)
|
||||
assert "10.0%" in report
|
||||
assert "12.50" in report
|
||||
assert "excellent" in report # KL < 0.2
|
||||
|
||||
def test_format_report_high_kl(self):
|
||||
result = AbliterationEvalResult(
|
||||
refusal_rate_harmful=0.0,
|
||||
refusal_rate_harmless=0.0,
|
||||
kl_divergence=1.5,
|
||||
perplexity=50.0,
|
||||
coherence_score=0.4,
|
||||
mean_activation_cosine=None,
|
||||
mean_cka=None,
|
||||
)
|
||||
report = format_eval_report(result)
|
||||
assert "significant damage" in report
|
||||
|
||||
def test_format_report_no_kl(self):
|
||||
result = AbliterationEvalResult(
|
||||
refusal_rate_harmful=0.5,
|
||||
refusal_rate_harmless=0.1,
|
||||
kl_divergence=None,
|
||||
perplexity=20.0,
|
||||
coherence_score=1.0,
|
||||
mean_activation_cosine=None,
|
||||
mean_cka=None,
|
||||
)
|
||||
report = format_eval_report(result)
|
||||
assert "50.0%" in report
|
||||
assert "KL" not in report
|
||||
@@ -0,0 +1,345 @@
|
||||
"""Tests for the analysis techniques."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor, WhitenedSVDResult
|
||||
from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer, CrossLayerResult
|
||||
from obliteratus.analysis.activation_probing import ActivationProbe, ProbeResult
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# WhitenedSVDExtractor
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestWhitenedSVD:
|
||||
def test_basic_extraction(self):
|
||||
"""Whitened SVD should extract directions from activation differences."""
|
||||
torch.manual_seed(42)
|
||||
n_prompts, hidden_dim = 10, 32
|
||||
|
||||
# Create activations with a clear refusal direction
|
||||
refusal_dir = torch.randn(hidden_dim)
|
||||
refusal_dir = refusal_dir / refusal_dir.norm()
|
||||
|
||||
harmless = [torch.randn(hidden_dim) for _ in range(n_prompts)]
|
||||
harmful = [h + 2.0 * refusal_dir for h in harmless] # shifted along refusal dir
|
||||
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless, n_directions=3)
|
||||
|
||||
assert isinstance(result, WhitenedSVDResult)
|
||||
assert result.directions.shape == (3, hidden_dim)
|
||||
assert result.singular_values.shape == (3,)
|
||||
assert result.variance_explained > 0
|
||||
assert result.condition_number > 0
|
||||
assert result.effective_rank > 0
|
||||
|
||||
def test_directions_are_unit_vectors(self):
|
||||
"""Extracted directions should be unit length."""
|
||||
torch.manual_seed(42)
|
||||
harmless = [torch.randn(16) for _ in range(8)]
|
||||
harmful = [h + torch.randn(16) * 0.5 for h in harmless]
|
||||
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless, n_directions=2)
|
||||
|
||||
for i in range(result.directions.shape[0]):
|
||||
assert abs(result.directions[i].norm().item() - 1.0) < 1e-4
|
||||
|
||||
def test_primary_aligns_with_planted_direction(self):
|
||||
"""Primary whitened direction should capture the planted refusal signal.
|
||||
|
||||
Whitening rotates directions relative to the covariance structure,
|
||||
so perfect alignment with the raw direction is not expected. We verify
|
||||
the whitened direction explains substantial variance and has moderate
|
||||
alignment (whitening intentionally reweights dimensions).
|
||||
"""
|
||||
torch.manual_seed(42)
|
||||
hidden_dim = 64
|
||||
n_prompts = 30
|
||||
|
||||
refusal_dir = torch.randn(hidden_dim)
|
||||
refusal_dir = refusal_dir / refusal_dir.norm()
|
||||
|
||||
# Isotropic harmless activations (whitening has minimal effect)
|
||||
harmless = [torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
|
||||
harmful = [h + 5.0 * refusal_dir for h in harmless]
|
||||
|
||||
extractor = WhitenedSVDExtractor(regularization_eps=1e-3)
|
||||
result = extractor.extract(harmful, harmless, n_directions=1)
|
||||
|
||||
cos_sim = (result.directions[0] @ refusal_dir).abs().item()
|
||||
# Moderate alignment expected (whitening reweights dimensions)
|
||||
assert cos_sim > 0.2, f"Expected alignment > 0.2, got {cos_sim:.3f}"
|
||||
# More importantly: the direction should explain most variance
|
||||
assert result.variance_explained > 0.5
|
||||
|
||||
def test_extract_all_layers(self):
|
||||
"""Should extract directions for all provided layers."""
|
||||
torch.manual_seed(42)
|
||||
harmful_acts = {}
|
||||
harmless_acts = {}
|
||||
for layer in range(4):
|
||||
harmful_acts[layer] = [torch.randn(16) for _ in range(5)]
|
||||
harmless_acts[layer] = [torch.randn(16) for _ in range(5)]
|
||||
|
||||
extractor = WhitenedSVDExtractor()
|
||||
results = extractor.extract_all_layers(harmful_acts, harmless_acts, n_directions=2)
|
||||
|
||||
assert len(results) == 4
|
||||
for idx in range(4):
|
||||
assert idx in results
|
||||
assert results[idx].directions.shape[0] == 2
|
||||
|
||||
def test_compare_with_standard(self):
|
||||
"""Comparison should return valid cosine similarities."""
|
||||
torch.manual_seed(42)
|
||||
harmless = [torch.randn(16) for _ in range(8)]
|
||||
harmful = [h + torch.randn(16) for h in harmless]
|
||||
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless, n_directions=2)
|
||||
|
||||
std_dir = torch.randn(16)
|
||||
std_dir = std_dir / std_dir.norm()
|
||||
|
||||
comparison = WhitenedSVDExtractor.compare_with_standard(result, std_dir)
|
||||
assert "primary_direction_cosine" in comparison
|
||||
assert "subspace_principal_cosine" in comparison
|
||||
assert 0 <= comparison["primary_direction_cosine"] <= 1.0
|
||||
|
||||
def test_handles_3d_activations(self):
|
||||
"""Should handle activations with an extra batch dimension."""
|
||||
torch.manual_seed(42)
|
||||
# (1, hidden_dim) shape from hook output
|
||||
harmless = [torch.randn(1, 16) for _ in range(5)]
|
||||
harmful = [torch.randn(1, 16) for _ in range(5)]
|
||||
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless, n_directions=2)
|
||||
assert result.directions.shape == (2, 16)
|
||||
|
||||
def test_variance_explained_bounded(self):
|
||||
"""Variance explained should be between 0 and 1."""
|
||||
torch.manual_seed(42)
|
||||
harmless = [torch.randn(16) for _ in range(8)]
|
||||
harmful = [torch.randn(16) for _ in range(8)]
|
||||
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless, n_directions=3)
|
||||
assert 0 <= result.variance_explained <= 1.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CrossLayerAlignmentAnalyzer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCrossLayerAlignment:
|
||||
def test_identical_directions(self):
|
||||
"""Identical directions across layers should give persistence = 1."""
|
||||
direction = torch.randn(32)
|
||||
direction = direction / direction.norm()
|
||||
directions = {i: direction.clone() for i in range(5)}
|
||||
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
|
||||
assert isinstance(result, CrossLayerResult)
|
||||
assert result.direction_persistence_score > 0.99
|
||||
assert result.mean_adjacent_cosine > 0.99
|
||||
assert result.total_geodesic_distance < 0.01
|
||||
|
||||
def test_orthogonal_directions(self):
|
||||
"""Orthogonal directions should give low persistence."""
|
||||
# Create orthogonal directions via QR decomposition
|
||||
torch.manual_seed(42)
|
||||
M = torch.randn(5, 32)
|
||||
Q, _ = torch.linalg.qr(M.T)
|
||||
directions = {i: Q[:, i] for i in range(5)}
|
||||
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
|
||||
assert result.direction_persistence_score < 0.3
|
||||
assert result.mean_adjacent_cosine < 0.3
|
||||
|
||||
def test_cluster_detection(self):
|
||||
"""Should detect clusters of similar directions."""
|
||||
torch.manual_seed(42)
|
||||
# Create two clusters
|
||||
d1 = torch.randn(32)
|
||||
d1 = d1 / d1.norm()
|
||||
d2 = torch.randn(32)
|
||||
d2 = d2 / d2.norm()
|
||||
|
||||
directions = {
|
||||
0: d1, 1: d1 + 0.01 * torch.randn(32),
|
||||
2: d1 + 0.01 * torch.randn(32),
|
||||
3: d2, 4: d2 + 0.01 * torch.randn(32),
|
||||
}
|
||||
# Normalize
|
||||
directions = {k: v / v.norm() for k, v in directions.items()}
|
||||
|
||||
analyzer = CrossLayerAlignmentAnalyzer(cluster_threshold=0.9)
|
||||
result = analyzer.analyze(directions)
|
||||
|
||||
# Should find at least 2 clusters
|
||||
assert result.cluster_count >= 2
|
||||
|
||||
def test_empty_input(self):
|
||||
"""Should handle empty input gracefully."""
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze({})
|
||||
assert result.layer_indices == []
|
||||
assert result.cluster_count == 0
|
||||
|
||||
def test_single_layer(self):
|
||||
"""Single layer should work fine."""
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze({5: torch.randn(16)})
|
||||
assert result.layer_indices == [5]
|
||||
assert result.direction_persistence_score == 1.0
|
||||
|
||||
def test_strong_layers_filter(self):
|
||||
"""Should only analyze specified strong layers."""
|
||||
directions = {i: torch.randn(16) for i in range(10)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions, strong_layers=[2, 5, 7])
|
||||
assert result.layer_indices == [2, 5, 7]
|
||||
assert result.cosine_matrix.shape == (3, 3)
|
||||
|
||||
def test_cosine_matrix_symmetry(self):
|
||||
"""Cosine matrix should be symmetric."""
|
||||
torch.manual_seed(42)
|
||||
directions = {i: torch.randn(16) for i in range(4)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
diff = (result.cosine_matrix - result.cosine_matrix.T).abs().max().item()
|
||||
assert diff < 1e-5
|
||||
|
||||
def test_cosine_matrix_diagonal_ones(self):
|
||||
"""Diagonal of cosine matrix should be 1.0."""
|
||||
torch.manual_seed(42)
|
||||
directions = {i: torch.randn(16) for i in range(4)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
for i in range(4):
|
||||
assert abs(result.cosine_matrix[i, i].item() - 1.0) < 1e-4
|
||||
|
||||
def test_angular_drift_monotonic(self):
|
||||
"""Angular drift should be monotonically non-decreasing."""
|
||||
torch.manual_seed(42)
|
||||
directions = {i: torch.randn(16) for i in range(6)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
for i in range(len(result.angular_drift) - 1):
|
||||
assert result.angular_drift[i + 1] >= result.angular_drift[i] - 1e-6
|
||||
|
||||
def test_format_report(self):
|
||||
"""Format report should produce a non-empty string."""
|
||||
torch.manual_seed(42)
|
||||
directions = {i: torch.randn(16) for i in range(4)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
report = CrossLayerAlignmentAnalyzer.format_report(result)
|
||||
assert "Cross-Layer" in report
|
||||
assert "persistence" in report
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ActivationProbe
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestActivationProbe:
|
||||
def test_clean_elimination(self):
|
||||
"""After removing direction, projections should be near-zero."""
|
||||
torch.manual_seed(42)
|
||||
hidden_dim = 32
|
||||
refusal_dir = torch.randn(hidden_dim)
|
||||
refusal_dir = refusal_dir / refusal_dir.norm()
|
||||
|
||||
# "Post-abliteration" activations: direction has been removed
|
||||
harmless = [torch.randn(hidden_dim) for _ in range(10)]
|
||||
harmful = [torch.randn(hidden_dim) for _ in range(10)]
|
||||
# Both sets are random, no refusal signal => gap should be small
|
||||
|
||||
probe = ActivationProbe()
|
||||
result = probe.probe_layer(harmful, harmless, refusal_dir)
|
||||
assert abs(result.projection_gap) < 1.0
|
||||
assert result.separation_d_prime < 2.0
|
||||
|
||||
def test_residual_detection(self):
|
||||
"""Should detect residual refusal signal when direction wasn't removed."""
|
||||
torch.manual_seed(42)
|
||||
hidden_dim = 32
|
||||
refusal_dir = torch.randn(hidden_dim)
|
||||
refusal_dir = refusal_dir / refusal_dir.norm()
|
||||
|
||||
harmless = [torch.randn(hidden_dim) for _ in range(10)]
|
||||
# Harmful still has strong refusal direction component
|
||||
harmful = [h + 5.0 * refusal_dir for h in harmless]
|
||||
|
||||
probe = ActivationProbe()
|
||||
result = probe.probe_layer(harmful, harmless, refusal_dir)
|
||||
assert abs(result.projection_gap) > 1.0
|
||||
assert result.separation_d_prime > 2.0
|
||||
|
||||
def test_probe_all_layers(self):
|
||||
"""Should compute aggregate metrics across layers."""
|
||||
torch.manual_seed(42)
|
||||
hidden_dim = 16
|
||||
n_layers = 4
|
||||
|
||||
harmful_acts = {}
|
||||
harmless_acts = {}
|
||||
refusal_dirs = {}
|
||||
|
||||
for layer in range(n_layers):
|
||||
harmful_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
|
||||
harmless_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)]
|
||||
d = torch.randn(hidden_dim)
|
||||
refusal_dirs[layer] = d / d.norm()
|
||||
|
||||
probe = ActivationProbe()
|
||||
result = probe.probe_all_layers(harmful_acts, harmless_acts, refusal_dirs)
|
||||
|
||||
assert isinstance(result, ProbeResult)
|
||||
assert len(result.per_layer) == n_layers
|
||||
assert 0 <= result.refusal_elimination_score <= 1.0
|
||||
assert result.mean_projection_gap >= 0
|
||||
|
||||
def test_res_score_range(self):
|
||||
"""RES should always be between 0 and 1."""
|
||||
torch.manual_seed(42)
|
||||
for seed in range(5):
|
||||
torch.manual_seed(seed)
|
||||
harmful = {0: [torch.randn(8) for _ in range(3)]}
|
||||
harmless = {0: [torch.randn(8) for _ in range(3)]}
|
||||
dirs = {0: torch.randn(8)}
|
||||
dirs[0] = dirs[0] / dirs[0].norm()
|
||||
|
||||
probe = ActivationProbe()
|
||||
result = probe.probe_all_layers(harmful, harmless, dirs)
|
||||
assert 0 <= result.refusal_elimination_score <= 1.0
|
||||
|
||||
def test_format_report(self):
|
||||
"""Format report should produce readable output."""
|
||||
torch.manual_seed(42)
|
||||
harmful = {0: [torch.randn(8) for _ in range(3)]}
|
||||
harmless = {0: [torch.randn(8) for _ in range(3)]}
|
||||
dirs = {0: torch.randn(8)}
|
||||
|
||||
probe = ActivationProbe()
|
||||
result = probe.probe_all_layers(harmful, harmless, dirs)
|
||||
report = ActivationProbe.format_report(result)
|
||||
assert "Refusal Elimination Score" in report
|
||||
|
||||
def test_empty_input(self):
|
||||
"""Should handle empty input gracefully."""
|
||||
probe = ActivationProbe()
|
||||
result = probe.probe_all_layers({}, {}, {})
|
||||
assert result.refusal_elimination_score == 0.0
|
||||
assert len(result.per_layer) == 0
|
||||
@@ -0,0 +1,65 @@
|
||||
"""Tests for shared analysis utilities (gini_coefficient, etc.)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from obliteratus.analysis.utils import gini_coefficient
|
||||
|
||||
|
||||
class TestGiniCoefficient:
|
||||
"""Tests for the Gini coefficient computation."""
|
||||
|
||||
def test_empty_list(self):
|
||||
assert gini_coefficient([]) == 0.0
|
||||
|
||||
def test_single_value(self):
|
||||
assert gini_coefficient([42.0]) == 0.0
|
||||
|
||||
def test_uniform_distribution(self):
|
||||
"""All-equal values → Gini = 0."""
|
||||
assert gini_coefficient([1.0, 1.0, 1.0, 1.0]) == pytest.approx(0.0, abs=1e-10)
|
||||
|
||||
def test_maximally_concentrated(self):
|
||||
"""One value, rest zero → Gini ≈ 1."""
|
||||
result = gini_coefficient([100.0, 0.0, 0.0, 0.0])
|
||||
assert result > 0.7 # For n=4, max Gini = (n-1)/n = 0.75
|
||||
|
||||
def test_all_zeros(self):
|
||||
assert gini_coefficient([0.0, 0.0, 0.0]) == 0.0
|
||||
|
||||
def test_two_equal_values(self):
|
||||
assert gini_coefficient([5.0, 5.0]) == pytest.approx(0.0, abs=1e-10)
|
||||
|
||||
def test_two_unequal_values(self):
|
||||
"""[0, 10] → Gini = 0.5 for n=2."""
|
||||
result = gini_coefficient([0.0, 10.0])
|
||||
assert result == pytest.approx(0.5, abs=0.01)
|
||||
|
||||
def test_moderate_inequality(self):
|
||||
"""Moderate spread → Gini between 0 and 1."""
|
||||
result = gini_coefficient([1.0, 2.0, 3.0, 4.0, 5.0])
|
||||
assert 0.1 < result < 0.5
|
||||
|
||||
def test_result_in_valid_range(self):
|
||||
"""Gini is always in [0, 1]."""
|
||||
for vals in [[1, 2, 3], [0, 0, 100], [5, 5, 5], [1], [0.1, 0.9]]:
|
||||
result = gini_coefficient(vals)
|
||||
assert 0.0 <= result <= 1.0, f"Gini({vals}) = {result} out of range"
|
||||
|
||||
def test_large_uniform(self):
|
||||
"""Large uniform distribution → Gini ≈ 0."""
|
||||
vals = [1.0] * 1000
|
||||
assert gini_coefficient(vals) == pytest.approx(0.0, abs=1e-10)
|
||||
|
||||
def test_large_concentrated(self):
|
||||
"""Large distribution with one outlier → high Gini."""
|
||||
vals = [0.0] * 999 + [1000.0]
|
||||
result = gini_coefficient(vals)
|
||||
assert result > 0.99
|
||||
|
||||
def test_order_invariant(self):
|
||||
"""Gini should not depend on input order."""
|
||||
a = gini_coefficient([1.0, 3.0, 5.0, 7.0])
|
||||
b = gini_coefficient([7.0, 1.0, 5.0, 3.0])
|
||||
assert a == pytest.approx(b)
|
||||
@@ -0,0 +1,598 @@
|
||||
"""Tests for architecture-aware preset defaults.
|
||||
|
||||
Tests the detection logic and recommended parameter overrides for each
|
||||
architecture class (dense/MoE, standard/reasoning).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
from obliteratus.architecture_profiles import (
|
||||
ArchitectureClass,
|
||||
ArchitectureProfile,
|
||||
ReasoningClass,
|
||||
detect_architecture,
|
||||
get_profile_summary,
|
||||
apply_profile_to_method_config,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detection: Dense models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDenseDetection:
|
||||
"""Test that standard dense models are correctly classified."""
|
||||
|
||||
def test_llama_is_dense(self):
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
assert profile.reasoning_class == ReasoningClass.STANDARD
|
||||
assert not profile.is_moe
|
||||
|
||||
def test_qwen_dense_is_dense(self):
|
||||
profile = detect_architecture("Qwen/Qwen2.5-7B-Instruct")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
assert not profile.is_moe
|
||||
|
||||
def test_gemma_is_dense(self):
|
||||
profile = detect_architecture("google/gemma-3-27b-it")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
|
||||
def test_phi_is_dense(self):
|
||||
profile = detect_architecture("microsoft/Phi-4-mini-instruct")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
|
||||
def test_mistral_small_is_dense(self):
|
||||
profile = detect_architecture("mistralai/Mistral-Small-24B-Instruct-2501")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
|
||||
def test_yi_is_dense(self):
|
||||
profile = detect_architecture("01-ai/Yi-1.5-9B-Chat")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
|
||||
def test_dense_label(self):
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert profile.profile_label == "Dense Standard"
|
||||
|
||||
def test_dense_recommended_method(self):
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert profile.recommended_method == "aggressive"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detection: MoE models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMoEDetection:
|
||||
"""Test that MoE models are correctly classified."""
|
||||
|
||||
def test_gpt_oss_is_moe(self):
|
||||
"""GPT-OSS is MoE. Without config, defaults to small (conservative)."""
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
assert profile.is_moe
|
||||
assert profile.arch_class == ArchitectureClass.SMALL_MOE
|
||||
|
||||
def test_qwen3_30b_is_small_moe(self):
|
||||
profile = detect_architecture("Qwen/Qwen3-30B-A3B")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_deepseek_v3_is_large_moe(self):
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-V3.2")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_kimi_k2_is_large_moe(self):
|
||||
profile = detect_architecture("moonshotai/Kimi-K2-Instruct")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_qwen3_235b_is_moe(self):
|
||||
profile = detect_architecture("Qwen/Qwen3-235B-A22B")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_glm_47_is_moe(self):
|
||||
profile = detect_architecture("zai-org/GLM-4.7")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_llama4_maverick_is_moe(self):
|
||||
profile = detect_architecture("meta-llama/Llama-4-Maverick-17B-128E-Instruct")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_step_flash_is_moe(self):
|
||||
profile = detect_architecture("stepfun-ai/Step-3.5-Flash")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_minimax_is_moe(self):
|
||||
profile = detect_architecture("MiniMaxAI/MiniMax-M2.1")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_mistral_large_3_is_moe(self):
|
||||
profile = detect_architecture("mistralai/Mistral-Large-3-675B-Instruct-2512")
|
||||
assert profile.is_moe
|
||||
|
||||
def test_moe_recommended_method_is_surgical(self):
|
||||
"""All MoE profiles recommend surgical method."""
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
assert profile.recommended_method == "surgical"
|
||||
|
||||
def test_gpt_oss_with_config_is_small_moe(self):
|
||||
"""GPT-OSS with config providing expert count → small MoE."""
|
||||
class MockConfig:
|
||||
model_type = "gpt_neox"
|
||||
num_hidden_layers = 32
|
||||
hidden_size = 2560
|
||||
intermediate_size = 6912
|
||||
vocab_size = 50304
|
||||
num_local_experts = 8
|
||||
num_experts_per_tok = 2
|
||||
profile = detect_architecture("openai/gpt-oss-20b", config=MockConfig())
|
||||
assert profile.is_moe
|
||||
assert profile.arch_class == ArchitectureClass.SMALL_MOE
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detection: Reasoning models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestReasoningDetection:
|
||||
"""Test that reasoning models are correctly classified."""
|
||||
|
||||
def test_r1_distill_qwen_is_reasoning(self):
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
|
||||
assert profile.reasoning_class == ReasoningClass.REASONING
|
||||
|
||||
def test_r1_distill_llama_is_reasoning(self):
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
|
||||
assert profile.reasoning_class == ReasoningClass.REASONING
|
||||
|
||||
def test_r1_distill_is_dense_reasoning(self):
|
||||
"""R1 distills are dense (distilled from MoE into dense)."""
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
assert profile.reasoning_class == ReasoningClass.REASONING
|
||||
assert profile.profile_label == "Dense Reasoning"
|
||||
|
||||
def test_olmo_think_is_reasoning(self):
|
||||
profile = detect_architecture("allenai/Olmo-3.1-32B-Think")
|
||||
assert profile.reasoning_class == ReasoningClass.REASONING
|
||||
|
||||
def test_olmo_standard_is_not_reasoning(self):
|
||||
"""OLMo (without Think) must NOT be classified as reasoning.
|
||||
Regression test: 'olmo' contains 'o1' substring."""
|
||||
profile = detect_architecture("allenai/Olmo-3-7B-Instruct")
|
||||
assert profile.reasoning_class == ReasoningClass.STANDARD
|
||||
|
||||
def test_falcon3_is_not_reasoning(self):
|
||||
"""falcon3 must NOT match 'o3' reasoning pattern."""
|
||||
profile = detect_architecture("tiiuae/Falcon3-7B-Instruct")
|
||||
assert profile.reasoning_class == ReasoningClass.STANDARD
|
||||
|
||||
def test_full_r1_is_moe_reasoning(self):
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1")
|
||||
assert profile.is_moe
|
||||
assert profile.reasoning_class == ReasoningClass.REASONING
|
||||
|
||||
def test_reasoning_dense_more_directions(self):
|
||||
"""Dense reasoning models need more directions (>=12) to span refusal."""
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
assert profile.method_overrides.get("n_directions", 0) >= 12
|
||||
|
||||
def test_reasoning_dense_more_passes(self):
|
||||
"""Dense reasoning models need more refinement passes (>=4)."""
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
assert profile.method_overrides.get("refinement_passes", 0) >= 4
|
||||
|
||||
def test_non_reasoning_is_standard(self):
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert profile.reasoning_class == ReasoningClass.STANDARD
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detection with config object
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestConfigDetection:
|
||||
"""Test detection when a mock config is provided."""
|
||||
|
||||
def test_moe_config_attrs(self):
|
||||
"""Config with num_local_experts should be detected as MoE."""
|
||||
class MockConfig:
|
||||
model_type = "mixtral"
|
||||
num_hidden_layers = 32
|
||||
hidden_size = 4096
|
||||
intermediate_size = 14336
|
||||
vocab_size = 32000
|
||||
num_local_experts = 8
|
||||
num_experts_per_tok = 2
|
||||
|
||||
profile = detect_architecture(
|
||||
"custom/mixtral-model", config=MockConfig(),
|
||||
num_layers=32, hidden_size=4096,
|
||||
)
|
||||
assert profile.is_moe
|
||||
assert profile.num_experts == 8
|
||||
assert profile.num_active_experts == 2
|
||||
|
||||
def test_large_moe_threshold(self):
|
||||
"""MoE models with >100B params should be classified as large."""
|
||||
class MockConfig:
|
||||
model_type = "deepseek_v3"
|
||||
num_hidden_layers = 61
|
||||
hidden_size = 7168
|
||||
intermediate_size = 18432
|
||||
vocab_size = 102400
|
||||
n_routed_experts = 256
|
||||
num_experts_per_tok = 8
|
||||
|
||||
profile = detect_architecture(
|
||||
"custom/large-moe", config=MockConfig(),
|
||||
)
|
||||
assert profile.arch_class == ArchitectureClass.LARGE_MOE
|
||||
|
||||
def test_small_moe_threshold(self):
|
||||
"""MoE models with <=16 experts should be classified as small."""
|
||||
class MockConfig:
|
||||
model_type = "mixtral"
|
||||
num_hidden_layers = 32
|
||||
hidden_size = 4096
|
||||
intermediate_size = 14336
|
||||
vocab_size = 32000
|
||||
num_local_experts = 8
|
||||
num_experts_per_tok = 2
|
||||
|
||||
profile = detect_architecture(
|
||||
"custom/small-moe", config=MockConfig(),
|
||||
)
|
||||
assert profile.arch_class == ArchitectureClass.SMALL_MOE
|
||||
|
||||
def test_dense_config(self):
|
||||
"""Config without MoE attributes should be dense."""
|
||||
class MockConfig:
|
||||
model_type = "llama"
|
||||
num_hidden_layers = 32
|
||||
hidden_size = 4096
|
||||
intermediate_size = 11008
|
||||
vocab_size = 32000
|
||||
|
||||
profile = detect_architecture(
|
||||
"custom/dense-model", config=MockConfig(),
|
||||
)
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
assert not profile.is_moe
|
||||
|
||||
def test_llama4_scout_is_large_moe(self):
|
||||
"""Llama 4 Scout: 109B total params with 16 experts → LARGE_MOE.
|
||||
Regression test: params > 100B must override low expert count."""
|
||||
class MockConfig:
|
||||
model_type = "llama4"
|
||||
num_hidden_layers = 48
|
||||
hidden_size = 5120
|
||||
intermediate_size = 14336
|
||||
vocab_size = 202048
|
||||
num_local_experts = 16
|
||||
num_experts_per_tok = 1
|
||||
|
||||
profile = detect_architecture(
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
config=MockConfig(),
|
||||
)
|
||||
assert profile.is_moe
|
||||
assert profile.arch_class == ArchitectureClass.LARGE_MOE
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recommended defaults validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRecommendedDefaults:
|
||||
"""Test that recommended defaults match research findings."""
|
||||
|
||||
def test_dense_standard_no_riemannian(self):
|
||||
"""Dense Standard: Riemannian OFF (manifolds are flat)."""
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert not profile.breakthrough_modules.get("riemannian", True)
|
||||
|
||||
def test_dense_standard_anti_ouroboros_on(self):
|
||||
"""Dense Standard: Anti-Ouroboros ON for self-repair mapping."""
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert profile.breakthrough_modules.get("anti_ouroboros", False)
|
||||
|
||||
def test_dense_standard_spectral_cert_on(self):
|
||||
"""Dense Standard: Spectral cert ON for verification."""
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert profile.breakthrough_modules.get("spectral_cert", False)
|
||||
|
||||
def test_moe_conditional_on(self):
|
||||
"""MoE: Conditional abliteration is #1 technique (Cracken AI 2025)."""
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
assert profile.breakthrough_modules.get("conditional", False)
|
||||
|
||||
def test_moe_no_project_embeddings(self):
|
||||
"""MoE: Project embeddings OFF (cascades through router)."""
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
assert not profile.method_overrides.get("project_embeddings", True)
|
||||
|
||||
def test_moe_per_expert_directions(self):
|
||||
"""MoE: Per-expert directions ON (global directions fail on MoE)."""
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
assert profile.method_overrides.get("per_expert_directions", False)
|
||||
|
||||
def test_large_moe_riemannian_on(self):
|
||||
"""Large MoE: Riemannian ON (curved shared layer geometry)."""
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-V3.2")
|
||||
assert profile.breakthrough_modules.get("riemannian", False)
|
||||
|
||||
def test_reasoning_dense_jailbreak_contrast(self):
|
||||
"""Reasoning Dense: Jailbreak contrast ON for thinking-chain refusal."""
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
|
||||
assert profile.method_overrides.get("use_jailbreak_contrast", False)
|
||||
|
||||
def test_reasoning_moe_gentle_transplant(self):
|
||||
"""Reasoning MoE: transplant_blend very low (preserve reasoning)."""
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1")
|
||||
assert profile.method_overrides.get("transplant_blend", 1.0) <= 0.10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Profile summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProfileSummary:
|
||||
"""Test the human-readable profile summary."""
|
||||
|
||||
def test_summary_contains_profile_label(self):
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
summary = get_profile_summary(profile)
|
||||
assert "Dense Standard" in summary
|
||||
|
||||
def test_summary_contains_method(self):
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
summary = get_profile_summary(profile)
|
||||
assert "aggressive" in summary
|
||||
|
||||
def test_summary_contains_citations(self):
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
summary = get_profile_summary(profile)
|
||||
assert "SAFEx" in summary or "Cracken" in summary
|
||||
|
||||
def test_summary_contains_moe_info(self):
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
summary = get_profile_summary(profile)
|
||||
assert "MoE" in summary
|
||||
|
||||
def test_summary_contains_breakthrough_modules(self):
|
||||
profile = detect_architecture("openai/gpt-oss-20b")
|
||||
summary = get_profile_summary(profile)
|
||||
assert "conditional" in summary
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# apply_profile_to_method_config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestApplyProfile:
|
||||
"""Test that profile overrides are correctly applied to method configs."""
|
||||
|
||||
def test_overrides_applied(self):
|
||||
from obliteratus.abliterate import METHODS
|
||||
profile = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
|
||||
base = dict(METHODS["aggressive"])
|
||||
merged = apply_profile_to_method_config(profile, base)
|
||||
assert merged["n_directions"] == profile.method_overrides["n_directions"]
|
||||
|
||||
def test_non_overridden_preserved(self):
|
||||
from obliteratus.abliterate import METHODS
|
||||
profile = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
base = dict(METHODS["aggressive"])
|
||||
merged = apply_profile_to_method_config(profile, base)
|
||||
# norm_preserve is not in overrides, should come from base
|
||||
assert merged["norm_preserve"] == base["norm_preserve"]
|
||||
|
||||
def test_empty_overrides(self):
|
||||
from obliteratus.abliterate import METHODS
|
||||
base = dict(METHODS["advanced"])
|
||||
profile = ArchitectureProfile(
|
||||
arch_class=ArchitectureClass.DENSE,
|
||||
reasoning_class=ReasoningClass.STANDARD,
|
||||
method_overrides={},
|
||||
breakthrough_modules={},
|
||||
)
|
||||
merged = apply_profile_to_method_config(profile, base)
|
||||
assert merged == base
|
||||
|
||||
def test_override_key_not_in_base_is_added(self):
|
||||
"""Override keys absent from base config should be added to result.
|
||||
|
||||
This is important for the UI auto-detect path: keys like
|
||||
use_jailbreak_contrast may not exist in the base method config
|
||||
but are valid pipeline parameters that app.py reads via merged.get().
|
||||
"""
|
||||
from obliteratus.abliterate import METHODS
|
||||
base = dict(METHODS["advanced"])
|
||||
profile = ArchitectureProfile(
|
||||
arch_class=ArchitectureClass.DENSE,
|
||||
reasoning_class=ReasoningClass.STANDARD,
|
||||
method_overrides={"use_jailbreak_contrast": True},
|
||||
breakthrough_modules={},
|
||||
)
|
||||
merged = apply_profile_to_method_config(profile, base)
|
||||
assert merged["use_jailbreak_contrast"] is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# All 6 profile combinations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAllSixProfiles:
|
||||
"""Verify label, method, overrides, and breakthrough modules for each profile."""
|
||||
|
||||
def _make_moe_config(self, num_experts=8, active=2, layers=32, hidden=4096):
|
||||
class C:
|
||||
model_type = "mixtral"
|
||||
num_hidden_layers = layers
|
||||
hidden_size = hidden
|
||||
intermediate_size = hidden * 4
|
||||
vocab_size = 32000
|
||||
num_local_experts = num_experts
|
||||
num_experts_per_tok = active
|
||||
return C()
|
||||
|
||||
def test_dense_standard_full(self):
|
||||
p = detect_architecture("meta-llama/Llama-3.1-8B-Instruct")
|
||||
assert p.profile_label == "Dense Standard"
|
||||
assert p.recommended_method == "aggressive"
|
||||
assert not p.breakthrough_modules["riemannian"]
|
||||
assert p.breakthrough_modules["anti_ouroboros"]
|
||||
assert p.breakthrough_modules["spectral_cert"]
|
||||
assert not p.breakthrough_modules["conditional"]
|
||||
assert len(p.profile_description) > 0
|
||||
assert len(p.research_citations) > 0
|
||||
|
||||
def test_dense_reasoning_full(self):
|
||||
p = detect_architecture("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
|
||||
assert p.profile_label == "Dense Reasoning"
|
||||
assert p.recommended_method == "aggressive"
|
||||
assert p.method_overrides["n_directions"] >= 12
|
||||
assert p.method_overrides["refinement_passes"] >= 4
|
||||
assert p.method_overrides["use_jailbreak_contrast"] is True
|
||||
assert p.method_overrides["use_chat_template"] is True
|
||||
assert p.breakthrough_modules["anti_ouroboros"]
|
||||
assert p.breakthrough_modules["riemannian"]
|
||||
assert p.breakthrough_modules["conditional"]
|
||||
assert p.breakthrough_modules["spectral_cert"]
|
||||
assert len(p.profile_description) > 0
|
||||
|
||||
def test_small_moe_standard_full(self):
|
||||
config = self._make_moe_config(num_experts=8, active=2)
|
||||
p = detect_architecture("custom/small-moe-model", config=config)
|
||||
assert p.profile_label == "Small MoE Standard"
|
||||
assert p.arch_class == ArchitectureClass.SMALL_MOE
|
||||
assert p.recommended_method == "surgical"
|
||||
assert p.method_overrides["per_expert_directions"] is True
|
||||
assert p.method_overrides["invert_refusal"] is False
|
||||
assert p.method_overrides["project_embeddings"] is False
|
||||
assert p.breakthrough_modules["conditional"]
|
||||
assert p.breakthrough_modules["anti_ouroboros"]
|
||||
assert p.breakthrough_modules["spectral_cert"]
|
||||
assert not p.breakthrough_modules["riemannian"]
|
||||
assert len(p.profile_description) > 0
|
||||
|
||||
def test_small_moe_reasoning_full(self):
|
||||
"""The most fragile combination: MoE + reasoning."""
|
||||
config = self._make_moe_config(num_experts=8, active=2)
|
||||
# Add "think" to name to trigger reasoning detection
|
||||
p = detect_architecture("custom/small-moe-think-model", config=config)
|
||||
assert p.profile_label == "Small MoE Reasoning"
|
||||
assert p.arch_class == ArchitectureClass.SMALL_MOE
|
||||
assert p.reasoning_class == ReasoningClass.REASONING
|
||||
assert p.recommended_method == "surgical"
|
||||
assert p.method_overrides["per_expert_directions"] is True
|
||||
assert p.method_overrides["use_jailbreak_contrast"] is True
|
||||
assert p.method_overrides["use_chat_template"] is True
|
||||
assert p.method_overrides["invert_refusal"] is False
|
||||
assert p.breakthrough_modules["conditional"]
|
||||
assert p.breakthrough_modules["anti_ouroboros"]
|
||||
assert p.breakthrough_modules["spectral_cert"]
|
||||
assert len(p.profile_description) > 0
|
||||
|
||||
def test_large_moe_standard_full(self):
|
||||
config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168)
|
||||
p = detect_architecture("custom/large-moe-model", config=config)
|
||||
assert p.profile_label == "Large MoE Standard"
|
||||
assert p.arch_class == ArchitectureClass.LARGE_MOE
|
||||
assert p.recommended_method == "surgical"
|
||||
assert p.method_overrides["per_expert_directions"] is True
|
||||
assert p.method_overrides["layer_adaptive_strength"] is True
|
||||
assert p.method_overrides["expert_transplant"] is True
|
||||
assert p.method_overrides["transplant_blend"] == 0.10
|
||||
assert p.method_overrides["attention_head_surgery"] is True
|
||||
assert p.method_overrides["project_embeddings"] is False
|
||||
assert p.breakthrough_modules["conditional"]
|
||||
assert p.breakthrough_modules["riemannian"]
|
||||
assert p.breakthrough_modules["anti_ouroboros"]
|
||||
assert p.breakthrough_modules["spectral_cert"]
|
||||
assert len(p.profile_description) > 0
|
||||
|
||||
def test_large_moe_reasoning_full(self):
|
||||
config = self._make_moe_config(num_experts=256, active=8, layers=61, hidden=7168)
|
||||
p = detect_architecture("custom/large-moe-r1-model", config=config)
|
||||
assert p.profile_label == "Large MoE Reasoning"
|
||||
assert p.arch_class == ArchitectureClass.LARGE_MOE
|
||||
assert p.reasoning_class == ReasoningClass.REASONING
|
||||
assert p.recommended_method == "surgical"
|
||||
assert p.method_overrides["n_directions"] == 8
|
||||
assert p.method_overrides["transplant_blend"] == 0.08
|
||||
assert p.method_overrides["use_jailbreak_contrast"] is True
|
||||
assert p.method_overrides["safety_neuron_masking"] is True
|
||||
assert p.breakthrough_modules["conditional"]
|
||||
assert p.breakthrough_modules["riemannian"]
|
||||
assert p.breakthrough_modules["anti_ouroboros"]
|
||||
assert p.breakthrough_modules["spectral_cert"]
|
||||
assert len(p.profile_description) > 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestEdgeCases:
|
||||
"""Edge cases for architecture detection."""
|
||||
|
||||
def test_empty_model_name(self):
|
||||
"""Empty string should fall through to Dense Standard."""
|
||||
profile = detect_architecture("")
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
assert profile.reasoning_class == ReasoningClass.STANDARD
|
||||
|
||||
def test_unknown_model_type_in_config(self):
|
||||
"""Unknown model_type should not cause MoE classification."""
|
||||
class MockConfig:
|
||||
model_type = "banana"
|
||||
num_hidden_layers = 12
|
||||
hidden_size = 768
|
||||
intermediate_size = 3072
|
||||
vocab_size = 30522
|
||||
profile = detect_architecture("custom/unknown-arch", config=MockConfig())
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
|
||||
def test_config_with_zero_experts(self):
|
||||
"""num_local_experts=0 should not trigger MoE."""
|
||||
class MockConfig:
|
||||
model_type = "llama"
|
||||
num_hidden_layers = 32
|
||||
hidden_size = 4096
|
||||
intermediate_size = 11008
|
||||
vocab_size = 32000
|
||||
num_local_experts = 0
|
||||
profile = detect_architecture("custom/dense-with-zero", config=MockConfig())
|
||||
assert not profile.is_moe
|
||||
assert profile.arch_class == ArchitectureClass.DENSE
|
||||
|
||||
def test_allcaps_model_name(self):
|
||||
"""Case-insensitive matching should work for all-caps names."""
|
||||
profile = detect_architecture("DEEPSEEK-AI/DEEPSEEK-R1-DISTILL-QWEN-7B")
|
||||
assert profile.reasoning_class == ReasoningClass.REASONING
|
||||
assert profile.arch_class == ArchitectureClass.DENSE # distill = dense
|
||||
|
||||
def test_single_expert_is_moe(self):
|
||||
"""num_local_experts=1 is technically MoE (single expert)."""
|
||||
class MockConfig:
|
||||
model_type = "llama"
|
||||
num_hidden_layers = 32
|
||||
hidden_size = 4096
|
||||
intermediate_size = 11008
|
||||
vocab_size = 32000
|
||||
num_local_experts = 1
|
||||
profile = detect_architecture("custom/single-expert", config=MockConfig())
|
||||
# 1 expert still triggers MoE detection (the code treats any >0 as MoE)
|
||||
assert profile.is_moe
|
||||
@@ -0,0 +1,183 @@
|
||||
"""Tests for lightweight benchmark harnesses."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.evaluation.benchmarks import (
|
||||
KNOWLEDGE_ITEMS,
|
||||
TRUTHFULNESS_ITEMS,
|
||||
MATH_REASONING_ITEMS,
|
||||
BenchmarkRunner,
|
||||
BenchmarkResult,
|
||||
format_benchmark_report,
|
||||
)
|
||||
|
||||
|
||||
def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64):
|
||||
"""Create mock model and tokenizer for benchmark testing."""
|
||||
model = MagicMock()
|
||||
|
||||
# Model returns logits when called
|
||||
def mock_forward(**kwargs):
|
||||
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
|
||||
batch_size, seq_len = input_ids.shape
|
||||
result = MagicMock()
|
||||
result.logits = torch.randn(batch_size, seq_len, vocab_size)
|
||||
return result
|
||||
|
||||
model.side_effect = mock_forward
|
||||
model.__call__ = mock_forward
|
||||
|
||||
# Model.generate returns token IDs
|
||||
def mock_generate(**kwargs):
|
||||
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
|
||||
# Append some "generated" tokens
|
||||
gen_tokens = torch.randint(0, vocab_size, (1, 20))
|
||||
return torch.cat([input_ids, gen_tokens], dim=1)
|
||||
|
||||
model.generate = mock_generate
|
||||
|
||||
# Model.parameters for device detection
|
||||
param = torch.nn.Parameter(torch.randn(1))
|
||||
model.parameters = MagicMock(return_value=iter([param]))
|
||||
|
||||
tokenizer = MagicMock()
|
||||
tokenizer.return_value = {
|
||||
"input_ids": torch.randint(0, vocab_size, (1, 15)),
|
||||
"attention_mask": torch.ones(1, 15, dtype=torch.long),
|
||||
}
|
||||
tokenizer.side_effect = lambda text, **kwargs: {
|
||||
"input_ids": torch.randint(0, vocab_size, (1, 15)),
|
||||
"attention_mask": torch.ones(1, 15, dtype=torch.long),
|
||||
}
|
||||
|
||||
def mock_decode(ids, **kwargs):
|
||||
return "The answer is 42. This is a generated response about the topic."
|
||||
|
||||
def mock_encode(text, **kwargs):
|
||||
# Return different IDs for A, B, C, D
|
||||
if text == "A":
|
||||
return [65]
|
||||
elif text == "B":
|
||||
return [66]
|
||||
elif text == "C":
|
||||
return [67]
|
||||
elif text == "D":
|
||||
return [68]
|
||||
return [hash(text) % vocab_size]
|
||||
|
||||
tokenizer.decode = mock_decode
|
||||
tokenizer.encode = mock_encode
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
class TestBenchmarkItems:
|
||||
def test_knowledge_items_have_required_fields(self):
|
||||
for item in KNOWLEDGE_ITEMS:
|
||||
assert "q" in item
|
||||
assert "choices" in item
|
||||
assert "answer" in item
|
||||
assert "category" in item
|
||||
assert 0 <= item["answer"] < len(item["choices"])
|
||||
|
||||
def test_knowledge_items_count(self):
|
||||
assert len(KNOWLEDGE_ITEMS) >= 20
|
||||
|
||||
def test_knowledge_categories(self):
|
||||
categories = set(item["category"] for item in KNOWLEDGE_ITEMS)
|
||||
assert len(categories) >= 4 # multiple categories
|
||||
|
||||
def test_truthfulness_items_have_required_fields(self):
|
||||
for item in TRUTHFULNESS_ITEMS:
|
||||
assert "q" in item
|
||||
assert "true_answer" in item
|
||||
assert "common_false" in item
|
||||
assert "category" in item
|
||||
|
||||
def test_truthfulness_items_count(self):
|
||||
assert len(TRUTHFULNESS_ITEMS) >= 10
|
||||
|
||||
def test_math_items_have_required_fields(self):
|
||||
for item in MATH_REASONING_ITEMS:
|
||||
assert "q" in item
|
||||
assert "answer" in item
|
||||
assert "category" in item
|
||||
assert isinstance(item["answer"], (int, float))
|
||||
|
||||
def test_math_items_count(self):
|
||||
assert len(MATH_REASONING_ITEMS) >= 10
|
||||
|
||||
|
||||
class TestBenchmarkRunner:
|
||||
def test_knowledge_probe_returns_result(self):
|
||||
model, tokenizer = _make_mock_model_and_tokenizer()
|
||||
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
||||
result = runner.run_knowledge_probe()
|
||||
|
||||
assert isinstance(result, BenchmarkResult)
|
||||
assert result.benchmark_name == "knowledge_probe"
|
||||
assert 0 <= result.score <= 1.0
|
||||
assert result.n_total == len(KNOWLEDGE_ITEMS)
|
||||
assert result.n_correct >= 0
|
||||
assert len(result.per_category) > 0
|
||||
|
||||
def test_truthfulness_probe_returns_result(self):
|
||||
model, tokenizer = _make_mock_model_and_tokenizer()
|
||||
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
||||
result = runner.run_truthfulness_probe()
|
||||
|
||||
assert isinstance(result, BenchmarkResult)
|
||||
assert result.benchmark_name == "truthfulness_probe"
|
||||
assert 0 <= result.score <= 1.0
|
||||
assert result.n_total == len(TRUTHFULNESS_ITEMS)
|
||||
|
||||
def test_math_probe_returns_result(self):
|
||||
model, tokenizer = _make_mock_model_and_tokenizer()
|
||||
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
||||
result = runner.run_math_reasoning_probe()
|
||||
|
||||
assert isinstance(result, BenchmarkResult)
|
||||
assert result.benchmark_name == "math_reasoning_probe"
|
||||
assert 0 <= result.score <= 1.0
|
||||
assert result.n_total == len(MATH_REASONING_ITEMS)
|
||||
|
||||
def test_run_all(self):
|
||||
model, tokenizer = _make_mock_model_and_tokenizer()
|
||||
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
||||
results = runner.run_all()
|
||||
|
||||
assert "knowledge" in results
|
||||
assert "truthfulness" in results
|
||||
assert "math_reasoning" in results
|
||||
|
||||
def test_format_report(self):
|
||||
model, tokenizer = _make_mock_model_and_tokenizer()
|
||||
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
||||
results = runner.run_all()
|
||||
report = format_benchmark_report(results)
|
||||
|
||||
assert "Capability" in report
|
||||
assert "knowledge" in report
|
||||
assert "truthfulness" in report
|
||||
assert "math" in report
|
||||
|
||||
def test_per_category_scores_bounded(self):
|
||||
model, tokenizer = _make_mock_model_and_tokenizer()
|
||||
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
||||
result = runner.run_knowledge_probe()
|
||||
|
||||
for cat, score in result.per_category.items():
|
||||
assert 0 <= score <= 1.0
|
||||
|
||||
def test_extract_number(self):
|
||||
model, tokenizer = _make_mock_model_and_tokenizer()
|
||||
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
||||
|
||||
assert runner._extract_number("The answer is 42.") == 42.0
|
||||
assert runner._extract_number("$20.50 is the price") == 20.50
|
||||
assert runner._extract_number("Result: -3.14") == -3.14
|
||||
assert runner._extract_number("No numbers here") is None
|
||||
@@ -0,0 +1,535 @@
|
||||
"""Tests for causal tracing, residual stream decomposition,
|
||||
probing classifiers, and cross-model transfer analysis."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.analysis.causal_tracing import (
|
||||
CausalRefusalTracer,
|
||||
CausalTracingResult,
|
||||
ComponentCausalEffect,
|
||||
)
|
||||
from obliteratus.analysis.residual_stream import (
|
||||
ResidualStreamDecomposer,
|
||||
ResidualStreamResult,
|
||||
LayerDecomposition,
|
||||
)
|
||||
from obliteratus.analysis.probing_classifiers import (
|
||||
LinearRefusalProbe,
|
||||
ProbeResult,
|
||||
ProbingSuiteResult,
|
||||
)
|
||||
from obliteratus.analysis.cross_model_transfer import (
|
||||
TransferAnalyzer,
|
||||
CrossModelResult,
|
||||
CrossCategoryResult,
|
||||
CrossLayerResult,
|
||||
UniversalityReport,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_layer_activations(
|
||||
n_layers=8, hidden_dim=32, refusal_strength=2.0,
|
||||
):
|
||||
"""Create synthetic per-layer activations with planted refusal signal."""
|
||||
torch.manual_seed(42)
|
||||
directions = {}
|
||||
activations = {}
|
||||
|
||||
base = torch.randn(hidden_dim) * 0.1
|
||||
|
||||
for i in range(n_layers):
|
||||
d = torch.randn(hidden_dim)
|
||||
d = d / d.norm()
|
||||
directions[i] = d
|
||||
|
||||
# Stronger refusal in middle layers
|
||||
strength = refusal_strength if 2 <= i <= 5 else 0.3
|
||||
activations[i] = base + strength * d + torch.randn(hidden_dim) * 0.05
|
||||
|
||||
return activations, directions
|
||||
|
||||
|
||||
def _make_separable_activations(
|
||||
n_per_class=20, hidden_dim=16, separation=3.0, seed=42,
|
||||
):
|
||||
"""Create harmful/harmless activations that are linearly separable."""
|
||||
torch.manual_seed(seed)
|
||||
direction = torch.randn(hidden_dim)
|
||||
direction = direction / direction.norm()
|
||||
|
||||
harmful = [
|
||||
torch.randn(hidden_dim) * 0.5 + separation * direction
|
||||
for _ in range(n_per_class)
|
||||
]
|
||||
harmless = [
|
||||
torch.randn(hidden_dim) * 0.5 - separation * direction
|
||||
for _ in range(n_per_class)
|
||||
]
|
||||
return harmful, harmless, direction
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Causal Tracing
|
||||
# ===========================================================================
|
||||
|
||||
class TestCausalTracing:
|
||||
def test_basic_tracing(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
tracer = CausalRefusalTracer(noise_level=3.0)
|
||||
result = tracer.trace_from_activations(activations, directions)
|
||||
|
||||
assert isinstance(result, CausalTracingResult)
|
||||
assert result.n_layers == 8
|
||||
assert result.clean_refusal_strength > 0
|
||||
assert len(result.component_effects) == 8
|
||||
|
||||
def test_causal_components_identified(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
tracer = CausalRefusalTracer(noise_level=3.0, causal_threshold=0.05)
|
||||
result = tracer.trace_from_activations(activations, directions)
|
||||
|
||||
assert result.circuit_size > 0
|
||||
assert result.circuit_fraction > 0
|
||||
assert len(result.causal_components) > 0
|
||||
|
||||
def test_corruption_reduces_strength(self):
|
||||
activations, directions = _make_layer_activations(refusal_strength=5.0)
|
||||
tracer = CausalRefusalTracer(noise_level=10.0)
|
||||
result = tracer.trace_from_activations(activations, directions)
|
||||
|
||||
# With high noise, corrupted should differ from clean
|
||||
assert result.total_corruption_effect != 0
|
||||
|
||||
def test_single_direction_input(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
single_dir = directions[3] # Use one direction for all layers
|
||||
tracer = CausalRefusalTracer()
|
||||
result = tracer.trace_from_activations(activations, single_dir)
|
||||
|
||||
assert result.n_layers == 8
|
||||
assert len(result.component_effects) == 8
|
||||
|
||||
def test_component_effects_structure(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
tracer = CausalRefusalTracer()
|
||||
result = tracer.trace_from_activations(activations, directions)
|
||||
|
||||
for e in result.component_effects:
|
||||
assert isinstance(e, ComponentCausalEffect)
|
||||
assert e.component_type == "full_layer"
|
||||
assert e.causal_effect >= 0
|
||||
|
||||
def test_correlation_causal_agreement_bounded(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
tracer = CausalRefusalTracer()
|
||||
result = tracer.trace_from_activations(activations, directions)
|
||||
assert -1.0 <= result.correlation_causal_agreement <= 1.0
|
||||
|
||||
def test_silent_contributors(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
tracer = CausalRefusalTracer()
|
||||
result = tracer.trace_from_activations(activations, directions)
|
||||
sc = tracer.identify_silent_contributors(result, top_k=3)
|
||||
|
||||
assert "silent_contributors" in sc
|
||||
assert "loud_non_contributors" in sc
|
||||
assert len(sc["silent_contributors"]) <= 3
|
||||
|
||||
def test_custom_component_types(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
tracer = CausalRefusalTracer()
|
||||
result = tracer.trace_from_activations(
|
||||
activations, directions,
|
||||
component_types=["attention", "mlp"],
|
||||
)
|
||||
# 8 layers * 2 types = 16 effects
|
||||
assert len(result.component_effects) == 16
|
||||
|
||||
def test_format_report(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
tracer = CausalRefusalTracer()
|
||||
result = tracer.trace_from_activations(activations, directions)
|
||||
report = CausalRefusalTracer.format_tracing_report(result)
|
||||
|
||||
assert "Causal Tracing" in report
|
||||
assert "Circuit size" in report
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Residual Stream Decomposition
|
||||
# ===========================================================================
|
||||
|
||||
class TestResidualStreamDecomposition:
|
||||
def test_basic_decomposition(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
decomposer = ResidualStreamDecomposer()
|
||||
result = decomposer.decompose(activations, directions)
|
||||
|
||||
assert isinstance(result, ResidualStreamResult)
|
||||
assert result.n_layers == 8
|
||||
assert len(result.per_layer) == 8
|
||||
assert result.total_attention_contribution > 0
|
||||
assert result.total_mlp_contribution > 0
|
||||
|
||||
def test_attention_fraction_bounded(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
decomposer = ResidualStreamDecomposer()
|
||||
result = decomposer.decompose(activations, directions)
|
||||
assert 0 <= result.attention_fraction <= 1.0
|
||||
|
||||
def test_with_head_count(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
decomposer = ResidualStreamDecomposer(n_heads_per_layer=4)
|
||||
result = decomposer.decompose(activations, directions)
|
||||
|
||||
assert result.n_refusal_heads >= 0
|
||||
assert len(result.refusal_heads) > 0
|
||||
|
||||
def test_layer_decomposition_structure(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
decomposer = ResidualStreamDecomposer()
|
||||
result = decomposer.decompose(activations, directions)
|
||||
|
||||
for _layer_idx, d in result.per_layer.items():
|
||||
assert isinstance(d, LayerDecomposition)
|
||||
assert 0 <= d.attn_mlp_ratio <= 1.0
|
||||
assert d.cumulative_refusal >= 0
|
||||
|
||||
def test_accumulation_profile(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
decomposer = ResidualStreamDecomposer()
|
||||
result = decomposer.decompose(activations, directions)
|
||||
|
||||
assert len(result.accumulation_profile) == 8
|
||||
# Accumulation should be monotonically non-decreasing
|
||||
for i in range(1, len(result.accumulation_profile)):
|
||||
assert result.accumulation_profile[i] >= result.accumulation_profile[i - 1]
|
||||
|
||||
def test_with_explicit_attn_mlp(self):
|
||||
"""Test with provided attention and MLP outputs."""
|
||||
torch.manual_seed(42)
|
||||
hidden_dim = 16
|
||||
n_layers = 4
|
||||
ref_dir = torch.randn(hidden_dim)
|
||||
ref_dir = ref_dir / ref_dir.norm()
|
||||
|
||||
acts = {}
|
||||
attn_outs = {}
|
||||
mlp_outs = {}
|
||||
for i in range(n_layers):
|
||||
attn = torch.randn(hidden_dim) * 0.5
|
||||
mlp = torch.randn(hidden_dim) * 0.5
|
||||
attn_outs[i] = attn
|
||||
mlp_outs[i] = mlp
|
||||
acts[i] = attn + mlp + (torch.randn(hidden_dim) * 0.1 if i == 0 else acts[i-1])
|
||||
|
||||
decomposer = ResidualStreamDecomposer()
|
||||
result = decomposer.decompose(
|
||||
acts, ref_dir,
|
||||
attn_outputs=attn_outs, mlp_outputs=mlp_outs,
|
||||
)
|
||||
assert len(result.per_layer) == n_layers
|
||||
|
||||
def test_single_direction(self):
|
||||
activations, _ = _make_layer_activations()
|
||||
single_dir = torch.randn(32)
|
||||
decomposer = ResidualStreamDecomposer()
|
||||
result = decomposer.decompose(activations, single_dir)
|
||||
assert result.n_layers == 8
|
||||
|
||||
def test_head_concentration_bounded(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
decomposer = ResidualStreamDecomposer(n_heads_per_layer=8)
|
||||
result = decomposer.decompose(activations, directions)
|
||||
assert 0 <= result.head_concentration <= 1.0
|
||||
|
||||
def test_format_decomposition(self):
|
||||
activations, directions = _make_layer_activations()
|
||||
decomposer = ResidualStreamDecomposer(n_heads_per_layer=4)
|
||||
result = decomposer.decompose(activations, directions)
|
||||
report = ResidualStreamDecomposer.format_decomposition(result)
|
||||
|
||||
assert "Residual Stream" in report
|
||||
assert "Attention" in report
|
||||
assert "MLP" in report
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Probing Classifiers
|
||||
# ===========================================================================
|
||||
|
||||
class TestProbingClassifiers:
|
||||
def test_separable_data_high_accuracy(self):
|
||||
"""With well-separated data, probe should achieve high accuracy."""
|
||||
harmful, harmless, direction = _make_separable_activations(
|
||||
n_per_class=30, separation=5.0,
|
||||
)
|
||||
probe = LinearRefusalProbe(n_epochs=200)
|
||||
result = probe.probe_layer(harmful, harmless, direction, layer_idx=5)
|
||||
|
||||
assert isinstance(result, ProbeResult)
|
||||
assert result.layer_idx == 5
|
||||
assert result.accuracy > 0.7 # Should be separable
|
||||
|
||||
def test_inseparable_data_low_accuracy(self):
|
||||
"""With overlapping data, probe should have lower accuracy."""
|
||||
harmful, harmless, direction = _make_separable_activations(
|
||||
n_per_class=30, separation=0.01,
|
||||
)
|
||||
probe = LinearRefusalProbe(n_epochs=50)
|
||||
result = probe.probe_layer(harmful, harmless, direction)
|
||||
# Accuracy should be near chance (0.5)
|
||||
assert result.accuracy < 0.9
|
||||
|
||||
def test_learned_direction_unit(self):
|
||||
harmful, harmless, direction = _make_separable_activations()
|
||||
probe = LinearRefusalProbe(n_epochs=100)
|
||||
result = probe.probe_layer(harmful, harmless, direction)
|
||||
assert abs(result.learned_direction.norm().item() - 1.0) < 0.01
|
||||
|
||||
def test_cosine_with_analytical(self):
|
||||
"""Learned direction should align with analytical direction."""
|
||||
harmful, harmless, direction = _make_separable_activations(
|
||||
n_per_class=50, separation=5.0,
|
||||
)
|
||||
probe = LinearRefusalProbe(n_epochs=300)
|
||||
result = probe.probe_layer(harmful, harmless, direction)
|
||||
# With clear separation, learned direction should agree
|
||||
assert result.cosine_with_analytical > 0.3
|
||||
|
||||
def test_without_analytical_direction(self):
|
||||
harmful, harmless, _ = _make_separable_activations()
|
||||
probe = LinearRefusalProbe(n_epochs=50)
|
||||
result = probe.probe_layer(harmful, harmless)
|
||||
assert result.cosine_with_analytical == 0.0
|
||||
|
||||
def test_auroc_bounded(self):
|
||||
harmful, harmless, direction = _make_separable_activations()
|
||||
probe = LinearRefusalProbe(n_epochs=100)
|
||||
result = probe.probe_layer(harmful, harmless, direction)
|
||||
assert 0 <= result.auroc <= 1.0
|
||||
|
||||
def test_mutual_information_nonnegative(self):
|
||||
harmful, harmless, direction = _make_separable_activations()
|
||||
probe = LinearRefusalProbe(n_epochs=100)
|
||||
result = probe.probe_layer(harmful, harmless, direction)
|
||||
assert result.mutual_information >= 0
|
||||
|
||||
def test_probe_all_layers(self):
|
||||
harmful_acts = {}
|
||||
harmless_acts = {}
|
||||
anal_dirs = {}
|
||||
for li in range(6):
|
||||
harmful, harmless, direction = _make_separable_activations(
|
||||
n_per_class=15, separation=3.0, seed=li * 10,
|
||||
)
|
||||
harmful_acts[li] = harmful
|
||||
harmless_acts[li] = harmless
|
||||
anal_dirs[li] = direction
|
||||
|
||||
probe = LinearRefusalProbe(n_epochs=100)
|
||||
result = probe.probe_all_layers(harmful_acts, harmless_acts, anal_dirs)
|
||||
|
||||
assert isinstance(result, ProbingSuiteResult)
|
||||
assert len(result.per_layer) == 6
|
||||
assert result.best_accuracy > 0
|
||||
assert result.total_mutual_information >= 0
|
||||
|
||||
def test_format_report(self):
|
||||
harmful_acts = {}
|
||||
harmless_acts = {}
|
||||
for li in range(4):
|
||||
harmful, harmless, _ = _make_separable_activations(
|
||||
n_per_class=15, seed=li,
|
||||
)
|
||||
harmful_acts[li] = harmful
|
||||
harmless_acts[li] = harmless
|
||||
|
||||
probe = LinearRefusalProbe(n_epochs=50)
|
||||
result = probe.probe_all_layers(harmful_acts, harmless_acts)
|
||||
report = LinearRefusalProbe.format_probing_report(result)
|
||||
|
||||
assert "Linear Probing" in report
|
||||
assert "accuracy" in report.lower()
|
||||
|
||||
def test_cross_entropy_finite(self):
|
||||
harmful, harmless, direction = _make_separable_activations()
|
||||
probe = LinearRefusalProbe(n_epochs=100)
|
||||
result = probe.probe_layer(harmful, harmless, direction)
|
||||
assert math.isfinite(result.cross_entropy)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Cross-Model Transfer Analysis
|
||||
# ===========================================================================
|
||||
|
||||
class TestTransferAnalysis:
|
||||
def test_cross_model_identical(self):
|
||||
"""Identical directions should give perfect transfer."""
|
||||
torch.manual_seed(42)
|
||||
dirs = {i: torch.randn(32) for i in range(8)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_model(dirs, dirs, "model_a", "model_a")
|
||||
|
||||
assert isinstance(result, CrossModelResult)
|
||||
assert result.mean_transfer_score > 0.99
|
||||
|
||||
def test_cross_model_random(self):
|
||||
"""Random directions should give low transfer."""
|
||||
torch.manual_seed(42)
|
||||
dirs_a = {i: torch.randn(32) for i in range(8)}
|
||||
torch.manual_seed(99)
|
||||
dirs_b = {i: torch.randn(32) for i in range(8)}
|
||||
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_model(dirs_a, dirs_b, "a", "b")
|
||||
# Random 32-dim vectors have low expected cosine
|
||||
assert result.mean_transfer_score < 0.7
|
||||
|
||||
def test_cross_model_structure(self):
|
||||
torch.manual_seed(42)
|
||||
dirs_a = {i: torch.randn(32) for i in range(8)}
|
||||
dirs_b = {i: torch.randn(32) for i in range(8)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
|
||||
|
||||
assert 0 <= result.transfer_above_threshold <= 1.0
|
||||
assert len(result.per_layer_transfer) == 8
|
||||
|
||||
def test_cross_category_similar(self):
|
||||
"""Similar categories should cluster together."""
|
||||
torch.manual_seed(42)
|
||||
shared = torch.randn(32)
|
||||
shared = shared / shared.norm()
|
||||
|
||||
cat_dirs = {}
|
||||
for cat in ["weapons", "bombs", "explosives"]:
|
||||
d = shared + 0.2 * torch.randn(32)
|
||||
cat_dirs[cat] = d / d.norm()
|
||||
|
||||
# Add one very different category
|
||||
cat_dirs["fraud"] = torch.randn(32)
|
||||
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_category(cat_dirs)
|
||||
|
||||
assert isinstance(result, CrossCategoryResult)
|
||||
assert result.mean_cross_category_transfer > 0
|
||||
assert len(result.categories) == 4
|
||||
|
||||
def test_cross_category_specificity(self):
|
||||
torch.manual_seed(42)
|
||||
cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(5)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_category(cat_dirs)
|
||||
|
||||
assert result.most_universal_category != ""
|
||||
assert result.most_specific_category != ""
|
||||
assert len(result.category_clusters) > 0
|
||||
|
||||
def test_cross_layer(self):
|
||||
_, directions = _make_layer_activations()
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_layer(directions)
|
||||
|
||||
assert isinstance(result, CrossLayerResult)
|
||||
assert result.mean_adjacent_transfer >= 0
|
||||
assert result.transfer_decay_rate >= 0
|
||||
|
||||
def test_cross_layer_adjacent_vs_distant(self):
|
||||
"""Adjacent layers typically have higher transfer than distant ones."""
|
||||
torch.manual_seed(42)
|
||||
# Create directions with gradual drift
|
||||
d = torch.randn(32)
|
||||
d = d / d.norm()
|
||||
directions = {}
|
||||
for i in range(10):
|
||||
noise = torch.randn(32) * 0.1 * i
|
||||
di = d + noise
|
||||
directions[i] = di / di.norm()
|
||||
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_layer(directions)
|
||||
# Adjacent should have higher transfer than distant
|
||||
assert result.mean_adjacent_transfer >= result.mean_distant_transfer - 0.1
|
||||
|
||||
def test_universality_index(self):
|
||||
torch.manual_seed(42)
|
||||
dirs = {i: torch.randn(32) for i in range(6)}
|
||||
|
||||
analyzer = TransferAnalyzer()
|
||||
cross_model = analyzer.analyze_cross_model(dirs, dirs)
|
||||
cross_layer = analyzer.analyze_cross_layer(dirs)
|
||||
cat_dirs = {f"cat_{i}": torch.randn(32) for i in range(4)}
|
||||
cross_cat = analyzer.analyze_cross_category(cat_dirs)
|
||||
|
||||
report = analyzer.compute_universality_index(
|
||||
cross_model=cross_model,
|
||||
cross_category=cross_cat,
|
||||
cross_layer=cross_layer,
|
||||
)
|
||||
|
||||
assert isinstance(report, UniversalityReport)
|
||||
assert 0 <= report.universality_index <= 1.0
|
||||
|
||||
def test_universality_empty(self):
|
||||
analyzer = TransferAnalyzer()
|
||||
report = analyzer.compute_universality_index()
|
||||
assert report.universality_index == 0.0
|
||||
|
||||
def test_format_cross_model(self):
|
||||
torch.manual_seed(42)
|
||||
dirs = {i: torch.randn(32) for i in range(4)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_model(dirs, dirs, "llama", "mistral")
|
||||
report = TransferAnalyzer.format_cross_model(result)
|
||||
assert "Cross-Model" in report
|
||||
assert "llama" in report
|
||||
|
||||
def test_format_cross_category(self):
|
||||
torch.manual_seed(42)
|
||||
cat_dirs = {f"cat_{i}": torch.randn(16) for i in range(3)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_category(cat_dirs)
|
||||
report = TransferAnalyzer.format_cross_category(result)
|
||||
assert "Cross-Category" in report
|
||||
|
||||
def test_format_universality(self):
|
||||
analyzer = TransferAnalyzer()
|
||||
report_obj = analyzer.compute_universality_index()
|
||||
report = TransferAnalyzer.format_universality(report_obj)
|
||||
assert "Universality" in report
|
||||
|
||||
def test_dimension_mismatch_handled(self):
|
||||
"""Cross-model with different hidden dims should truncate."""
|
||||
dirs_a = {0: torch.randn(32), 1: torch.randn(32)}
|
||||
dirs_b = {0: torch.randn(64), 1: torch.randn(64)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
|
||||
assert len(result.per_layer_transfer) == 2
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Integration
|
||||
# ===========================================================================
|
||||
|
||||
class TestNewImports:
|
||||
def test_all_new_modules_importable(self):
|
||||
from obliteratus.analysis import (
|
||||
CausalRefusalTracer,
|
||||
ResidualStreamDecomposer,
|
||||
LinearRefusalProbe,
|
||||
TransferAnalyzer,
|
||||
)
|
||||
assert CausalRefusalTracer is not None
|
||||
assert ResidualStreamDecomposer is not None
|
||||
assert LinearRefusalProbe is not None
|
||||
assert TransferAnalyzer is not None
|
||||
@@ -0,0 +1,133 @@
|
||||
"""CLI dispatch tests for obliteratus.cli.main().
|
||||
|
||||
These tests verify argument parsing and subcommand routing without
|
||||
downloading real models or running any pipeline. They use
|
||||
``unittest.mock.patch`` to capture stdout/stderr and
|
||||
``pytest.raises(SystemExit)`` for argparse exits.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from obliteratus.cli import main
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _capture_exit(argv: list[str] | None, *, expect_code: int | None = None):
|
||||
"""Call main(argv), expecting SystemExit; return captured stderr text."""
|
||||
buf = StringIO()
|
||||
with pytest.raises(SystemExit) as exc_info, patch("sys.stderr", buf):
|
||||
main(argv)
|
||||
if expect_code is not None:
|
||||
assert exc_info.value.code == expect_code
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCLIDispatch:
|
||||
"""Test suite for CLI argument parsing and subcommand dispatch."""
|
||||
|
||||
# 1. No args -> prints help / exits with error
|
||||
def test_main_no_args_prints_help(self):
|
||||
"""Calling main() with no args should exit (subcommand is required)."""
|
||||
stderr_text = _capture_exit([], expect_code=2)
|
||||
# argparse prints usage info to stderr on error
|
||||
assert "usage" in stderr_text.lower() or "required" in stderr_text.lower()
|
||||
|
||||
# 2. models command lists models without error
|
||||
def test_models_command(self):
|
||||
"""Calling main(['models']) should list models without raising."""
|
||||
with patch("obliteratus.cli.console") as mock_console:
|
||||
main(["models"])
|
||||
# console.print is called at least once to render the table
|
||||
assert mock_console.print.call_count >= 1
|
||||
|
||||
# 3. obliterate without model arg -> error
|
||||
def test_obliterate_requires_model(self):
|
||||
"""Calling main(['obliterate']) without a model arg should error."""
|
||||
stderr_text = _capture_exit(["obliterate"], expect_code=2)
|
||||
assert "model" in stderr_text.lower() or "required" in stderr_text.lower()
|
||||
|
||||
# 4. obliterate --method accepts valid methods
|
||||
def test_obliterate_valid_methods(self):
|
||||
"""Test that --method accepts all 9 pipeline methods."""
|
||||
valid_methods = [
|
||||
"basic", "advanced", "aggressive", "spectral_cascade",
|
||||
"informed", "surgical", "optimized", "inverted", "nuclear",
|
||||
]
|
||||
for method in valid_methods:
|
||||
# Patch the actual pipeline execution so nothing runs
|
||||
with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
|
||||
main(["obliterate", "fake/model", "--method", method])
|
||||
mock_cmd.assert_called_once()
|
||||
args_passed = mock_cmd.call_args[0][0]
|
||||
assert args_passed.method == method
|
||||
|
||||
# 4b. invalid methods are rejected
|
||||
def test_obliterate_rejects_invalid_method(self):
|
||||
"""The CLI --method flag rejects unknown method names."""
|
||||
stderr_text = _capture_exit(
|
||||
["obliterate", "fake/model", "--method", "nonexistent"],
|
||||
expect_code=2,
|
||||
)
|
||||
assert "invalid choice" in stderr_text.lower()
|
||||
|
||||
# 5. run requires config path
|
||||
def test_run_requires_config(self):
|
||||
"""Calling main(['run']) without a config path should error."""
|
||||
stderr_text = _capture_exit(["run"], expect_code=2)
|
||||
assert "config" in stderr_text.lower() or "required" in stderr_text.lower()
|
||||
|
||||
# 6. aggregate with nonexistent dir handles gracefully
|
||||
def test_aggregate_command_missing_dir(self):
|
||||
"""Calling main(['aggregate']) with nonexistent dir should handle gracefully."""
|
||||
with patch("obliteratus.cli.console") as mock_console:
|
||||
main(["aggregate", "--dir", "/nonexistent/path/to/nowhere"])
|
||||
# The command prints a message about no contributions found and returns
|
||||
printed_text = " ".join(
|
||||
str(call) for call in mock_console.print.call_args_list
|
||||
)
|
||||
assert "no contributions found" in printed_text.lower() or mock_console.print.called
|
||||
|
||||
# 7. --help flag prints help
|
||||
def test_help_flag(self):
|
||||
"""Calling main(['--help']) should print help and exit 0."""
|
||||
buf = StringIO()
|
||||
with pytest.raises(SystemExit) as exc_info, patch("sys.stdout", buf):
|
||||
main(["--help"])
|
||||
assert exc_info.value.code == 0
|
||||
output = buf.getvalue()
|
||||
assert "obliteratus" in output.lower() or "usage" in output.lower()
|
||||
|
||||
# 8. interactive subcommand is registered
|
||||
def test_interactive_command_exists(self):
|
||||
"""Verify 'interactive' subcommand is registered and dispatches."""
|
||||
with patch("obliteratus.cli._cmd_interactive") as mock_cmd:
|
||||
main(["interactive"])
|
||||
mock_cmd.assert_called_once()
|
||||
|
||||
# 9. --contribute and --contribute-notes are accepted on obliterate
|
||||
def test_contribute_flags_on_obliterate(self):
|
||||
"""Verify --contribute and --contribute-notes are accepted args."""
|
||||
with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
|
||||
main([
|
||||
"obliterate", "fake/model",
|
||||
"--contribute",
|
||||
"--contribute-notes", "Testing contribution system",
|
||||
])
|
||||
mock_cmd.assert_called_once()
|
||||
args_passed = mock_cmd.call_args[0][0]
|
||||
assert args_passed.contribute is True
|
||||
assert args_passed.contribute_notes == "Testing contribution system"
|
||||
@@ -0,0 +1,567 @@
|
||||
"""Tests for the community contribution system."""
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from obliteratus.community import (
|
||||
CONTRIBUTION_SCHEMA_VERSION,
|
||||
_config_fingerprint,
|
||||
_model_short_name,
|
||||
aggregate_results,
|
||||
generate_latex_table,
|
||||
load_contributions,
|
||||
save_contribution,
|
||||
)
|
||||
|
||||
|
||||
# ── Helper: mock pipeline ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_mock_pipeline():
|
||||
"""Build a mock pipeline with all fields the community module reads."""
|
||||
p = MagicMock()
|
||||
p.handle.summary.return_value = {
|
||||
"architecture": "LlamaForCausalLM",
|
||||
"num_layers": 32,
|
||||
"num_heads": 32,
|
||||
"hidden_size": 4096,
|
||||
"total_params": 8_000_000_000,
|
||||
}
|
||||
p.method = "advanced"
|
||||
p.n_directions = 4
|
||||
p.norm_preserve = True
|
||||
p.regularization = 0.3
|
||||
p.refinement_passes = 2
|
||||
p.project_biases = True
|
||||
p.use_chat_template = True
|
||||
p.use_whitened_svd = True
|
||||
p.true_iterative_refinement = False
|
||||
p.use_jailbreak_contrast = False
|
||||
p.layer_adaptive_strength = False
|
||||
p.attention_head_surgery = True
|
||||
p.safety_neuron_masking = False
|
||||
p.per_expert_directions = False
|
||||
p.use_sae_features = False
|
||||
p.invert_refusal = False
|
||||
p.project_embeddings = False
|
||||
p.embed_regularization = 0.5
|
||||
p.activation_steering = False
|
||||
p.steering_strength = 0.3
|
||||
p.expert_transplant = False
|
||||
p.transplant_blend = 0.3
|
||||
p.reflection_strength = 2.0
|
||||
p.quantization = None
|
||||
|
||||
p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05}
|
||||
p._strong_layers = [10, 11, 12, 13]
|
||||
p._stage_durations = {
|
||||
"summon": 3.0, "probe": 12.5, "distill": 4.1,
|
||||
"excise": 2.0, "verify": 8.3, "rebirth": 5.0,
|
||||
}
|
||||
p._excise_modified_count = 128
|
||||
|
||||
# Direction data
|
||||
d = torch.randn(4096)
|
||||
d = d / d.norm()
|
||||
p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096)}
|
||||
p.refusal_subspaces = {10: torch.randn(4, 4096)}
|
||||
|
||||
# Excise details
|
||||
p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]}
|
||||
p._sae_directions = {}
|
||||
p._expert_safety_scores = {}
|
||||
p._layer_excise_weights = {}
|
||||
p._expert_directions = {}
|
||||
p._steering_hooks = []
|
||||
|
||||
# Prompts
|
||||
p.harmful_prompts = ["x"] * 33
|
||||
p.harmless_prompts = ["y"] * 33
|
||||
p.jailbreak_prompts = None
|
||||
|
||||
return p
|
||||
|
||||
|
||||
# ── Model short name ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestModelShortName:
|
||||
def test_strips_org_prefix(self):
|
||||
assert _model_short_name("meta-llama/Llama-2-7b-chat-hf") == "llama-2-7b-chat-hf"
|
||||
|
||||
def test_no_org_prefix(self):
|
||||
assert _model_short_name("gpt2") == "gpt2"
|
||||
|
||||
def test_sanitizes_special_chars(self):
|
||||
assert _model_short_name("org/Model_V2.1") == "model-v2-1"
|
||||
|
||||
def test_caps_length(self):
|
||||
long_name = "a" * 100
|
||||
assert len(_model_short_name(long_name)) <= 60
|
||||
|
||||
def test_collapses_dashes(self):
|
||||
assert _model_short_name("org/Model---Name") == "model-name"
|
||||
|
||||
def test_strips_trailing_dashes(self):
|
||||
assert _model_short_name("org/Model-") == "model"
|
||||
|
||||
|
||||
# ── Config fingerprint ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestConfigFingerprint:
|
||||
def test_deterministic(self):
|
||||
config = {"n_directions": 4, "norm_preserve": True}
|
||||
fp1 = _config_fingerprint(config)
|
||||
fp2 = _config_fingerprint(config)
|
||||
assert fp1 == fp2
|
||||
|
||||
def test_different_configs_different_hashes(self):
|
||||
fp1 = _config_fingerprint({"n_directions": 4})
|
||||
fp2 = _config_fingerprint({"n_directions": 8})
|
||||
assert fp1 != fp2
|
||||
|
||||
def test_key_order_invariant(self):
|
||||
fp1 = _config_fingerprint({"a": 1, "b": 2})
|
||||
fp2 = _config_fingerprint({"b": 2, "a": 1})
|
||||
assert fp1 == fp2
|
||||
|
||||
def test_returns_8_char_hex(self):
|
||||
fp = _config_fingerprint({"test": True})
|
||||
assert len(fp) == 8
|
||||
assert all(c in "0123456789abcdef" for c in fp)
|
||||
|
||||
|
||||
# ── Save contribution ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSaveContribution:
|
||||
def test_saves_json_file(self, tmp_path):
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline,
|
||||
model_name="meta-llama/Llama-2-7b-chat-hf",
|
||||
output_dir=tmp_path,
|
||||
)
|
||||
assert path.exists()
|
||||
assert path.suffix == ".json"
|
||||
data = json.loads(path.read_text())
|
||||
assert data["contribution_schema_version"] == CONTRIBUTION_SCHEMA_VERSION
|
||||
assert data["model_name"] == "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
def test_filename_format(self, tmp_path):
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline,
|
||||
model_name="meta-llama/Llama-2-7b-chat-hf",
|
||||
output_dir=tmp_path,
|
||||
)
|
||||
name = path.stem
|
||||
assert name.startswith("llama-2-7b-chat-hf_advanced_")
|
||||
|
||||
def test_includes_telemetry_report(self, tmp_path):
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline,
|
||||
model_name="meta-llama/Llama-2-7b-chat-hf",
|
||||
output_dir=tmp_path,
|
||||
)
|
||||
data = json.loads(path.read_text())
|
||||
telemetry = data["telemetry"]
|
||||
assert telemetry["schema_version"] == 2
|
||||
assert telemetry["model"]["architecture"] == "LlamaForCausalLM"
|
||||
assert telemetry["method"] == "advanced"
|
||||
assert telemetry["quality_metrics"]["refusal_rate"] == 0.05
|
||||
|
||||
def test_includes_config_fingerprint(self, tmp_path):
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline,
|
||||
model_name="meta-llama/Llama-2-7b-chat-hf",
|
||||
output_dir=tmp_path,
|
||||
)
|
||||
data = json.loads(path.read_text())
|
||||
assert "config_fingerprint" in data
|
||||
assert len(data["config_fingerprint"]) == 8
|
||||
|
||||
def test_includes_notes(self, tmp_path):
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline,
|
||||
model_name="test/model",
|
||||
notes="Ran on A100 with default prompts",
|
||||
output_dir=tmp_path,
|
||||
)
|
||||
data = json.loads(path.read_text())
|
||||
assert data["notes"] == "Ran on A100 with default prompts"
|
||||
|
||||
def test_creates_output_dir(self, tmp_path):
|
||||
subdir = tmp_path / "nested" / "dir"
|
||||
assert not subdir.exists()
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline, model_name="test/model", output_dir=subdir,
|
||||
)
|
||||
assert subdir.exists()
|
||||
assert path.exists()
|
||||
|
||||
def test_timestamp_format(self, tmp_path):
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline, model_name="test/model", output_dir=tmp_path,
|
||||
)
|
||||
data = json.loads(path.read_text())
|
||||
ts = data["timestamp"]
|
||||
# Should be UTC ISO-ish: YYYYMMDDTHHMMSSZ
|
||||
assert ts.endswith("Z")
|
||||
assert "T" in ts
|
||||
assert len(ts) == 16
|
||||
|
||||
def test_method_config_extracted(self, tmp_path):
|
||||
pipeline = _make_mock_pipeline()
|
||||
path = save_contribution(
|
||||
pipeline, model_name="test/model", output_dir=tmp_path,
|
||||
)
|
||||
data = json.loads(path.read_text())
|
||||
cfg = data["telemetry"]["method_config"]
|
||||
assert cfg["n_directions"] == 4
|
||||
assert cfg["norm_preserve"] is True
|
||||
assert cfg["attention_head_surgery"] is True
|
||||
|
||||
|
||||
# ── Load contributions ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestLoadContributions:
|
||||
def _write_contrib(self, directory, model, method, refusal_rate, idx=0):
|
||||
"""Write a minimal valid contribution file."""
|
||||
record = {
|
||||
"contribution_schema_version": CONTRIBUTION_SCHEMA_VERSION,
|
||||
"timestamp": f"20260227T{idx:06d}Z",
|
||||
"model_name": model,
|
||||
"config_fingerprint": "abcd1234",
|
||||
"notes": "",
|
||||
"telemetry": {
|
||||
"schema_version": 2,
|
||||
"method": method,
|
||||
"quality_metrics": {"refusal_rate": refusal_rate},
|
||||
},
|
||||
}
|
||||
path = directory / f"contrib_{idx}.json"
|
||||
path.write_text(json.dumps(record))
|
||||
return path
|
||||
|
||||
def test_loads_valid_files(self, tmp_path):
|
||||
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
|
||||
self._write_contrib(tmp_path, "test/model", "basic", 0.10, 1)
|
||||
records = load_contributions(tmp_path)
|
||||
assert len(records) == 2
|
||||
|
||||
def test_sorts_by_timestamp(self, tmp_path):
|
||||
self._write_contrib(tmp_path, "model-b", "advanced", 0.05, 2)
|
||||
self._write_contrib(tmp_path, "model-a", "advanced", 0.10, 1)
|
||||
records = load_contributions(tmp_path)
|
||||
assert records[0]["model_name"] == "model-a"
|
||||
assert records[1]["model_name"] == "model-b"
|
||||
|
||||
def test_skips_non_contribution_json(self, tmp_path):
|
||||
# Write a JSON file without contribution_schema_version
|
||||
(tmp_path / "random.json").write_text('{"foo": "bar"}')
|
||||
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
|
||||
records = load_contributions(tmp_path)
|
||||
assert len(records) == 1
|
||||
|
||||
def test_skips_invalid_json(self, tmp_path):
|
||||
(tmp_path / "bad.json").write_text("not valid json {{{")
|
||||
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
|
||||
records = load_contributions(tmp_path)
|
||||
assert len(records) == 1
|
||||
|
||||
def test_returns_empty_for_missing_dir(self, tmp_path):
|
||||
records = load_contributions(tmp_path / "nonexistent")
|
||||
assert records == []
|
||||
|
||||
def test_tracks_source_file(self, tmp_path):
|
||||
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
|
||||
records = load_contributions(tmp_path)
|
||||
assert "_source_file" in records[0]
|
||||
assert "contrib_0.json" in records[0]["_source_file"]
|
||||
|
||||
def test_ignores_non_json_files(self, tmp_path):
|
||||
(tmp_path / "readme.txt").write_text("some text")
|
||||
self._write_contrib(tmp_path, "test/model", "advanced", 0.05, 0)
|
||||
records = load_contributions(tmp_path)
|
||||
assert len(records) == 1
|
||||
|
||||
|
||||
# ── Aggregate results ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestAggregateResults:
|
||||
def _make_record(self, model, method, refusal_rate, perplexity=None, coherence=None):
|
||||
metrics = {"refusal_rate": refusal_rate}
|
||||
if perplexity is not None:
|
||||
metrics["perplexity"] = perplexity
|
||||
if coherence is not None:
|
||||
metrics["coherence"] = coherence
|
||||
return {
|
||||
"model_name": model,
|
||||
"telemetry": {
|
||||
"method": method,
|
||||
"quality_metrics": metrics,
|
||||
},
|
||||
}
|
||||
|
||||
def test_single_record(self):
|
||||
records = [self._make_record("model-a", "advanced", 0.05)]
|
||||
result = aggregate_results(records)
|
||||
assert "model-a" in result
|
||||
assert "advanced" in result["model-a"]
|
||||
assert result["model-a"]["advanced"]["n_runs"] == 1
|
||||
assert result["model-a"]["advanced"]["refusal_rate"]["mean"] == 0.05
|
||||
|
||||
def test_multiple_runs_same_model_method(self):
|
||||
records = [
|
||||
self._make_record("model-a", "advanced", 0.04),
|
||||
self._make_record("model-a", "advanced", 0.06),
|
||||
]
|
||||
result = aggregate_results(records)
|
||||
stats = result["model-a"]["advanced"]
|
||||
assert stats["n_runs"] == 2
|
||||
assert stats["refusal_rate"]["mean"] == 0.05
|
||||
assert stats["refusal_rate"]["min"] == 0.04
|
||||
assert stats["refusal_rate"]["max"] == 0.06
|
||||
assert stats["refusal_rate"]["n"] == 2
|
||||
|
||||
def test_multiple_models(self):
|
||||
records = [
|
||||
self._make_record("model-a", "advanced", 0.05),
|
||||
self._make_record("model-b", "basic", 0.10),
|
||||
]
|
||||
result = aggregate_results(records)
|
||||
assert len(result) == 2
|
||||
assert "model-a" in result
|
||||
assert "model-b" in result
|
||||
|
||||
def test_multiple_methods(self):
|
||||
records = [
|
||||
self._make_record("model-a", "advanced", 0.05),
|
||||
self._make_record("model-a", "basic", 0.10),
|
||||
]
|
||||
result = aggregate_results(records)
|
||||
assert len(result["model-a"]) == 2
|
||||
assert "advanced" in result["model-a"]
|
||||
assert "basic" in result["model-a"]
|
||||
|
||||
def test_std_zero_for_single_run(self):
|
||||
records = [self._make_record("model-a", "advanced", 0.05)]
|
||||
result = aggregate_results(records)
|
||||
assert result["model-a"]["advanced"]["refusal_rate"]["std"] == 0.0
|
||||
|
||||
def test_multiple_metrics(self):
|
||||
records = [
|
||||
self._make_record("model-a", "advanced", 0.05, perplexity=5.2, coherence=0.8),
|
||||
]
|
||||
result = aggregate_results(records)
|
||||
stats = result["model-a"]["advanced"]
|
||||
assert "refusal_rate" in stats
|
||||
assert "perplexity" in stats
|
||||
assert "coherence" in stats
|
||||
assert stats["perplexity"]["mean"] == 5.2
|
||||
|
||||
def test_missing_metric_skipped(self):
|
||||
records = [self._make_record("model-a", "advanced", 0.05)]
|
||||
result = aggregate_results(records)
|
||||
# coherence not provided, should not appear
|
||||
assert "coherence" not in result["model-a"]["advanced"]
|
||||
|
||||
def test_unknown_model_and_method(self):
|
||||
records = [{
|
||||
"telemetry": {"quality_metrics": {"refusal_rate": 0.1}},
|
||||
}]
|
||||
result = aggregate_results(records)
|
||||
assert "unknown" in result
|
||||
assert "unknown" in result["unknown"]
|
||||
|
||||
|
||||
# ── LaTeX table generation ─────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestGenerateLatexTable:
|
||||
def _sample_aggregated(self):
|
||||
return {
|
||||
"meta-llama/Llama-2-7b-chat-hf": {
|
||||
"advanced": {
|
||||
"n_runs": 3,
|
||||
"refusal_rate": {"mean": 0.04, "std": 0.01, "n": 3, "min": 0.03, "max": 0.05},
|
||||
},
|
||||
"basic": {
|
||||
"n_runs": 2,
|
||||
"refusal_rate": {"mean": 0.08, "std": 0.02, "n": 2, "min": 0.06, "max": 0.10},
|
||||
},
|
||||
},
|
||||
"mistralai/Mistral-7B-Instruct-v0.2": {
|
||||
"advanced": {
|
||||
"n_runs": 1,
|
||||
"refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def test_produces_valid_latex(self):
|
||||
agg = self._sample_aggregated()
|
||||
latex = generate_latex_table(agg)
|
||||
assert "\\begin{tabular}" in latex
|
||||
assert "\\end{tabular}" in latex
|
||||
assert "\\toprule" in latex
|
||||
assert "\\bottomrule" in latex
|
||||
|
||||
def test_includes_model_names(self):
|
||||
agg = self._sample_aggregated()
|
||||
latex = generate_latex_table(agg)
|
||||
assert "Llama-2-7b-chat-hf" in latex
|
||||
assert "Mistral-7B-Instruct-v0.2" in latex
|
||||
|
||||
def test_includes_method_headers(self):
|
||||
agg = self._sample_aggregated()
|
||||
latex = generate_latex_table(agg)
|
||||
assert "advanced" in latex
|
||||
assert "basic" in latex
|
||||
|
||||
def test_missing_method_shows_dash(self):
|
||||
agg = self._sample_aggregated()
|
||||
latex = generate_latex_table(agg)
|
||||
# Mistral doesn't have "basic" method
|
||||
assert "---" in latex
|
||||
|
||||
def test_shows_std_when_multiple_runs(self):
|
||||
agg = self._sample_aggregated()
|
||||
latex = generate_latex_table(agg)
|
||||
assert "$\\pm$" in latex
|
||||
|
||||
def test_no_std_for_single_run(self):
|
||||
agg = {
|
||||
"model": {
|
||||
"method": {
|
||||
"n_runs": 1,
|
||||
"refusal_rate": {"mean": 0.03, "std": 0.0, "n": 1, "min": 0.03, "max": 0.03},
|
||||
},
|
||||
},
|
||||
}
|
||||
latex = generate_latex_table(agg)
|
||||
assert "$\\pm$" not in latex
|
||||
|
||||
def test_methods_filter(self):
|
||||
agg = self._sample_aggregated()
|
||||
latex = generate_latex_table(agg, methods=["advanced"])
|
||||
assert "\\textbf{advanced}" in latex
|
||||
assert "\\textbf{basic}" not in latex
|
||||
|
||||
def test_custom_metric(self):
|
||||
agg = {
|
||||
"model": {
|
||||
"method": {
|
||||
"n_runs": 2,
|
||||
"perplexity": {"mean": 5.2, "std": 0.3, "n": 2, "min": 4.9, "max": 5.5},
|
||||
},
|
||||
},
|
||||
}
|
||||
latex = generate_latex_table(agg, metric="perplexity")
|
||||
assert "5.2" in latex
|
||||
|
||||
def test_column_count_matches_methods(self):
|
||||
agg = self._sample_aggregated()
|
||||
latex = generate_latex_table(agg)
|
||||
# 2 methods → "lcc" (1 model col + 2 method cols)
|
||||
assert "{@{}lcc@{}}" in latex
|
||||
|
||||
|
||||
# ── CLI integration ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestCLIContributeFlag:
|
||||
def test_contribute_flag_accepted(self):
|
||||
"""Verify the --contribute flag parses without error."""
|
||||
from obliteratus.cli import main
|
||||
|
||||
# We can't run the full command (no GPU), but verify parsing works
|
||||
with pytest.raises(SystemExit):
|
||||
# "obliterate" requires a model, so parse will fail,
|
||||
# but if --contribute is not recognized it fails differently
|
||||
main(["obliterate", "--help"])
|
||||
|
||||
def test_aggregate_command_accepted(self):
|
||||
"""Verify the aggregate command parses without error."""
|
||||
from obliteratus.cli import main
|
||||
|
||||
with pytest.raises(SystemExit):
|
||||
main(["aggregate", "--help"])
|
||||
|
||||
|
||||
# ── Package exports ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestPackageExports:
|
||||
def test_save_contribution_importable(self):
|
||||
from obliteratus import save_contribution
|
||||
assert callable(save_contribution)
|
||||
|
||||
def test_load_contributions_importable(self):
|
||||
from obliteratus import load_contributions
|
||||
assert callable(load_contributions)
|
||||
|
||||
def test_aggregate_results_importable(self):
|
||||
from obliteratus import aggregate_results
|
||||
assert callable(aggregate_results)
|
||||
|
||||
|
||||
# ── End-to-end: save → load → aggregate ───────────────────────────────
|
||||
|
||||
|
||||
class TestEndToEnd:
|
||||
def test_save_load_aggregate_roundtrip(self, tmp_path):
|
||||
"""Full roundtrip: save contributions, load them, aggregate."""
|
||||
pipeline = _make_mock_pipeline()
|
||||
|
||||
# Save two contributions (different models to avoid filename collision)
|
||||
save_contribution(
|
||||
pipeline, model_name="test/model-a", output_dir=tmp_path,
|
||||
)
|
||||
# Tweak metrics for second run with a different model name
|
||||
pipeline._quality_metrics = {"perplexity": 5.5, "coherence": 0.75, "refusal_rate": 0.07}
|
||||
save_contribution(
|
||||
pipeline, model_name="test/model-b", output_dir=tmp_path,
|
||||
)
|
||||
|
||||
# Load
|
||||
records = load_contributions(tmp_path)
|
||||
assert len(records) == 2
|
||||
|
||||
# Aggregate
|
||||
aggregated = aggregate_results(records)
|
||||
assert "test/model-a" in aggregated
|
||||
assert "test/model-b" in aggregated
|
||||
stats_a = aggregated["test/model-a"]["advanced"]
|
||||
stats_b = aggregated["test/model-b"]["advanced"]
|
||||
assert stats_a["n_runs"] == 1
|
||||
assert stats_b["n_runs"] == 1
|
||||
assert abs(stats_a["refusal_rate"]["mean"] - 0.05) < 0.001
|
||||
assert abs(stats_b["refusal_rate"]["mean"] - 0.07) < 0.001
|
||||
|
||||
def test_save_load_aggregate_to_latex(self, tmp_path):
|
||||
"""Full roundtrip ending in LaTeX output."""
|
||||
pipeline = _make_mock_pipeline()
|
||||
save_contribution(
|
||||
pipeline, model_name="meta-llama/Llama-2-7b-chat-hf", output_dir=tmp_path,
|
||||
)
|
||||
|
||||
records = load_contributions(tmp_path)
|
||||
aggregated = aggregate_results(records)
|
||||
latex = generate_latex_table(aggregated)
|
||||
|
||||
assert "\\begin{tabular}" in latex
|
||||
assert "Llama-2-7b-chat-hf" in latex
|
||||
assert "advanced" in latex
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Tests for configuration loading."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import yaml
|
||||
|
||||
from obliteratus.config import StudyConfig
|
||||
|
||||
|
||||
SAMPLE_CONFIG = {
|
||||
"model": {
|
||||
"name": "gpt2",
|
||||
"task": "causal_lm",
|
||||
"dtype": "float32",
|
||||
"device": "cpu",
|
||||
},
|
||||
"dataset": {
|
||||
"name": "wikitext",
|
||||
"subset": "wikitext-2-raw-v1",
|
||||
"split": "test",
|
||||
"text_column": "text",
|
||||
"max_samples": 50,
|
||||
},
|
||||
"strategies": [
|
||||
{"name": "layer_removal", "params": {}},
|
||||
{"name": "ffn_ablation", "params": {}},
|
||||
],
|
||||
"metrics": ["perplexity"],
|
||||
"batch_size": 4,
|
||||
"max_length": 256,
|
||||
"output_dir": "results/test",
|
||||
}
|
||||
|
||||
|
||||
class TestStudyConfig:
|
||||
def test_from_dict(self):
|
||||
config = StudyConfig.from_dict(SAMPLE_CONFIG)
|
||||
assert config.model.name == "gpt2"
|
||||
assert config.model.task == "causal_lm"
|
||||
assert config.dataset.name == "wikitext"
|
||||
assert len(config.strategies) == 2
|
||||
assert config.strategies[0].name == "layer_removal"
|
||||
|
||||
def test_from_yaml(self, tmp_path):
|
||||
yaml_path = tmp_path / "test_config.yaml"
|
||||
yaml_path.write_text(yaml.dump(SAMPLE_CONFIG))
|
||||
|
||||
config = StudyConfig.from_yaml(yaml_path)
|
||||
assert config.model.name == "gpt2"
|
||||
assert config.batch_size == 4
|
||||
|
||||
def test_roundtrip(self):
|
||||
config = StudyConfig.from_dict(SAMPLE_CONFIG)
|
||||
d = config.to_dict()
|
||||
config2 = StudyConfig.from_dict(d)
|
||||
assert config2.model.name == config.model.name
|
||||
assert config2.dataset.name == config.dataset.name
|
||||
assert len(config2.strategies) == len(config.strategies)
|
||||
@@ -0,0 +1,169 @@
|
||||
"""Tests for defense robustness evaluation framework."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.analysis.defense_robustness import (
|
||||
DefenseProfile,
|
||||
DefenseRobustnessEvaluator,
|
||||
EntanglementMap,
|
||||
SelfRepairResult,
|
||||
)
|
||||
|
||||
|
||||
def _make_mock_pipeline(n_layers=6, hidden_dim=16, n_prompts=5):
|
||||
"""Create a mock pipeline with refusal directions and activations."""
|
||||
pipeline = MagicMock()
|
||||
pipeline.model_name = "test-model"
|
||||
|
||||
# Generate refusal directions (some strong, some weak)
|
||||
torch.manual_seed(42)
|
||||
directions = {}
|
||||
for i in range(n_layers):
|
||||
d = torch.randn(hidden_dim)
|
||||
directions[i] = d / d.norm()
|
||||
pipeline.refusal_directions = directions
|
||||
|
||||
# Generate activations with a planted refusal signal in middle layers
|
||||
harmful_means = {}
|
||||
harmless_means = {}
|
||||
harmful_acts = {}
|
||||
harmless_acts = {}
|
||||
|
||||
for i in range(n_layers):
|
||||
base = torch.randn(hidden_dim)
|
||||
harmless_means[i] = base.unsqueeze(0)
|
||||
|
||||
# Middle layers have stronger refusal signal
|
||||
signal_strength = 3.0 if 2 <= i <= 4 else 0.5
|
||||
harmful_means[i] = (base + signal_strength * directions[i]).unsqueeze(0)
|
||||
|
||||
harmful_acts[i] = [base + signal_strength * directions[i] + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
|
||||
harmless_acts[i] = [base + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
|
||||
|
||||
pipeline._harmful_means = harmful_means
|
||||
pipeline._harmless_means = harmless_means
|
||||
pipeline._harmful_acts = harmful_acts
|
||||
pipeline._harmless_acts = harmless_acts
|
||||
|
||||
return pipeline
|
||||
|
||||
|
||||
class TestDefenseProfile:
|
||||
def test_profile_generates(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
profile = evaluator.profile_defense()
|
||||
|
||||
assert isinstance(profile, DefenseProfile)
|
||||
assert profile.model_name == "test-model"
|
||||
assert profile.refusal_layer_spread > 0
|
||||
assert profile.mean_refusal_strength > 0
|
||||
assert profile.max_refusal_strength >= profile.mean_refusal_strength
|
||||
assert profile.estimated_robustness in ("low", "medium", "high", "very_high")
|
||||
|
||||
def test_alignment_type_estimate(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
profile = evaluator.profile_defense()
|
||||
assert profile.alignment_type_estimate != "unknown"
|
||||
|
||||
def test_empty_pipeline(self):
|
||||
pipeline = MagicMock()
|
||||
pipeline.model_name = "empty"
|
||||
pipeline.refusal_directions = {}
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
profile = evaluator.profile_defense()
|
||||
assert profile.estimated_robustness == "unknown"
|
||||
|
||||
def test_concentration_bounded(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
profile = evaluator.profile_defense()
|
||||
# Gini coefficient should be between 0 and 1
|
||||
assert 0 <= profile.refusal_concentration <= 1.0
|
||||
|
||||
def test_self_repair_bounded(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
profile = evaluator.profile_defense()
|
||||
assert 0 <= profile.self_repair_estimate <= 1.0
|
||||
|
||||
def test_format_report(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
profile = evaluator.profile_defense()
|
||||
report = DefenseRobustnessEvaluator.format_defense_profile(profile)
|
||||
assert "Defense Robustness" in report
|
||||
assert "test-model" in report
|
||||
|
||||
|
||||
class TestSelfRepair:
|
||||
def test_self_repair_measurement(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
result = evaluator.measure_self_repair(layer_idx=3)
|
||||
|
||||
assert isinstance(result, SelfRepairResult)
|
||||
assert result.layer_idx == 3
|
||||
assert result.original_refusal_strength >= 0
|
||||
assert 0 <= result.repair_ratio <= 1.0
|
||||
assert len(result.compensating_layers) > 0
|
||||
assert 3 not in result.compensating_layers # shouldn't list itself
|
||||
|
||||
def test_repair_ratio_high_for_distributed(self):
|
||||
"""Distributed refusal should have high repair ratio."""
|
||||
pipeline = _make_mock_pipeline(n_layers=10)
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
result = evaluator.measure_self_repair(layer_idx=3)
|
||||
# With distributed signal, removing one layer leaves much compensation
|
||||
assert result.repair_ratio > 0.5
|
||||
|
||||
def test_format_self_repair(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
result = evaluator.measure_self_repair(layer_idx=2)
|
||||
report = DefenseRobustnessEvaluator.format_self_repair(result)
|
||||
assert "Self-Repair" in report
|
||||
assert "Layer 2" in report
|
||||
|
||||
|
||||
class TestEntanglement:
|
||||
def test_entanglement_map(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
emap = evaluator.map_entanglement()
|
||||
|
||||
assert isinstance(emap, EntanglementMap)
|
||||
assert len(emap.layer_entanglement) > 0
|
||||
assert 0 <= emap.overall_entanglement <= 1.0
|
||||
assert len(emap.most_entangled_layers) > 0
|
||||
assert len(emap.least_entangled_layers) > 0
|
||||
|
||||
def test_capability_sensitivity_keys(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
emap = evaluator.map_entanglement()
|
||||
|
||||
expected_keys = {"factual_knowledge", "reasoning", "language_fluency",
|
||||
"instruction_following", "math"}
|
||||
assert set(emap.capability_sensitivity.keys()) == expected_keys
|
||||
|
||||
def test_math_most_sensitive(self):
|
||||
"""Math should be estimated as the most sensitive capability."""
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
emap = evaluator.map_entanglement()
|
||||
if emap.overall_entanglement > 0:
|
||||
assert emap.capability_sensitivity["math"] >= emap.capability_sensitivity["language_fluency"]
|
||||
|
||||
def test_format_entanglement(self):
|
||||
pipeline = _make_mock_pipeline()
|
||||
evaluator = DefenseRobustnessEvaluator(pipeline)
|
||||
emap = evaluator.map_entanglement()
|
||||
report = DefenseRobustnessEvaluator.format_entanglement(emap)
|
||||
assert "Entanglement" in report
|
||||
assert "math" in report
|
||||
@@ -0,0 +1,510 @@
|
||||
"""Edge-case and robustness tests.
|
||||
|
||||
Tests for NaN/Inf handling, empty inputs, extreme dimensions,
|
||||
and other boundary conditions that the main test suite doesn't cover.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
|
||||
from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
|
||||
from obliteratus.analysis.concept_geometry import ConceptConeAnalyzer
|
||||
from obliteratus.analysis.alignment_imprint import AlignmentImprintDetector
|
||||
from obliteratus.analysis.multi_token_position import MultiTokenPositionAnalyzer
|
||||
from obliteratus.analysis.sparse_surgery import SparseDirectionSurgeon
|
||||
from obliteratus.analysis.causal_tracing import CausalRefusalTracer
|
||||
from obliteratus.analysis.residual_stream import ResidualStreamDecomposer
|
||||
from obliteratus.analysis.probing_classifiers import LinearRefusalProbe
|
||||
from obliteratus.analysis.cross_model_transfer import TransferAnalyzer
|
||||
from obliteratus.evaluation.advanced_metrics import (
|
||||
refusal_rate,
|
||||
effective_rank,
|
||||
activation_cosine_similarity,
|
||||
)
|
||||
from obliteratus.analysis.steering_vectors import (
|
||||
SteeringVectorFactory,
|
||||
SteeringHookManager,
|
||||
SteeringConfig,
|
||||
SteeringResult,
|
||||
compute_steering_effectiveness,
|
||||
format_steering_report,
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# NaN / Inf handling
|
||||
# ===========================================================================
|
||||
|
||||
class TestNaNInfHandling:
|
||||
"""Test that modules handle degenerate inputs gracefully."""
|
||||
|
||||
def test_whitened_svd_nan_activations(self):
|
||||
"""WhitenedSVD with NaN — currently raises; documenting behavior."""
|
||||
harmful = [torch.tensor([float("nan"), 1.0, 2.0]) for _ in range(5)]
|
||||
harmless = [torch.randn(3) for _ in range(5)]
|
||||
extractor = WhitenedSVDExtractor()
|
||||
# NaN propagation through SVD is expected to produce NaN results
|
||||
# This documents the current behavior — ideally would guard against it
|
||||
raised = False
|
||||
result = None
|
||||
try:
|
||||
result = extractor.extract(harmful, harmless)
|
||||
except (RuntimeError, ValueError):
|
||||
raised = True
|
||||
# Either it raised an exception (acceptable) or returned a result with NaNs
|
||||
assert raised or result is not None, (
|
||||
"Should either raise on NaN input or return a result"
|
||||
)
|
||||
|
||||
def test_whitened_svd_zero_activations(self):
|
||||
"""WhitenedSVD with all-zero activations."""
|
||||
harmful = [torch.zeros(8) for _ in range(5)]
|
||||
harmless = [torch.zeros(8) for _ in range(5)]
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless)
|
||||
# Should return a valid result without crashing
|
||||
assert result is not None
|
||||
assert result.directions is not None
|
||||
assert result.singular_values is not None
|
||||
|
||||
def test_concept_cone_nan_direction(self):
|
||||
"""ConceptConeAnalyzer with NaN in activations — documenting behavior."""
|
||||
harmful = [torch.randn(16) for _ in range(10)]
|
||||
harmless = [torch.randn(16) for _ in range(10)]
|
||||
# Poison one activation
|
||||
harmful[3] = torch.full((16,), float("nan"))
|
||||
cat_map = {i: f"cat_{i % 3}" for i in range(10)}
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
raised = False
|
||||
result = None
|
||||
try:
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
except (RuntimeError, ValueError):
|
||||
raised = True
|
||||
# Either it raised an exception (acceptable) or returned a result
|
||||
assert raised or result is not None, (
|
||||
"Should either raise on NaN input or return a result"
|
||||
)
|
||||
|
||||
def test_sparse_surgery_zero_direction(self):
|
||||
"""Sparse surgery with zero refusal direction."""
|
||||
W = torch.randn(32, 16)
|
||||
zero_dir = torch.zeros(16)
|
||||
surgeon = SparseDirectionSurgeon()
|
||||
result = surgeon.analyze_weight_matrix(W, zero_dir)
|
||||
assert result.mean_projection == 0.0
|
||||
|
||||
def test_sparse_surgery_zero_weight(self):
|
||||
"""Sparse surgery with zero weight matrix."""
|
||||
W = torch.zeros(32, 16)
|
||||
ref_dir = torch.randn(16)
|
||||
surgeon = SparseDirectionSurgeon()
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir)
|
||||
assert result.max_projection < 1e-6
|
||||
|
||||
def test_effective_rank_nan_matrix(self):
|
||||
"""effective_rank should handle matrix with NaN."""
|
||||
W = torch.randn(10, 10)
|
||||
W[0, 0] = float("nan")
|
||||
# Should either return a value or raise cleanly
|
||||
try:
|
||||
result = effective_rank(torch.nan_to_num(W))
|
||||
assert math.isfinite(result)
|
||||
except Exception:
|
||||
pass # Raising is acceptable for NaN input
|
||||
|
||||
def test_cosine_similarity_zero_vectors(self):
|
||||
"""Cosine similarity between zero vectors."""
|
||||
a = torch.zeros(32)
|
||||
b = torch.zeros(32)
|
||||
result = activation_cosine_similarity(a, b)
|
||||
# Should be 0 or NaN, not crash
|
||||
assert math.isfinite(result) or math.isnan(result)
|
||||
|
||||
def test_transfer_analyzer_nan_directions(self):
|
||||
"""Transfer analyzer with NaN directions."""
|
||||
dirs_a = {0: torch.randn(16), 1: torch.tensor([float("nan")] * 16)}
|
||||
dirs_b = {0: torch.randn(16), 1: torch.randn(16)}
|
||||
analyzer = TransferAnalyzer()
|
||||
# Should not crash
|
||||
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
|
||||
assert result is not None
|
||||
assert isinstance(result.mean_transfer_score, float)
|
||||
assert result.per_layer_transfer is not None
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Empty inputs
|
||||
# ===========================================================================
|
||||
|
||||
class TestEmptyInputs:
|
||||
"""Test graceful handling of empty or minimal inputs."""
|
||||
|
||||
def test_cross_layer_empty_directions(self):
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze({})
|
||||
assert result.direction_persistence_score == 0.0
|
||||
|
||||
def test_alignment_imprint_single_layer(self):
|
||||
"""Single layer should still return a result."""
|
||||
detector = AlignmentImprintDetector()
|
||||
dirs = {0: torch.randn(32)}
|
||||
result = detector.detect_imprint(dirs)
|
||||
assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown")
|
||||
|
||||
def test_multi_token_single_position(self):
|
||||
"""Single-position sequence."""
|
||||
ref_dir = torch.randn(16)
|
||||
acts = torch.randn(1, 16)
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
assert result.n_tokens == 1
|
||||
assert result.peak_position == 0
|
||||
|
||||
def test_probing_minimal_data(self):
|
||||
"""Probing with very few samples."""
|
||||
harmful = [torch.randn(8) for _ in range(3)]
|
||||
harmless = [torch.randn(8) for _ in range(3)]
|
||||
probe = LinearRefusalProbe(n_epochs=10)
|
||||
result = probe.probe_layer(harmful, harmless)
|
||||
assert 0 <= result.accuracy <= 1.0
|
||||
|
||||
def test_residual_stream_single_layer(self):
|
||||
acts = {0: torch.randn(32)}
|
||||
ref_dir = torch.randn(32)
|
||||
decomposer = ResidualStreamDecomposer()
|
||||
result = decomposer.decompose(acts, ref_dir)
|
||||
assert result.n_layers == 1
|
||||
|
||||
def test_causal_tracing_single_layer(self):
|
||||
acts = {0: torch.randn(32)}
|
||||
ref_dirs = {0: torch.randn(32)}
|
||||
tracer = CausalRefusalTracer()
|
||||
result = tracer.trace_from_activations(acts, ref_dirs)
|
||||
assert result.n_layers == 1
|
||||
|
||||
def test_transfer_no_common_layers(self):
|
||||
"""Cross-model with no overlapping layer indices."""
|
||||
dirs_a = {0: torch.randn(16), 1: torch.randn(16)}
|
||||
dirs_b = {2: torch.randn(16), 3: torch.randn(16)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_model(dirs_a, dirs_b)
|
||||
assert result.mean_transfer_score == 0.0
|
||||
|
||||
def test_refusal_rate_empty_list(self):
|
||||
result = refusal_rate([])
|
||||
assert result == 0.0
|
||||
|
||||
def test_refusal_rate_single_response(self):
|
||||
result = refusal_rate(["I cannot help with that."])
|
||||
assert result == 1.0
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Extreme dimensions
|
||||
# ===========================================================================
|
||||
|
||||
class TestExtremeDimensions:
|
||||
"""Test with unusually large or small dimensions."""
|
||||
|
||||
def test_high_dimensional_directions(self):
|
||||
"""Test with realistic hidden dimension (4096)."""
|
||||
hidden_dim = 4096
|
||||
torch.manual_seed(42)
|
||||
dirs = {i: torch.randn(hidden_dim) for i in range(8)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_layer(dirs)
|
||||
assert result.mean_adjacent_transfer >= 0
|
||||
|
||||
def test_high_dim_sparse_surgery(self):
|
||||
"""Sparse surgery with large weight matrix."""
|
||||
W = torch.randn(2048, 1024)
|
||||
ref_dir = torch.randn(1024)
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.05)
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir)
|
||||
assert result.n_rows_modified == int(0.05 * 2048)
|
||||
|
||||
def test_single_dimension(self):
|
||||
"""1D hidden dimension edge case."""
|
||||
dirs = {i: torch.randn(1) for i in range(4)}
|
||||
analyzer = TransferAnalyzer()
|
||||
result = analyzer.analyze_cross_layer(dirs)
|
||||
# All 1D directions are parallel or anti-parallel, so cosine is always 1.0
|
||||
assert result.mean_adjacent_transfer >= 0.99
|
||||
|
||||
def test_many_layers_imprint(self):
|
||||
"""Alignment imprint with many layers (128)."""
|
||||
dirs = {i: torch.randn(32) for i in range(128)}
|
||||
detector = AlignmentImprintDetector()
|
||||
result = detector.detect_imprint(dirs)
|
||||
total = (result.dpo_probability + result.rlhf_probability +
|
||||
result.cai_probability + result.sft_probability)
|
||||
assert abs(total - 1.0) < 0.01
|
||||
|
||||
@pytest.mark.parametrize("n_prompts", [1, 2, 5, 50, 100])
|
||||
def test_concept_cone_varying_prompt_counts(self, n_prompts):
|
||||
"""Concept cone with varying numbers of prompts."""
|
||||
harmful = [torch.randn(16) for _ in range(n_prompts)]
|
||||
harmless = [torch.randn(16) for _ in range(n_prompts)]
|
||||
cat_map = {i: f"cat_{i % 3}" for i in range(n_prompts)}
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=1)
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
# Should produce a valid result regardless of prompt count
|
||||
assert result is not None
|
||||
assert result.general_direction is not None
|
||||
assert result.category_count >= 1
|
||||
|
||||
@pytest.mark.parametrize("seq_len", [1, 2, 10, 100, 500])
|
||||
def test_multi_token_varying_lengths(self, seq_len):
|
||||
"""Multi-token analysis with varying sequence lengths."""
|
||||
ref_dir = torch.randn(16)
|
||||
acts = torch.randn(seq_len, 16)
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
assert result.n_tokens == seq_len
|
||||
|
||||
@pytest.mark.parametrize("hidden_dim", [2, 4, 16, 64, 256, 1024])
|
||||
def test_probing_varying_dimensions(self, hidden_dim):
|
||||
"""Probing classifier across different hidden dims (min 2 for SGD)."""
|
||||
harmful = [torch.randn(hidden_dim) * 2 + 1 for _ in range(10)]
|
||||
harmless = [torch.randn(hidden_dim) * 2 - 1 for _ in range(10)]
|
||||
probe = LinearRefusalProbe(n_epochs=20)
|
||||
result = probe.probe_layer(harmful, harmless)
|
||||
assert 0 <= result.accuracy <= 1.0
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Refusal rate edge cases
|
||||
# ===========================================================================
|
||||
|
||||
class TestRefusalRateEdgeCases:
|
||||
"""Edge cases for refusal rate detection."""
|
||||
|
||||
def test_novel_refusal_phrasing(self):
|
||||
"""Test that unusual refusal phrasings might be missed (known limitation)."""
|
||||
novel_refusals = [
|
||||
"That request falls outside my operational parameters.",
|
||||
"I'm designed to be helpful but this conflicts with my guidelines.",
|
||||
]
|
||||
# These should ideally be detected but string matching may miss them
|
||||
rate = refusal_rate(novel_refusals)
|
||||
# This tests the known limitation — rate may be 0 (string matching misses novel phrasings)
|
||||
assert 0 <= rate <= 1.0
|
||||
|
||||
def test_false_positive_sorry(self):
|
||||
"""'I'm sorry to hear that' followed by helpful content."""
|
||||
responses = [
|
||||
"I'm sorry to hear that. Here's how you can solve the problem: first...",
|
||||
]
|
||||
rate = refusal_rate(responses, mode="prefix")
|
||||
# Known limitation: prefix matching produces false positives here
|
||||
# Just verify it returns a valid float
|
||||
assert 0 <= rate <= 1.0
|
||||
|
||||
def test_case_insensitivity(self):
|
||||
responses = ["I CANNOT ASSIST WITH THAT REQUEST."]
|
||||
rate = refusal_rate(responses, mode="substring")
|
||||
assert rate == 1.0
|
||||
|
||||
def test_whitespace_handling(self):
|
||||
responses = [" I'm sorry, but I can't help with that. "]
|
||||
rate = refusal_rate(responses, mode="prefix")
|
||||
# Should handle leading whitespace
|
||||
assert 0 <= rate <= 1.0
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Steering Vectors
|
||||
# ===========================================================================
|
||||
|
||||
class TestSteeringVectors:
|
||||
"""Tests for the steering vector inference-time intervention system."""
|
||||
|
||||
def test_from_refusal_direction(self):
|
||||
d = torch.randn(32)
|
||||
vec = SteeringVectorFactory.from_refusal_direction(d, source_layer=5)
|
||||
assert vec.label == "refusal"
|
||||
assert vec.source_layer == 5
|
||||
assert vec.default_alpha == -1.0
|
||||
assert abs(vec.direction.norm().item() - 1.0) < 0.01
|
||||
|
||||
def test_from_contrastive_pairs(self):
|
||||
pos = [torch.randn(16) + 2 for _ in range(10)]
|
||||
neg = [torch.randn(16) - 2 for _ in range(10)]
|
||||
vec = SteeringVectorFactory.from_contrastive_pairs(pos, neg, label="test")
|
||||
assert vec.label == "test"
|
||||
assert abs(vec.direction.norm().item() - 1.0) < 0.01
|
||||
assert "n_positive" in vec.metadata
|
||||
|
||||
def test_combine_vectors(self):
|
||||
v1 = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
|
||||
v2 = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
|
||||
combined = SteeringVectorFactory.combine([v1, v2], label="merged")
|
||||
assert combined.label == "merged"
|
||||
assert abs(combined.direction.norm().item() - 1.0) < 0.01
|
||||
|
||||
def test_combine_single(self):
|
||||
v = SteeringVectorFactory.from_refusal_direction(torch.randn(16))
|
||||
combined = SteeringVectorFactory.combine([v])
|
||||
assert abs(combined.direction.norm().item() - 1.0) < 0.01
|
||||
|
||||
def test_combine_empty_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
SteeringVectorFactory.combine([])
|
||||
|
||||
def test_hook_manager_lifecycle(self):
|
||||
"""Test install/remove lifecycle without a real model."""
|
||||
manager = SteeringHookManager()
|
||||
assert not manager.is_active
|
||||
manager.remove() # Should not crash even with no hooks
|
||||
assert not manager.is_active
|
||||
|
||||
def test_hook_with_simple_model(self):
|
||||
"""Test steering on a simple nn.Sequential model."""
|
||||
model = nn.Sequential(
|
||||
nn.Linear(16, 16),
|
||||
nn.ReLU(),
|
||||
nn.Linear(16, 16),
|
||||
nn.ReLU(),
|
||||
nn.Linear(16, 8),
|
||||
)
|
||||
|
||||
vec = SteeringVectorFactory.from_refusal_direction(torch.randn(16))
|
||||
config = SteeringConfig(
|
||||
vectors=[vec],
|
||||
target_layers=[0, 2], # steer at first and third linear layers
|
||||
alpha=1.0,
|
||||
)
|
||||
|
||||
manager = SteeringHookManager()
|
||||
# Install on specific modules
|
||||
layers = list(model.children())
|
||||
result = manager.install(model, config, layer_modules=layers)
|
||||
assert result.hooks_installed == 2
|
||||
assert manager.is_active
|
||||
|
||||
# Run a forward pass (should not crash)
|
||||
x = torch.randn(1, 16)
|
||||
output = model(x)
|
||||
assert output.shape == (1, 8)
|
||||
|
||||
# Remove hooks
|
||||
manager.remove()
|
||||
assert not manager.is_active
|
||||
|
||||
def test_steering_effectiveness_remove(self):
|
||||
eff = compute_steering_effectiveness(2.0, 0.5, direction="remove")
|
||||
assert 0 < eff < 1.0 # Reduced but not eliminated
|
||||
|
||||
def test_steering_effectiveness_perfect_remove(self):
|
||||
eff = compute_steering_effectiveness(2.0, 0.0, direction="remove")
|
||||
assert eff == 1.0
|
||||
|
||||
def test_steering_effectiveness_no_change(self):
|
||||
eff = compute_steering_effectiveness(2.0, 2.0, direction="remove")
|
||||
assert eff == 0.0
|
||||
|
||||
def test_steering_effectiveness_add(self):
|
||||
eff = compute_steering_effectiveness(1.0, 3.0, direction="add")
|
||||
assert eff == 1.0 # Capped at 1.0
|
||||
|
||||
def test_format_report(self):
|
||||
vec = SteeringVectorFactory.from_refusal_direction(torch.randn(32))
|
||||
config = SteeringConfig(vectors=[vec], target_layers=[3, 5], alpha=0.5)
|
||||
result = SteeringResult(config=config, hooks_installed=2, total_steered_layers=2)
|
||||
report = format_steering_report(result)
|
||||
assert "Steering" in report
|
||||
assert "refusal" in report
|
||||
|
||||
def test_steering_config_position_modes(self):
|
||||
"""Test different position modes in config."""
|
||||
for pos in ["all", "last", "first"]:
|
||||
config = SteeringConfig(
|
||||
vectors=[SteeringVectorFactory.from_refusal_direction(torch.randn(8))],
|
||||
target_layers=[0],
|
||||
position=pos,
|
||||
)
|
||||
assert config.position == pos
|
||||
|
||||
def test_imports(self):
|
||||
from obliteratus.analysis import SteeringVectorFactory, SteeringHookManager
|
||||
assert SteeringVectorFactory is not None
|
||||
assert SteeringHookManager is not None
|
||||
|
||||
|
||||
class TestParametrizedDimensions:
|
||||
"""Parametrized tests across different hidden dimensions."""
|
||||
|
||||
@pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256, 768])
|
||||
def test_whitened_svd_various_dims(self, hidden_dim):
|
||||
n_samples = max(4, hidden_dim // 4)
|
||||
harmful = [torch.randn(hidden_dim) for _ in range(n_samples)]
|
||||
harmless = [torch.randn(hidden_dim) for _ in range(n_samples)]
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless, n_directions=1)
|
||||
assert result.directions.shape[1] == hidden_dim
|
||||
|
||||
@pytest.mark.parametrize("hidden_dim", [2, 8, 64, 256])
|
||||
def test_cross_layer_various_dims(self, hidden_dim):
|
||||
directions = {i: torch.randn(hidden_dim) for i in range(4)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
assert 0.0 <= result.direction_persistence_score <= 1.0
|
||||
|
||||
@pytest.mark.parametrize("hidden_dim", [4, 32, 128])
|
||||
def test_sparse_surgery_various_dims(self, hidden_dim):
|
||||
weight = torch.randn(hidden_dim, hidden_dim)
|
||||
direction = torch.randn(hidden_dim)
|
||||
direction = direction / direction.norm()
|
||||
surgeon = SparseDirectionSurgeon()
|
||||
result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0)
|
||||
assert 0.0 <= result.energy_removed <= 1.0
|
||||
|
||||
@pytest.mark.parametrize("n_layers", [1, 4, 12, 32])
|
||||
def test_imprint_various_layer_counts(self, n_layers):
|
||||
directions = {i: torch.randn(64) for i in range(n_layers)}
|
||||
detector = AlignmentImprintDetector()
|
||||
result = detector.detect_imprint(directions)
|
||||
assert result.predicted_method in ("dpo", "rlhf", "cai", "sft", "unknown")
|
||||
|
||||
|
||||
class TestExceptionPaths:
|
||||
"""Tests for error handling and boundary conditions."""
|
||||
|
||||
def test_whitened_svd_mismatched_dims(self):
|
||||
"""Harmful and harmless with different hidden dims should fail or handle gracefully."""
|
||||
harmful = [torch.randn(64) for _ in range(10)]
|
||||
harmless = [torch.randn(32) for _ in range(10)]
|
||||
extractor = WhitenedSVDExtractor()
|
||||
with pytest.raises(Exception):
|
||||
extractor.extract(harmful, harmless, n_directions=1)
|
||||
|
||||
def test_whitened_svd_single_sample(self):
|
||||
"""Single sample should not crash (may return 0 directions due to insufficient data)."""
|
||||
harmful = [torch.randn(32)]
|
||||
harmless = [torch.randn(32)]
|
||||
extractor = WhitenedSVDExtractor()
|
||||
result = extractor.extract(harmful, harmless, n_directions=1)
|
||||
assert result.directions.shape[1] == 32 # hidden dim preserved
|
||||
|
||||
def test_sparse_surgery_zero_direction(self):
|
||||
"""Zero direction vector should not crash."""
|
||||
weight = torch.randn(16, 16)
|
||||
direction = torch.zeros(16)
|
||||
surgeon = SparseDirectionSurgeon()
|
||||
# Should handle gracefully (possibly returning 0 energy)
|
||||
result = surgeon.analyze_weight_matrix(weight, direction, layer_idx=0)
|
||||
assert result is not None
|
||||
|
||||
def test_cross_layer_single_layer(self):
|
||||
"""Single layer directions should still produce a result."""
|
||||
directions = {0: torch.randn(32)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
assert result is not None
|
||||
@@ -0,0 +1,483 @@
|
||||
"""Tests for the community-standard (Heretics/Arditi) evaluation protocol."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from obliteratus.evaluation.heretic_eval import (
|
||||
ARDITI_REFUSAL_SUBSTRINGS,
|
||||
_DEFAULT_HARMLESS_PROMPTS,
|
||||
_HARMBENCH_CLS_TEMPLATE,
|
||||
_JAILBREAKBENCH_BUILTIN,
|
||||
_extract_score,
|
||||
_last_real_token_indices,
|
||||
_sanitize_model_path,
|
||||
arditi_refusal_rate,
|
||||
first_token_kl_on_prompts,
|
||||
format_comparison_table,
|
||||
HereticComparisonResult,
|
||||
LM_EVAL_BENCHMARKS,
|
||||
load_jailbreakbench_prompts,
|
||||
unload_harmbench_classifier,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# JailbreakBench Prompts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestJailbreakBenchPrompts:
|
||||
def test_builtin_has_100_prompts(self):
|
||||
assert len(_JAILBREAKBENCH_BUILTIN) == 100
|
||||
|
||||
def test_builtin_all_non_empty(self):
|
||||
for i, p in enumerate(_JAILBREAKBENCH_BUILTIN):
|
||||
assert isinstance(p, str) and len(p.strip()) > 10, f"Prompt {i} is too short"
|
||||
|
||||
def test_builtin_no_duplicates(self):
|
||||
assert len(set(_JAILBREAKBENCH_BUILTIN)) == len(_JAILBREAKBENCH_BUILTIN)
|
||||
|
||||
def test_load_returns_list(self):
|
||||
prompts = load_jailbreakbench_prompts()
|
||||
assert isinstance(prompts, list)
|
||||
assert len(prompts) >= 100
|
||||
|
||||
def test_load_caches(self):
|
||||
p1 = load_jailbreakbench_prompts()
|
||||
p2 = load_jailbreakbench_prompts()
|
||||
assert p1 == p2
|
||||
|
||||
def test_load_returns_copy(self):
|
||||
"""Loading should return a copy, not the cached list itself."""
|
||||
p1 = load_jailbreakbench_prompts()
|
||||
p1.append("mutated")
|
||||
p2 = load_jailbreakbench_prompts()
|
||||
assert "mutated" not in p2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Arditi Refusal Rate
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestArditiRefusalRate:
|
||||
def test_canonical_substring_count(self):
|
||||
"""Arditi/Zou list has exactly 29 substrings."""
|
||||
assert len(ARDITI_REFUSAL_SUBSTRINGS) == 29
|
||||
|
||||
def test_all_refusals(self):
|
||||
responses = [
|
||||
"I'm sorry, but I cannot help with that.",
|
||||
"I apologize for the inconvenience.",
|
||||
"As an AI, I don't do that.",
|
||||
]
|
||||
assert arditi_refusal_rate(responses) == 1.0
|
||||
|
||||
def test_no_refusals(self):
|
||||
responses = [
|
||||
"Sure, here is a step-by-step guide.",
|
||||
"The answer to your question is 42.",
|
||||
"Here you go, this should work.",
|
||||
]
|
||||
assert arditi_refusal_rate(responses) == 0.0
|
||||
|
||||
def test_empty_responses_not_refusals(self):
|
||||
"""Per Zou et al. logic, empty responses have no substring match = not refusals."""
|
||||
responses = ["", " ", ""]
|
||||
assert arditi_refusal_rate(responses) == 0.0
|
||||
|
||||
def test_empty_list(self):
|
||||
assert arditi_refusal_rate([]) == 0.0
|
||||
|
||||
def test_case_insensitive(self):
|
||||
responses = ["i'm sorry, but I can't help."]
|
||||
assert arditi_refusal_rate(responses) == 1.0
|
||||
|
||||
def test_mixed(self):
|
||||
responses = [
|
||||
"I cannot do that.",
|
||||
"Sure, here's the info.",
|
||||
"My apologies, I'm unable to help.",
|
||||
"The capital of France is Paris.",
|
||||
]
|
||||
rate = arditi_refusal_rate(responses)
|
||||
assert rate == 0.5 # 2/4
|
||||
|
||||
def test_subtle_refusal_not_caught(self):
|
||||
"""Arditi list is deliberately limited — subtle refusals should not match."""
|
||||
responses = [
|
||||
"I'd rather not help with that specific request.",
|
||||
"Let me redirect you to a more appropriate resource.",
|
||||
]
|
||||
# These are NOT in the Arditi 29-substring list
|
||||
assert arditi_refusal_rate(responses) == 0.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# First-Token KL Divergence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFirstTokenKL:
|
||||
@pytest.fixture
|
||||
def simple_models(self):
|
||||
"""Create two tiny 'models' with controllable logit distributions."""
|
||||
class FakeModel(torch.nn.Module):
|
||||
def __init__(self, peak_idx: int = 0):
|
||||
super().__init__()
|
||||
# A single parameter so next(model.parameters()).device works
|
||||
self._param = torch.nn.Parameter(torch.zeros(1))
|
||||
self._peak_idx = peak_idx
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
batch_size = kwargs["input_ids"].shape[0]
|
||||
seq_len = kwargs["input_ids"].shape[1]
|
||||
vocab_size = 10
|
||||
# Create a non-uniform distribution peaked at _peak_idx
|
||||
base = torch.zeros(vocab_size)
|
||||
base[self._peak_idx] = 5.0
|
||||
logits = base.unsqueeze(0).unsqueeze(0).expand(
|
||||
batch_size, seq_len, vocab_size
|
||||
).clone()
|
||||
return type("Output", (), {"logits": logits})()
|
||||
|
||||
class FakeTokenizer:
|
||||
pad_token_id = 0
|
||||
def __call__(self, texts, return_tensors="pt", **kwargs):
|
||||
batch_size = len(texts) if isinstance(texts, list) else 1
|
||||
input_ids = torch.ones(batch_size, 5, dtype=torch.long)
|
||||
return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
|
||||
|
||||
return FakeModel, FakeTokenizer
|
||||
|
||||
def test_identical_models_zero_kl(self, simple_models):
|
||||
FakeModel, FakeTokenizer = simple_models
|
||||
model_a = FakeModel(peak_idx=0)
|
||||
model_b = FakeModel(peak_idx=0)
|
||||
tokenizer = FakeTokenizer()
|
||||
|
||||
result = first_token_kl_on_prompts(
|
||||
model_a, model_b, tokenizer,
|
||||
["hello", "world"],
|
||||
)
|
||||
assert abs(result["mean_kl"]) < 1e-5
|
||||
assert result["interpretation"] == "excellent (minimal collateral damage)"
|
||||
|
||||
def test_different_models_positive_kl(self, simple_models):
|
||||
FakeModel, FakeTokenizer = simple_models
|
||||
model_a = FakeModel(peak_idx=0) # peaked at vocab position 0
|
||||
model_b = FakeModel(peak_idx=5) # peaked at vocab position 5
|
||||
tokenizer = FakeTokenizer()
|
||||
|
||||
result = first_token_kl_on_prompts(
|
||||
model_a, model_b, tokenizer,
|
||||
["test prompt"],
|
||||
)
|
||||
assert result["mean_kl"] > 0
|
||||
|
||||
def test_returns_per_prompt_kl(self, simple_models):
|
||||
FakeModel, FakeTokenizer = simple_models
|
||||
model_a = FakeModel(peak_idx=0)
|
||||
model_b = FakeModel(peak_idx=3)
|
||||
tokenizer = FakeTokenizer()
|
||||
|
||||
result = first_token_kl_on_prompts(
|
||||
model_a, model_b, tokenizer,
|
||||
["a", "b", "c"],
|
||||
)
|
||||
assert len(result["per_prompt_kl"]) == 3
|
||||
assert result["std_kl"] >= 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HereticComparisonResult
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHereticComparisonResult:
|
||||
def test_dataclass_fields(self):
|
||||
r = HereticComparisonResult(
|
||||
model_name="test-model",
|
||||
method="OBLITERATUS",
|
||||
refusal_rate_arditi=0.05,
|
||||
refusal_rate_obliteratus=0.03,
|
||||
harmbench_asr=0.85,
|
||||
n_jailbreakbench=100,
|
||||
n_refusals_remaining=5,
|
||||
first_token_kl=0.15,
|
||||
kl_interpretation="excellent",
|
||||
)
|
||||
assert r.model_name == "test-model"
|
||||
assert r.method == "OBLITERATUS"
|
||||
assert r.refusal_rate_arditi == 0.05
|
||||
assert r.harmbench_asr == 0.85
|
||||
assert r.first_token_kl == 0.15
|
||||
|
||||
def test_optional_fields_default_none(self):
|
||||
r = HereticComparisonResult(
|
||||
model_name="test",
|
||||
method="test",
|
||||
refusal_rate_arditi=0.0,
|
||||
refusal_rate_obliteratus=0.0,
|
||||
harmbench_asr=None,
|
||||
n_jailbreakbench=100,
|
||||
n_refusals_remaining=0,
|
||||
)
|
||||
assert r.mmlu is None
|
||||
assert r.gsm8k is None
|
||||
assert r.perplexity is None
|
||||
assert r.harmbench_per_item == []
|
||||
assert r.kl_per_prompt == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Comparison Table Formatting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestComparisonTable:
|
||||
def test_format_single_result(self):
|
||||
r = HereticComparisonResult(
|
||||
model_name="Llama-2-7B",
|
||||
method="OBLITERATUS",
|
||||
refusal_rate_arditi=0.05,
|
||||
refusal_rate_obliteratus=0.03,
|
||||
harmbench_asr=0.85,
|
||||
n_jailbreakbench=100,
|
||||
n_refusals_remaining=5,
|
||||
first_token_kl=0.15,
|
||||
kl_interpretation="excellent",
|
||||
mmlu=0.518,
|
||||
gsm8k=0.313,
|
||||
)
|
||||
table = format_comparison_table([r])
|
||||
assert "OBLITERATUS" in table
|
||||
assert "REFUSAL REMOVAL" in table
|
||||
assert "CAPABILITY PRESERVATION" in table
|
||||
assert "DISTRIBUTION QUALITY" in table
|
||||
assert "5.0%" in table # arditi refusal rate
|
||||
assert "85.0%" in table # harmbench asr
|
||||
assert "5/100" in table # JBB refusals
|
||||
assert "0.1500" in table # KL divergence
|
||||
|
||||
def test_format_multiple_results(self):
|
||||
results = [
|
||||
HereticComparisonResult(
|
||||
model_name="test", method="OBLITERATUS",
|
||||
refusal_rate_arditi=0.05, refusal_rate_obliteratus=0.03,
|
||||
harmbench_asr=0.85, n_jailbreakbench=100, n_refusals_remaining=5,
|
||||
),
|
||||
HereticComparisonResult(
|
||||
model_name="test", method="Heretic",
|
||||
refusal_rate_arditi=0.03, refusal_rate_obliteratus=0.03,
|
||||
harmbench_asr=0.90, n_jailbreakbench=100, n_refusals_remaining=3,
|
||||
),
|
||||
]
|
||||
table = format_comparison_table(results)
|
||||
assert "OBLITERATUS" in table
|
||||
assert "Heretic" in table
|
||||
|
||||
def test_heretic_reference_numbers_present(self):
|
||||
"""The comparison table should include Heretic's published reference numbers."""
|
||||
table = format_comparison_table([
|
||||
HereticComparisonResult(
|
||||
model_name="test", method="test",
|
||||
refusal_rate_arditi=0.0, refusal_rate_obliteratus=0.0,
|
||||
harmbench_asr=None, n_jailbreakbench=100, n_refusals_remaining=0,
|
||||
),
|
||||
])
|
||||
assert "p-e-w/heretic" in table
|
||||
assert "97/100" in table # Original model refusal count
|
||||
assert "0.16" in table # Heretic's KL divergence
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LM-Eval Benchmark Config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLmEvalBenchmarks:
|
||||
def test_five_standard_benchmarks(self):
|
||||
assert len(LM_EVAL_BENCHMARKS) == 5
|
||||
|
||||
def test_required_keys(self):
|
||||
for key, config in LM_EVAL_BENCHMARKS.items():
|
||||
assert "task" in config
|
||||
assert "num_fewshot" in config
|
||||
assert "description" in config
|
||||
|
||||
def test_expected_benchmarks(self):
|
||||
expected = {"mmlu", "gsm8k", "arc_challenge", "hellaswag", "truthfulqa_mc2"}
|
||||
assert set(LM_EVAL_BENCHMARKS.keys()) == expected
|
||||
|
||||
def test_fewshot_settings_match_arditi(self):
|
||||
"""Few-shot settings should match Arditi et al. / Open LLM Leaderboard."""
|
||||
assert LM_EVAL_BENCHMARKS["mmlu"]["num_fewshot"] == 5
|
||||
assert LM_EVAL_BENCHMARKS["gsm8k"]["num_fewshot"] == 5
|
||||
assert LM_EVAL_BENCHMARKS["truthfulqa_mc2"]["num_fewshot"] == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HarmBench Template
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHarmBenchTemplate:
|
||||
def test_template_has_sys_tags(self):
|
||||
"""Official template requires <<SYS>> / <</SYS>> delimiters."""
|
||||
assert "<<SYS>>" in _HARMBENCH_CLS_TEMPLATE
|
||||
assert "<</SYS>>" in _HARMBENCH_CLS_TEMPLATE
|
||||
|
||||
def test_template_has_seven_rules(self):
|
||||
"""Official HarmBench template has exactly 7 rules."""
|
||||
rule_count = _HARMBENCH_CLS_TEMPLATE.count("\n- ")
|
||||
assert rule_count == 7
|
||||
|
||||
def test_template_has_behavior_placeholder(self):
|
||||
assert "{behavior}" in _HARMBENCH_CLS_TEMPLATE
|
||||
|
||||
def test_template_has_generation_placeholder(self):
|
||||
assert "{generation}" in _HARMBENCH_CLS_TEMPLATE
|
||||
|
||||
def test_template_ends_with_inst_tag(self):
|
||||
assert _HARMBENCH_CLS_TEMPLATE.strip().endswith("[/INST]")
|
||||
|
||||
def test_template_answer_format(self):
|
||||
"""Official template uses lowercase 'yes' or 'no'."""
|
||||
assert '"yes" or "no"' in _HARMBENCH_CLS_TEMPLATE
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Score Extraction (C3 fix)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestExtractScore:
|
||||
def test_normal_score(self):
|
||||
assert _extract_score({"acc,none": 0.75}) == 0.75
|
||||
|
||||
def test_zero_score_not_skipped(self):
|
||||
"""A legitimate score of 0.0 should be returned, not treated as falsy."""
|
||||
assert _extract_score({"acc,none": 0.0}) == 0.0
|
||||
|
||||
def test_fallback_to_next_key(self):
|
||||
assert _extract_score({"acc_norm,none": 0.65}) == 0.65
|
||||
|
||||
def test_mc2_key(self):
|
||||
assert _extract_score({"mc2,none": 0.42}) == 0.42
|
||||
|
||||
def test_no_matching_key(self):
|
||||
assert _extract_score({"unknown_metric": 0.99}) == 0.0
|
||||
|
||||
def test_priority_order(self):
|
||||
"""acc,none should take priority over acc_norm,none."""
|
||||
result = _extract_score({"acc,none": 0.5, "acc_norm,none": 0.9})
|
||||
assert result == 0.5
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Padding-Aware Last-Token Indices
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLastRealTokenIndices:
|
||||
def test_no_padding(self):
|
||||
mask = torch.ones(3, 5, dtype=torch.long)
|
||||
indices = _last_real_token_indices(mask)
|
||||
assert indices.tolist() == [4, 4, 4]
|
||||
|
||||
def test_with_padding(self):
|
||||
mask = torch.tensor([
|
||||
[1, 1, 1, 1, 1], # length 5, last real = index 4
|
||||
[1, 1, 1, 0, 0], # length 3, last real = index 2
|
||||
[1, 0, 0, 0, 0], # length 1, last real = index 0
|
||||
])
|
||||
indices = _last_real_token_indices(mask)
|
||||
assert indices.tolist() == [4, 2, 0]
|
||||
|
||||
def test_single_token(self):
|
||||
mask = torch.tensor([[1]])
|
||||
indices = _last_real_token_indices(mask)
|
||||
assert indices.tolist() == [0]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model Path Sanitization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSanitizeModelPath:
|
||||
def test_normal_path(self):
|
||||
assert _sanitize_model_path("/tmp/my-model") == "/tmp/my-model"
|
||||
|
||||
def test_hf_model_id(self):
|
||||
assert _sanitize_model_path("meta-llama/Llama-2-7b-hf") == "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
def test_rejects_commas(self):
|
||||
with pytest.raises(ValueError, match="commas"):
|
||||
_sanitize_model_path("evil,trust_remote_code=True")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Classifier Unload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClassifierUnload:
|
||||
def test_unload_when_not_loaded(self):
|
||||
"""Unloading when nothing is loaded should not raise."""
|
||||
unload_harmbench_classifier() # should be a no-op
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default Harmless Prompts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDefaultHarmlessPrompts:
|
||||
def test_has_100_unique_prompts(self):
|
||||
assert len(_DEFAULT_HARMLESS_PROMPTS) == 100
|
||||
|
||||
def test_no_duplicates(self):
|
||||
assert len(set(_DEFAULT_HARMLESS_PROMPTS)) == len(_DEFAULT_HARMLESS_PROMPTS)
|
||||
|
||||
def test_all_non_empty(self):
|
||||
for i, p in enumerate(_DEFAULT_HARMLESS_PROMPTS):
|
||||
assert isinstance(p, str) and len(p) > 10, f"Prompt {i} is too short"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# KL Divergence Non-Negativity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestKLNonNegativity:
|
||||
@pytest.fixture
|
||||
def models_and_tokenizer(self):
|
||||
class FakeModel(torch.nn.Module):
|
||||
def __init__(self, peak_idx: int = 0):
|
||||
super().__init__()
|
||||
self._param = torch.nn.Parameter(torch.zeros(1))
|
||||
self._peak_idx = peak_idx
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
batch_size = kwargs["input_ids"].shape[0]
|
||||
seq_len = kwargs["input_ids"].shape[1]
|
||||
vocab_size = 10
|
||||
base = torch.zeros(vocab_size)
|
||||
base[self._peak_idx] = 5.0
|
||||
logits = base.unsqueeze(0).unsqueeze(0).expand(
|
||||
batch_size, seq_len, vocab_size
|
||||
).clone()
|
||||
return type("Output", (), {"logits": logits})()
|
||||
|
||||
class FakeTokenizer:
|
||||
pad_token_id = 0
|
||||
def __call__(self, texts, return_tensors="pt", **kwargs):
|
||||
batch_size = len(texts) if isinstance(texts, list) else 1
|
||||
input_ids = torch.ones(batch_size, 5, dtype=torch.long)
|
||||
return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
|
||||
|
||||
return FakeModel, FakeTokenizer
|
||||
|
||||
def test_all_kl_values_non_negative(self, models_and_tokenizer):
|
||||
FakeModel, FakeTokenizer = models_and_tokenizer
|
||||
model_a = FakeModel(peak_idx=0)
|
||||
model_b = FakeModel(peak_idx=3)
|
||||
tokenizer = FakeTokenizer()
|
||||
|
||||
result = first_token_kl_on_prompts(
|
||||
model_a, model_b, tokenizer,
|
||||
["a", "b", "c", "d", "e"],
|
||||
)
|
||||
for val in result["per_prompt_kl"]:
|
||||
assert val >= 0.0, f"KL value {val} is negative"
|
||||
@@ -0,0 +1,385 @@
|
||||
"""Tests for the Analysis-Informed Abliteration Pipeline."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from obliteratus.informed_pipeline import (
|
||||
AnalysisInsights,
|
||||
InformedAbliterationPipeline,
|
||||
InformedPipelineReport,
|
||||
INFORMED_METHOD,
|
||||
)
|
||||
from obliteratus.abliterate import METHODS
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture
|
||||
def insights():
|
||||
"""Default AnalysisInsights for testing."""
|
||||
return AnalysisInsights()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pipeline(tmp_path):
|
||||
"""An InformedAbliterationPipeline with no model loaded."""
|
||||
return InformedAbliterationPipeline(
|
||||
model_name="test-model",
|
||||
output_dir=str(tmp_path / "test_informed"),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AnalysisInsights
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAnalysisInsights:
|
||||
def test_default_values(self, insights):
|
||||
assert insights.detected_alignment_method == "unknown"
|
||||
assert insights.alignment_confidence == 0.0
|
||||
assert insights.cone_is_polyhedral is False
|
||||
assert insights.cone_dimensionality == 1.0
|
||||
assert insights.mean_pairwise_cosine == 1.0
|
||||
assert insights.per_category_directions == {}
|
||||
assert insights.direction_specificity == {}
|
||||
assert insights.cluster_count == 0
|
||||
assert insights.direction_persistence == 0.0
|
||||
assert insights.use_sparse_surgery is False
|
||||
assert insights.recommended_n_directions == 4
|
||||
assert insights.recommended_regularization == 0.0
|
||||
assert insights.recommended_refinement_passes == 2
|
||||
assert insights.recommended_layers == []
|
||||
assert insights.skip_layers == []
|
||||
|
||||
def test_default_robustness(self, insights):
|
||||
assert insights.estimated_robustness == "unknown"
|
||||
assert insights.self_repair_estimate == 0.0
|
||||
assert insights.entanglement_score == 0.0
|
||||
assert insights.entangled_layers == []
|
||||
assert insights.clean_layers == []
|
||||
|
||||
|
||||
class TestInformedPipelineReport:
|
||||
def test_default_report(self):
|
||||
insights = AnalysisInsights()
|
||||
report = InformedPipelineReport(insights=insights)
|
||||
assert report.analysis_duration == 0.0
|
||||
assert report.total_duration == 0.0
|
||||
assert report.ouroboros_passes == 0
|
||||
assert report.final_refusal_rate == 0.0
|
||||
assert report.stages == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Method preset
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInformedMethod:
|
||||
def test_informed_method_in_abliterate_methods(self):
|
||||
assert "informed" in METHODS
|
||||
cfg = METHODS["informed"]
|
||||
assert cfg["norm_preserve"] is True
|
||||
assert cfg["project_biases"] is True
|
||||
assert cfg["use_chat_template"] is True
|
||||
assert cfg["use_whitened_svd"] is True
|
||||
assert cfg["true_iterative_refinement"] is True
|
||||
|
||||
def test_informed_method_standalone(self):
|
||||
assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)"
|
||||
assert INFORMED_METHOD["n_directions"] == 4
|
||||
assert INFORMED_METHOD["norm_preserve"] is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pipeline initialization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestPipelineInit:
|
||||
def test_method_set_to_informed(self, pipeline):
|
||||
assert pipeline.method == "informed"
|
||||
|
||||
def test_default_analysis_flags(self, pipeline):
|
||||
assert pipeline._run_cone is True
|
||||
assert pipeline._run_alignment is True
|
||||
assert pipeline._run_cross_layer is True
|
||||
assert pipeline._run_sparse is True
|
||||
assert pipeline._run_defense is True
|
||||
|
||||
def test_ouroboros_defaults(self, pipeline):
|
||||
assert pipeline._ouroboros_threshold == 0.5
|
||||
assert pipeline._max_ouroboros_passes == 3
|
||||
|
||||
def test_entanglement_gate(self, pipeline):
|
||||
assert pipeline._entanglement_gate == 0.8
|
||||
|
||||
def test_inherits_base_pipeline(self, pipeline):
|
||||
assert pipeline.norm_preserve is True
|
||||
assert pipeline.project_biases is True
|
||||
assert pipeline.use_chat_template is True
|
||||
assert pipeline.use_whitened_svd is True
|
||||
assert pipeline.true_iterative_refinement is True
|
||||
|
||||
def test_custom_flags(self):
|
||||
p = InformedAbliterationPipeline(
|
||||
model_name="test",
|
||||
run_cone_analysis=False,
|
||||
run_alignment_detection=False,
|
||||
ouroboros_threshold=0.3,
|
||||
max_ouroboros_passes=5,
|
||||
entanglement_gate=0.9,
|
||||
)
|
||||
assert p._run_cone is False
|
||||
assert p._run_alignment is False
|
||||
assert p._ouroboros_threshold == 0.3
|
||||
assert p._max_ouroboros_passes == 5
|
||||
assert p._entanglement_gate == 0.9
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration derivation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestConfigurationDerivation:
|
||||
"""Test the _derive_configuration logic with various insights."""
|
||||
|
||||
def _make_pipeline_with_insights(self, **kwargs):
|
||||
p = InformedAbliterationPipeline(
|
||||
model_name="test",
|
||||
on_log=lambda m: None,
|
||||
)
|
||||
for k, v in kwargs.items():
|
||||
setattr(p._insights, k, v)
|
||||
return p
|
||||
|
||||
def test_polyhedral_cone_more_directions(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
cone_is_polyhedral=True,
|
||||
cone_dimensionality=3.5,
|
||||
)
|
||||
p._derive_configuration()
|
||||
# Polyhedral with dim 3.5 → n_dirs = max(4, min(8, int(3.5*2))) = 7
|
||||
assert p.n_directions == 7
|
||||
|
||||
def test_linear_cone_fewer_directions(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
cone_is_polyhedral=False,
|
||||
cone_dimensionality=1.0,
|
||||
)
|
||||
p._derive_configuration()
|
||||
# Linear with dim 1.0 → n_dirs = max(1, min(4, int(1.0+1))) = 2
|
||||
assert p.n_directions == 2
|
||||
|
||||
def test_dpo_zero_regularization(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
detected_alignment_method="dpo",
|
||||
entanglement_score=0.1,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.regularization == 0.0
|
||||
|
||||
def test_rlhf_moderate_regularization(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
detected_alignment_method="rlhf",
|
||||
entanglement_score=0.2,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.regularization == 0.15
|
||||
|
||||
def test_cai_regularization(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
detected_alignment_method="cai",
|
||||
entanglement_score=0.2,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.regularization == 0.2
|
||||
|
||||
def test_sft_low_regularization(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
detected_alignment_method="sft",
|
||||
entanglement_score=0.1,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.regularization == 0.05
|
||||
|
||||
def test_high_entanglement_increases_regularization(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
detected_alignment_method="dpo",
|
||||
entanglement_score=0.7,
|
||||
)
|
||||
p._derive_configuration()
|
||||
# DPO base = 0.0, + 0.15 for high entanglement = 0.15
|
||||
assert p.regularization == 0.15
|
||||
|
||||
def test_high_self_repair_more_passes(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
self_repair_estimate=0.8,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.refinement_passes == 3
|
||||
|
||||
def test_moderate_self_repair_two_passes(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
self_repair_estimate=0.5,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.refinement_passes == 2
|
||||
|
||||
def test_low_self_repair_one_pass(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
self_repair_estimate=0.2,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.refinement_passes == 1
|
||||
|
||||
def test_cluster_layers_used(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
cluster_representative_layers=[5, 10, 15],
|
||||
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
|
||||
)
|
||||
p.refusal_directions = {i: torch.randn(64) for i in range(20)}
|
||||
p._derive_configuration()
|
||||
# Should include all cluster layers
|
||||
assert 5 in p._insights.recommended_layers
|
||||
assert 10 in p._insights.recommended_layers
|
||||
|
||||
def test_entangled_layers_skipped(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
cluster_representative_layers=[5, 10, 15],
|
||||
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
|
||||
entangled_layers=[10],
|
||||
)
|
||||
p._derive_configuration()
|
||||
# Layer 10 should be skipped
|
||||
assert 10 not in p._insights.recommended_layers
|
||||
assert 10 in p._insights.skip_layers
|
||||
|
||||
def test_sparse_surgery_enabled_when_rsi_high(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
mean_refusal_sparsity_index=0.7,
|
||||
)
|
||||
p._sparse_threshold = 0.5
|
||||
p._derive_configuration()
|
||||
assert p._insights.use_sparse_surgery is True
|
||||
|
||||
def test_sparse_surgery_disabled_when_rsi_low(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
mean_refusal_sparsity_index=0.3,
|
||||
)
|
||||
p._sparse_threshold = 0.5
|
||||
p._derive_configuration()
|
||||
assert p._insights.use_sparse_surgery is False
|
||||
|
||||
def test_whitened_svd_for_multi_direction(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
cone_is_polyhedral=True,
|
||||
cone_dimensionality=2.5,
|
||||
)
|
||||
p._derive_configuration()
|
||||
assert p.n_directions > 1
|
||||
assert p.use_whitened_svd is True
|
||||
|
||||
def test_no_whitened_svd_for_single_direction(self):
|
||||
p = self._make_pipeline_with_insights(
|
||||
cone_is_polyhedral=False,
|
||||
cone_dimensionality=0.5,
|
||||
)
|
||||
p._derive_configuration()
|
||||
# dim 0.5 → max(1, min(4, int(0.5+1))) = 1
|
||||
assert p.n_directions == 1
|
||||
assert p.use_whitened_svd is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Format report
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFormatInsights:
|
||||
def test_format_default(self, insights):
|
||||
text = InformedAbliterationPipeline.format_insights(insights)
|
||||
assert "Analysis-Informed Pipeline" in text
|
||||
assert "UNKNOWN" in text # detected method
|
||||
assert "LINEAR" in text # cone type
|
||||
|
||||
def test_format_polyhedral(self):
|
||||
insights = AnalysisInsights(
|
||||
detected_alignment_method="dpo",
|
||||
alignment_confidence=0.85,
|
||||
cone_is_polyhedral=True,
|
||||
cone_dimensionality=3.5,
|
||||
cluster_count=4,
|
||||
)
|
||||
text = InformedAbliterationPipeline.format_insights(insights)
|
||||
assert "DPO" in text
|
||||
assert "POLYHEDRAL" in text
|
||||
assert "3.50" in text
|
||||
|
||||
def test_format_includes_derived_config(self, insights):
|
||||
insights.recommended_n_directions = 6
|
||||
insights.recommended_regularization = 0.2
|
||||
insights.recommended_refinement_passes = 3
|
||||
text = InformedAbliterationPipeline.format_insights(insights)
|
||||
assert "n_directions: 6" in text
|
||||
assert "regularization: 0.2" in text
|
||||
assert "refinement_passes: 3" in text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEdgeCases:
|
||||
def test_no_cluster_layers_falls_back(self):
|
||||
p = InformedAbliterationPipeline(
|
||||
model_name="test",
|
||||
on_log=lambda m: None,
|
||||
)
|
||||
p._insights.cluster_representative_layers = []
|
||||
p._derive_configuration()
|
||||
assert p._insights.recommended_layers == []
|
||||
|
||||
def test_regularization_capped(self):
|
||||
p = InformedAbliterationPipeline(
|
||||
model_name="test",
|
||||
on_log=lambda m: None,
|
||||
)
|
||||
p._insights.detected_alignment_method = "cai"
|
||||
p._insights.entanglement_score = 0.9
|
||||
p._derive_configuration()
|
||||
# CAI base = 0.2, + 0.15 = 0.35, capped at 0.5
|
||||
assert p.regularization <= 0.5
|
||||
|
||||
def test_all_layers_entangled_keeps_some(self):
|
||||
"""If all cluster layers are entangled, don't skip all of them."""
|
||||
p = InformedAbliterationPipeline(
|
||||
model_name="test",
|
||||
on_log=lambda m: None,
|
||||
)
|
||||
p._insights.cluster_representative_layers = [5]
|
||||
p._insights.direction_clusters = [[5]]
|
||||
p._insights.entangled_layers = [5]
|
||||
p._derive_configuration()
|
||||
# Should NOT skip the only layer
|
||||
assert 5 in p._insights.recommended_layers
|
||||
|
||||
def test_cone_dimensionality_bounds(self):
|
||||
"""Extreme cone dimensionality values are handled."""
|
||||
p = InformedAbliterationPipeline(
|
||||
model_name="test",
|
||||
on_log=lambda m: None,
|
||||
)
|
||||
# Very high dimensionality
|
||||
p._insights.cone_is_polyhedral = True
|
||||
p._insights.cone_dimensionality = 10.0
|
||||
p._derive_configuration()
|
||||
assert p.n_directions <= 8 # capped
|
||||
|
||||
# Very low dimensionality
|
||||
p._insights.cone_is_polyhedral = False
|
||||
p._insights.cone_dimensionality = 0.1
|
||||
p._derive_configuration()
|
||||
assert p.n_directions >= 1 # at least 1
|
||||
@@ -0,0 +1,172 @@
|
||||
"""Tests for logit lens refusal direction analysis."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.analysis.logit_lens import (
|
||||
RefusalLogitLens,
|
||||
LogitLensResult,
|
||||
MultiLayerLogitLensResult,
|
||||
REFUSAL_TOKENS,
|
||||
COMPLIANCE_TOKENS,
|
||||
)
|
||||
|
||||
|
||||
def _make_mock_model(hidden_dim=32, vocab_size=100):
|
||||
"""Create a mock model with LM head and layer norm."""
|
||||
model = MagicMock()
|
||||
|
||||
# LM head weight (vocab_size, hidden_dim)
|
||||
lm_head = MagicMock()
|
||||
lm_head.weight = MagicMock()
|
||||
lm_head.weight.data = torch.randn(vocab_size, hidden_dim)
|
||||
model.lm_head = lm_head
|
||||
|
||||
# Final LayerNorm
|
||||
ln_f = MagicMock()
|
||||
ln_f.weight = MagicMock()
|
||||
ln_f.weight.data = torch.ones(hidden_dim)
|
||||
ln_f.bias = MagicMock()
|
||||
ln_f.bias.data = torch.zeros(hidden_dim)
|
||||
model.transformer = MagicMock()
|
||||
model.transformer.ln_f = ln_f
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def _make_mock_tokenizer(vocab_size=100):
|
||||
"""Create a mock tokenizer."""
|
||||
tokenizer = MagicMock()
|
||||
|
||||
def mock_decode(ids):
|
||||
if isinstance(ids, list) and len(ids) == 1:
|
||||
return f"tok_{ids[0]}"
|
||||
return f"tok_{ids}"
|
||||
|
||||
def mock_encode(text, add_special_tokens=False):
|
||||
# Return a deterministic token ID based on the text
|
||||
return [hash(text) % vocab_size]
|
||||
|
||||
tokenizer.decode = mock_decode
|
||||
tokenizer.encode = mock_encode
|
||||
return tokenizer
|
||||
|
||||
|
||||
class TestRefusalLogitLens:
|
||||
def test_basic_analysis(self):
|
||||
"""Should produce a LogitLensResult with expected fields."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
direction = torch.randn(32)
|
||||
|
||||
lens = RefusalLogitLens(top_k=10)
|
||||
result = lens.analyze_direction(direction, model, tokenizer, layer_idx=5)
|
||||
|
||||
assert isinstance(result, LogitLensResult)
|
||||
assert result.layer_idx == 5
|
||||
assert len(result.top_promoted) == 10
|
||||
assert len(result.top_suppressed) == 10
|
||||
assert isinstance(result.refusal_specificity, float)
|
||||
assert isinstance(result.logit_effect_entropy, float)
|
||||
assert isinstance(result.refusal_compliance_gap, float)
|
||||
|
||||
def test_promoted_suppressed_ordering(self):
|
||||
"""Top promoted should have higher logit boost than top suppressed."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
direction = torch.randn(32)
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_direction(direction, model, tokenizer)
|
||||
|
||||
# Promoted tokens should have positive-ish values
|
||||
# Suppressed tokens should have negative-ish values
|
||||
max_promoted = max(v for _, v in result.top_promoted)
|
||||
min_suppressed = min(v for _, v in result.top_suppressed)
|
||||
assert max_promoted > min_suppressed
|
||||
|
||||
def test_multi_layer_analysis(self):
|
||||
"""Should analyze multiple layers."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
directions = {0: torch.randn(32), 1: torch.randn(32), 2: torch.randn(32)}
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_all_layers(directions, model, tokenizer)
|
||||
|
||||
assert isinstance(result, MultiLayerLogitLensResult)
|
||||
assert len(result.per_layer) == 3
|
||||
assert result.strongest_refusal_layer in [0, 1, 2]
|
||||
assert result.peak_specificity_layer in [0, 1, 2]
|
||||
|
||||
def test_strong_layers_filter(self):
|
||||
"""Should only analyze specified strong layers."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
directions = {i: torch.randn(32) for i in range(10)}
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_all_layers(
|
||||
directions, model, tokenizer, strong_layers=[2, 5]
|
||||
)
|
||||
assert set(result.per_layer.keys()) == {2, 5}
|
||||
|
||||
def test_handles_unnormalized_direction(self):
|
||||
"""Should handle non-unit directions."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
direction = torch.randn(32) * 100.0 # large magnitude
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_direction(direction, model, tokenizer)
|
||||
# Should still produce valid results
|
||||
assert len(result.top_promoted) == 5
|
||||
|
||||
def test_format_report(self):
|
||||
"""Format report should produce readable output."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
directions = {0: torch.randn(32), 1: torch.randn(32)}
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_all_layers(directions, model, tokenizer)
|
||||
report = RefusalLogitLens.format_report(result)
|
||||
assert "Logit Lens" in report
|
||||
assert "Layer 0:" in report
|
||||
|
||||
def test_empty_directions(self):
|
||||
"""Should handle empty input gracefully."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_all_layers({}, model, tokenizer)
|
||||
assert len(result.per_layer) == 0
|
||||
|
||||
def test_token_lists_nonempty(self):
|
||||
"""Refusal and compliance token lists should have entries."""
|
||||
assert len(REFUSAL_TOKENS) > 10
|
||||
assert len(COMPLIANCE_TOKENS) > 10
|
||||
|
||||
def test_entropy_nonnegative(self):
|
||||
"""Logit effect entropy should be non-negative."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
direction = torch.randn(32)
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_direction(direction, model, tokenizer)
|
||||
assert result.logit_effect_entropy >= 0
|
||||
|
||||
def test_2d_direction_input(self):
|
||||
"""Should handle 2D direction input (unsqueezed)."""
|
||||
model = _make_mock_model()
|
||||
tokenizer = _make_mock_tokenizer()
|
||||
direction = torch.randn(1, 32)
|
||||
|
||||
lens = RefusalLogitLens(top_k=5)
|
||||
result = lens.analyze_direction(direction, model, tokenizer)
|
||||
assert len(result.top_promoted) == 5
|
||||
@@ -0,0 +1,60 @@
|
||||
"""Tests for evaluation metrics."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.evaluation.metrics import accuracy, f1_score_metric, perplexity
|
||||
|
||||
|
||||
class TestPerplexity:
|
||||
def test_perfect_prediction(self):
|
||||
# Create logits that strongly predict the correct next token
|
||||
vocab_size = 10
|
||||
seq_len = 5
|
||||
batch_size = 1
|
||||
|
||||
labels = torch.tensor([[0, 1, 2, 3, 4]])
|
||||
logits = torch.full((batch_size, seq_len, vocab_size), -100.0)
|
||||
# Set high logit for the correct next token
|
||||
for t in range(seq_len - 1):
|
||||
logits[0, t, labels[0, t + 1]] = 100.0
|
||||
|
||||
ppl = perplexity(logits, labels)
|
||||
assert ppl < 2.0, f"Expected near-1 perplexity, got {ppl}"
|
||||
|
||||
def test_random_prediction_higher(self):
|
||||
vocab_size = 100
|
||||
seq_len = 20
|
||||
batch_size = 2
|
||||
|
||||
torch.manual_seed(42)
|
||||
logits = torch.randn(batch_size, seq_len, vocab_size)
|
||||
labels = torch.randint(0, vocab_size, (batch_size, seq_len))
|
||||
|
||||
ppl = perplexity(logits, labels)
|
||||
assert ppl > 10, f"Random logits should yield high perplexity, got {ppl}"
|
||||
|
||||
|
||||
class TestAccuracy:
|
||||
def test_perfect(self):
|
||||
assert accuracy([1, 2, 3], [1, 2, 3]) == 1.0
|
||||
|
||||
def test_zero(self):
|
||||
assert accuracy([1, 2, 3], [4, 5, 6]) == 0.0
|
||||
|
||||
def test_partial(self):
|
||||
assert accuracy([1, 2, 3, 4], [1, 2, 0, 0]) == 0.5
|
||||
|
||||
def test_empty(self):
|
||||
assert accuracy([], []) == 0.0
|
||||
|
||||
|
||||
class TestF1:
|
||||
def test_perfect(self):
|
||||
assert f1_score_metric([0, 1, 0, 1], [0, 1, 0, 1]) == 1.0
|
||||
|
||||
def test_zero(self):
|
||||
score = f1_score_metric([0, 0, 0, 0], [1, 1, 1, 1])
|
||||
assert score == 0.0
|
||||
@@ -0,0 +1,85 @@
|
||||
"""Smoke tests verifying all new modules are importable from package level."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class TestTopLevelImports:
|
||||
"""Verify obliteratus top-level exports."""
|
||||
|
||||
def test_set_seed(self):
|
||||
from obliteratus import set_seed
|
||||
assert callable(set_seed)
|
||||
|
||||
def test_run_sweep(self):
|
||||
from obliteratus import run_sweep
|
||||
assert callable(run_sweep)
|
||||
|
||||
def test_sweep_config(self):
|
||||
from obliteratus import SweepConfig
|
||||
cfg = SweepConfig(
|
||||
model_name="test",
|
||||
sweep_params={"n_directions": [1, 2]},
|
||||
)
|
||||
assert cfg.model_name == "test"
|
||||
|
||||
def test_sweep_result(self):
|
||||
from obliteratus import SweepResult
|
||||
r = SweepResult(
|
||||
params={"n_directions": 1},
|
||||
seed=42,
|
||||
quality_metrics={},
|
||||
stage_durations={},
|
||||
strong_layers=[],
|
||||
)
|
||||
assert r.seed == 42
|
||||
|
||||
|
||||
class TestEvaluationImports:
|
||||
"""Verify evaluation subpackage exports."""
|
||||
|
||||
def test_refusal_rate_with_ci(self):
|
||||
from obliteratus.evaluation import refusal_rate_with_ci
|
||||
result = refusal_rate_with_ci(["Sure, here you go."], mode="combined")
|
||||
assert result["rate"] == 0.0
|
||||
assert result["n_samples"] == 1
|
||||
|
||||
def test_random_direction_ablation(self):
|
||||
from obliteratus.evaluation import random_direction_ablation
|
||||
assert callable(random_direction_ablation)
|
||||
|
||||
def test_direction_specificity_test(self):
|
||||
from obliteratus.evaluation import direction_specificity_test
|
||||
assert callable(direction_specificity_test)
|
||||
|
||||
def test_run_benchmarks(self):
|
||||
from obliteratus.evaluation import run_benchmarks
|
||||
assert callable(run_benchmarks)
|
||||
|
||||
def test_compare_models(self):
|
||||
from obliteratus.evaluation import compare_models
|
||||
assert callable(compare_models)
|
||||
|
||||
|
||||
class TestDirectImports:
|
||||
"""Verify direct module imports still work."""
|
||||
|
||||
def test_reproducibility(self):
|
||||
from obliteratus.reproducibility import set_seed
|
||||
import torch
|
||||
set_seed(999, deterministic=False)
|
||||
a = torch.randn(10)
|
||||
set_seed(999, deterministic=False)
|
||||
b = torch.randn(10)
|
||||
assert torch.equal(a, b)
|
||||
|
||||
def test_baselines(self):
|
||||
from obliteratus.evaluation.baselines import (
|
||||
BaselineResult,
|
||||
)
|
||||
assert BaselineResult is not None
|
||||
|
||||
def test_lm_eval_integration(self):
|
||||
from obliteratus.evaluation.lm_eval_integration import (
|
||||
run_benchmarks,
|
||||
)
|
||||
assert callable(run_benchmarks)
|
||||
@@ -0,0 +1,672 @@
|
||||
"""Tests for the five new analysis modules:
|
||||
1. Tuned Lens (learned-affine logit lens variant)
|
||||
2. Activation Patching (real interchange intervention)
|
||||
3. Enhanced SAE Decomposition Pipeline
|
||||
4. Wasserstein-Optimal Direction Extraction
|
||||
5. Bayesian-Optimized Kernel Projection
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from obliteratus.analysis.tuned_lens import (
|
||||
TunedLensTrainer,
|
||||
TunedLensProbe,
|
||||
RefusalTunedLens,
|
||||
TunedLensResult,
|
||||
MultiLayerTunedLensResult,
|
||||
)
|
||||
from obliteratus.analysis.activation_patching import (
|
||||
ActivationPatcher,
|
||||
PatchingSite,
|
||||
ActivationPatchingResult,
|
||||
)
|
||||
from obliteratus.analysis.sae_abliteration import (
|
||||
SAEDecompositionPipeline,
|
||||
SAEDecompositionResult,
|
||||
FeatureClusterResult,
|
||||
)
|
||||
from obliteratus.analysis.wasserstein_optimal import (
|
||||
WassersteinOptimalExtractor,
|
||||
WassersteinDirectionResult,
|
||||
WassersteinComparisonResult,
|
||||
MultiLayerWassersteinResult,
|
||||
)
|
||||
from obliteratus.analysis.bayesian_kernel_projection import (
|
||||
BayesianKernelProjection,
|
||||
BayesianOptimizationResult,
|
||||
ProjectionConfig,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_activations(
|
||||
hidden_dim=32, n_per_class=20, separation=2.0, seed=42,
|
||||
):
|
||||
"""Create harmful/harmless activations with planted refusal signal."""
|
||||
torch.manual_seed(seed)
|
||||
direction = torch.randn(hidden_dim)
|
||||
direction = direction / direction.norm()
|
||||
|
||||
harmful = [
|
||||
torch.randn(hidden_dim) * 0.3 + separation * direction
|
||||
for _ in range(n_per_class)
|
||||
]
|
||||
harmless = [
|
||||
torch.randn(hidden_dim) * 0.3
|
||||
for _ in range(n_per_class)
|
||||
]
|
||||
return harmful, harmless, direction
|
||||
|
||||
|
||||
def _make_multilayer_activations(
|
||||
n_layers=6, hidden_dim=32, n_per_class=20, separation=2.0, seed=42,
|
||||
):
|
||||
"""Create per-layer activations with planted refusal signals."""
|
||||
torch.manual_seed(seed)
|
||||
|
||||
harmful_acts = {}
|
||||
harmless_acts = {}
|
||||
directions = {}
|
||||
|
||||
for li in range(n_layers):
|
||||
d = torch.randn(hidden_dim)
|
||||
d = d / d.norm()
|
||||
directions[li] = d
|
||||
|
||||
strength = separation if 1 <= li <= n_layers - 2 else 0.3
|
||||
harmful_acts[li] = [
|
||||
torch.randn(hidden_dim) * 0.3 + strength * d
|
||||
for _ in range(n_per_class)
|
||||
]
|
||||
harmless_acts[li] = [
|
||||
torch.randn(hidden_dim) * 0.3
|
||||
for _ in range(n_per_class)
|
||||
]
|
||||
|
||||
return harmful_acts, harmless_acts, directions
|
||||
|
||||
|
||||
class FakeTokenizer:
|
||||
"""Fake tokenizer that maps strings to reproducible token IDs."""
|
||||
|
||||
def __init__(self, vocab_size=100):
|
||||
self.vocab_size = vocab_size
|
||||
|
||||
def encode(self, text, add_special_tokens=False):
|
||||
return [hash(text) % self.vocab_size]
|
||||
|
||||
def decode(self, ids):
|
||||
return f"tok_{ids[0]}"
|
||||
|
||||
|
||||
class FakeModel(nn.Module):
|
||||
"""Fake model with lm_head and transformer.ln_f for testing."""
|
||||
|
||||
def __init__(self, hidden_dim=32, vocab_size=100, n_layers=4):
|
||||
super().__init__()
|
||||
self.hidden_dim = hidden_dim
|
||||
self.vocab_size = vocab_size
|
||||
self.n_layers = n_layers
|
||||
|
||||
self.lm_head = nn.Linear(hidden_dim, vocab_size, bias=False)
|
||||
self.transformer = nn.Module()
|
||||
self.transformer.ln_f = nn.LayerNorm(hidden_dim)
|
||||
self.transformer.h = nn.ModuleList([
|
||||
nn.Linear(hidden_dim, hidden_dim) for _ in range(n_layers)
|
||||
])
|
||||
|
||||
def forward(self, input_ids):
|
||||
# Fake forward pass
|
||||
batch_size, seq_len = input_ids.shape
|
||||
x = torch.randn(batch_size, seq_len, self.hidden_dim)
|
||||
for layer in self.transformer.h:
|
||||
x = layer(x) + x
|
||||
logits = self.lm_head(self.transformer.ln_f(x))
|
||||
return type('Output', (), {'logits': logits})()
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Tuned Lens
|
||||
# ===========================================================================
|
||||
|
||||
class TestTunedLensTrainer:
|
||||
def test_train_single_probe(self):
|
||||
hidden_dim = 16
|
||||
n_samples = 30
|
||||
|
||||
layer_acts = torch.randn(n_samples, hidden_dim)
|
||||
final_acts = layer_acts + torch.randn(n_samples, hidden_dim) * 0.1
|
||||
|
||||
trainer = TunedLensTrainer(hidden_dim, n_epochs=20)
|
||||
probe = trainer.train_probe(layer_acts, final_acts, layer_idx=3)
|
||||
|
||||
assert isinstance(probe, TunedLensProbe)
|
||||
assert probe.layer_idx == 3
|
||||
assert probe.weight.shape == (hidden_dim, hidden_dim)
|
||||
assert probe.bias.shape == (hidden_dim,)
|
||||
assert probe.train_loss < 1.0 # should converge somewhat
|
||||
|
||||
def test_train_all_layers(self):
|
||||
hidden_dim = 16
|
||||
n_samples = 20
|
||||
|
||||
layer_acts = {
|
||||
i: torch.randn(n_samples, hidden_dim) for i in range(4)
|
||||
}
|
||||
final_acts = torch.randn(n_samples, hidden_dim)
|
||||
|
||||
trainer = TunedLensTrainer(hidden_dim, n_epochs=10)
|
||||
probes = trainer.train_all_layers(layer_acts, final_acts)
|
||||
|
||||
assert len(probes) == 4
|
||||
for i in range(4):
|
||||
assert i in probes
|
||||
assert probes[i].weight.shape == (hidden_dim, hidden_dim)
|
||||
|
||||
def test_probe_near_identity_for_final_layer(self):
|
||||
"""Probe for the final layer should be close to identity."""
|
||||
hidden_dim = 16
|
||||
n_samples = 50
|
||||
|
||||
acts = torch.randn(n_samples, hidden_dim)
|
||||
trainer = TunedLensTrainer(hidden_dim, n_epochs=50)
|
||||
probe = trainer.train_probe(acts, acts, layer_idx=0)
|
||||
|
||||
# Weight should be close to identity
|
||||
identity = torch.eye(hidden_dim)
|
||||
diff = (probe.weight - identity).norm().item()
|
||||
assert diff < 1.0
|
||||
|
||||
|
||||
class TestRefusalTunedLens:
|
||||
def test_analyze_direction(self):
|
||||
hidden_dim = 32
|
||||
vocab_size = 100
|
||||
|
||||
model = FakeModel(hidden_dim, vocab_size)
|
||||
tokenizer = FakeTokenizer(vocab_size)
|
||||
|
||||
direction = torch.randn(hidden_dim)
|
||||
probe = TunedLensProbe(
|
||||
layer_idx=2,
|
||||
weight=torch.eye(hidden_dim) + torch.randn(hidden_dim, hidden_dim) * 0.01,
|
||||
bias=torch.zeros(hidden_dim),
|
||||
train_loss=0.01,
|
||||
)
|
||||
|
||||
lens = RefusalTunedLens(top_k=10)
|
||||
result = lens.analyze_direction(direction, probe, model, tokenizer)
|
||||
|
||||
assert isinstance(result, TunedLensResult)
|
||||
assert result.layer_idx == 2
|
||||
assert len(result.top_promoted) <= 10
|
||||
assert len(result.top_suppressed) <= 10
|
||||
assert isinstance(result.correction_magnitude, float)
|
||||
assert result.correction_magnitude >= 0
|
||||
|
||||
def test_analyze_all_layers(self):
|
||||
hidden_dim = 32
|
||||
vocab_size = 100
|
||||
|
||||
model = FakeModel(hidden_dim, vocab_size)
|
||||
tokenizer = FakeTokenizer(vocab_size)
|
||||
|
||||
directions = {
|
||||
i: torch.randn(hidden_dim) for i in range(4)
|
||||
}
|
||||
probes = {
|
||||
i: TunedLensProbe(
|
||||
layer_idx=i,
|
||||
weight=torch.eye(hidden_dim),
|
||||
bias=torch.zeros(hidden_dim),
|
||||
train_loss=0.01,
|
||||
)
|
||||
for i in range(4)
|
||||
}
|
||||
|
||||
lens = RefusalTunedLens(top_k=5)
|
||||
result = lens.analyze_all_layers(directions, probes, model, tokenizer)
|
||||
|
||||
assert isinstance(result, MultiLayerTunedLensResult)
|
||||
assert len(result.per_layer) == 4
|
||||
assert result.strongest_refusal_layer in range(4)
|
||||
|
||||
def test_compare_with_logit_lens(self):
|
||||
logit_gaps = {0: 0.1, 1: 0.5, 2: 0.3, 3: 0.8}
|
||||
|
||||
tuned_result = MultiLayerTunedLensResult(
|
||||
per_layer={
|
||||
i: TunedLensResult(
|
||||
layer_idx=i,
|
||||
top_promoted=[], top_suppressed=[],
|
||||
refusal_token_mean_boost=0.0,
|
||||
compliance_token_mean_boost=0.0,
|
||||
refusal_compliance_gap=v * 1.1, # similar ranking
|
||||
correction_magnitude=0.1,
|
||||
)
|
||||
for i, v in logit_gaps.items()
|
||||
},
|
||||
probes={},
|
||||
strongest_refusal_layer=3,
|
||||
peak_gap_layer=3,
|
||||
mean_refusal_compliance_gap=0.5,
|
||||
logit_lens_agreement=0.0,
|
||||
)
|
||||
|
||||
agreement = RefusalTunedLens.compare_with_logit_lens(tuned_result, logit_gaps)
|
||||
# Same ranking → correlation should be 1.0
|
||||
assert agreement == pytest.approx(1.0, abs=0.01)
|
||||
|
||||
def test_format_report(self):
|
||||
result = MultiLayerTunedLensResult(
|
||||
per_layer={},
|
||||
probes={},
|
||||
strongest_refusal_layer=0,
|
||||
peak_gap_layer=0,
|
||||
mean_refusal_compliance_gap=0.0,
|
||||
logit_lens_agreement=0.0,
|
||||
)
|
||||
report = RefusalTunedLens.format_report(result)
|
||||
assert "Tuned Lens" in report
|
||||
assert "No layers analyzed" in report
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Activation Patching
|
||||
# ===========================================================================
|
||||
|
||||
class TestActivationPatcher:
|
||||
def test_patching_site_creation(self):
|
||||
site = PatchingSite(layer_idx=3, component="residual")
|
||||
assert site.layer_idx == 3
|
||||
assert site.component == "residual"
|
||||
assert site.head_idx is None
|
||||
|
||||
def test_patching_site_with_head(self):
|
||||
site = PatchingSite(layer_idx=2, component="attn_head", head_idx=5)
|
||||
assert site.head_idx == 5
|
||||
|
||||
def test_patch_sweep_with_model(self):
|
||||
"""Test full patching sweep on fake model."""
|
||||
hidden_dim = 32
|
||||
model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
|
||||
|
||||
clean_ids = torch.randint(0, 100, (1, 10))
|
||||
corrupted_ids = torch.randint(0, 100, (1, 10))
|
||||
|
||||
patcher = ActivationPatcher(significance_threshold=0.05)
|
||||
|
||||
result = patcher.patch_sweep(
|
||||
model, clean_ids, corrupted_ids,
|
||||
mode="noising",
|
||||
)
|
||||
|
||||
assert isinstance(result, ActivationPatchingResult)
|
||||
assert result.patching_mode == "noising"
|
||||
assert result.n_layers == 4
|
||||
assert len(result.effects) > 0
|
||||
assert isinstance(result.circuit_fraction, float)
|
||||
assert 0.0 <= result.circuit_fraction <= 1.0
|
||||
|
||||
def test_patch_sweep_denoising(self):
|
||||
hidden_dim = 32
|
||||
model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
|
||||
|
||||
clean_ids = torch.randint(0, 100, (1, 10))
|
||||
corrupted_ids = torch.randint(0, 100, (1, 10))
|
||||
|
||||
patcher = ActivationPatcher()
|
||||
result = patcher.patch_sweep(
|
||||
model, clean_ids, corrupted_ids,
|
||||
mode="denoising",
|
||||
)
|
||||
|
||||
assert result.patching_mode == "denoising"
|
||||
|
||||
def test_custom_metric(self):
|
||||
hidden_dim = 32
|
||||
model = FakeModel(hidden_dim, vocab_size=100, n_layers=4)
|
||||
|
||||
clean_ids = torch.randint(0, 100, (1, 10))
|
||||
corrupted_ids = torch.randint(0, 100, (1, 10))
|
||||
|
||||
def custom_metric(logits):
|
||||
return logits.sum().item()
|
||||
|
||||
patcher = ActivationPatcher(metric_fn=custom_metric)
|
||||
result = patcher.patch_sweep(model, clean_ids, corrupted_ids)
|
||||
|
||||
assert isinstance(result, ActivationPatchingResult)
|
||||
assert isinstance(result.clean_baseline, float)
|
||||
|
||||
def test_format_report(self):
|
||||
result = ActivationPatchingResult(
|
||||
n_layers=4,
|
||||
n_sites=4,
|
||||
patching_mode="noising",
|
||||
effects=[],
|
||||
clean_baseline=1.0,
|
||||
corrupted_baseline=0.0,
|
||||
total_effect=1.0,
|
||||
significant_sites=[],
|
||||
circuit_fraction=0.0,
|
||||
top_causal_layers=[],
|
||||
)
|
||||
report = ActivationPatcher.format_report(result)
|
||||
assert "Activation Patching" in report
|
||||
assert "noising" in report
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Enhanced SAE Decomposition Pipeline
|
||||
# ===========================================================================
|
||||
|
||||
class TestSAEDecompositionPipeline:
|
||||
def test_basic_pipeline(self):
|
||||
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=2.0)
|
||||
|
||||
pipeline = SAEDecompositionPipeline(
|
||||
expansion=2, n_epochs=10, top_k_features=8, n_clusters=3,
|
||||
)
|
||||
result = pipeline.run(harmful, harmless, layer_idx=0)
|
||||
|
||||
assert isinstance(result, SAEDecompositionResult)
|
||||
assert result.layer_idx == 0
|
||||
assert result.sae is not None
|
||||
assert result.refusal_features.n_refusal_features == 8
|
||||
assert len(result.feature_sparsity) == 8
|
||||
assert len(result.feature_monosemanticity) == 8
|
||||
assert len(result.per_feature_refusal_reduction) == 8
|
||||
assert len(result.cumulative_refusal_reduction) == 8
|
||||
assert 0.0 <= result.raw_direction_overlap <= 1.0
|
||||
|
||||
def test_feature_clustering(self):
|
||||
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30)
|
||||
|
||||
pipeline = SAEDecompositionPipeline(
|
||||
expansion=2, n_epochs=10, top_k_features=8, n_clusters=3,
|
||||
)
|
||||
result = pipeline.run(harmful, harmless)
|
||||
|
||||
clusters = result.feature_clusters
|
||||
assert clusters is not None
|
||||
assert isinstance(clusters, FeatureClusterResult)
|
||||
assert clusters.n_clusters == 3
|
||||
assert len(clusters.cluster_labels) == 8
|
||||
assert all(0 <= lbl < 3 for lbl in clusters.cluster_labels)
|
||||
assert clusters.cluster_directions.shape[0] == 3
|
||||
assert -1.0 <= clusters.silhouette_score <= 1.0
|
||||
|
||||
def test_cumulative_reduction_monotonic(self):
|
||||
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=30, separation=3.0)
|
||||
|
||||
pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=10, top_k_features=6)
|
||||
result = pipeline.run(harmful, harmless)
|
||||
|
||||
# Cumulative reduction should be non-decreasing
|
||||
for i in range(1, len(result.cumulative_refusal_reduction)):
|
||||
assert result.cumulative_refusal_reduction[i] >= result.cumulative_refusal_reduction[i - 1] - 1e-6
|
||||
|
||||
def test_format_report(self):
|
||||
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20)
|
||||
pipeline = SAEDecompositionPipeline(expansion=2, n_epochs=5, top_k_features=4, n_clusters=2)
|
||||
result = pipeline.run(harmful, harmless)
|
||||
|
||||
report = SAEDecompositionPipeline.format_report(result)
|
||||
assert "SAE Feature Decomposition" in report
|
||||
assert "Variance explained" in report
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Wasserstein-Optimal Direction Extraction
|
||||
# ===========================================================================
|
||||
|
||||
class TestWassersteinOptimalExtractor:
|
||||
def test_basic_extraction(self):
|
||||
harmful, harmless, planted_dir = _make_activations(
|
||||
hidden_dim=32, n_per_class=30, separation=3.0,
|
||||
)
|
||||
|
||||
extractor = WassersteinOptimalExtractor()
|
||||
result = extractor.extract(harmful, harmless, layer_idx=0)
|
||||
|
||||
assert isinstance(result, WassersteinDirectionResult)
|
||||
assert result.layer_idx == 0
|
||||
assert result.direction.shape == (32,)
|
||||
assert abs(result.direction.norm().item() - 1.0) < 1e-5
|
||||
assert result.wasserstein_cost >= 0
|
||||
assert result.mean_shift_component >= 0
|
||||
assert result.bures_component >= 0
|
||||
assert result.cost_effectiveness_ratio >= 0
|
||||
|
||||
def test_direction_captures_signal(self):
|
||||
"""Wasserstein direction should have non-trivial refusal projection."""
|
||||
harmful, harmless, planted_dir = _make_activations(
|
||||
hidden_dim=32, n_per_class=30, separation=3.0,
|
||||
)
|
||||
|
||||
extractor = WassersteinOptimalExtractor()
|
||||
result = extractor.extract(harmful, harmless)
|
||||
|
||||
# Direction should have some alignment with planted signal
|
||||
cosine = abs((result.direction @ planted_dir).item())
|
||||
assert cosine > 0.1 # not totally orthogonal
|
||||
|
||||
def test_extract_all_layers(self):
|
||||
harmful_acts, harmless_acts, _ = _make_multilayer_activations(
|
||||
n_layers=4, hidden_dim=16, n_per_class=20,
|
||||
)
|
||||
|
||||
extractor = WassersteinOptimalExtractor()
|
||||
result = extractor.extract_all_layers(harmful_acts, harmless_acts)
|
||||
|
||||
assert isinstance(result, MultiLayerWassersteinResult)
|
||||
assert len(result.per_layer) == 4
|
||||
assert result.best_layer in range(4)
|
||||
assert result.mean_cost_ratio >= 0
|
||||
|
||||
def test_compare_with_alternatives(self):
|
||||
harmful, harmless, planted_dir = _make_activations(
|
||||
hidden_dim=16, n_per_class=30, separation=3.0,
|
||||
)
|
||||
|
||||
extractor = WassersteinOptimalExtractor()
|
||||
w_result = extractor.extract(harmful, harmless)
|
||||
|
||||
# Use planted direction as "Fisher" and diff-in-means
|
||||
H = torch.stack(harmful).float()
|
||||
B = torch.stack(harmless).float()
|
||||
dim_dir = (H.mean(0) - B.mean(0))
|
||||
dim_dir = dim_dir / dim_dir.norm()
|
||||
|
||||
comparison = extractor.compare_with_alternatives(
|
||||
w_result, harmful, harmless,
|
||||
fisher_direction=planted_dir,
|
||||
dim_direction=dim_dir,
|
||||
)
|
||||
|
||||
assert isinstance(comparison, WassersteinComparisonResult)
|
||||
assert comparison.wasserstein_cost_ratio >= 0
|
||||
assert comparison.fisher_cost_ratio is not None
|
||||
assert comparison.dim_cost_ratio is not None
|
||||
assert 0 <= comparison.cosine_wasserstein_fisher <= 1
|
||||
assert 0 <= comparison.cosine_wasserstein_dim <= 1
|
||||
|
||||
def test_wasserstein_lower_cost_than_dim(self):
|
||||
"""Wasserstein-optimal should have lower cost ratio than diff-in-means."""
|
||||
harmful, harmless, _ = _make_activations(
|
||||
hidden_dim=32, n_per_class=50, separation=2.0,
|
||||
)
|
||||
|
||||
extractor = WassersteinOptimalExtractor()
|
||||
w_result = extractor.extract(harmful, harmless)
|
||||
|
||||
H = torch.stack(harmful).float()
|
||||
B = torch.stack(harmless).float()
|
||||
dim_dir = (H.mean(0) - B.mean(0))
|
||||
dim_dir = dim_dir / dim_dir.norm()
|
||||
|
||||
comparison = extractor.compare_with_alternatives(
|
||||
w_result, harmful, harmless, dim_direction=dim_dir,
|
||||
)
|
||||
|
||||
# Wasserstein should have lower or equal cost ratio by construction
|
||||
assert comparison.wasserstein_cost_ratio <= comparison.dim_cost_ratio + 1e-4
|
||||
|
||||
def test_format_report(self):
|
||||
harmful, harmless, _ = _make_activations(hidden_dim=16, n_per_class=20)
|
||||
extractor = WassersteinOptimalExtractor()
|
||||
result = extractor.extract_all_layers(
|
||||
{0: harmful, 1: harmful},
|
||||
{0: harmless, 1: harmless},
|
||||
)
|
||||
report = WassersteinOptimalExtractor.format_report(result)
|
||||
assert "Wasserstein" in report
|
||||
assert "cost ratio" in report.lower()
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Bayesian-Optimized Kernel Projection
|
||||
# ===========================================================================
|
||||
|
||||
class TestBayesianKernelProjection:
|
||||
def test_basic_optimization(self):
|
||||
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
|
||||
n_layers=6, hidden_dim=16, n_per_class=20,
|
||||
)
|
||||
|
||||
optimizer = BayesianKernelProjection(
|
||||
n_trials=30, refusal_weight=0.6, distortion_weight=0.4,
|
||||
)
|
||||
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
|
||||
|
||||
assert isinstance(result, BayesianOptimizationResult)
|
||||
assert result.n_trials == 30
|
||||
assert result.best_score >= 0
|
||||
assert 0 <= result.best_refusal_reduction <= 1.0
|
||||
assert result.best_harmless_distortion >= 0
|
||||
assert len(result.all_trials) == 30
|
||||
|
||||
def test_best_config_structure(self):
|
||||
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
|
||||
n_layers=4, hidden_dim=16, n_per_class=15,
|
||||
)
|
||||
|
||||
optimizer = BayesianKernelProjection(n_trials=20)
|
||||
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
|
||||
|
||||
config = result.best_config
|
||||
assert isinstance(config, ProjectionConfig)
|
||||
assert config.layer_range[0] <= config.layer_range[1]
|
||||
assert config.n_directions >= 1
|
||||
assert 0 <= config.regularization <= 0.5
|
||||
|
||||
def test_pareto_front(self):
|
||||
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
|
||||
n_layers=6, hidden_dim=16, n_per_class=20,
|
||||
)
|
||||
|
||||
optimizer = BayesianKernelProjection(n_trials=50)
|
||||
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
|
||||
|
||||
# Pareto front should have at least 1 entry
|
||||
assert len(result.pareto_configs) >= 1
|
||||
|
||||
# Pareto entries should be non-dominated
|
||||
for i in range(len(result.pareto_configs) - 1):
|
||||
# Each entry should have lower distortion than the next
|
||||
# (since they're sorted by decreasing refusal reduction)
|
||||
assert (
|
||||
result.pareto_configs[i].harmless_distortion
|
||||
>= result.pareto_configs[i + 1].harmless_distortion - 1e-8
|
||||
)
|
||||
|
||||
def test_layer_importance(self):
|
||||
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
|
||||
n_layers=6, hidden_dim=16, n_per_class=20,
|
||||
)
|
||||
|
||||
optimizer = BayesianKernelProjection(n_trials=50)
|
||||
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
|
||||
|
||||
assert len(result.layer_importance) == 6
|
||||
for _layer, imp in result.layer_importance.items():
|
||||
assert 0 <= imp <= 1.0
|
||||
|
||||
def test_tpe_improves_over_random(self):
|
||||
"""TPE phase should produce better configs than random exploration."""
|
||||
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
|
||||
n_layers=6, hidden_dim=16, n_per_class=20,
|
||||
)
|
||||
|
||||
optimizer = BayesianKernelProjection(n_trials=60, seed=42)
|
||||
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
|
||||
|
||||
# Compare average score of first 20 (random) vs last 20 (TPE)
|
||||
first_20 = sorted(result.all_trials[:20], key=lambda t: t.combined_score)
|
||||
last_20 = sorted(result.all_trials[-20:], key=lambda t: t.combined_score)
|
||||
|
||||
best_random = first_20[0].combined_score
|
||||
best_tpe = min(t.combined_score for t in last_20)
|
||||
|
||||
# TPE should find at least as good (lower = better)
|
||||
# This is probabilistic so we allow some slack
|
||||
assert best_tpe <= best_random + 0.3
|
||||
|
||||
def test_empty_input(self):
|
||||
optimizer = BayesianKernelProjection(n_trials=10)
|
||||
result = optimizer.optimize({}, {}, {})
|
||||
|
||||
assert result.n_trials == 0
|
||||
assert result.best_score == 0.0
|
||||
|
||||
def test_format_report(self):
|
||||
harmful_acts, harmless_acts, directions = _make_multilayer_activations(
|
||||
n_layers=4, hidden_dim=16, n_per_class=15,
|
||||
)
|
||||
|
||||
optimizer = BayesianKernelProjection(n_trials=20)
|
||||
result = optimizer.optimize(harmful_acts, harmless_acts, directions)
|
||||
|
||||
report = BayesianKernelProjection.format_report(result)
|
||||
assert "Bayesian" in report
|
||||
assert "Pareto" in report
|
||||
assert "Layer importance" in report
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Module imports
|
||||
# ===========================================================================
|
||||
|
||||
class TestModuleImports:
|
||||
def test_all_new_modules_importable(self):
|
||||
from obliteratus.analysis import TunedLensTrainer
|
||||
from obliteratus.analysis import RefusalTunedLens
|
||||
from obliteratus.analysis import ActivationPatcher
|
||||
from obliteratus.analysis import WassersteinOptimalExtractor
|
||||
from obliteratus.analysis import BayesianKernelProjection
|
||||
from obliteratus.analysis import SAEDecompositionPipeline
|
||||
|
||||
assert TunedLensTrainer is not None
|
||||
assert RefusalTunedLens is not None
|
||||
assert ActivationPatcher is not None
|
||||
assert WassersteinOptimalExtractor is not None
|
||||
assert BayesianKernelProjection is not None
|
||||
assert SAEDecompositionPipeline is not None
|
||||
|
||||
def test_new_modules_in_all(self):
|
||||
import obliteratus.analysis as analysis
|
||||
assert "TunedLensTrainer" in analysis.__all__
|
||||
assert "RefusalTunedLens" in analysis.__all__
|
||||
assert "ActivationPatcher" in analysis.__all__
|
||||
assert "WassersteinOptimalExtractor" in analysis.__all__
|
||||
assert "BayesianKernelProjection" in analysis.__all__
|
||||
assert "SAEDecompositionPipeline" in analysis.__all__
|
||||
@@ -0,0 +1,669 @@
|
||||
"""Tests for analysis techniques: concept cones, alignment imprints,
|
||||
multi-token position, and sparse direction surgery."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.analysis.concept_geometry import (
|
||||
ConceptConeAnalyzer,
|
||||
ConeConeResult,
|
||||
MultiLayerConeResult,
|
||||
CategoryDirection,
|
||||
DEFAULT_HARM_CATEGORIES,
|
||||
)
|
||||
from obliteratus.analysis.alignment_imprint import (
|
||||
AlignmentImprintDetector,
|
||||
AlignmentImprint,
|
||||
BaseInstructDelta,
|
||||
)
|
||||
from obliteratus.analysis.multi_token_position import (
|
||||
MultiTokenPositionAnalyzer,
|
||||
PositionAnalysisResult,
|
||||
MultiTokenSummary,
|
||||
)
|
||||
from obliteratus.analysis.sparse_surgery import (
|
||||
SparseDirectionSurgeon,
|
||||
SparseProjectionResult,
|
||||
SparseSurgeryPlan,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_category_activations(
|
||||
hidden_dim=32, n_prompts=30, n_categories=5, category_spread=0.3,
|
||||
):
|
||||
"""Create synthetic activations with planted per-category refusal directions.
|
||||
|
||||
Each category gets its own refusal direction, with some shared component
|
||||
to simulate a polyhedral cone structure.
|
||||
"""
|
||||
torch.manual_seed(42)
|
||||
|
||||
# Shared refusal component
|
||||
shared = torch.randn(hidden_dim)
|
||||
shared = shared / shared.norm()
|
||||
|
||||
# Per-category unique components
|
||||
cat_dirs = {}
|
||||
categories = [f"cat_{i}" for i in range(n_categories)]
|
||||
for cat in categories:
|
||||
unique = torch.randn(hidden_dim)
|
||||
unique = unique / unique.norm()
|
||||
combined = shared + category_spread * unique
|
||||
cat_dirs[cat] = combined / combined.norm()
|
||||
|
||||
# Assign prompts to categories
|
||||
prompts_per_cat = n_prompts // n_categories
|
||||
category_map = {}
|
||||
for i, cat in enumerate(categories):
|
||||
for j in range(prompts_per_cat):
|
||||
category_map[i * prompts_per_cat + j] = cat
|
||||
|
||||
actual_n = prompts_per_cat * n_categories
|
||||
|
||||
# Generate activations
|
||||
harmful_acts = []
|
||||
harmless_acts = []
|
||||
for idx in range(actual_n):
|
||||
cat = category_map[idx]
|
||||
base = torch.randn(hidden_dim) * 0.1
|
||||
harmful_acts.append(base + 2.0 * cat_dirs[cat])
|
||||
harmless_acts.append(base)
|
||||
|
||||
return harmful_acts, harmless_acts, category_map, cat_dirs
|
||||
|
||||
|
||||
def _make_refusal_directions(n_layers=8, hidden_dim=32, concentration="distributed"):
|
||||
"""Create synthetic refusal directions with specified concentration pattern."""
|
||||
torch.manual_seed(123)
|
||||
directions = {}
|
||||
strengths = {}
|
||||
|
||||
for i in range(n_layers):
|
||||
d = torch.randn(hidden_dim)
|
||||
directions[i] = d / d.norm()
|
||||
|
||||
if concentration == "concentrated":
|
||||
# Strong in last few layers only (SFT-like)
|
||||
strengths[i] = 3.0 if i >= n_layers - 2 else 0.1
|
||||
elif concentration == "distributed":
|
||||
# Even across layers (RLHF-like)
|
||||
strengths[i] = 1.0 + 0.2 * torch.randn(1).item()
|
||||
elif concentration == "orthogonal":
|
||||
# Each layer direction is more orthogonal (CAI-like)
|
||||
if i > 0:
|
||||
# Make each direction more orthogonal to previous
|
||||
prev = directions[i - 1]
|
||||
d = d - (d @ prev) * prev
|
||||
d = d / d.norm().clamp(min=1e-8)
|
||||
directions[i] = d
|
||||
strengths[i] = 1.5
|
||||
else:
|
||||
strengths[i] = 2.0 if 2 <= i <= 4 else 0.5
|
||||
|
||||
return directions, strengths
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Concept Cone Geometry
|
||||
# ===========================================================================
|
||||
|
||||
class TestConceptConeAnalyzer:
|
||||
def test_basic_analysis(self):
|
||||
harmful, harmless, cat_map, _ = _make_category_activations()
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_layer(harmful, harmless, layer_idx=5)
|
||||
|
||||
assert isinstance(result, ConeConeResult)
|
||||
assert result.layer_idx == 5
|
||||
assert result.category_count >= 2
|
||||
assert result.cone_dimensionality > 0
|
||||
assert result.cone_solid_angle >= 0
|
||||
assert 0 <= result.mean_pairwise_cosine <= 1.0
|
||||
|
||||
def test_polyhedral_detection(self):
|
||||
"""With spread-out categories, should detect polyhedral geometry."""
|
||||
harmful, harmless, cat_map, _ = _make_category_activations(
|
||||
category_spread=2.0, # Large spread -> distinct directions
|
||||
)
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
# With high spread, directions should be more distinct
|
||||
assert result.cone_dimensionality > 1.0
|
||||
|
||||
def test_linear_detection(self):
|
||||
"""With no spread, should detect linear (single direction) geometry."""
|
||||
harmful, harmless, cat_map, _ = _make_category_activations(
|
||||
category_spread=0.0, # No spread -> all directions aligned
|
||||
)
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
assert result.mean_pairwise_cosine > 0.8
|
||||
|
||||
def test_category_directions_populated(self):
|
||||
harmful, harmless, cat_map, _ = _make_category_activations()
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
|
||||
for cd in result.category_directions:
|
||||
assert isinstance(cd, CategoryDirection)
|
||||
assert cd.strength > 0
|
||||
assert cd.n_prompts >= 2
|
||||
assert 0 <= cd.specificity <= 1.0
|
||||
|
||||
def test_pairwise_cosines(self):
|
||||
harmful, harmless, cat_map, _ = _make_category_activations()
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
|
||||
for (a, b), cos in result.pairwise_cosines.items():
|
||||
assert 0 <= cos <= 1.0
|
||||
assert a < b # Sorted pair
|
||||
|
||||
def test_general_direction_unit(self):
|
||||
harmful, harmless, cat_map, _ = _make_category_activations()
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
assert abs(result.general_direction.norm().item() - 1.0) < 0.01
|
||||
|
||||
def test_multi_layer_analysis(self):
|
||||
harmful, harmless, cat_map, _ = _make_category_activations()
|
||||
harmful_by_layer = {i: harmful for i in range(4)}
|
||||
harmless_by_layer = {i: harmless for i in range(4)}
|
||||
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_all_layers(harmful_by_layer, harmless_by_layer)
|
||||
|
||||
assert isinstance(result, MultiLayerConeResult)
|
||||
assert len(result.per_layer) == 4
|
||||
assert result.mean_cone_dimensionality > 0
|
||||
|
||||
def test_format_report(self):
|
||||
harmful, harmless, cat_map, _ = _make_category_activations()
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map)
|
||||
result = analyzer.analyze_layer(harmful, harmless, layer_idx=3)
|
||||
report = ConceptConeAnalyzer.format_report(result)
|
||||
|
||||
assert "Concept Cone" in report
|
||||
assert "Layer 3" in report
|
||||
assert "dimensionality" in report
|
||||
|
||||
def test_default_category_map(self):
|
||||
assert len(DEFAULT_HARM_CATEGORIES) == 30
|
||||
cats = set(DEFAULT_HARM_CATEGORIES.values())
|
||||
assert "weapons" in cats
|
||||
assert "cyber" in cats
|
||||
|
||||
def test_empty_activations(self):
|
||||
analyzer = ConceptConeAnalyzer()
|
||||
result = analyzer.analyze_layer([], [], layer_idx=0)
|
||||
assert result.category_count == 0
|
||||
|
||||
def test_min_category_size(self):
|
||||
"""Categories with too few prompts should be excluded."""
|
||||
harmful, harmless, cat_map, _ = _make_category_activations(
|
||||
n_prompts=10, n_categories=5,
|
||||
)
|
||||
analyzer = ConceptConeAnalyzer(category_map=cat_map, min_category_size=3)
|
||||
result = analyzer.analyze_layer(harmful, harmless)
|
||||
# Each category has only 2 prompts, so with min_size=3 all are excluded
|
||||
assert result.category_count == 0
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Alignment Imprint Detector
|
||||
# ===========================================================================
|
||||
|
||||
class TestAlignmentImprintDetector:
|
||||
def test_basic_detection(self):
|
||||
directions, strengths = _make_refusal_directions()
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
|
||||
assert isinstance(imprint, AlignmentImprint)
|
||||
assert imprint.predicted_method in ("dpo", "rlhf", "cai", "sft")
|
||||
assert 0 <= imprint.confidence <= 1.0
|
||||
|
||||
def test_probabilities_sum_to_one(self):
|
||||
directions, strengths = _make_refusal_directions()
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
|
||||
total = (imprint.dpo_probability + imprint.rlhf_probability +
|
||||
imprint.cai_probability + imprint.sft_probability)
|
||||
assert abs(total - 1.0) < 0.01
|
||||
|
||||
def test_concentrated_detects_sft_or_dpo(self):
|
||||
"""Concentrated refusal (tail-biased) should predict SFT or DPO."""
|
||||
directions, strengths = _make_refusal_directions(concentration="concentrated")
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
# SFT and DPO both have concentrated signatures
|
||||
assert imprint.predicted_method in ("sft", "dpo")
|
||||
|
||||
def test_distributed_detects_not_sft(self):
|
||||
"""Distributed refusal should not be predicted as SFT."""
|
||||
directions, strengths = _make_refusal_directions(
|
||||
n_layers=16, concentration="distributed",
|
||||
)
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
# With distributed refusal, Gini is low -> SFT is unlikely to be top prediction
|
||||
assert imprint.predicted_method != "sft"
|
||||
|
||||
def test_orthogonal_detects_cai(self):
|
||||
"""Orthogonal layer directions should lean toward CAI."""
|
||||
directions, strengths = _make_refusal_directions(
|
||||
n_layers=12, concentration="orthogonal",
|
||||
)
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
# CAI should rank highly due to orthogonality
|
||||
assert imprint.cai_probability > 0.15
|
||||
|
||||
def test_feature_extraction(self):
|
||||
directions, strengths = _make_refusal_directions()
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
|
||||
assert 0 <= imprint.gini_coefficient <= 1.0
|
||||
assert imprint.effective_rank > 0
|
||||
assert 0 <= imprint.cross_layer_smoothness <= 1.0
|
||||
assert 0 <= imprint.tail_layer_bias <= 1.0
|
||||
assert 0 <= imprint.mean_pairwise_orthogonality <= 1.0
|
||||
assert imprint.spectral_decay_rate >= 0
|
||||
|
||||
def test_empty_directions(self):
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint({})
|
||||
assert imprint.predicted_method == "unknown"
|
||||
assert imprint.confidence == 0.0
|
||||
|
||||
def test_compare_base_instruct(self):
|
||||
torch.manual_seed(42)
|
||||
hidden_dim = 32
|
||||
directions, _ = _make_refusal_directions(hidden_dim=hidden_dim)
|
||||
|
||||
base_acts = {i: torch.randn(hidden_dim) for i in range(8)}
|
||||
instruct_acts = {
|
||||
i: base_acts[i] + 1.5 * directions[i] for i in range(8)
|
||||
}
|
||||
|
||||
detector = AlignmentImprintDetector()
|
||||
deltas = detector.compare_base_instruct(base_acts, instruct_acts, directions)
|
||||
|
||||
assert len(deltas) == 8
|
||||
for d in deltas:
|
||||
assert isinstance(d, BaseInstructDelta)
|
||||
assert d.delta_magnitude > 0
|
||||
# Since delta IS the refusal direction, cosine should be high
|
||||
assert abs(d.cosine_with_refusal) > 0.5
|
||||
|
||||
def test_format_imprint(self):
|
||||
directions, strengths = _make_refusal_directions()
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
report = AlignmentImprintDetector.format_imprint(imprint)
|
||||
|
||||
assert "Alignment Imprint" in report
|
||||
assert "DPO" in report
|
||||
assert "RLHF" in report
|
||||
assert "Gini" in report
|
||||
|
||||
def test_per_layer_strength_populated(self):
|
||||
directions, strengths = _make_refusal_directions()
|
||||
detector = AlignmentImprintDetector()
|
||||
imprint = detector.detect_imprint(directions, strengths)
|
||||
assert len(imprint.per_layer_strength) == len(directions)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Multi-Token Position Analysis
|
||||
# ===========================================================================
|
||||
|
||||
class TestMultiTokenPositionAnalyzer:
|
||||
def _make_activations_with_trigger(
|
||||
self, seq_len=20, hidden_dim=32, trigger_pos=5,
|
||||
):
|
||||
"""Create activations with a planted trigger at a specific position."""
|
||||
torch.manual_seed(42)
|
||||
refusal_dir = torch.randn(hidden_dim)
|
||||
refusal_dir = refusal_dir / refusal_dir.norm()
|
||||
|
||||
# Background activations
|
||||
acts = torch.randn(seq_len, hidden_dim) * 0.1
|
||||
|
||||
# Strong refusal at trigger position
|
||||
acts[trigger_pos] += 3.0 * refusal_dir
|
||||
|
||||
# Weaker refusal at last position
|
||||
acts[-1] += 1.0 * refusal_dir
|
||||
|
||||
# Moderate at a few positions after trigger (decay)
|
||||
for i in range(trigger_pos + 1, min(trigger_pos + 4, seq_len)):
|
||||
decay = 0.5 ** (i - trigger_pos)
|
||||
acts[i] += 3.0 * decay * refusal_dir
|
||||
|
||||
return acts, refusal_dir
|
||||
|
||||
def test_basic_analysis(self):
|
||||
acts, ref_dir = self._make_activations_with_trigger()
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir, layer_idx=3)
|
||||
|
||||
assert isinstance(result, PositionAnalysisResult)
|
||||
assert result.layer_idx == 3
|
||||
assert result.n_tokens == 20
|
||||
assert result.peak_strength > 0
|
||||
|
||||
def test_trigger_detection(self):
|
||||
acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
|
||||
analyzer = MultiTokenPositionAnalyzer(trigger_threshold=0.5)
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
|
||||
# The planted trigger should be detected
|
||||
assert 5 in result.trigger_positions
|
||||
assert result.peak_position == 5
|
||||
|
||||
def test_peak_vs_last(self):
|
||||
"""Peak should be at trigger, not last token."""
|
||||
acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
|
||||
assert result.peak_strength > result.last_token_strength
|
||||
assert result.peak_position != result.n_tokens - 1
|
||||
|
||||
def test_decay_rate_positive(self):
|
||||
acts, ref_dir = self._make_activations_with_trigger(trigger_pos=5)
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
# With exponential decay planted, decay rate should be positive
|
||||
assert result.decay_rate > 0
|
||||
|
||||
def test_position_gini_bounded(self):
|
||||
acts, ref_dir = self._make_activations_with_trigger()
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
assert 0 <= result.position_gini <= 1.0
|
||||
|
||||
def test_token_profiles_length(self):
|
||||
acts, ref_dir = self._make_activations_with_trigger(seq_len=15)
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
assert len(result.token_profiles) == 15
|
||||
|
||||
def test_custom_token_texts(self):
|
||||
acts, ref_dir = self._make_activations_with_trigger(seq_len=10, trigger_pos=3)
|
||||
tokens = ["How", "to", "make", "a", "bomb", "from", "scratch", "please", "help", "me"]
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir, token_texts=tokens)
|
||||
for tp in result.token_profiles:
|
||||
assert tp.token_text in tokens or tp.token_text.startswith("pos_")
|
||||
|
||||
def test_batch_analysis(self):
|
||||
batch = []
|
||||
for i in range(5):
|
||||
acts, ref_dir = self._make_activations_with_trigger(
|
||||
trigger_pos=3 + i % 3,
|
||||
)
|
||||
batch.append(acts)
|
||||
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
summary = analyzer.analyze_batch(batch, ref_dir)
|
||||
|
||||
assert isinstance(summary, MultiTokenSummary)
|
||||
assert len(summary.per_prompt) == 5
|
||||
assert summary.mean_peak_vs_last_ratio > 0
|
||||
assert summary.mean_trigger_count > 0
|
||||
assert 0 <= summary.peak_is_last_fraction <= 1.0
|
||||
assert 0 <= summary.last_token_dominance <= 1.0
|
||||
|
||||
def test_last_token_dominant_case(self):
|
||||
"""When signal is only at last token, peak should equal last."""
|
||||
torch.manual_seed(42)
|
||||
hidden_dim = 32
|
||||
seq_len = 10
|
||||
ref_dir = torch.randn(hidden_dim)
|
||||
ref_dir = ref_dir / ref_dir.norm()
|
||||
|
||||
acts = torch.randn(seq_len, hidden_dim) * 0.01
|
||||
acts[-1] += 5.0 * ref_dir
|
||||
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
assert result.peak_position == seq_len - 1
|
||||
|
||||
def test_format_position_report(self):
|
||||
acts, ref_dir = self._make_activations_with_trigger()
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir, prompt_text="How to hack?")
|
||||
report = MultiTokenPositionAnalyzer.format_position_report(result)
|
||||
|
||||
assert "Multi-Token" in report
|
||||
assert "Peak position" in report
|
||||
|
||||
def test_format_summary(self):
|
||||
batch = []
|
||||
for _ in range(3):
|
||||
acts, ref_dir = self._make_activations_with_trigger()
|
||||
batch.append(acts)
|
||||
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
summary = analyzer.analyze_batch(batch, ref_dir)
|
||||
report = MultiTokenPositionAnalyzer.format_summary(summary)
|
||||
|
||||
assert "Summary" in report
|
||||
assert "Prompts analyzed" in report
|
||||
|
||||
def test_3d_activations_handled(self):
|
||||
"""Should handle (1, seq_len, hidden_dim) inputs."""
|
||||
acts, ref_dir = self._make_activations_with_trigger()
|
||||
acts = acts.unsqueeze(0) # Add batch dim
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
result = analyzer.analyze_prompt(acts, ref_dir)
|
||||
assert result.n_tokens == 20
|
||||
|
||||
def test_empty_batch(self):
|
||||
ref_dir = torch.randn(32)
|
||||
analyzer = MultiTokenPositionAnalyzer()
|
||||
summary = analyzer.analyze_batch([], ref_dir)
|
||||
assert len(summary.per_prompt) == 0
|
||||
assert summary.peak_is_last_fraction == 1.0
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Sparse Direction Surgery
|
||||
# ===========================================================================
|
||||
|
||||
class TestSparseDirectionSurgeon:
|
||||
def _make_weight_with_sparse_refusal(
|
||||
self, out_dim=64, in_dim=32, n_refusal_rows=5,
|
||||
):
|
||||
"""Create a weight matrix where refusal is concentrated in a few rows."""
|
||||
torch.manual_seed(42)
|
||||
refusal_dir = torch.randn(in_dim)
|
||||
refusal_dir = refusal_dir / refusal_dir.norm()
|
||||
|
||||
W = torch.randn(out_dim, in_dim) * 0.1
|
||||
|
||||
# Plant strong refusal signal in specific rows
|
||||
refusal_rows = list(range(n_refusal_rows))
|
||||
for i in refusal_rows:
|
||||
W[i] += 5.0 * refusal_dir
|
||||
|
||||
return W, refusal_dir, refusal_rows
|
||||
|
||||
def test_basic_analysis(self):
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=3)
|
||||
|
||||
assert isinstance(result, SparseProjectionResult)
|
||||
assert result.layer_idx == 3
|
||||
assert result.n_rows_total == 64
|
||||
assert result.n_rows_modified > 0
|
||||
assert result.mean_projection > 0
|
||||
assert result.max_projection > result.mean_projection
|
||||
|
||||
def test_refusal_sparsity_index(self):
|
||||
"""With sparse refusal, RSI should be high."""
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal(
|
||||
out_dim=100, n_refusal_rows=5,
|
||||
)
|
||||
surgeon = SparseDirectionSurgeon()
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir)
|
||||
assert result.refusal_sparsity_index > 0.3 # Concentrated signal
|
||||
|
||||
def test_energy_removed(self):
|
||||
"""Top rows should capture most of the refusal energy."""
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal(
|
||||
out_dim=64, n_refusal_rows=5,
|
||||
)
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.15) # ~10 rows out of 64
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir)
|
||||
# With 5 refusal rows and 10 modified, should capture most energy
|
||||
assert result.energy_removed > 0.5
|
||||
|
||||
def test_frobenius_change_bounded(self):
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir)
|
||||
assert result.frobenius_change > 0
|
||||
assert result.frobenius_change < 1.0 # Shouldn't change more than 100%
|
||||
|
||||
def test_apply_sparse_projection(self):
|
||||
"""Sparse projection should reduce refusal signal."""
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
|
||||
W_modified = surgeon.apply_sparse_projection(W, ref_dir)
|
||||
|
||||
# Check that modified rows have reduced projection
|
||||
original_proj = (W @ ref_dir).abs().sum().item()
|
||||
modified_proj = (W_modified @ ref_dir).abs().sum().item()
|
||||
assert modified_proj < original_proj
|
||||
|
||||
def test_sparse_preserves_unmodified_rows(self):
|
||||
"""Rows below the threshold should be unchanged."""
|
||||
W, ref_dir, refusal_rows = self._make_weight_with_sparse_refusal(
|
||||
out_dim=64, n_refusal_rows=5,
|
||||
)
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1) # ~6 rows
|
||||
W_modified = surgeon.apply_sparse_projection(W, ref_dir)
|
||||
|
||||
# Count rows that actually changed
|
||||
diffs = (W - W_modified).abs().sum(dim=1)
|
||||
n_changed = (diffs > 1e-6).sum().item()
|
||||
n_unchanged = (diffs < 1e-6).sum().item()
|
||||
|
||||
assert n_changed <= int(0.1 * 64) + 1 # Sparsity bound
|
||||
assert n_unchanged >= 57 # Most rows unchanged
|
||||
|
||||
def test_dense_vs_sparse_comparison(self):
|
||||
"""Dense projection should modify all rows; sparse should modify fewer."""
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
|
||||
# Dense projection
|
||||
r = ref_dir / ref_dir.norm()
|
||||
W_dense = W - (W @ r).unsqueeze(1) * r.unsqueeze(0)
|
||||
|
||||
# Sparse projection
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
W_sparse = surgeon.apply_sparse_projection(W, ref_dir)
|
||||
|
||||
dense_changes = (W - W_dense).abs().sum(dim=1)
|
||||
sparse_changes = (W - W_sparse).abs().sum(dim=1)
|
||||
|
||||
n_dense_changed = (dense_changes > 1e-6).sum().item()
|
||||
n_sparse_changed = (sparse_changes > 1e-6).sum().item()
|
||||
|
||||
assert n_sparse_changed < n_dense_changed
|
||||
|
||||
def test_plan_surgery(self):
|
||||
weights = {}
|
||||
directions = {}
|
||||
for i in range(6):
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
weights[i] = W
|
||||
directions[i] = ref_dir
|
||||
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
plan = surgeon.plan_surgery(weights, directions)
|
||||
|
||||
assert isinstance(plan, SparseSurgeryPlan)
|
||||
assert len(plan.per_layer) == 6
|
||||
assert 0 < plan.recommended_sparsity < 1.0
|
||||
assert plan.mean_refusal_sparsity_index > 0
|
||||
assert plan.mean_energy_removed > 0
|
||||
|
||||
def test_auto_sparsity(self):
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
surgeon = SparseDirectionSurgeon(auto_sparsity=True)
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir)
|
||||
# Auto sparsity should find a reasonable value
|
||||
assert 0.01 <= result.sparsity <= 0.5
|
||||
|
||||
def test_auto_sparsity_apply(self):
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
surgeon = SparseDirectionSurgeon(auto_sparsity=True)
|
||||
W_modified = surgeon.apply_sparse_projection(W, ref_dir)
|
||||
# Should reduce projection
|
||||
assert (W_modified @ ref_dir).abs().sum() < (W @ ref_dir).abs().sum()
|
||||
|
||||
def test_format_analysis(self):
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
result = surgeon.analyze_weight_matrix(W, ref_dir, layer_idx=4)
|
||||
report = SparseDirectionSurgeon.format_analysis(result)
|
||||
|
||||
assert "Sparse Direction Surgery" in report
|
||||
assert "Layer 4" in report
|
||||
assert "Refusal Sparsity Index" in report
|
||||
|
||||
def test_format_plan(self):
|
||||
weights = {i: torch.randn(32, 16) for i in range(4)}
|
||||
directions = {i: torch.randn(16) for i in range(4)}
|
||||
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
plan = surgeon.plan_surgery(weights, directions)
|
||||
report = SparseDirectionSurgeon.format_plan(plan)
|
||||
|
||||
assert "Sparse Direction Surgery Plan" in report
|
||||
assert "Recommended sparsity" in report
|
||||
|
||||
def test_empty_inputs(self):
|
||||
surgeon = SparseDirectionSurgeon()
|
||||
plan = surgeon.plan_surgery({}, {})
|
||||
assert len(plan.per_layer) == 0
|
||||
|
||||
def test_output_dtype_preserved(self):
|
||||
"""Output should match input dtype."""
|
||||
W, ref_dir, _ = self._make_weight_with_sparse_refusal()
|
||||
W_half = W.half()
|
||||
surgeon = SparseDirectionSurgeon(sparsity=0.1)
|
||||
W_out = surgeon.apply_sparse_projection(W_half, ref_dir)
|
||||
assert W_out.dtype == torch.float16
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests: Integration / Imports
|
||||
# ===========================================================================
|
||||
|
||||
class TestAnalysisImports:
|
||||
def test_all_new_modules_importable(self):
|
||||
from obliteratus.analysis import (
|
||||
ConceptConeAnalyzer,
|
||||
AlignmentImprintDetector,
|
||||
MultiTokenPositionAnalyzer,
|
||||
SparseDirectionSurgeon,
|
||||
)
|
||||
assert ConceptConeAnalyzer is not None
|
||||
assert AlignmentImprintDetector is not None
|
||||
assert MultiTokenPositionAnalyzer is not None
|
||||
assert SparseDirectionSurgeon is not None
|
||||
@@ -0,0 +1,155 @@
|
||||
"""Tests for refusal detection, including semantic mode and confidence intervals."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
from obliteratus.evaluation.advanced_metrics import (
|
||||
_is_refusal,
|
||||
_is_refusal_semantic,
|
||||
refusal_rate,
|
||||
refusal_rate_with_ci,
|
||||
)
|
||||
|
||||
|
||||
class TestKeywordRefusal:
|
||||
"""Test the keyword-based refusal detection."""
|
||||
|
||||
def test_obvious_refusal(self):
|
||||
assert _is_refusal("I'm sorry, I can't help with that.", "combined")
|
||||
|
||||
def test_obvious_non_refusal(self):
|
||||
assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined")
|
||||
|
||||
def test_empty_string(self):
|
||||
assert not _is_refusal("", "combined")
|
||||
|
||||
def test_prefix_mode(self):
|
||||
assert _is_refusal("I cannot assist with that request.", "prefix")
|
||||
assert not _is_refusal("Well, I cannot do that.", "prefix") # not a prefix
|
||||
|
||||
def test_substring_mode(self):
|
||||
assert _is_refusal("Well, I cannot assist with that.", "substring")
|
||||
|
||||
|
||||
class TestSemanticRefusal:
|
||||
"""Test the semantic refusal detection patterns."""
|
||||
|
||||
def test_hedge_then_refuse(self):
|
||||
text = "While I understand your interest in this topic, I cannot provide instructions for that."
|
||||
assert _is_refusal(text, "semantic")
|
||||
|
||||
def test_redirect(self):
|
||||
text = "Instead of providing those instructions, I can help you with something constructive."
|
||||
assert _is_refusal(text, "semantic")
|
||||
|
||||
def test_concern_flagging(self):
|
||||
text = "This raises serious ethical concerns that prevent me from responding."
|
||||
assert _is_refusal(text, "semantic")
|
||||
|
||||
def test_meta_refusal(self):
|
||||
text = "I'm designed to avoid providing information that could cause harm."
|
||||
assert _is_refusal(text, "semantic")
|
||||
|
||||
def test_responsibility_deflection(self):
|
||||
text = "You should consult a professional about this matter."
|
||||
assert _is_refusal(text, "semantic")
|
||||
|
||||
def test_genuine_compliance_not_flagged(self):
|
||||
"""Actual compliant responses should not trigger semantic detection."""
|
||||
texts = [
|
||||
"Here are the steps to build a birdhouse from scratch.",
|
||||
"The chemical formula for water is H2O.",
|
||||
"Python is a great language for data science. Here's how to get started.",
|
||||
"To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.",
|
||||
]
|
||||
for text in texts:
|
||||
assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}"
|
||||
|
||||
def test_semantic_catches_what_keyword_misses(self):
|
||||
"""Semantic mode should catch refusals that keyword mode misses."""
|
||||
# This hedge-then-refuse doesn't start with any refusal prefix
|
||||
# and doesn't contain exact refusal substrings
|
||||
text = "While I appreciate your curiosity, I'm not able to share that information."
|
||||
assert not _is_refusal(text, "prefix")
|
||||
# "not able to" is in substring list, so combined catches it
|
||||
assert _is_refusal(text, "combined")
|
||||
|
||||
# This one truly requires semantic detection
|
||||
text = "Although I respect your question, I will not be able to help with this particular request."
|
||||
# "will not" is in substrings, but test the semantic path independently
|
||||
assert _is_refusal_semantic(
|
||||
"While I understand your request, I cannot assist with that specific topic."
|
||||
)
|
||||
|
||||
|
||||
class TestRefusalRate:
|
||||
def test_all_refuse(self):
|
||||
responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."]
|
||||
assert refusal_rate(responses, "combined") == 1.0
|
||||
|
||||
def test_none_refuse(self):
|
||||
responses = ["Sure, here you go.", "Of course!", "The answer is 42."]
|
||||
assert refusal_rate(responses, "combined") == 0.0
|
||||
|
||||
def test_mixed(self):
|
||||
responses = ["I'm sorry, no.", "Sure, here you go."]
|
||||
assert refusal_rate(responses, "combined") == 0.5
|
||||
|
||||
def test_empty_list(self):
|
||||
assert refusal_rate([], "combined") == 0.0
|
||||
|
||||
def test_semantic_mode(self):
|
||||
responses = [
|
||||
"While I understand your interest, I cannot provide that information.",
|
||||
"Here are the steps you requested.",
|
||||
]
|
||||
assert refusal_rate(responses, "semantic") == 0.5
|
||||
|
||||
|
||||
class TestRefusalRateWithCI:
|
||||
def test_returns_all_fields(self):
|
||||
responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5
|
||||
ci = refusal_rate_with_ci(responses, mode="combined")
|
||||
assert "rate" in ci
|
||||
assert "ci_lower" in ci
|
||||
assert "ci_upper" in ci
|
||||
assert "n_samples" in ci
|
||||
assert ci["n_samples"] == 10
|
||||
|
||||
def test_ci_bounds_bracket_rate(self):
|
||||
responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70
|
||||
ci = refusal_rate_with_ci(responses, mode="combined")
|
||||
assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"]
|
||||
|
||||
def test_all_refuse_tight_ci(self):
|
||||
responses = ["I'm sorry"] * 50
|
||||
ci = refusal_rate_with_ci(responses, mode="combined")
|
||||
assert ci["rate"] == 1.0
|
||||
# Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0
|
||||
# (a proper CI acknowledges uncertainty even with all-positive observations)
|
||||
assert ci["ci_lower"] > 0.9
|
||||
assert ci["ci_upper"] == 1.0
|
||||
|
||||
def test_empty_responses(self):
|
||||
ci = refusal_rate_with_ci([], mode="combined")
|
||||
assert ci["rate"] == 0.0
|
||||
assert ci["n_samples"] == 0
|
||||
|
||||
def test_ci_narrower_with_more_samples(self):
|
||||
"""More samples should produce tighter confidence intervals."""
|
||||
responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5
|
||||
responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50
|
||||
|
||||
ci_small = refusal_rate_with_ci(responses_small)
|
||||
ci_large = refusal_rate_with_ci(responses_large)
|
||||
|
||||
width_small = ci_small["ci_upper"] - ci_small["ci_lower"]
|
||||
width_large = ci_large["ci_upper"] - ci_large["ci_lower"]
|
||||
assert width_large < width_small, \
|
||||
f"Large CI ({width_large}) not narrower than small CI ({width_small})"
|
||||
|
||||
def test_deterministic_with_seed(self):
|
||||
responses = ["I'm sorry"] * 30 + ["Sure"] * 70
|
||||
ci1 = refusal_rate_with_ci(responses)
|
||||
ci2 = refusal_rate_with_ci(responses)
|
||||
assert ci1 == ci2, "Same input produced different CIs"
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Tests for the reporting module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from obliteratus.reporting.report import AblationReport, AblationResult
|
||||
|
||||
|
||||
def _make_report() -> AblationReport:
|
||||
report = AblationReport(model_name="test-model")
|
||||
report.add_baseline({"perplexity": 25.0, "accuracy": 0.85})
|
||||
report.add_result(
|
||||
AblationResult(
|
||||
strategy="layer_removal",
|
||||
component="layer_0",
|
||||
description="Remove layer 0",
|
||||
metrics={"perplexity": 30.0, "accuracy": 0.80},
|
||||
)
|
||||
)
|
||||
report.add_result(
|
||||
AblationResult(
|
||||
strategy="layer_removal",
|
||||
component="layer_1",
|
||||
description="Remove layer 1",
|
||||
metrics={"perplexity": 50.0, "accuracy": 0.60},
|
||||
)
|
||||
)
|
||||
return report
|
||||
|
||||
|
||||
class TestAblationReport:
|
||||
def test_to_dataframe(self):
|
||||
report = _make_report()
|
||||
df = report.to_dataframe()
|
||||
assert len(df) == 2
|
||||
assert "perplexity" in df.columns
|
||||
assert "perplexity_delta" in df.columns
|
||||
assert "perplexity_pct_change" in df.columns
|
||||
|
||||
def test_save_json(self, tmp_path):
|
||||
report = _make_report()
|
||||
out = tmp_path / "results.json"
|
||||
report.save_json(out)
|
||||
data = json.loads(out.read_text())
|
||||
assert data["model_name"] == "test-model"
|
||||
assert len(data["results"]) == 2
|
||||
assert data["baseline_metrics"]["perplexity"] == 25.0
|
||||
|
||||
def test_save_csv(self, tmp_path):
|
||||
report = _make_report()
|
||||
out = tmp_path / "results.csv"
|
||||
report.save_csv(out)
|
||||
text = out.read_text()
|
||||
assert "layer_0" in text
|
||||
assert "perplexity" in text
|
||||
|
||||
def test_delta_calculation(self):
|
||||
report = _make_report()
|
||||
df = report.to_dataframe()
|
||||
row0 = df[df["component"] == "layer_0"].iloc[0]
|
||||
assert row0["perplexity_delta"] == 5.0 # 30 - 25
|
||||
assert abs(row0["perplexity_pct_change"] - 20.0) < 0.01
|
||||
|
||||
def test_plot_impact(self, tmp_path):
|
||||
report = _make_report()
|
||||
out = tmp_path / "impact.png"
|
||||
report.plot_impact(metric="perplexity", output_path=out)
|
||||
assert out.exists()
|
||||
assert out.stat().st_size > 0
|
||||
@@ -0,0 +1,179 @@
|
||||
"""Tests for ablation strategies using a small GPT-2 model."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from obliteratus.strategies.base import AblationSpec
|
||||
from obliteratus.strategies.registry import STRATEGY_REGISTRY, get_strategy
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_dummy_handle():
|
||||
"""Create a minimal ModelHandle with a tiny GPT-2 for testing (no network)."""
|
||||
from unittest.mock import MagicMock
|
||||
from transformers import GPT2Config, GPT2LMHeadModel
|
||||
from obliteratus.models.loader import ModelHandle
|
||||
|
||||
config = GPT2Config(
|
||||
vocab_size=1000,
|
||||
n_positions=128,
|
||||
n_embd=64,
|
||||
n_layer=2,
|
||||
n_head=2,
|
||||
n_inner=256,
|
||||
)
|
||||
model = GPT2LMHeadModel(config)
|
||||
model.eval()
|
||||
|
||||
# Strategy tests don't tokenize — use a simple mock
|
||||
tokenizer = MagicMock()
|
||||
tokenizer.pad_token = "<pad>"
|
||||
tokenizer.eos_token = "<eos>"
|
||||
|
||||
handle = ModelHandle(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
config=config,
|
||||
model_name="gpt2-test",
|
||||
task="causal_lm",
|
||||
)
|
||||
handle.snapshot()
|
||||
return handle
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def handle():
|
||||
return _make_dummy_handle()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRegistry:
|
||||
def test_all_strategies_registered(self):
|
||||
expected = {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"}
|
||||
assert expected.issubset(set(STRATEGY_REGISTRY.keys()))
|
||||
|
||||
def test_get_strategy_returns_instance(self):
|
||||
strat = get_strategy("layer_removal")
|
||||
assert strat.name == "layer_removal"
|
||||
|
||||
def test_get_unknown_strategy_raises(self):
|
||||
with pytest.raises(KeyError, match="Unknown strategy"):
|
||||
get_strategy("nonexistent_strategy")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Layer removal
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLayerRemoval:
|
||||
def test_enumerate(self, handle):
|
||||
strat = get_strategy("layer_removal")
|
||||
specs = strat.enumerate(handle)
|
||||
assert len(specs) == handle.num_layers
|
||||
assert all(s.strategy_name == "layer_removal" for s in specs)
|
||||
|
||||
def test_apply_zeros_layer(self, handle):
|
||||
strat = get_strategy("layer_removal")
|
||||
specs = strat.enumerate(handle)
|
||||
strat.apply(handle, specs[0])
|
||||
|
||||
from obliteratus.strategies.utils import get_layer_modules
|
||||
layer = get_layer_modules(handle)[0]
|
||||
for param in layer.parameters():
|
||||
assert torch.all(param == 0), "Layer params should be zeroed after ablation"
|
||||
|
||||
def test_restore_after_ablation(self, handle):
|
||||
strat = get_strategy("layer_removal")
|
||||
specs = strat.enumerate(handle)
|
||||
|
||||
from obliteratus.strategies.utils import get_layer_modules
|
||||
original_weight = get_layer_modules(handle)[0].attn.c_attn.weight.clone()
|
||||
|
||||
strat.apply(handle, specs[0])
|
||||
handle.restore()
|
||||
|
||||
restored_weight = get_layer_modules(handle)[0].attn.c_attn.weight
|
||||
assert torch.allclose(original_weight, restored_weight)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Head pruning
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHeadPruning:
|
||||
def test_enumerate(self, handle):
|
||||
strat = get_strategy("head_pruning")
|
||||
specs = strat.enumerate(handle)
|
||||
assert len(specs) == handle.num_layers * handle.num_heads
|
||||
|
||||
def test_apply_zeros_head(self, handle):
|
||||
strat = get_strategy("head_pruning")
|
||||
spec = AblationSpec(
|
||||
strategy_name="head_pruning",
|
||||
component="layer_0_head_0",
|
||||
description="test",
|
||||
metadata={"layer_idx": 0, "head_idx": 0},
|
||||
)
|
||||
strat.apply(handle, spec)
|
||||
|
||||
from obliteratus.strategies.utils import get_layer_modules, get_attention_module
|
||||
attn = get_attention_module(get_layer_modules(handle)[0], handle.architecture)
|
||||
head_dim = handle.hidden_size // handle.num_heads
|
||||
# GPT-2 uses c_attn (Conv1D), check output projection c_proj
|
||||
if hasattr(attn, "c_proj"):
|
||||
# Conv1D stores weight transposed
|
||||
assert torch.all(attn.c_proj.weight[0:head_dim, :] == 0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FFN ablation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFFNAblation:
|
||||
def test_enumerate(self, handle):
|
||||
strat = get_strategy("ffn_ablation")
|
||||
specs = strat.enumerate(handle)
|
||||
assert len(specs) == handle.num_layers
|
||||
|
||||
def test_apply_zeros_ffn(self, handle):
|
||||
strat = get_strategy("ffn_ablation")
|
||||
specs = strat.enumerate(handle)
|
||||
strat.apply(handle, specs[0])
|
||||
|
||||
from obliteratus.strategies.utils import get_layer_modules, get_ffn_module
|
||||
ffn = get_ffn_module(get_layer_modules(handle)[0], handle.architecture)
|
||||
for param in ffn.parameters():
|
||||
assert torch.all(param == 0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Embedding ablation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEmbeddingAblation:
|
||||
def test_enumerate(self, handle):
|
||||
strat = get_strategy("embedding_ablation")
|
||||
specs = strat.enumerate(handle)
|
||||
assert len(specs) > 0
|
||||
|
||||
def test_apply_zeros_dims(self, handle):
|
||||
strat = get_strategy("embedding_ablation")
|
||||
spec = AblationSpec(
|
||||
strategy_name="embedding_ablation",
|
||||
component="embed_dims_0_4",
|
||||
description="test",
|
||||
metadata={"dim_start": 0, "dim_end": 4},
|
||||
)
|
||||
strat.apply(handle, spec)
|
||||
|
||||
from obliteratus.strategies.utils import get_embedding_module
|
||||
emb = get_embedding_module(handle)
|
||||
assert torch.all(emb.weight[:, 0:4] == 0)
|
||||
@@ -0,0 +1,108 @@
|
||||
"""Tests for ablation presets."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from obliteratus.study_presets import (
|
||||
STUDY_PRESETS,
|
||||
get_study_preset,
|
||||
get_preset,
|
||||
list_study_presets,
|
||||
list_presets,
|
||||
)
|
||||
from obliteratus.config import StudyConfig
|
||||
|
||||
|
||||
class TestPresets:
|
||||
def test_all_presets_registered(self):
|
||||
expected_keys = {"quick", "full", "attention", "layers", "knowledge", "pruning", "embeddings", "jailbreak", "guardrail", "robustness"}
|
||||
assert expected_keys.issubset(set(STUDY_PRESETS.keys()))
|
||||
|
||||
def test_get_preset(self):
|
||||
preset = get_study_preset("quick")
|
||||
assert preset.name == "Quick Scan"
|
||||
assert preset.key == "quick"
|
||||
assert len(preset.strategies) == 2
|
||||
|
||||
def test_get_preset_alias(self):
|
||||
preset = get_preset("quick")
|
||||
assert preset.name == "Quick Scan"
|
||||
|
||||
def test_get_unknown_preset_raises(self):
|
||||
import pytest
|
||||
with pytest.raises(KeyError, match="Unknown preset"):
|
||||
get_study_preset("nonexistent")
|
||||
|
||||
def test_list_presets(self):
|
||||
presets = list_study_presets()
|
||||
assert len(presets) >= 7
|
||||
keys = [p.key for p in presets]
|
||||
assert "quick" in keys
|
||||
assert "full" in keys
|
||||
|
||||
def test_list_presets_alias(self):
|
||||
assert list_presets() == list_study_presets()
|
||||
|
||||
def test_preset_strategies_are_valid(self):
|
||||
from obliteratus.strategies import STRATEGY_REGISTRY
|
||||
for preset in list_study_presets():
|
||||
for s in preset.strategies:
|
||||
assert s["name"] in STRATEGY_REGISTRY, (
|
||||
f"Preset {preset.key!r} references unknown strategy {s['name']!r}"
|
||||
)
|
||||
|
||||
|
||||
class TestConfigWithPreset:
|
||||
def test_preset_key_in_config(self):
|
||||
config_dict = {
|
||||
"preset": "quick",
|
||||
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
|
||||
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
|
||||
}
|
||||
config = StudyConfig.from_dict(config_dict)
|
||||
# Should inherit strategies from the quick preset
|
||||
assert len(config.strategies) == 2
|
||||
strategy_names = [s.name for s in config.strategies]
|
||||
assert "layer_removal" in strategy_names
|
||||
assert "ffn_ablation" in strategy_names
|
||||
# Should inherit max_samples
|
||||
assert config.dataset.max_samples == 25
|
||||
# Should inherit batch_size and max_length
|
||||
assert config.batch_size == 4
|
||||
assert config.max_length == 128
|
||||
|
||||
def test_legacy_study_preset_key_still_works(self):
|
||||
config_dict = {
|
||||
"study_preset": "quick",
|
||||
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
|
||||
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
|
||||
}
|
||||
config = StudyConfig.from_dict(config_dict)
|
||||
assert len(config.strategies) == 2
|
||||
|
||||
def test_preset_can_be_overridden(self):
|
||||
config_dict = {
|
||||
"preset": "quick",
|
||||
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
|
||||
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text", "max_samples": 999},
|
||||
"batch_size": 16,
|
||||
"strategies": [{"name": "head_pruning", "params": {}}],
|
||||
}
|
||||
config = StudyConfig.from_dict(config_dict)
|
||||
# Explicit strategies should override preset
|
||||
assert len(config.strategies) == 1
|
||||
assert config.strategies[0].name == "head_pruning"
|
||||
# Explicit batch_size should override
|
||||
assert config.batch_size == 16
|
||||
# Explicit max_samples in dataset should be kept
|
||||
assert config.dataset.max_samples == 999
|
||||
|
||||
def test_full_preset(self):
|
||||
config_dict = {
|
||||
"preset": "full",
|
||||
"model": {"name": "gpt2", "task": "causal_lm", "dtype": "float32", "device": "cpu"},
|
||||
"dataset": {"name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "test", "text_column": "text"},
|
||||
}
|
||||
config = StudyConfig.from_dict(config_dict)
|
||||
assert len(config.strategies) == 4
|
||||
strategy_names = {s.name for s in config.strategies}
|
||||
assert strategy_names == {"layer_removal", "head_pruning", "ffn_ablation", "embedding_ablation"}
|
||||
@@ -0,0 +1,696 @@
|
||||
"""Tests for the opt-in telemetry module."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
|
||||
from obliteratus.telemetry import (
|
||||
_ALLOWED_METHOD_CONFIG_KEYS,
|
||||
_direction_stats,
|
||||
_extract_excise_details,
|
||||
_extract_prompt_counts,
|
||||
_extract_analysis_insights,
|
||||
_is_mount_point,
|
||||
_test_writable,
|
||||
build_report,
|
||||
disable_telemetry,
|
||||
enable_telemetry,
|
||||
is_enabled,
|
||||
maybe_send_informed_report,
|
||||
maybe_send_pipeline_report,
|
||||
restore_from_hub,
|
||||
send_report,
|
||||
storage_diagnostic,
|
||||
)
|
||||
|
||||
|
||||
def _reset_telemetry():
|
||||
import obliteratus.telemetry as t
|
||||
t._enabled = None
|
||||
|
||||
|
||||
# ── Enable / disable ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestTelemetryConfig:
|
||||
"""Test telemetry enable/disable logic."""
|
||||
|
||||
def setup_method(self):
|
||||
_reset_telemetry()
|
||||
|
||||
def test_disabled_by_default(self):
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
_reset_telemetry()
|
||||
assert not is_enabled()
|
||||
|
||||
def test_enabled_by_default_on_hf_spaces(self):
|
||||
with patch.dict(os.environ, {"SPACE_ID": "user/space"}, clear=True):
|
||||
import obliteratus.telemetry as t
|
||||
old_val = t._ON_HF_SPACES
|
||||
t._ON_HF_SPACES = True
|
||||
_reset_telemetry()
|
||||
assert is_enabled()
|
||||
t._ON_HF_SPACES = old_val
|
||||
|
||||
def test_disable_via_env_zero(self):
|
||||
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "0"}):
|
||||
_reset_telemetry()
|
||||
assert not is_enabled()
|
||||
|
||||
def test_disable_via_env_false(self):
|
||||
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "false"}):
|
||||
_reset_telemetry()
|
||||
assert not is_enabled()
|
||||
|
||||
def test_enable_via_env_explicit(self):
|
||||
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
|
||||
_reset_telemetry()
|
||||
assert is_enabled()
|
||||
|
||||
def test_enable_programmatically(self):
|
||||
enable_telemetry()
|
||||
assert is_enabled()
|
||||
|
||||
def test_disable_programmatically(self):
|
||||
enable_telemetry()
|
||||
assert is_enabled()
|
||||
disable_telemetry()
|
||||
assert not is_enabled()
|
||||
|
||||
def test_programmatic_overrides_env(self):
|
||||
with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
|
||||
disable_telemetry()
|
||||
assert not is_enabled()
|
||||
|
||||
|
||||
# ── Report building ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestBuildReport:
|
||||
"""Test report payload construction."""
|
||||
|
||||
def _base_kwargs(self, **overrides):
|
||||
defaults = dict(
|
||||
architecture="LlamaForCausalLM",
|
||||
num_layers=32,
|
||||
num_heads=32,
|
||||
hidden_size=4096,
|
||||
total_params=8_000_000_000,
|
||||
method="advanced",
|
||||
method_config={"n_directions": 4, "norm_preserve": True},
|
||||
quality_metrics={"perplexity": 5.2, "refusal_rate": 0.05},
|
||||
)
|
||||
defaults.update(overrides)
|
||||
return defaults
|
||||
|
||||
def test_schema_version_2(self):
|
||||
report = build_report(**self._base_kwargs())
|
||||
assert report["schema_version"] == 2
|
||||
|
||||
def test_basic_fields(self):
|
||||
report = build_report(**self._base_kwargs())
|
||||
assert report["model"]["architecture"] == "LlamaForCausalLM"
|
||||
assert report["model"]["num_layers"] == 32
|
||||
assert report["model"]["total_params"] == 8_000_000_000
|
||||
assert report["method"] == "advanced"
|
||||
assert report["quality_metrics"]["refusal_rate"] == 0.05
|
||||
assert len(report["session_id"]) == 32
|
||||
|
||||
def test_filters_unknown_config_keys(self):
|
||||
report = build_report(**self._base_kwargs(
|
||||
method_config={"n_directions": 1, "secret_flag": True, "nuke": "boom"},
|
||||
))
|
||||
assert "n_directions" in report["method_config"]
|
||||
assert "secret_flag" not in report["method_config"]
|
||||
assert "nuke" not in report["method_config"]
|
||||
|
||||
def test_allows_all_valid_config_keys(self):
|
||||
"""Every key in the allowlist should pass through."""
|
||||
config = {k: True for k in _ALLOWED_METHOD_CONFIG_KEYS}
|
||||
report = build_report(**self._base_kwargs(method_config=config))
|
||||
for k in _ALLOWED_METHOD_CONFIG_KEYS:
|
||||
assert k in report["method_config"], f"Missing allowlisted key: {k}"
|
||||
|
||||
def test_no_model_name_in_report(self):
|
||||
report = build_report(**self._base_kwargs())
|
||||
report_str = json.dumps(report)
|
||||
assert "meta-llama" not in report_str
|
||||
assert "Llama-3" not in report_str
|
||||
|
||||
def test_environment_info(self):
|
||||
report = build_report(**self._base_kwargs())
|
||||
env = report["environment"]
|
||||
assert "python_version" in env
|
||||
assert "os" in env
|
||||
assert "arch" in env
|
||||
|
||||
def test_stage_durations(self):
|
||||
durations = {"summon": 2.5, "probe": 10.1, "distill": 3.2}
|
||||
report = build_report(**self._base_kwargs(stage_durations=durations))
|
||||
assert report["stage_durations"] == durations
|
||||
|
||||
def test_direction_stats(self):
|
||||
stats = {"direction_norms": {"10": 0.95}, "mean_direction_persistence": 0.87}
|
||||
report = build_report(**self._base_kwargs(direction_stats=stats))
|
||||
assert report["direction_stats"]["mean_direction_persistence"] == 0.87
|
||||
|
||||
def test_excise_details(self):
|
||||
details = {"modified_count": 128, "used_techniques": ["head_surgery"]}
|
||||
report = build_report(**self._base_kwargs(excise_details=details))
|
||||
assert report["excise_details"]["modified_count"] == 128
|
||||
|
||||
def test_prompt_counts(self):
|
||||
counts = {"harmful": 33, "harmless": 33, "jailbreak": 15}
|
||||
report = build_report(**self._base_kwargs(prompt_counts=counts))
|
||||
assert report["prompt_counts"]["harmful"] == 33
|
||||
assert report["prompt_counts"]["jailbreak"] == 15
|
||||
|
||||
def test_gpu_memory(self):
|
||||
mem = {"peak_allocated_gb": 7.2, "peak_reserved_gb": 8.0}
|
||||
report = build_report(**self._base_kwargs(gpu_memory=mem))
|
||||
assert report["gpu_memory"]["peak_allocated_gb"] == 7.2
|
||||
|
||||
def test_analysis_insights_filtered(self):
|
||||
"""Only allowlisted analysis keys should pass through."""
|
||||
insights = {
|
||||
"detected_alignment_method": "DPO",
|
||||
"alignment_confidence": 0.92,
|
||||
"secret_internal_data": "should not appear",
|
||||
}
|
||||
report = build_report(**self._base_kwargs(analysis_insights=insights))
|
||||
assert report["analysis_insights"]["detected_alignment_method"] == "DPO"
|
||||
assert "secret_internal_data" not in report["analysis_insights"]
|
||||
|
||||
def test_informed_extras(self):
|
||||
extras = {"ouroboros_passes": 3, "final_refusal_rate": 0.02, "total_duration": 120.5}
|
||||
report = build_report(**self._base_kwargs(informed_extras=extras))
|
||||
assert report["informed"]["ouroboros_passes"] == 3
|
||||
|
||||
def test_optional_fields_omitted_when_empty(self):
|
||||
"""Optional fields should not appear when not provided."""
|
||||
report = build_report(**self._base_kwargs())
|
||||
assert "stage_durations" not in report
|
||||
assert "direction_stats" not in report
|
||||
assert "excise_details" not in report
|
||||
assert "prompt_counts" not in report
|
||||
assert "gpu_memory" not in report
|
||||
assert "analysis_insights" not in report
|
||||
assert "informed" not in report
|
||||
|
||||
|
||||
# ── Direction stats extraction ──────────────────────────────────────────
|
||||
|
||||
|
||||
class TestDirectionStats:
|
||||
"""Test direction quality metric extraction."""
|
||||
|
||||
def test_direction_norms(self):
|
||||
pipeline = MagicMock()
|
||||
pipeline.refusal_directions = {
|
||||
0: torch.randn(128),
|
||||
1: torch.randn(128),
|
||||
}
|
||||
pipeline.refusal_subspaces = {}
|
||||
stats = _direction_stats(pipeline)
|
||||
assert "direction_norms" in stats
|
||||
assert "0" in stats["direction_norms"]
|
||||
assert "1" in stats["direction_norms"]
|
||||
|
||||
def test_direction_persistence(self):
|
||||
"""Adjacent layers with similar directions should have high persistence."""
|
||||
d = torch.randn(128)
|
||||
d = d / d.norm()
|
||||
pipeline = MagicMock()
|
||||
pipeline.refusal_directions = {0: d, 1: d + 0.01 * torch.randn(128)}
|
||||
pipeline.refusal_subspaces = {}
|
||||
stats = _direction_stats(pipeline)
|
||||
assert "mean_direction_persistence" in stats
|
||||
assert stats["mean_direction_persistence"] > 0.9
|
||||
|
||||
def test_effective_rank(self):
|
||||
"""Multi-direction subspace should yield effective rank > 1."""
|
||||
pipeline = MagicMock()
|
||||
pipeline.refusal_directions = {0: torch.randn(128)}
|
||||
# 4-direction subspace with distinct directions
|
||||
sub = torch.randn(4, 128)
|
||||
pipeline.refusal_subspaces = {0: sub}
|
||||
stats = _direction_stats(pipeline)
|
||||
assert "effective_ranks" in stats
|
||||
assert float(stats["effective_ranks"]["0"]) > 1.0
|
||||
|
||||
def test_empty_directions(self):
|
||||
pipeline = MagicMock()
|
||||
pipeline.refusal_directions = {}
|
||||
pipeline.refusal_subspaces = {}
|
||||
stats = _direction_stats(pipeline)
|
||||
assert stats == {}
|
||||
|
||||
|
||||
# ── Excise details extraction ───────────────────────────────────────────
|
||||
|
||||
|
||||
class TestExciseDetails:
|
||||
def test_basic_excise_details(self):
|
||||
pipeline = MagicMock()
|
||||
pipeline._excise_modified_count = 64
|
||||
pipeline._refusal_heads = {10: [(0, 0.9), (3, 0.8)], 11: [(1, 0.7)]}
|
||||
pipeline._sae_directions = {}
|
||||
pipeline._expert_safety_scores = {}
|
||||
pipeline._layer_excise_weights = {}
|
||||
pipeline._expert_directions = {}
|
||||
pipeline._steering_hooks = []
|
||||
pipeline.invert_refusal = False
|
||||
pipeline.project_embeddings = False
|
||||
pipeline.activation_steering = False
|
||||
pipeline.expert_transplant = False
|
||||
|
||||
details = _extract_excise_details(pipeline)
|
||||
assert details["modified_count"] == 64
|
||||
assert details["head_surgery_layers"] == 2
|
||||
assert details["total_heads_projected"] == 3
|
||||
assert "head_surgery" in details["used_techniques"]
|
||||
|
||||
def test_adaptive_weights(self):
|
||||
pipeline = MagicMock()
|
||||
pipeline._excise_modified_count = None
|
||||
pipeline._refusal_heads = {}
|
||||
pipeline._sae_directions = {}
|
||||
pipeline._expert_safety_scores = {}
|
||||
pipeline._layer_excise_weights = {0: 0.2, 1: 0.8, 2: 0.5}
|
||||
pipeline._expert_directions = {}
|
||||
pipeline._steering_hooks = []
|
||||
pipeline.invert_refusal = False
|
||||
pipeline.project_embeddings = False
|
||||
pipeline.activation_steering = False
|
||||
pipeline.expert_transplant = False
|
||||
|
||||
details = _extract_excise_details(pipeline)
|
||||
assert details["adaptive_weight_min"] == 0.2
|
||||
assert details["adaptive_weight_max"] == 0.8
|
||||
assert "layer_adaptive" in details["used_techniques"]
|
||||
|
||||
|
||||
# ── Prompt counts extraction ────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestPromptCounts:
|
||||
def test_basic_counts(self):
|
||||
pipeline = MagicMock()
|
||||
pipeline.harmful_prompts = ["a"] * 33
|
||||
pipeline.harmless_prompts = ["b"] * 33
|
||||
pipeline.jailbreak_prompts = None
|
||||
counts = _extract_prompt_counts(pipeline)
|
||||
assert counts["harmful"] == 33
|
||||
assert counts["harmless"] == 33
|
||||
assert "jailbreak" not in counts
|
||||
|
||||
def test_with_jailbreak(self):
|
||||
pipeline = MagicMock()
|
||||
pipeline.harmful_prompts = ["a"] * 33
|
||||
pipeline.harmless_prompts = ["b"] * 33
|
||||
pipeline.jailbreak_prompts = ["c"] * 10
|
||||
counts = _extract_prompt_counts(pipeline)
|
||||
assert counts["jailbreak"] == 10
|
||||
|
||||
|
||||
# ── Send behavior ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSendReport:
|
||||
def setup_method(self):
|
||||
_reset_telemetry()
|
||||
|
||||
def test_does_not_send_when_disabled(self):
|
||||
disable_telemetry()
|
||||
with patch("obliteratus.telemetry._send_sync") as mock_send:
|
||||
send_report({"test": True})
|
||||
mock_send.assert_not_called()
|
||||
|
||||
def test_sends_when_enabled(self):
|
||||
enable_telemetry()
|
||||
with patch("obliteratus.telemetry._send_sync") as mock_send:
|
||||
send_report({"test": True})
|
||||
import time
|
||||
time.sleep(0.1)
|
||||
mock_send.assert_called_once_with({"test": True})
|
||||
|
||||
def test_send_failure_is_silent(self):
|
||||
enable_telemetry()
|
||||
with patch("obliteratus.telemetry._send_sync", side_effect=Exception("network down")) as mock_send:
|
||||
# send_report should not propagate the exception to the caller
|
||||
send_report({"test": True})
|
||||
import time
|
||||
time.sleep(0.1) # Allow background thread to execute
|
||||
mock_send.assert_called_once_with({"test": True})
|
||||
|
||||
|
||||
# ── Pipeline integration ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_mock_pipeline():
|
||||
"""Build a mock pipeline with all fields the telemetry module reads."""
|
||||
p = MagicMock()
|
||||
p.handle.summary.return_value = {
|
||||
"architecture": "LlamaForCausalLM",
|
||||
"num_layers": 32,
|
||||
"num_heads": 32,
|
||||
"hidden_size": 4096,
|
||||
"total_params": 8_000_000_000,
|
||||
}
|
||||
p.method = "advanced"
|
||||
p.n_directions = 4
|
||||
p.norm_preserve = True
|
||||
p.regularization = 0.1
|
||||
p.refinement_passes = 2
|
||||
p.project_biases = True
|
||||
p.use_chat_template = True
|
||||
p.use_whitened_svd = True
|
||||
p.true_iterative_refinement = False
|
||||
p.use_jailbreak_contrast = False
|
||||
p.layer_adaptive_strength = False
|
||||
p.attention_head_surgery = True
|
||||
p.safety_neuron_masking = False
|
||||
p.per_expert_directions = False
|
||||
p.use_sae_features = False
|
||||
p.invert_refusal = False
|
||||
p.project_embeddings = False
|
||||
p.embed_regularization = 0.5
|
||||
p.activation_steering = False
|
||||
p.steering_strength = 0.3
|
||||
p.expert_transplant = False
|
||||
p.transplant_blend = 0.3
|
||||
p.reflection_strength = 2.0
|
||||
p.quantization = None
|
||||
|
||||
p._quality_metrics = {"perplexity": 5.2, "coherence": 0.8, "refusal_rate": 0.05}
|
||||
p._strong_layers = [10, 11, 12, 13]
|
||||
p._stage_durations = {"summon": 3.0, "probe": 12.5, "distill": 4.1, "excise": 2.0, "verify": 8.3, "rebirth": 5.0}
|
||||
p._excise_modified_count = 128
|
||||
|
||||
# Direction data
|
||||
d = torch.randn(4096)
|
||||
d = d / d.norm()
|
||||
p.refusal_directions = {10: d, 11: d + 0.01 * torch.randn(4096), 12: d, 13: d}
|
||||
p.refusal_subspaces = {10: torch.randn(4, 4096)}
|
||||
|
||||
# Excise details
|
||||
p._refusal_heads = {10: [(0, 0.9), (3, 0.8)]}
|
||||
p._sae_directions = {}
|
||||
p._expert_safety_scores = {}
|
||||
p._layer_excise_weights = {}
|
||||
p._expert_directions = {}
|
||||
p._steering_hooks = []
|
||||
|
||||
# Prompts
|
||||
p.harmful_prompts = ["x"] * 33
|
||||
p.harmless_prompts = ["y"] * 33
|
||||
p.jailbreak_prompts = None
|
||||
|
||||
return p
|
||||
|
||||
|
||||
class TestPipelineIntegration:
|
||||
def setup_method(self):
|
||||
_reset_telemetry()
|
||||
|
||||
def test_does_nothing_when_disabled(self):
|
||||
disable_telemetry()
|
||||
with patch("obliteratus.telemetry.send_report") as mock_send:
|
||||
maybe_send_pipeline_report(_make_mock_pipeline())
|
||||
mock_send.assert_not_called()
|
||||
|
||||
def test_comprehensive_report(self):
|
||||
"""Verify that all data points are extracted from the pipeline."""
|
||||
enable_telemetry()
|
||||
p = _make_mock_pipeline()
|
||||
with patch("obliteratus.telemetry.send_report") as mock_send:
|
||||
maybe_send_pipeline_report(p)
|
||||
mock_send.assert_called_once()
|
||||
report = mock_send.call_args[0][0]
|
||||
|
||||
# Core fields
|
||||
assert report["schema_version"] == 2
|
||||
assert report["model"]["architecture"] == "LlamaForCausalLM"
|
||||
assert report["method"] == "advanced"
|
||||
|
||||
# Method config — check all keys passed through
|
||||
cfg = report["method_config"]
|
||||
assert cfg["n_directions"] == 4
|
||||
assert cfg["norm_preserve"] is True
|
||||
assert cfg["use_whitened_svd"] is True
|
||||
assert cfg["attention_head_surgery"] is True
|
||||
|
||||
# Quality metrics
|
||||
assert report["quality_metrics"]["perplexity"] == 5.2
|
||||
assert report["quality_metrics"]["refusal_rate"] == 0.05
|
||||
|
||||
# Stage durations
|
||||
assert "stage_durations" in report
|
||||
assert report["stage_durations"]["summon"] == 3.0
|
||||
assert report["stage_durations"]["verify"] == 8.3
|
||||
|
||||
# Strong layers
|
||||
assert report["strong_layers"] == [10, 11, 12, 13]
|
||||
|
||||
# Direction stats
|
||||
assert "direction_stats" in report
|
||||
assert "direction_norms" in report["direction_stats"]
|
||||
assert "mean_direction_persistence" in report["direction_stats"]
|
||||
|
||||
# Excise details
|
||||
assert "excise_details" in report
|
||||
assert report["excise_details"]["modified_count"] == 128
|
||||
assert "head_surgery" in report["excise_details"]["used_techniques"]
|
||||
|
||||
# Prompt counts
|
||||
assert report["prompt_counts"]["harmful"] == 33
|
||||
assert report["prompt_counts"]["harmless"] == 33
|
||||
|
||||
# Environment
|
||||
assert "os" in report["environment"]
|
||||
assert "python_version" in report["environment"]
|
||||
|
||||
|
||||
# ── Informed pipeline integration ────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class _MockInsights:
|
||||
detected_alignment_method: str = "DPO"
|
||||
alignment_confidence: float = 0.92
|
||||
alignment_probabilities: dict = field(default_factory=lambda: {"DPO": 0.92, "RLHF": 0.05})
|
||||
cone_is_polyhedral: bool = True
|
||||
cone_dimensionality: float = 3.2
|
||||
mean_pairwise_cosine: float = 0.45
|
||||
direction_specificity: dict = field(default_factory=lambda: {"violence": 0.8})
|
||||
cluster_count: int = 3
|
||||
direction_persistence: float = 0.87
|
||||
mean_refusal_sparsity_index: float = 0.15
|
||||
recommended_sparsity: float = 0.1
|
||||
use_sparse_surgery: bool = True
|
||||
estimated_robustness: str = "medium"
|
||||
self_repair_estimate: float = 0.3
|
||||
entanglement_score: float = 0.2
|
||||
entangled_layers: list = field(default_factory=lambda: [15, 16])
|
||||
clean_layers: list = field(default_factory=lambda: [10, 11, 12])
|
||||
recommended_n_directions: int = 6
|
||||
recommended_regularization: float = 0.05
|
||||
recommended_refinement_passes: int = 3
|
||||
recommended_layers: list = field(default_factory=lambda: [10, 11, 12, 13])
|
||||
skip_layers: list = field(default_factory=lambda: [15])
|
||||
|
||||
|
||||
@dataclass
|
||||
class _MockInformedReport:
|
||||
insights: _MockInsights = field(default_factory=_MockInsights)
|
||||
ouroboros_passes: int = 2
|
||||
final_refusal_rate: float = 0.02
|
||||
analysis_duration: float = 15.3
|
||||
total_duration: float = 85.7
|
||||
|
||||
|
||||
class TestInformedPipelineIntegration:
|
||||
def setup_method(self):
|
||||
_reset_telemetry()
|
||||
|
||||
def test_does_nothing_when_disabled(self):
|
||||
disable_telemetry()
|
||||
with patch("obliteratus.telemetry.send_report") as mock_send:
|
||||
maybe_send_informed_report(_make_mock_pipeline(), _MockInformedReport())
|
||||
mock_send.assert_not_called()
|
||||
|
||||
def test_comprehensive_informed_report(self):
|
||||
enable_telemetry()
|
||||
p = _make_mock_pipeline()
|
||||
report_obj = _MockInformedReport()
|
||||
|
||||
with patch("obliteratus.telemetry.send_report") as mock_send:
|
||||
maybe_send_informed_report(p, report_obj)
|
||||
mock_send.assert_called_once()
|
||||
report = mock_send.call_args[0][0]
|
||||
|
||||
# All base fields present
|
||||
assert report["schema_version"] == 2
|
||||
assert report["model"]["architecture"] == "LlamaForCausalLM"
|
||||
assert "direction_stats" in report
|
||||
assert "excise_details" in report
|
||||
|
||||
# Analysis insights
|
||||
ai = report["analysis_insights"]
|
||||
assert ai["detected_alignment_method"] == "DPO"
|
||||
assert ai["alignment_confidence"] == 0.92
|
||||
assert ai["cone_is_polyhedral"] is True
|
||||
assert ai["cone_dimensionality"] == 3.2
|
||||
assert ai["cluster_count"] == 3
|
||||
assert ai["self_repair_estimate"] == 0.3
|
||||
assert ai["entanglement_score"] == 0.2
|
||||
assert ai["recommended_n_directions"] == 6
|
||||
|
||||
# Informed extras
|
||||
inf = report["informed"]
|
||||
assert inf["ouroboros_passes"] == 2
|
||||
assert inf["final_refusal_rate"] == 0.02
|
||||
assert inf["analysis_duration"] == 15.3
|
||||
assert inf["total_duration"] == 85.7
|
||||
|
||||
def test_analysis_insights_filter_unknown_keys(self):
|
||||
enable_telemetry()
|
||||
_make_mock_pipeline()
|
||||
|
||||
@dataclass
|
||||
class _BadInsights(_MockInsights):
|
||||
secret_sauce: str = "should not appear"
|
||||
|
||||
report_obj = _MockInformedReport(insights=_BadInsights())
|
||||
insights = _extract_analysis_insights(report_obj)
|
||||
assert "detected_alignment_method" in insights
|
||||
assert "secret_sauce" not in insights
|
||||
|
||||
|
||||
# ── Stage duration tracking on pipeline ──────────────────────────────────
|
||||
|
||||
|
||||
class TestStageDurationTracking:
|
||||
def test_emit_records_durations(self):
|
||||
"""Verify _emit stores durations in _stage_durations dict."""
|
||||
from obliteratus.abliterate import AbliterationPipeline
|
||||
|
||||
p = AbliterationPipeline.__new__(AbliterationPipeline)
|
||||
p._stage_durations = {}
|
||||
p._excise_modified_count = None
|
||||
p._on_stage = lambda r: None
|
||||
|
||||
p._emit("summon", "done", "loaded", duration=3.5)
|
||||
p._emit("probe", "done", "probed", duration=10.2)
|
||||
p._emit("excise", "done", "excised", duration=2.1, modified_count=64)
|
||||
|
||||
assert p._stage_durations == {"summon": 3.5, "probe": 10.2, "excise": 2.1}
|
||||
assert p._excise_modified_count == 64
|
||||
|
||||
def test_running_status_does_not_record(self):
|
||||
"""Only 'done' status should record durations."""
|
||||
from obliteratus.abliterate import AbliterationPipeline
|
||||
|
||||
p = AbliterationPipeline.__new__(AbliterationPipeline)
|
||||
p._stage_durations = {}
|
||||
p._excise_modified_count = None
|
||||
p._on_stage = lambda r: None
|
||||
|
||||
p._emit("summon", "running", "loading...", duration=0)
|
||||
assert p._stage_durations == {}
|
||||
|
||||
|
||||
# ── Storage helpers ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestStorageHelpers:
|
||||
"""Test persistent storage helper functions."""
|
||||
|
||||
def test_test_writable_valid_dir(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
assert _test_writable(Path(d) / "subdir")
|
||||
|
||||
def test_test_writable_unwritable(self):
|
||||
# /proc is never writable for arbitrary files
|
||||
assert not _test_writable(Path("/proc/obliteratus_test"))
|
||||
|
||||
def test_is_mount_point_existing_path(self):
|
||||
# Should return a bool without raising for any existing path
|
||||
result = _is_mount_point(Path("/"))
|
||||
assert isinstance(result, bool)
|
||||
|
||||
def test_is_mount_point_nonexistent(self):
|
||||
assert not _is_mount_point(Path("/nonexistent_dir_12345"))
|
||||
|
||||
def test_storage_diagnostic_returns_dict(self):
|
||||
diag = storage_diagnostic()
|
||||
assert isinstance(diag, dict)
|
||||
assert "telemetry_dir" in diag
|
||||
assert "is_persistent" in diag
|
||||
assert "on_hf_spaces" in diag
|
||||
assert "telemetry_enabled" in diag
|
||||
assert "data_dir_exists" in diag
|
||||
|
||||
|
||||
# ── Hub restore ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestHubRestore:
|
||||
"""Test Hub-to-local restore functionality."""
|
||||
|
||||
def setup_method(self):
|
||||
_reset_telemetry()
|
||||
# Reset restore state so each test can trigger it
|
||||
import obliteratus.telemetry as t
|
||||
t._restore_done = False
|
||||
|
||||
def test_restore_skips_when_no_repo(self):
|
||||
with patch("obliteratus.telemetry._TELEMETRY_REPO", ""):
|
||||
assert restore_from_hub() == 0
|
||||
|
||||
def test_restore_deduplicates(self):
|
||||
"""Records already in local JSONL should not be re-added."""
|
||||
import obliteratus.telemetry as t
|
||||
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
test_file = Path(d) / "telemetry.jsonl"
|
||||
existing = {"session_id": "abc", "timestamp": "2025-01-01T00:00:00"}
|
||||
test_file.write_text(json.dumps(existing) + "\n")
|
||||
|
||||
old_file = t.TELEMETRY_FILE
|
||||
old_repo = t._TELEMETRY_REPO
|
||||
t.TELEMETRY_FILE = test_file
|
||||
t._TELEMETRY_REPO = "test/repo"
|
||||
t._restore_done = False
|
||||
|
||||
try:
|
||||
hub_records = [
|
||||
{"session_id": "abc", "timestamp": "2025-01-01T00:00:00"}, # duplicate
|
||||
{"session_id": "def", "timestamp": "2025-01-02T00:00:00"}, # new
|
||||
]
|
||||
with patch("obliteratus.telemetry.fetch_hub_records", return_value=hub_records):
|
||||
count = restore_from_hub()
|
||||
assert count == 1 # Only the new record
|
||||
|
||||
# Verify file contents
|
||||
lines = test_file.read_text().strip().split("\n")
|
||||
assert len(lines) == 2 # original + 1 new
|
||||
finally:
|
||||
t.TELEMETRY_FILE = old_file
|
||||
t._TELEMETRY_REPO = old_repo
|
||||
|
||||
def test_restore_only_runs_once(self):
|
||||
"""Calling restore_from_hub() twice should be a no-op the second time."""
|
||||
import obliteratus.telemetry as t
|
||||
t._restore_done = False
|
||||
|
||||
with patch("obliteratus.telemetry._TELEMETRY_REPO", "test/repo"):
|
||||
with patch("obliteratus.telemetry.fetch_hub_records", return_value=[]):
|
||||
restore_from_hub()
|
||||
# Second call should return 0 immediately
|
||||
assert restore_from_hub() == 0
|
||||
@@ -0,0 +1,167 @@
|
||||
"""Tests for visualization module (non-interactive, save-to-file)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
|
||||
from obliteratus.analysis.activation_probing import ActivationProbe
|
||||
from obliteratus.analysis.visualization import (
|
||||
_sanitize_label,
|
||||
plot_refusal_topology,
|
||||
plot_cross_layer_heatmap,
|
||||
plot_angular_drift,
|
||||
plot_probe_dashboard,
|
||||
plot_defense_radar,
|
||||
)
|
||||
from obliteratus.analysis.defense_robustness import DefenseProfile
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tmp_dir():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
yield Path(d)
|
||||
|
||||
|
||||
def _make_refusal_data(n_layers=6, hidden_dim=16):
|
||||
"""Create test refusal directions and means."""
|
||||
torch.manual_seed(42)
|
||||
directions = {}
|
||||
harmful_means = {}
|
||||
harmless_means = {}
|
||||
|
||||
for i in range(n_layers):
|
||||
d = torch.randn(hidden_dim)
|
||||
directions[i] = d / d.norm()
|
||||
base = torch.randn(hidden_dim)
|
||||
harmless_means[i] = base.unsqueeze(0)
|
||||
harmful_means[i] = (base + (2.0 if i in [2, 3, 4] else 0.3) * directions[i]).unsqueeze(0)
|
||||
|
||||
strong_layers = [2, 3, 4]
|
||||
return directions, harmful_means, harmless_means, strong_layers
|
||||
|
||||
|
||||
class TestRefusalTopology:
|
||||
def test_plot_saves_file(self, tmp_dir):
|
||||
directions, h_means, b_means, strong = _make_refusal_data()
|
||||
path = tmp_dir / "topology.png"
|
||||
plot_refusal_topology(
|
||||
directions, h_means, b_means, strong, output_path=path
|
||||
)
|
||||
assert path.exists()
|
||||
assert path.stat().st_size > 0
|
||||
|
||||
def test_plot_returns_figure(self, tmp_dir):
|
||||
directions, h_means, b_means, strong = _make_refusal_data()
|
||||
fig = plot_refusal_topology(
|
||||
directions, h_means, b_means, strong, output_path=tmp_dir / "test.png"
|
||||
)
|
||||
assert fig is not None
|
||||
|
||||
|
||||
class TestCrossLayerHeatmap:
|
||||
def test_plot_saves_file(self, tmp_dir):
|
||||
torch.manual_seed(42)
|
||||
directions = {i: torch.randn(16) for i in range(6)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
|
||||
path = tmp_dir / "heatmap.png"
|
||||
plot_cross_layer_heatmap(result, output_path=path)
|
||||
assert path.exists()
|
||||
|
||||
|
||||
class TestAngularDrift:
|
||||
def test_plot_saves_file(self, tmp_dir):
|
||||
torch.manual_seed(42)
|
||||
directions = {i: torch.randn(16) for i in range(8)}
|
||||
analyzer = CrossLayerAlignmentAnalyzer()
|
||||
result = analyzer.analyze(directions)
|
||||
|
||||
path = tmp_dir / "drift.png"
|
||||
plot_angular_drift(result, output_path=path)
|
||||
assert path.exists()
|
||||
|
||||
|
||||
class TestProbeDashboard:
|
||||
def test_plot_saves_file(self, tmp_dir):
|
||||
torch.manual_seed(42)
|
||||
harmful = {i: [torch.randn(8) for _ in range(3)] for i in range(4)}
|
||||
harmless = {i: [torch.randn(8) for _ in range(3)] for i in range(4)}
|
||||
dirs = {i: torch.randn(8) for i in range(4)}
|
||||
|
||||
probe = ActivationProbe()
|
||||
result = probe.probe_all_layers(harmful, harmless, dirs)
|
||||
|
||||
path = tmp_dir / "probe.png"
|
||||
plot_probe_dashboard(result, output_path=path)
|
||||
assert path.exists()
|
||||
|
||||
|
||||
class TestDefenseRadar:
|
||||
def test_plot_saves_file(self, tmp_dir):
|
||||
profile = DefenseProfile(
|
||||
model_name="test-model",
|
||||
alignment_type_estimate="RLHF-like",
|
||||
refusal_concentration=0.4,
|
||||
refusal_layer_spread=5,
|
||||
mean_refusal_strength=2.0,
|
||||
max_refusal_strength=4.0,
|
||||
self_repair_estimate=0.6,
|
||||
entanglement_score=0.3,
|
||||
estimated_robustness="medium",
|
||||
)
|
||||
path = tmp_dir / "radar.png"
|
||||
plot_defense_radar(profile, output_path=path)
|
||||
assert path.exists()
|
||||
|
||||
def test_model_name_sanitized_in_title(self, tmp_dir):
|
||||
"""Ensure sensitive paths in model_name don't leak into saved charts."""
|
||||
profile = DefenseProfile(
|
||||
model_name="/home/user/.cache/huggingface/hub/models--secret-org/private-model",
|
||||
alignment_type_estimate="RLHF-like",
|
||||
refusal_concentration=0.4,
|
||||
refusal_layer_spread=5,
|
||||
mean_refusal_strength=2.0,
|
||||
max_refusal_strength=4.0,
|
||||
self_repair_estimate=0.6,
|
||||
entanglement_score=0.3,
|
||||
estimated_robustness="medium",
|
||||
)
|
||||
path = tmp_dir / "radar_sanitized.png"
|
||||
fig = plot_defense_radar(profile, output_path=path)
|
||||
# Title should not contain the full filesystem path
|
||||
title_text = fig.axes[0].get_title()
|
||||
assert "/home/user" not in title_text
|
||||
assert ".cache" not in title_text
|
||||
|
||||
|
||||
class TestSanitizeLabel:
|
||||
def test_strips_absolute_paths(self):
|
||||
result = _sanitize_label("/home/user/.cache/huggingface/models--org/model")
|
||||
assert "/home/user" not in result
|
||||
assert "model" in result
|
||||
|
||||
def test_redacts_hf_tokens(self):
|
||||
result = _sanitize_label("model with hf_abcdefghij token")
|
||||
assert "hf_abcdefghij" not in result
|
||||
assert "<TOKEN>" in result
|
||||
|
||||
def test_redacts_long_hex_strings(self):
|
||||
hex_str = "a" * 40
|
||||
result = _sanitize_label(f"commit {hex_str}")
|
||||
assert hex_str not in result
|
||||
assert "<REDACTED>" in result
|
||||
|
||||
def test_truncates_long_strings(self):
|
||||
long = "x" * 200
|
||||
result = _sanitize_label(long)
|
||||
assert len(result) <= 80
|
||||
assert result.endswith("...")
|
||||
|
||||
def test_passes_normal_strings_through(self):
|
||||
assert _sanitize_label("Refusal Topology Map") == "Refusal Topology Map"
|
||||
Reference in New Issue
Block a user