Add files via upload

This commit is contained in:
pliny
2026-03-05 10:03:46 -08:00
committed by GitHub
parent 6120061553
commit 984ce14059
11 changed files with 247 additions and 60 deletions
+1 -1
View File
@@ -526,7 +526,7 @@ Works with any HuggingFace transformer, including: GPT-2, LLaMA, Mistral, Falcon
## References
- Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
- Gulmez, G. (2025). *Gabliteration: SVD-Based Multi-Direction Refusal Removal.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
- Gülmez, G. (2026). *Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
- grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
- Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
- Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)
+221 -44
View File
@@ -18,6 +18,7 @@ ZeroGPU Support:
from __future__ import annotations
import gc
import json as _json
import os
import re
import time
@@ -117,7 +118,84 @@ _obliterate_counter: int = 0
# Flag to suppress session_model_dd.change when obliterate programmatically
# sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
_skip_session_load: bool = False
_skip_session_load: int = 0 # counter (not bool) — obliterate sets to 2 for both dropdowns
# ---------------------------------------------------------------------------
# ZeroGPU session persistence — survive process restarts
# ---------------------------------------------------------------------------
# On ZeroGPU Spaces, the container may restart between requests (idle timeout,
# scaling, etc.). The browser retains the old dropdown values but the Python
# process loses all in-memory state (_state, _session_models). To recover,
# we persist a small JSON sidecar next to each checkpoint.
_SESSION_META_FILE = "obliteratus_session.json"
def _persist_session_meta(output_dir: str, label: str, meta: dict) -> None:
"""Write session metadata next to a checkpoint so we can recover later."""
try:
p = Path(output_dir) / _SESSION_META_FILE
data = {"label": label, **meta}
p.write_text(_json.dumps(data, indent=2))
except Exception:
pass # best-effort
def _recover_sessions_from_disk() -> None:
"""Scan /tmp for obliterated checkpoints and repopulate _session_models.
Called on startup and when a stale dropdown value is detected. Skips
directories that are already registered.
"""
global _last_obliterated_label, _obliterate_counter
found_any = False
for pattern in ("obliterated_*", "obliterated", "bench_*"):
for p in Path("/tmp").glob(pattern):
if not p.is_dir():
continue
meta_file = p / _SESSION_META_FILE
if not meta_file.exists():
continue
try:
data = _json.loads(meta_file.read_text())
except Exception:
continue
label = data.get("label", p.name)
if label in _session_models:
continue # already registered
_session_models[label] = {
"model_id": data.get("model_id", ""),
"model_choice": data.get("model_choice", data.get("model_id", "")),
"method": data.get("method", "unknown"),
"dataset_key": data.get("dataset_key", ""),
"prompt_volume": data.get("prompt_volume", 0),
"output_dir": str(p),
"source": data.get("source", "recovered"),
}
found_any = True
# Track the latest for auto-select
_last_obliterated_label = label
# Keep counter above any existing numbered dirs
if p.name.startswith("obliterated_"):
try:
idx = int(p.name.split("_", 1)[1])
if idx >= _obliterate_counter:
_obliterate_counter = idx + 1
except (ValueError, IndexError):
pass
# If we recovered sessions but _state has no output_dir, set it to the
# most recent checkpoint so chat_respond can reload from disk.
if found_any and not _state.get("output_dir"):
with _lock:
latest = _last_obliterated_label
if latest and latest in _session_models:
_state["output_dir"] = _session_models[latest]["output_dir"]
_state["model_name"] = _session_models[latest].get("model_choice")
_state["method"] = _session_models[latest].get("method")
# Run recovery on import (app startup)
_recover_sessions_from_disk()
# ---------------------------------------------------------------------------
# Model presets — 100+ models organized by provider
@@ -961,6 +1039,14 @@ def benchmark(
"prompt_volume": prompt_volume,
"output_dir": bench_save_path,
}
_persist_session_meta(bench_save_path, label, {
"model_id": model_id,
"model_choice": model_choice,
"method": method_key,
"dataset_key": dataset_key,
"prompt_volume": prompt_volume,
"source": "benchmark",
})
# Explicitly free the pipeline and its model to reclaim GPU memory
# before the next benchmark iteration. _clear_gpu() only clears
@@ -1306,6 +1392,14 @@ def benchmark_multi_model(
"prompt_volume": prompt_volume,
"output_dir": mm_save_path,
}
_persist_session_meta(mm_save_path, label, {
"model_id": model_id,
"model_choice": model_display,
"method": method_key,
"dataset_key": dataset_key,
"prompt_volume": prompt_volume,
"source": "benchmark_mm",
})
# Explicitly free pipeline and model before next iteration
if pipeline_ref[0] is not None:
@@ -1734,6 +1828,16 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
_state["steering"] = steering_meta
_state["output_dir"] = save_dir # for ZeroGPU checkpoint reload
# Persist session metadata to disk so we survive ZeroGPU process restarts
_persist_session_meta(save_dir, _cache_label, {
"model_id": model_id,
"model_choice": model_choice,
"method": method,
"dataset_key": dataset_key if not use_custom else "custom",
"prompt_volume": prompt_volume,
"source": "obliterate",
})
if can_generate:
# Model fits — use it directly (steering hooks already installed)
with _lock:
@@ -1870,7 +1974,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
# Set skip flag so the .change handler doesn't trigger a wasteful
# GPU re-allocation — the model is already loaded.
global _skip_session_load
_skip_session_load = True
_skip_session_load = 2 # both session_model_dd and ab_session_model_dd fire .change
_dd_update = gr.update(
choices=_get_session_model_choices(),
value=_last_obliterated_label or None,
@@ -1947,25 +2051,30 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
model = _state["model"]
tokenizer = _state["tokenizer"]
if model is None or tokenizer is None:
yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model."
return
# ZeroGPU safety: detect whether we need to reload from checkpoint.
# Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
# model as None (garbage-collected) or with stale/meta tensors.
# Meta tensors raise NotImplementedError on .to(), not RuntimeError,
# so we catch Exception broadly here.
_needs_reload = model is None or tokenizer is None
if not _needs_reload:
try:
dev = next(model.parameters()).device
if dev.type == "meta":
_needs_reload = True
elif torch.cuda.is_available() and dev.type != "cuda":
model.to("cuda")
except Exception:
_needs_reload = True
# ZeroGPU safety: ensure model is on GPU if available.
# Between GPU allocations, ZeroGPU may have moved the model to CPU/meta,
# or tensors may be stale from a previous GPU context.
# The @spaces.GPU decorator guarantees a GPU is available here.
_needs_reload = False
try:
dev = next(model.parameters()).device
if torch.cuda.is_available() and dev.type != "cuda":
model.to("cuda")
except (StopIteration, RuntimeError):
_needs_reload = True
# If model tensors are stale/meta, reload from the saved checkpoint
if _needs_reload and _ZEROGPU_AVAILABLE:
# Reload from saved checkpoint if model is missing or stale
if _needs_reload:
checkpoint = _state.get("output_dir")
# ZeroGPU recovery: if output_dir is lost (process restart), try to
# recover session data from checkpoint metadata files on disk.
if not checkpoint or not Path(checkpoint).exists():
_recover_sessions_from_disk()
checkpoint = _state.get("output_dir")
if checkpoint and Path(checkpoint).exists():
try:
is_preset = (_state.get("model_name") or "") in MODELS
@@ -1985,11 +2094,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
with _lock:
_state["model"] = model
_state["tokenizer"] = tokenizer
_state["status"] = "ready"
except Exception:
yield "Model failed to reload from checkpoint. Try re-obliterating."
return
else:
yield "Model tensors are stale (ZeroGPU). Re-obliterate to create a fresh checkpoint."
yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model."
return
# Sanitize inputs to prevent resource exhaustion
@@ -2117,8 +2227,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
# Skip if the obliterate function just set the dropdown value — the model
# is already loaded and we'd just waste GPU quota re-allocating.
global _skip_session_load
if _skip_session_load:
_skip_session_load = False
if _skip_session_load > 0:
_skip_session_load -= 1
if choice and _state.get("status") == "ready":
yield (
f"**Ready!** `{choice}` is loaded — just type in the chat below.",
@@ -2127,8 +2237,65 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
return
if not choice or choice not in _bench_configs:
yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", ""
return
# On ZeroGPU, global state may be lost between process restarts.
# Try to recover session data from checkpoint metadata files on disk.
if choice and choice not in _bench_configs:
_recover_sessions_from_disk()
# After recovery, the choice might now be in _bench_configs
if choice in _bench_configs:
pass # fall through to the normal loading path below
else:
# choice still not found — but we may have recovered output_dir
pass
# If recovery didn't find the exact choice, check if model is loaded
if choice not in _bench_configs:
with _lock:
if _state["status"] == "ready" and _state["model"] is not None:
yield (
f"**Ready!** Model already loaded — just type in the chat below.",
get_chat_header(),
)
return
# Check if we can reload from a checkpoint on disk
checkpoint = _state.get("output_dir")
if checkpoint and Path(checkpoint).exists():
yield (
f"**Loading model** from saved checkpoint...",
"",
)
# If we have a checkpoint, attempt reload outside the lock
checkpoint = _state.get("output_dir")
if checkpoint and Path(checkpoint).exists():
is_preset = (_state.get("model_name") or "") in MODELS
try:
model_loaded = AutoModelForCausalLM.from_pretrained(
checkpoint, device_map="auto", torch_dtype=torch.float16,
trust_remote_code=is_preset,
)
tokenizer_loaded = AutoTokenizer.from_pretrained(
checkpoint, trust_remote_code=is_preset,
)
if tokenizer_loaded.pad_token is None:
tokenizer_loaded.pad_token = tokenizer_loaded.eos_token
with _lock:
_state["model"] = model_loaded
_state["tokenizer"] = tokenizer_loaded
_state["status"] = "ready"
yield (
f"**Loaded!** Model reloaded from checkpoint — ready to chat.",
get_chat_header(),
)
return
except Exception as e:
yield f"**Error:** Could not reload model: {e}", get_chat_header()
return
yield (
"**Error:** Model checkpoint not found. The Space may have restarted — "
"please re-obliterate the model on the **Obliterate** tab.",
"",
)
return
cfg = _bench_configs[choice]
model_id = cfg["model_id"]
@@ -2320,28 +2487,27 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
tokenizer = _state["tokenizer"]
model_name = _state["model_name"]
if abliterated_model is None or tokenizer is None:
yield (history_left + [{"role": "user", "content": message},
{"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}],
history_right + [{"role": "user", "content": message},
{"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}],
"Load a model first.",
"#### Original (Pre-Abliteration)",
"#### Abliterated")
return
# ZeroGPU safety: detect whether we need to reload from checkpoint.
# Model may be None (garbage-collected after GPU deallocation) or stale.
# Meta tensors raise NotImplementedError on .to(), so catch broadly.
_needs_reload = abliterated_model is None or tokenizer is None
if not _needs_reload:
try:
dev = next(abliterated_model.parameters()).device
if dev.type == "meta":
_needs_reload = True
elif torch.cuda.is_available() and dev.type != "cuda":
abliterated_model.to("cuda")
except Exception:
_needs_reload = True
# ZeroGPU safety: ensure model is on GPU if available.
# If tensors are stale from a prior GPU context, reload from checkpoint.
_needs_reload = False
try:
dev = next(abliterated_model.parameters()).device
if torch.cuda.is_available() and dev.type != "cuda":
abliterated_model.to("cuda")
except (StopIteration, RuntimeError):
_needs_reload = True
if _needs_reload and _ZEROGPU_AVAILABLE:
if _needs_reload:
checkpoint = _state.get("output_dir")
# ZeroGPU recovery: try disk scan if output_dir is lost
if not checkpoint or not Path(checkpoint).exists():
_recover_sessions_from_disk()
checkpoint = _state.get("output_dir")
model_name = _state.get("model_name") or model_name
if checkpoint and Path(checkpoint).exists():
try:
is_preset = (model_name or "") in MODELS
@@ -2361,8 +2527,19 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
with _lock:
_state["model"] = abliterated_model
_state["tokenizer"] = tokenizer
_state["status"] = "ready"
except Exception:
pass # Fall through — will fail at generation with a clear error
else:
_no_model_msg = "No abliterated model loaded. Obliterate a model first."
yield (history_left + [{"role": "user", "content": message},
{"role": "assistant", "content": _no_model_msg}],
history_right + [{"role": "user", "content": message},
{"role": "assistant", "content": _no_model_msg}],
"Load a model first.",
"#### Original (Pre-Abliteration)",
"#### Abliterated")
return
# Build header strings showing model name on each side
header_left = f"#### Original (Pre-Abliteration)\n`{model_name}`"
+2 -2
View File
@@ -116,7 +116,7 @@ The paper also analyzes how adversarial suffixes (e.g., GCG-generated) suppress
## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}
**Author:** Gokdeniz Gulmez (independent research)
**Author:** Gökdeniz Gülmez (independent research)
**arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
**Version:** v3, revised January 28, 2026
**Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
@@ -733,7 +733,7 @@ MI research helps make AI safe but could be used adversarially. The same techniq
1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
2. Gulmez, G. (2025). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)
+1 -1
View File
@@ -1802,7 +1802,7 @@ implementations diverge from the closed-form GRRO solution.
## References
1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
2. Gulmez, G. (2025). Gabliteration. arXiv:2512.18901.
2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. arXiv:2512.18901.
3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.
+1 -1
View File
@@ -53,7 +53,7 @@
"id": "install"
},
"outputs": [],
"source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
"source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\")"
},
{
"cell_type": "markdown",
+2 -2
View File
@@ -334,7 +334,7 @@ METHODS = {
"layer_selection": "middle60",
},
"gabliteration": {
"label": "Gabliteration (Gulmez 2025 Baseline)",
"label": "Gabliteration (Gülmez 2026 Baseline)",
"description": (
"Faithful reproduction of Gabliteration (arXiv:2512.18901). "
"SVD-based multi-direction extraction (top-4), ridge-regularized "
@@ -2494,7 +2494,7 @@ class AbliterationPipeline:
References:
- SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
- Gabliteration (Gulmez, 2025): multi-direction SVD abliteration
- Gabliteration (Gülmez, 2026): multi-direction SVD abliteration
- SAFEx (Lai et al., NeurIPS 2025): safety expert identification
"""
if not self._routing_harmful or not self._routing_harmless:
+1 -1
View File
@@ -919,7 +919,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
"Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (2024)",
"Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)",
"grimjim, Norm-Preserving Biprojected Abliteration (2025)",
"Gurnee & Nanda, The Geometry of Refusal in LLMs — concept cones (ICML 2025)",
"Wollschlager et al., The Geometry of Refusal in LLMs — concept cones (ICML 2025, arXiv:2502.17420)",
"Joad et al., The Ouroboros Effect: Self-Repair in Abliterated LLMs (2026)",
"OBLITERATUS: Analysis-informed abliteration pipeline (novel)",
],
+1 -1
View File
@@ -24,7 +24,7 @@ def _detect_compute_tier() -> str:
import torch
if torch.cuda.is_available():
vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024**3)
vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
if vram_gb >= 20:
return "large"
elif vram_gb >= 8:
+8 -1
View File
@@ -10,6 +10,7 @@ Usage:
from __future__ import annotations
import os
import pathlib
import platform
import shutil
import sys
@@ -48,7 +49,7 @@ def _detect_gpu() -> list[dict]:
{
"index": i,
"name": props.name,
"vram_gb": round(props.total_mem / 1024**3, 1),
"vram_gb": round(props.total_memory / 1024**3, 1),
"compute": f"{props.major}.{props.minor}",
}
)
@@ -292,6 +293,12 @@ def launch_local_ui(
console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
start = time.time()
# app.py lives at the project root, one level above this package.
# When installed via pip the root isn't on sys.path, so add it.
_project_root = str(pathlib.Path(__file__).resolve().parent.parent)
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from app import launch as app_launch
elapsed = time.time() - start
+8 -5
View File
@@ -7,11 +7,14 @@
year={2024}
}
@article{gabliteration2024,
title={{Gabliteration}: {SVD}-Based Multi-Direction Refusal Removal},
author={Gabriel, Saul and {contributors}},
journal={arXiv preprint arXiv:2512.18901},
year={2024}
@misc{gabliteration2024,
title={{Gabliteration}: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models},
author={G\"{o}kdeniz G\"{u}lmez},
year={2026},
eprint={2512.18901},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2512.18901}
}
@misc{grimjim2025,
+1 -1
View File
@@ -7,7 +7,7 @@ comparison tables with standardized community metrics.
Baselines included:
1. FailSpy/abliterator (2024) — Community workhorse baseline
2. Gabliteration (Gulmez 2025) — SVD multi-direction + ridge regularization
2. Gabliteration (Gülmez 2026) — SVD multi-direction + ridge regularization
3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization