From 984ce140592a9385347934f9ca647413ba9fac76 Mon Sep 17 00:00:00 2001 From: pliny <133052465+elder-plinius@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:03:46 -0800 Subject: [PATCH] Add files via upload --- README.md | 2 +- app.py | 265 ++++++++++++++++++++++----- docs/RESEARCH_SURVEY.md | 4 +- docs/theory_journal.md | 2 +- notebooks/abliterate.ipynb | 2 +- obliteratus/abliterate.py | 4 +- obliteratus/informed_pipeline.py | 2 +- obliteratus/interactive.py | 2 +- obliteratus/local_ui.py | 9 +- paper/references.bib | 13 +- scripts/benchmark_sota_comparison.py | 2 +- 11 files changed, 247 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index c593e8d..045ae5f 100644 --- a/README.md +++ b/README.md @@ -526,7 +526,7 @@ Works with any HuggingFace transformer, including: GPT-2, LLaMA, Mistral, Falcon ## References - Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717) -- Gulmez, G. (2025). *Gabliteration: SVD-Based Multi-Direction Refusal Removal.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901) +- Gülmez, G. (2026). *Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901) - grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim) - Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248) - Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681) diff --git a/app.py b/app.py index ba32a9b..8616c33 100644 --- a/app.py +++ b/app.py @@ -18,6 +18,7 @@ ZeroGPU Support: from __future__ import annotations import gc +import json as _json import os import re import time @@ -117,7 +118,84 @@ _obliterate_counter: int = 0 # Flag to suppress session_model_dd.change when obliterate programmatically # sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU) -_skip_session_load: bool = False +_skip_session_load: int = 0 # counter (not bool) — obliterate sets to 2 for both dropdowns + +# --------------------------------------------------------------------------- +# ZeroGPU session persistence — survive process restarts +# --------------------------------------------------------------------------- +# On ZeroGPU Spaces, the container may restart between requests (idle timeout, +# scaling, etc.). The browser retains the old dropdown values but the Python +# process loses all in-memory state (_state, _session_models). To recover, +# we persist a small JSON sidecar next to each checkpoint. + +_SESSION_META_FILE = "obliteratus_session.json" + + +def _persist_session_meta(output_dir: str, label: str, meta: dict) -> None: + """Write session metadata next to a checkpoint so we can recover later.""" + try: + p = Path(output_dir) / _SESSION_META_FILE + data = {"label": label, **meta} + p.write_text(_json.dumps(data, indent=2)) + except Exception: + pass # best-effort + + +def _recover_sessions_from_disk() -> None: + """Scan /tmp for obliterated checkpoints and repopulate _session_models. + + Called on startup and when a stale dropdown value is detected. Skips + directories that are already registered. + """ + global _last_obliterated_label, _obliterate_counter + found_any = False + for pattern in ("obliterated_*", "obliterated", "bench_*"): + for p in Path("/tmp").glob(pattern): + if not p.is_dir(): + continue + meta_file = p / _SESSION_META_FILE + if not meta_file.exists(): + continue + try: + data = _json.loads(meta_file.read_text()) + except Exception: + continue + label = data.get("label", p.name) + if label in _session_models: + continue # already registered + _session_models[label] = { + "model_id": data.get("model_id", ""), + "model_choice": data.get("model_choice", data.get("model_id", "")), + "method": data.get("method", "unknown"), + "dataset_key": data.get("dataset_key", ""), + "prompt_volume": data.get("prompt_volume", 0), + "output_dir": str(p), + "source": data.get("source", "recovered"), + } + found_any = True + # Track the latest for auto-select + _last_obliterated_label = label + # Keep counter above any existing numbered dirs + if p.name.startswith("obliterated_"): + try: + idx = int(p.name.split("_", 1)[1]) + if idx >= _obliterate_counter: + _obliterate_counter = idx + 1 + except (ValueError, IndexError): + pass + # If we recovered sessions but _state has no output_dir, set it to the + # most recent checkpoint so chat_respond can reload from disk. + if found_any and not _state.get("output_dir"): + with _lock: + latest = _last_obliterated_label + if latest and latest in _session_models: + _state["output_dir"] = _session_models[latest]["output_dir"] + _state["model_name"] = _session_models[latest].get("model_choice") + _state["method"] = _session_models[latest].get("method") + + +# Run recovery on import (app startup) +_recover_sessions_from_disk() # --------------------------------------------------------------------------- # Model presets — 100+ models organized by provider @@ -961,6 +1039,14 @@ def benchmark( "prompt_volume": prompt_volume, "output_dir": bench_save_path, } + _persist_session_meta(bench_save_path, label, { + "model_id": model_id, + "model_choice": model_choice, + "method": method_key, + "dataset_key": dataset_key, + "prompt_volume": prompt_volume, + "source": "benchmark", + }) # Explicitly free the pipeline and its model to reclaim GPU memory # before the next benchmark iteration. _clear_gpu() only clears @@ -1306,6 +1392,14 @@ def benchmark_multi_model( "prompt_volume": prompt_volume, "output_dir": mm_save_path, } + _persist_session_meta(mm_save_path, label, { + "model_id": model_id, + "model_choice": model_display, + "method": method_key, + "dataset_key": dataset_key, + "prompt_volume": prompt_volume, + "source": "benchmark_mm", + }) # Explicitly free pipeline and model before next iteration if pipeline_ref[0] is not None: @@ -1734,6 +1828,16 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, _state["steering"] = steering_meta _state["output_dir"] = save_dir # for ZeroGPU checkpoint reload + # Persist session metadata to disk so we survive ZeroGPU process restarts + _persist_session_meta(save_dir, _cache_label, { + "model_id": model_id, + "model_choice": model_choice, + "method": method, + "dataset_key": dataset_key if not use_custom else "custom", + "prompt_volume": prompt_volume, + "source": "obliterate", + }) + if can_generate: # Model fits — use it directly (steering hooks already installed) with _lock: @@ -1870,7 +1974,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str, # Set skip flag so the .change handler doesn't trigger a wasteful # GPU re-allocation — the model is already loaded. global _skip_session_load - _skip_session_load = True + _skip_session_load = 2 # both session_model_dd and ab_session_model_dd fire .change _dd_update = gr.update( choices=_get_session_model_choices(), value=_last_obliterated_label or None, @@ -1947,25 +2051,30 @@ def chat_respond(message: str, history: list[dict], system_prompt: str, model = _state["model"] tokenizer = _state["tokenizer"] - if model is None or tokenizer is None: - yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model." - return + # ZeroGPU safety: detect whether we need to reload from checkpoint. + # Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving + # model as None (garbage-collected) or with stale/meta tensors. + # Meta tensors raise NotImplementedError on .to(), not RuntimeError, + # so we catch Exception broadly here. + _needs_reload = model is None or tokenizer is None + if not _needs_reload: + try: + dev = next(model.parameters()).device + if dev.type == "meta": + _needs_reload = True + elif torch.cuda.is_available() and dev.type != "cuda": + model.to("cuda") + except Exception: + _needs_reload = True - # ZeroGPU safety: ensure model is on GPU if available. - # Between GPU allocations, ZeroGPU may have moved the model to CPU/meta, - # or tensors may be stale from a previous GPU context. - # The @spaces.GPU decorator guarantees a GPU is available here. - _needs_reload = False - try: - dev = next(model.parameters()).device - if torch.cuda.is_available() and dev.type != "cuda": - model.to("cuda") - except (StopIteration, RuntimeError): - _needs_reload = True - - # If model tensors are stale/meta, reload from the saved checkpoint - if _needs_reload and _ZEROGPU_AVAILABLE: + # Reload from saved checkpoint if model is missing or stale + if _needs_reload: checkpoint = _state.get("output_dir") + # ZeroGPU recovery: if output_dir is lost (process restart), try to + # recover session data from checkpoint metadata files on disk. + if not checkpoint or not Path(checkpoint).exists(): + _recover_sessions_from_disk() + checkpoint = _state.get("output_dir") if checkpoint and Path(checkpoint).exists(): try: is_preset = (_state.get("model_name") or "") in MODELS @@ -1985,11 +2094,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str, with _lock: _state["model"] = model _state["tokenizer"] = tokenizer + _state["status"] = "ready" except Exception: yield "Model failed to reload from checkpoint. Try re-obliterating." return else: - yield "Model tensors are stale (ZeroGPU). Re-obliterate to create a fresh checkpoint." + yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model." return # Sanitize inputs to prevent resource exhaustion @@ -2117,8 +2227,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()): # Skip if the obliterate function just set the dropdown value — the model # is already loaded and we'd just waste GPU quota re-allocating. global _skip_session_load - if _skip_session_load: - _skip_session_load = False + if _skip_session_load > 0: + _skip_session_load -= 1 if choice and _state.get("status") == "ready": yield ( f"**Ready!** `{choice}` is loaded — just type in the chat below.", @@ -2127,8 +2237,65 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()): return if not choice or choice not in _bench_configs: - yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", "" - return + # On ZeroGPU, global state may be lost between process restarts. + # Try to recover session data from checkpoint metadata files on disk. + if choice and choice not in _bench_configs: + _recover_sessions_from_disk() + # After recovery, the choice might now be in _bench_configs + if choice in _bench_configs: + pass # fall through to the normal loading path below + else: + # choice still not found — but we may have recovered output_dir + pass + + # If recovery didn't find the exact choice, check if model is loaded + if choice not in _bench_configs: + with _lock: + if _state["status"] == "ready" and _state["model"] is not None: + yield ( + f"**Ready!** Model already loaded — just type in the chat below.", + get_chat_header(), + ) + return + # Check if we can reload from a checkpoint on disk + checkpoint = _state.get("output_dir") + if checkpoint and Path(checkpoint).exists(): + yield ( + f"**Loading model** from saved checkpoint...", + "", + ) + # If we have a checkpoint, attempt reload outside the lock + checkpoint = _state.get("output_dir") + if checkpoint and Path(checkpoint).exists(): + is_preset = (_state.get("model_name") or "") in MODELS + try: + model_loaded = AutoModelForCausalLM.from_pretrained( + checkpoint, device_map="auto", torch_dtype=torch.float16, + trust_remote_code=is_preset, + ) + tokenizer_loaded = AutoTokenizer.from_pretrained( + checkpoint, trust_remote_code=is_preset, + ) + if tokenizer_loaded.pad_token is None: + tokenizer_loaded.pad_token = tokenizer_loaded.eos_token + with _lock: + _state["model"] = model_loaded + _state["tokenizer"] = tokenizer_loaded + _state["status"] = "ready" + yield ( + f"**Loaded!** Model reloaded from checkpoint — ready to chat.", + get_chat_header(), + ) + return + except Exception as e: + yield f"**Error:** Could not reload model: {e}", get_chat_header() + return + yield ( + "**Error:** Model checkpoint not found. The Space may have restarted — " + "please re-obliterate the model on the **Obliterate** tab.", + "", + ) + return cfg = _bench_configs[choice] model_id = cfg["model_id"] @@ -2320,28 +2487,27 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[ tokenizer = _state["tokenizer"] model_name = _state["model_name"] - if abliterated_model is None or tokenizer is None: - yield (history_left + [{"role": "user", "content": message}, - {"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}], - history_right + [{"role": "user", "content": message}, - {"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}], - "Load a model first.", - "#### Original (Pre-Abliteration)", - "#### Abliterated") - return + # ZeroGPU safety: detect whether we need to reload from checkpoint. + # Model may be None (garbage-collected after GPU deallocation) or stale. + # Meta tensors raise NotImplementedError on .to(), so catch broadly. + _needs_reload = abliterated_model is None or tokenizer is None + if not _needs_reload: + try: + dev = next(abliterated_model.parameters()).device + if dev.type == "meta": + _needs_reload = True + elif torch.cuda.is_available() and dev.type != "cuda": + abliterated_model.to("cuda") + except Exception: + _needs_reload = True - # ZeroGPU safety: ensure model is on GPU if available. - # If tensors are stale from a prior GPU context, reload from checkpoint. - _needs_reload = False - try: - dev = next(abliterated_model.parameters()).device - if torch.cuda.is_available() and dev.type != "cuda": - abliterated_model.to("cuda") - except (StopIteration, RuntimeError): - _needs_reload = True - - if _needs_reload and _ZEROGPU_AVAILABLE: + if _needs_reload: checkpoint = _state.get("output_dir") + # ZeroGPU recovery: try disk scan if output_dir is lost + if not checkpoint or not Path(checkpoint).exists(): + _recover_sessions_from_disk() + checkpoint = _state.get("output_dir") + model_name = _state.get("model_name") or model_name if checkpoint and Path(checkpoint).exists(): try: is_preset = (model_name or "") in MODELS @@ -2361,8 +2527,19 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[ with _lock: _state["model"] = abliterated_model _state["tokenizer"] = tokenizer + _state["status"] = "ready" except Exception: pass # Fall through — will fail at generation with a clear error + else: + _no_model_msg = "No abliterated model loaded. Obliterate a model first." + yield (history_left + [{"role": "user", "content": message}, + {"role": "assistant", "content": _no_model_msg}], + history_right + [{"role": "user", "content": message}, + {"role": "assistant", "content": _no_model_msg}], + "Load a model first.", + "#### Original (Pre-Abliteration)", + "#### Abliterated") + return # Build header strings showing model name on each side header_left = f"#### Original (Pre-Abliteration)\n`{model_name}`" diff --git a/docs/RESEARCH_SURVEY.md b/docs/RESEARCH_SURVEY.md index 9c632dc..1a511dc 100644 --- a/docs/RESEARCH_SURVEY.md +++ b/docs/RESEARCH_SURVEY.md @@ -116,7 +116,7 @@ The paper also analyzes how adversarial suffixes (e.g., GCG-generated) suppress ## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration} -**Author:** Gokdeniz Gulmez (independent research) +**Author:** Gökdeniz Gülmez (independent research) **arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901) **Version:** v3, revised January 28, 2026 **Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration) @@ -733,7 +733,7 @@ MI research helps make AI safe but could be used adversarially. The same techniq 1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717) -2. Gulmez, G. (2025). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901) +2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901) 3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration) diff --git a/docs/theory_journal.md b/docs/theory_journal.md index f9b10b6..ce7b192 100644 --- a/docs/theory_journal.md +++ b/docs/theory_journal.md @@ -1802,7 +1802,7 @@ implementations diverge from the closed-form GRRO solution. ## References 1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. -2. Gulmez, G. (2025). Gabliteration. arXiv:2512.18901. +2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. arXiv:2512.18901. 3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace. 4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025. 5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132. diff --git a/notebooks/abliterate.ipynb b/notebooks/abliterate.ipynb index 52ff463..e5085a5 100644 --- a/notebooks/abliterate.ipynb +++ b/notebooks/abliterate.ipynb @@ -53,7 +53,7 @@ "id": "install" }, "outputs": [], - "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")" + "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\")" }, { "cell_type": "markdown", diff --git a/obliteratus/abliterate.py b/obliteratus/abliterate.py index df010d5..a90ff3c 100644 --- a/obliteratus/abliterate.py +++ b/obliteratus/abliterate.py @@ -334,7 +334,7 @@ METHODS = { "layer_selection": "middle60", }, "gabliteration": { - "label": "Gabliteration (Gulmez 2025 Baseline)", + "label": "Gabliteration (Gülmez 2026 Baseline)", "description": ( "Faithful reproduction of Gabliteration (arXiv:2512.18901). " "SVD-based multi-direction extraction (top-4), ridge-regularized " @@ -2494,7 +2494,7 @@ class AbliterationPipeline: References: - SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis - - Gabliteration (Gulmez, 2025): multi-direction SVD abliteration + - Gabliteration (Gülmez, 2026): multi-direction SVD abliteration - SAFEx (Lai et al., NeurIPS 2025): safety expert identification """ if not self._routing_harmful or not self._routing_harmless: diff --git a/obliteratus/informed_pipeline.py b/obliteratus/informed_pipeline.py index 3758acd..4ee95ac 100644 --- a/obliteratus/informed_pipeline.py +++ b/obliteratus/informed_pipeline.py @@ -919,7 +919,7 @@ class InformedAbliterationPipeline(AbliterationPipeline): "Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (2024)", "Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)", "grimjim, Norm-Preserving Biprojected Abliteration (2025)", - "Gurnee & Nanda, The Geometry of Refusal in LLMs — concept cones (ICML 2025)", + "Wollschlager et al., The Geometry of Refusal in LLMs — concept cones (ICML 2025, arXiv:2502.17420)", "Joad et al., The Ouroboros Effect: Self-Repair in Abliterated LLMs (2026)", "OBLITERATUS: Analysis-informed abliteration pipeline (novel)", ], diff --git a/obliteratus/interactive.py b/obliteratus/interactive.py index c5d31dd..2a8cabf 100644 --- a/obliteratus/interactive.py +++ b/obliteratus/interactive.py @@ -24,7 +24,7 @@ def _detect_compute_tier() -> str: import torch if torch.cuda.is_available(): - vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024**3) + vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3) if vram_gb >= 20: return "large" elif vram_gb >= 8: diff --git a/obliteratus/local_ui.py b/obliteratus/local_ui.py index e9f9c6a..6aad8bc 100644 --- a/obliteratus/local_ui.py +++ b/obliteratus/local_ui.py @@ -10,6 +10,7 @@ Usage: from __future__ import annotations import os +import pathlib import platform import shutil import sys @@ -48,7 +49,7 @@ def _detect_gpu() -> list[dict]: { "index": i, "name": props.name, - "vram_gb": round(props.total_mem / 1024**3, 1), + "vram_gb": round(props.total_memory / 1024**3, 1), "compute": f"{props.major}.{props.minor}", } ) @@ -292,6 +293,12 @@ def launch_local_ui( console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]") start = time.time() + # app.py lives at the project root, one level above this package. + # When installed via pip the root isn't on sys.path, so add it. + _project_root = str(pathlib.Path(__file__).resolve().parent.parent) + if _project_root not in sys.path: + sys.path.insert(0, _project_root) + from app import launch as app_launch elapsed = time.time() - start diff --git a/paper/references.bib b/paper/references.bib index 6124fea..7db917b 100644 --- a/paper/references.bib +++ b/paper/references.bib @@ -7,11 +7,14 @@ year={2024} } -@article{gabliteration2024, - title={{Gabliteration}: {SVD}-Based Multi-Direction Refusal Removal}, - author={Gabriel, Saul and {contributors}}, - journal={arXiv preprint arXiv:2512.18901}, - year={2024} +@misc{gabliteration2024, + title={{Gabliteration}: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models}, + author={G\"{o}kdeniz G\"{u}lmez}, + year={2026}, + eprint={2512.18901}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2512.18901} } @misc{grimjim2025, diff --git a/scripts/benchmark_sota_comparison.py b/scripts/benchmark_sota_comparison.py index 7df6094..1c92b5f 100644 --- a/scripts/benchmark_sota_comparison.py +++ b/scripts/benchmark_sota_comparison.py @@ -7,7 +7,7 @@ comparison tables with standardized community metrics. Baselines included: 1. FailSpy/abliterator (2024) — Community workhorse baseline - 2. Gabliteration (Gulmez 2025) — SVD multi-direction + ridge regularization + 2. Gabliteration (Gülmez 2026) — SVD multi-direction + ridge regularization 3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality) 4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization