Add files via upload

2026-07-24 12:50:54 +02:00 · 2026-03-05 10:03:46 -08:00
parent 6120061553
commit 984ce14059
11 changed files with 247 additions and 60 deletions
@@ -526,7 +526,7 @@ Works with any HuggingFace transformer, including: GPT-2, LLaMA, Mistral, Falcon
 ## References

 - Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
- Gulmez, G. (2025). *Gabliteration: SVD-Based Multi-Direction Refusal Removal.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
+- Gülmez, G. (2026). *Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
 - grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
 - Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
 - Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)
@@ -18,6 +18,7 @@ ZeroGPU Support:
 from __future__ import annotations

 import gc
+import json as _json
 import os
 import re
 import time
@@ -117,7 +118,84 @@ _obliterate_counter: int = 0

 # Flag to suppress session_model_dd.change when obliterate programmatically
 # sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
-_skip_session_load: bool = False
+_skip_session_load: int = 0  # counter (not bool) — obliterate sets to 2 for both dropdowns
+
+# ---------------------------------------------------------------------------
+# ZeroGPU session persistence — survive process restarts
+# ---------------------------------------------------------------------------
+# On ZeroGPU Spaces, the container may restart between requests (idle timeout,
+# scaling, etc.).  The browser retains the old dropdown values but the Python
+# process loses all in-memory state (_state, _session_models).  To recover,
+# we persist a small JSON sidecar next to each checkpoint.
+
+_SESSION_META_FILE = "obliteratus_session.json"
+
+
+def _persist_session_meta(output_dir: str, label: str, meta: dict) -> None:
+    """Write session metadata next to a checkpoint so we can recover later."""
+    try:
+        p = Path(output_dir) / _SESSION_META_FILE
+        data = {"label": label, **meta}
+        p.write_text(_json.dumps(data, indent=2))
+    except Exception:
+        pass  # best-effort
+
+
+def _recover_sessions_from_disk() -> None:
+    """Scan /tmp for obliterated checkpoints and repopulate _session_models.
+
+    Called on startup and when a stale dropdown value is detected.  Skips
+    directories that are already registered.
+    """
+    global _last_obliterated_label, _obliterate_counter
+    found_any = False
+    for pattern in ("obliterated_*", "obliterated", "bench_*"):
+        for p in Path("/tmp").glob(pattern):
+            if not p.is_dir():
+                continue
+            meta_file = p / _SESSION_META_FILE
+            if not meta_file.exists():
+                continue
+            try:
+                data = _json.loads(meta_file.read_text())
+            except Exception:
+                continue
+            label = data.get("label", p.name)
+            if label in _session_models:
+                continue  # already registered
+            _session_models[label] = {
+                "model_id": data.get("model_id", ""),
+                "model_choice": data.get("model_choice", data.get("model_id", "")),
+                "method": data.get("method", "unknown"),
+                "dataset_key": data.get("dataset_key", ""),
+                "prompt_volume": data.get("prompt_volume", 0),
+                "output_dir": str(p),
+                "source": data.get("source", "recovered"),
+            }
+            found_any = True
+            # Track the latest for auto-select
+            _last_obliterated_label = label
+            # Keep counter above any existing numbered dirs
+            if p.name.startswith("obliterated_"):
+                try:
+                    idx = int(p.name.split("_", 1)[1])
+                    if idx >= _obliterate_counter:
+                        _obliterate_counter = idx + 1
+                except (ValueError, IndexError):
+                    pass
+    # If we recovered sessions but _state has no output_dir, set it to the
+    # most recent checkpoint so chat_respond can reload from disk.
+    if found_any and not _state.get("output_dir"):
+        with _lock:
+            latest = _last_obliterated_label
+            if latest and latest in _session_models:
+                _state["output_dir"] = _session_models[latest]["output_dir"]
+                _state["model_name"] = _session_models[latest].get("model_choice")
+                _state["method"] = _session_models[latest].get("method")
+
+
+# Run recovery on import (app startup)
+_recover_sessions_from_disk()

 # ---------------------------------------------------------------------------
 # Model presets — 100+ models organized by provider
@@ -961,6 +1039,14 @@ def benchmark(
                "prompt_volume": prompt_volume,
                "output_dir": bench_save_path,
            }
+            _persist_session_meta(bench_save_path, label, {
+                "model_id": model_id,
+                "model_choice": model_choice,
+                "method": method_key,
+                "dataset_key": dataset_key,
+                "prompt_volume": prompt_volume,
+                "source": "benchmark",
+            })

        # Explicitly free the pipeline and its model to reclaim GPU memory
        # before the next benchmark iteration. _clear_gpu() only clears
@@ -1306,6 +1392,14 @@ def benchmark_multi_model(
                "prompt_volume": prompt_volume,
                "output_dir": mm_save_path,
            }
+            _persist_session_meta(mm_save_path, label, {
+                "model_id": model_id,
+                "model_choice": model_display,
+                "method": method_key,
+                "dataset_key": dataset_key,
+                "prompt_volume": prompt_volume,
+                "source": "benchmark_mm",
+            })

        # Explicitly free pipeline and model before next iteration
        if pipeline_ref[0] is not None:
@@ -1734,6 +1828,16 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
            _state["steering"] = steering_meta
            _state["output_dir"] = save_dir  # for ZeroGPU checkpoint reload

+        # Persist session metadata to disk so we survive ZeroGPU process restarts
+        _persist_session_meta(save_dir, _cache_label, {
+            "model_id": model_id,
+            "model_choice": model_choice,
+            "method": method,
+            "dataset_key": dataset_key if not use_custom else "custom",
+            "prompt_volume": prompt_volume,
+            "source": "obliterate",
+        })
+
        if can_generate:
            # Model fits — use it directly (steering hooks already installed)
            with _lock:
@@ -1870,7 +1974,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
        # Set skip flag so the .change handler doesn't trigger a wasteful
        # GPU re-allocation — the model is already loaded.
        global _skip_session_load
-        _skip_session_load = True
+        _skip_session_load = 2  # both session_model_dd and ab_session_model_dd fire .change
        _dd_update = gr.update(
            choices=_get_session_model_choices(),
            value=_last_obliterated_label or None,
@@ -1947,25 +2051,30 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
        model = _state["model"]
        tokenizer = _state["tokenizer"]

-    if model is None or tokenizer is None:
-        yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model."
-        return
+    # ZeroGPU safety: detect whether we need to reload from checkpoint.
+    # Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
+    # model as None (garbage-collected) or with stale/meta tensors.
+    # Meta tensors raise NotImplementedError on .to(), not RuntimeError,
+    # so we catch Exception broadly here.
+    _needs_reload = model is None or tokenizer is None
+    if not _needs_reload:
+        try:
+            dev = next(model.parameters()).device
+            if dev.type == "meta":
+                _needs_reload = True
+            elif torch.cuda.is_available() and dev.type != "cuda":
+                model.to("cuda")
+        except Exception:
+            _needs_reload = True

-    # ZeroGPU safety: ensure model is on GPU if available.
-    # Between GPU allocations, ZeroGPU may have moved the model to CPU/meta,
-    # or tensors may be stale from a previous GPU context.
-    # The @spaces.GPU decorator guarantees a GPU is available here.
-    _needs_reload = False
-    try:
-        dev = next(model.parameters()).device
-        if torch.cuda.is_available() and dev.type != "cuda":
-            model.to("cuda")
-    except (StopIteration, RuntimeError):
-        _needs_reload = True
-
-    # If model tensors are stale/meta, reload from the saved checkpoint
-    if _needs_reload and _ZEROGPU_AVAILABLE:
+    # Reload from saved checkpoint if model is missing or stale
+    if _needs_reload:
        checkpoint = _state.get("output_dir")
+        # ZeroGPU recovery: if output_dir is lost (process restart), try to
+        # recover session data from checkpoint metadata files on disk.
+        if not checkpoint or not Path(checkpoint).exists():
+            _recover_sessions_from_disk()
+            checkpoint = _state.get("output_dir")
        if checkpoint and Path(checkpoint).exists():
            try:
                is_preset = (_state.get("model_name") or "") in MODELS
@@ -1985,11 +2094,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
                with _lock:
                    _state["model"] = model
                    _state["tokenizer"] = tokenizer
+                    _state["status"] = "ready"
            except Exception:
                yield "Model failed to reload from checkpoint. Try re-obliterating."
                return
        else:
-            yield "Model tensors are stale (ZeroGPU). Re-obliterate to create a fresh checkpoint."
+            yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model."
            return

    # Sanitize inputs to prevent resource exhaustion
@@ -2117,8 +2227,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
    # Skip if the obliterate function just set the dropdown value — the model
    # is already loaded and we'd just waste GPU quota re-allocating.
    global _skip_session_load
-    if _skip_session_load:
-        _skip_session_load = False
+    if _skip_session_load > 0:
+        _skip_session_load -= 1
        if choice and _state.get("status") == "ready":
            yield (
                f"**Ready!** `{choice}` is loaded — just type in the chat below.",
@@ -2127,8 +2237,65 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
            return

    if not choice or choice not in _bench_configs:
-        yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", ""
-        return
+        # On ZeroGPU, global state may be lost between process restarts.
+        # Try to recover session data from checkpoint metadata files on disk.
+        if choice and choice not in _bench_configs:
+            _recover_sessions_from_disk()
+            # After recovery, the choice might now be in _bench_configs
+            if choice in _bench_configs:
+                pass  # fall through to the normal loading path below
+            else:
+                # choice still not found — but we may have recovered output_dir
+                pass
+
+        # If recovery didn't find the exact choice, check if model is loaded
+        if choice not in _bench_configs:
+            with _lock:
+                if _state["status"] == "ready" and _state["model"] is not None:
+                    yield (
+                        f"**Ready!** Model already loaded — just type in the chat below.",
+                        get_chat_header(),
+                    )
+                    return
+                # Check if we can reload from a checkpoint on disk
+                checkpoint = _state.get("output_dir")
+                if checkpoint and Path(checkpoint).exists():
+                    yield (
+                        f"**Loading model** from saved checkpoint...",
+                        "",
+                    )
+            # If we have a checkpoint, attempt reload outside the lock
+            checkpoint = _state.get("output_dir")
+            if checkpoint and Path(checkpoint).exists():
+                is_preset = (_state.get("model_name") or "") in MODELS
+                try:
+                    model_loaded = AutoModelForCausalLM.from_pretrained(
+                        checkpoint, device_map="auto", torch_dtype=torch.float16,
+                        trust_remote_code=is_preset,
+                    )
+                    tokenizer_loaded = AutoTokenizer.from_pretrained(
+                        checkpoint, trust_remote_code=is_preset,
+                    )
+                    if tokenizer_loaded.pad_token is None:
+                        tokenizer_loaded.pad_token = tokenizer_loaded.eos_token
+                    with _lock:
+                        _state["model"] = model_loaded
+                        _state["tokenizer"] = tokenizer_loaded
+                        _state["status"] = "ready"
+                    yield (
+                        f"**Loaded!** Model reloaded from checkpoint — ready to chat.",
+                        get_chat_header(),
+                    )
+                    return
+                except Exception as e:
+                    yield f"**Error:** Could not reload model: {e}", get_chat_header()
+                    return
+            yield (
+                "**Error:** Model checkpoint not found. The Space may have restarted — "
+                "please re-obliterate the model on the **Obliterate** tab.",
+                "",
+            )
+            return

    cfg = _bench_configs[choice]
    model_id = cfg["model_id"]
@@ -2320,28 +2487,27 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
        tokenizer = _state["tokenizer"]
        model_name = _state["model_name"]

-    if abliterated_model is None or tokenizer is None:
-        yield (history_left + [{"role": "user", "content": message},
-                                {"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}],
-               history_right + [{"role": "user", "content": message},
-                                 {"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}],
-               "Load a model first.",
-               "#### Original (Pre-Abliteration)",
-               "#### Abliterated")
-        return
+    # ZeroGPU safety: detect whether we need to reload from checkpoint.
+    # Model may be None (garbage-collected after GPU deallocation) or stale.
+    # Meta tensors raise NotImplementedError on .to(), so catch broadly.
+    _needs_reload = abliterated_model is None or tokenizer is None
+    if not _needs_reload:
+        try:
+            dev = next(abliterated_model.parameters()).device
+            if dev.type == "meta":
+                _needs_reload = True
+            elif torch.cuda.is_available() and dev.type != "cuda":
+                abliterated_model.to("cuda")
+        except Exception:
+            _needs_reload = True

-    # ZeroGPU safety: ensure model is on GPU if available.
-    # If tensors are stale from a prior GPU context, reload from checkpoint.
-    _needs_reload = False
-    try:
-        dev = next(abliterated_model.parameters()).device
-        if torch.cuda.is_available() and dev.type != "cuda":
-            abliterated_model.to("cuda")
-    except (StopIteration, RuntimeError):
-        _needs_reload = True
-
-    if _needs_reload and _ZEROGPU_AVAILABLE:
+    if _needs_reload:
        checkpoint = _state.get("output_dir")
+        # ZeroGPU recovery: try disk scan if output_dir is lost
+        if not checkpoint or not Path(checkpoint).exists():
+            _recover_sessions_from_disk()
+            checkpoint = _state.get("output_dir")
+            model_name = _state.get("model_name") or model_name
        if checkpoint and Path(checkpoint).exists():
            try:
                is_preset = (model_name or "") in MODELS
@@ -2361,8 +2527,19 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
                with _lock:
                    _state["model"] = abliterated_model
                    _state["tokenizer"] = tokenizer
+                    _state["status"] = "ready"
            except Exception:
                pass  # Fall through — will fail at generation with a clear error
+        else:
+            _no_model_msg = "No abliterated model loaded. Obliterate a model first."
+            yield (history_left + [{"role": "user", "content": message},
+                                    {"role": "assistant", "content": _no_model_msg}],
+                   history_right + [{"role": "user", "content": message},
+                                     {"role": "assistant", "content": _no_model_msg}],
+                   "Load a model first.",
+                   "#### Original (Pre-Abliteration)",
+                   "#### Abliterated")
+            return

    # Build header strings showing model name on each side
    header_left = f"#### Original (Pre-Abliteration)\n`{model_name}`"
@@ -116,7 +116,7 @@ The paper also analyzes how adversarial suffixes (e.g., GCG-generated) suppress

 ## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}

-**Author:** Gokdeniz Gulmez (independent research)
+**Author:** Gökdeniz Gülmez (independent research)
 **arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
 **Version:** v3, revised January 28, 2026
 **Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
@@ -733,7 +733,7 @@ MI research helps make AI safe but could be used adversarially. The same techniq

 1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)

-2. Gulmez, G. (2025). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
+2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)

 3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)

@@ -1802,7 +1802,7 @@ implementations diverge from the closed-form GRRO solution.
 ## References

 1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
-2. Gulmez, G. (2025). Gabliteration. arXiv:2512.18901.
+2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. arXiv:2512.18901.
 3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
 4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
 5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.
@@ -53,7 +53,7 @@
    "id": "install"
   },
   "outputs": [],
-   "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
+   "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\")"
  },
  {
   "cell_type": "markdown",
@@ -334,7 +334,7 @@ METHODS = {
        "layer_selection": "middle60",
    },
    "gabliteration": {
-        "label": "Gabliteration (Gulmez 2025 Baseline)",
+        "label": "Gabliteration (Gülmez 2026 Baseline)",
        "description": (
            "Faithful reproduction of Gabliteration (arXiv:2512.18901). "
            "SVD-based multi-direction extraction (top-4), ridge-regularized "
@@ -2494,7 +2494,7 @@ class AbliterationPipeline:

        References:
        - SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
-        - Gabliteration (Gulmez, 2025): multi-direction SVD abliteration
+        - Gabliteration (Gülmez, 2026): multi-direction SVD abliteration
        - SAFEx (Lai et al., NeurIPS 2025): safety expert identification
        """
        if not self._routing_harmful or not self._routing_harmless:
@@ -919,7 +919,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
                "Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (2024)",
                "Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)",
                "grimjim, Norm-Preserving Biprojected Abliteration (2025)",
-                "Gurnee & Nanda, The Geometry of Refusal in LLMs — concept cones (ICML 2025)",
+                "Wollschlager et al., The Geometry of Refusal in LLMs — concept cones (ICML 2025, arXiv:2502.17420)",
                "Joad et al., The Ouroboros Effect: Self-Repair in Abliterated LLMs (2026)",
                "OBLITERATUS: Analysis-informed abliteration pipeline (novel)",
            ],
@@ -24,7 +24,7 @@ def _detect_compute_tier() -> str:
        import torch

        if torch.cuda.is_available():
-            vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024**3)
+            vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
            if vram_gb >= 20:
                return "large"
            elif vram_gb >= 8:
@@ -10,6 +10,7 @@ Usage:
 from __future__ import annotations

 import os
+import pathlib
 import platform
 import shutil
 import sys
@@ -48,7 +49,7 @@ def _detect_gpu() -> list[dict]:
                    {
                        "index": i,
                        "name": props.name,
-                        "vram_gb": round(props.total_mem / 1024**3, 1),
+                        "vram_gb": round(props.total_memory / 1024**3, 1),
                        "compute": f"{props.major}.{props.minor}",
                    }
                )
@@ -292,6 +293,12 @@ def launch_local_ui(
    console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
    start = time.time()

+    # app.py lives at the project root, one level above this package.
+    # When installed via pip the root isn't on sys.path, so add it.
+    _project_root = str(pathlib.Path(__file__).resolve().parent.parent)
+    if _project_root not in sys.path:
+        sys.path.insert(0, _project_root)
+
    from app import launch as app_launch

    elapsed = time.time() - start
@@ -7,11 +7,14 @@
  year={2024}
 }

-@article{gabliteration2024,
-  title={{Gabliteration}: {SVD}-Based Multi-Direction Refusal Removal},
-  author={Gabriel, Saul and {contributors}},
-  journal={arXiv preprint arXiv:2512.18901},
-  year={2024}
+@misc{gabliteration2024,
+  title={{Gabliteration}: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models},
+  author={G\"{o}kdeniz G\"{u}lmez},
+  year={2026},
+  eprint={2512.18901},
+  archivePrefix={arXiv},
+  primaryClass={cs.AI},
+  url={https://arxiv.org/abs/2512.18901}
 }

@misc{grimjim2025,
@@ -7,7 +7,7 @@ comparison tables with standardized community metrics.

 Baselines included:
  1. FailSpy/abliterator (2024) — Community workhorse baseline
-  2. Gabliteration (Gulmez 2025) — SVD multi-direction + ridge regularization
+  2. Gabliteration (Gülmez 2026) — SVD multi-direction + ridge regularization
  3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
  4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization