From 34032b78214929f0ecd2ea5ae34914cdeb061e0b Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 11 Mar 2026 15:51:16 -0400
Subject: [PATCH 01/10] Add remote SSH execution support for GPU nodes

Adds --remote [user@]host flag to obliterate, run, and tourney commands,
enabling execution on remote GPU nodes via SSH. Also supports a remote:
section in YAML configs. The remote runner handles SSH connectivity checks,
GPU detection, auto-installation of obliteratus, log streaming, and result
syncing back to the local machine via scp.
---
 examples/remote_gpu_node.yaml |  41 ++++
 obliteratus/__init__.py       |   8 +
 obliteratus/cli.py            | 181 ++++++++++++++-
 obliteratus/config.py         |  19 ++
 obliteratus/remote.py         | 417 ++++++++++++++++++++++++++++++++++
 5 files changed, 663 insertions(+), 3 deletions(-)
 create mode 100644 examples/remote_gpu_node.yaml
 create mode 100644 obliteratus/remote.py

diff --git a/examples/remote_gpu_node.yaml b/examples/remote_gpu_node.yaml
new file mode 100644
index 0000000..b6b533c
--- /dev/null
+++ b/examples/remote_gpu_node.yaml
@@ -0,0 +1,41 @@
+# Example: Run an ablation study on a remote GPU node via SSH.
+#
+# Usage:
+#   obliteratus run examples/remote_gpu_node.yaml
+#
+# The 'remote' section tells Obliteratus to SSH into the specified host,
+# install obliteratus if needed, run the pipeline there, and copy results
+# back to the local machine.
+#
+# You can also use --remote on any command instead of a YAML section:
+#   obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
+
+model:
+  name: meta-llama/Llama-3.1-8B-Instruct
+  task: causal_lm
+  dtype: float16
+  device: auto
+
+dataset:
+  name: wikitext
+  split: test
+  max_samples: 500
+
+strategies:
+  - name: layer_removal
+    params:
+      layer_indices: [10, 11, 12]
+
+metrics: [perplexity]
+batch_size: 8
+max_length: 512
+output_dir: results/remote_example
+
+remote:
+  host: gpu-node.example.com
+  user: root
+  port: 22
+  ssh_key: ~/.ssh/id_rsa
+  remote_dir: /tmp/obliteratus_run
+  python: python3
+  sync_results: true
diff --git a/obliteratus/__init__.py b/obliteratus/__init__.py
index 7f70058..2039325 100644
--- a/obliteratus/__init__.py
+++ b/obliteratus/__init__.py
@@ -17,6 +17,8 @@ __all__ = [
     "TourneyResult",
     "get_adaptive_recommendation",
     "AdaptiveRecommendation",
+    "RemoteRunner",
+    "RemoteConfig",
 ]
 
 
@@ -60,4 +62,10 @@ def __getattr__(name):
     if name == "AdaptiveRecommendation":
         from obliteratus.adaptive_defaults import AdaptiveRecommendation
         return AdaptiveRecommendation
+    if name == "RemoteRunner":
+        from obliteratus.remote import RemoteRunner
+        return RemoteRunner
+    if name == "RemoteConfig":
+        from obliteratus.remote import RemoteConfig
+        return RemoteConfig
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/obliteratus/cli.py b/obliteratus/cli.py
index e32fb25..b9595ed 100644
--- a/obliteratus/cli.py
+++ b/obliteratus/cli.py
@@ -22,6 +22,35 @@ _BANNER = r"""
 """
 
 
+def _add_remote_args(parser):
+    """Add --remote execution flags to a subcommand parser."""
+    remote_group = parser.add_argument_group("remote execution")
+    remote_group.add_argument(
+        "--remote", type=str, default=None, metavar="[USER@]HOST",
+        help="Run on a remote GPU node via SSH (e.g. root@gpu-node or just gpu-node)",
+    )
+    remote_group.add_argument(
+        "--ssh-key", type=str, default=None,
+        help="Path to SSH private key (default: use SSH agent or ~/.ssh/id_rsa)",
+    )
+    remote_group.add_argument(
+        "--ssh-port", type=int, default=22,
+        help="SSH port on remote host (default: 22)",
+    )
+    remote_group.add_argument(
+        "--remote-dir", type=str, default="/tmp/obliteratus_run",
+        help="Working directory on the remote machine (default: /tmp/obliteratus_run)",
+    )
+    remote_group.add_argument(
+        "--remote-python", type=str, default="python3",
+        help="Python binary on the remote machine (default: python3)",
+    )
+    remote_group.add_argument(
+        "--no-sync", action="store_true", default=False,
+        help="Don't copy results back to local machine after remote run",
+    )
+
+
 def main(argv: list[str] | None = None):
     console.print(_BANNER)
     parser = argparse.ArgumentParser(
@@ -40,6 +69,7 @@ def main(argv: list[str] | None = None):
         default=None,
         help="Apply a preset (e.g. quick, full, attention, jailbreak, guardrail)",
     )
+    _add_remote_args(run_parser)
 
     # --- info ---
     info_parser = subparsers.add_parser("info", help="Print model architecture info")
@@ -144,9 +174,11 @@ def main(argv: list[str] | None = None):
         help="One-click: remove refusal directions from a model (SOTA multi-technique)",
     )
     _add_obliterate_args(abl_parser)
+    _add_remote_args(abl_parser)
     # Backward-compat alias (hidden from help)
     abl_alias = subparsers.add_parser("abliterate", help=argparse.SUPPRESS)
     _add_obliterate_args(abl_alias)
+    _add_remote_args(abl_alias)
 
     # --- report ---
     report_parser = subparsers.add_parser("report", help="Regenerate report from saved results")
@@ -180,6 +212,7 @@ def main(argv: list[str] | None = None):
         "--methods", type=str, nargs="+", default=None,
         help="Override: only run these methods (space-separated)",
     )
+    _add_remote_args(tourney_parser)
 
     # --- recommend ---
     recommend_parser = subparsers.add_parser(
@@ -197,7 +230,10 @@ def main(argv: list[str] | None = None):
     args = parser.parse_args(argv)
 
     if args.command == "run":
-        _cmd_run(args)
+        if getattr(args, "remote", None):
+            _cmd_remote_run(args)
+        else:
+            _cmd_run(args)
     elif args.command == "interactive":
         _cmd_interactive()
     elif args.command == "models":
@@ -217,9 +253,15 @@ def main(argv: list[str] | None = None):
     elif args.command == "recommend":
         _cmd_recommend(args)
     elif args.command == "tourney":
-        _cmd_tourney(args)
+        if getattr(args, "remote", None):
+            _cmd_remote_tourney(args)
+        else:
+            _cmd_tourney(args)
     elif args.command in ("obliterate", "abliterate"):
-        _cmd_abliterate(args)
+        if getattr(args, "remote", None):
+            _cmd_remote_abliterate(args)
+        else:
+            _cmd_abliterate(args)
 
 
 def _cmd_ui(args):
@@ -314,6 +356,33 @@ def _cmd_run(args):
         config = StudyConfig.from_dict(raw)
     if args.output_dir:
         config.output_dir = args.output_dir
+
+    # If YAML has a remote: section, dispatch to remote runner
+    if config.remote is not None:
+        from obliteratus.remote import RemoteConfig as _RC, RemoteRunner
+
+        rc = _RC(
+            host=config.remote.host,
+            user=config.remote.user,
+            port=config.remote.port,
+            ssh_key=config.remote.ssh_key,
+            remote_dir=config.remote.remote_dir,
+            python=config.remote.python,
+            sync_results=config.remote.sync_results,
+        )
+        runner = RemoteRunner(rc)
+        result_path = runner.run_config(
+            local_config_path=args.config,
+            local_output_dir=config.output_dir,
+            preset=args.preset,
+        )
+        if result_path:
+            console.print(f"\n[bold green]Remote run complete.[/] Results at: [cyan]{result_path}[/]")
+        else:
+            console.print("[red]Remote run failed. Check logs above.[/]")
+            raise SystemExit(1)
+        return
+
     run_study(config)
 
 
@@ -653,5 +722,111 @@ def _cmd_abliterate(args):
     )
 
 
+def _make_remote_runner(args):
+    """Create a RemoteRunner from CLI --remote flags."""
+    from obliteratus.remote import RemoteConfig, RemoteRunner
+
+    rc = RemoteConfig.from_cli_args(
+        args.remote,
+        port=args.ssh_port,
+        ssh_key=args.ssh_key,
+        remote_dir=args.remote_dir,
+        python=args.remote_python,
+        sync_results=not args.no_sync,
+    )
+    return RemoteRunner(rc)
+
+
+def _cmd_remote_abliterate(args):
+    from rich.panel import Panel
+
+    runner = _make_remote_runner(args)
+
+    kwargs = {}
+    if args.method:
+        kwargs["method"] = args.method
+    if args.device:
+        kwargs["device"] = args.device
+    if args.dtype:
+        kwargs["dtype"] = args.dtype
+    if args.quantization:
+        kwargs["quantization"] = args.quantization
+    if args.n_directions is not None:
+        kwargs["n_directions"] = args.n_directions
+    if getattr(args, "direction_method", None):
+        kwargs["direction_method"] = args.direction_method
+    if args.regularization is not None:
+        kwargs["regularization"] = args.regularization
+    if args.refinement_passes is not None:
+        kwargs["refinement_passes"] = args.refinement_passes
+    if getattr(args, "large_model", False):
+        kwargs["large_model"] = True
+    if getattr(args, "verify_sample_size", None) is not None:
+        kwargs["verify_sample_size"] = args.verify_sample_size
+
+    result_path = runner.run_obliterate(
+        model=args.model,
+        local_output_dir=args.output_dir,
+        **kwargs,
+    )
+
+    if result_path:
+        console.print(
+            Panel(
+                f"[bold green]Remote abliteration complete![/]\n\n"
+                f"  Results at: [cyan]{result_path}[/]\n\n"
+                f"  [dim]Load with:[/] AutoModelForCausalLM.from_pretrained('{result_path}')",
+                border_style="green",
+                title="[bold green]REBIRTH COMPLETE (remote)[/]",
+            )
+        )
+    else:
+        console.print("[red]Remote abliteration failed. Check logs above.[/]")
+        raise SystemExit(1)
+
+
+def _cmd_remote_run(args):
+    runner = _make_remote_runner(args)
+    result_path = runner.run_config(
+        local_config_path=args.config,
+        local_output_dir=args.output_dir,
+        preset=args.preset,
+    )
+    if result_path:
+        console.print(f"\n[bold green]Remote run complete.[/] Results at: [cyan]{result_path}[/]")
+    else:
+        console.print("[red]Remote run failed. Check logs above.[/]")
+        raise SystemExit(1)
+
+
+def _cmd_remote_tourney(args):
+    from rich.panel import Panel
+
+    runner = _make_remote_runner(args)
+    result_path = runner.run_tourney(
+        model=args.model,
+        local_output_dir=args.output_dir,
+        device=args.device,
+        dtype=args.dtype,
+        quantization=args.quantization,
+        methods=args.methods,
+        hub_org=args.hub_org,
+        hub_repo=args.hub_repo,
+        dataset=args.dataset,
+    )
+    if result_path:
+        console.print(
+            Panel(
+                f"[bold green]Remote tournament complete![/]\n\n"
+                f"  Results at: [cyan]{result_path}[/]",
+                border_style="green",
+                title="[bold green]TOURNAMENT COMPLETE (remote)[/]",
+            )
+        )
+    else:
+        console.print("[red]Remote tournament failed. Check logs above.[/]")
+        raise SystemExit(1)
+
+
 if __name__ == "__main__":
     main()
diff --git a/obliteratus/config.py b/obliteratus/config.py
index 9947803..1a31e31 100644
--- a/obliteratus/config.py
+++ b/obliteratus/config.py
@@ -35,6 +35,19 @@ class StrategyConfig:
     params: dict[str, Any] = field(default_factory=dict)
 
 
+@dataclass
+class RemoteConfig:
+    """Optional remote execution settings for running on a GPU node via SSH."""
+
+    host: str
+    user: str = "root"
+    port: int = 22
+    ssh_key: str | None = None
+    remote_dir: str = "/tmp/obliteratus_run"
+    python: str = "python3"
+    sync_results: bool = True
+
+
 @dataclass
 class StudyConfig:
     """Top-level configuration for an ablation run."""
@@ -46,6 +59,7 @@ class StudyConfig:
     batch_size: int = 8
     max_length: int = 512
     output_dir: str = "results"
+    remote: RemoteConfig | None = None
 
     @classmethod
     def from_yaml(cls, path: str | Path) -> StudyConfig:
@@ -82,6 +96,10 @@ class StudyConfig:
         model = ModelConfig(**d["model"])
         dataset = DatasetConfig(**d["dataset"])
         strategies = [StrategyConfig(**s) for s in d["strategies"]]
+        remote = None
+        if "remote" in d and d["remote"]:
+            remote = RemoteConfig(**d["remote"])
+
         return cls(
             model=model,
             dataset=dataset,
@@ -90,6 +108,7 @@ class StudyConfig:
             batch_size=d.get("batch_size", 8),
             max_length=d.get("max_length", 512),
             output_dir=d.get("output_dir", "results"),
+            remote=remote,
         )
 
     def to_dict(self) -> dict:
diff --git a/obliteratus/remote.py b/obliteratus/remote.py
new file mode 100644
index 0000000..9659e04
--- /dev/null
+++ b/obliteratus/remote.py
@@ -0,0 +1,417 @@
+"""Remote execution support for Obliteratus.
+
+Run abliteration pipelines on remote GPU nodes via SSH. The remote machine
+must have CUDA-capable GPUs and a Python environment. Obliteratus will be
+auto-installed if not present.
+
+Usage (CLI):
+    obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
+        --remote user@gpu-node \
+        --ssh-key ~/.ssh/id_rsa
+
+Usage (YAML config):
+    remote:
+      host: gpu-node
+      user: root
+      ssh_key: ~/.ssh/id_rsa
+      remote_dir: /tmp/obliteratus_run
+"""
+
+from __future__ import annotations
+
+import os
+import shlex
+import subprocess
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable
+
+from rich.console import Console
+
+console = Console()
+
+
+@dataclass
+class RemoteConfig:
+    """SSH connection and remote execution settings."""
+
+    host: str
+    user: str = "root"
+    port: int = 22
+    ssh_key: str | None = None
+    remote_dir: str = "/tmp/obliteratus_run"
+    install_timeout: int = 600  # seconds
+    python: str = "python3"  # remote python binary
+    sync_results: bool = True
+
+    @property
+    def ssh_target(self) -> str:
+        return f"{self.user}@{self.host}"
+
+    @classmethod
+    def from_cli_args(cls, remote_str: str, **kwargs) -> RemoteConfig:
+        """Parse 'user@host' or just 'host' from CLI --remote flag."""
+        if "@" in remote_str:
+            user, host = remote_str.rsplit("@", 1)
+        else:
+            user = "root"
+            host = remote_str
+        return cls(host=host, user=user, **kwargs)
+
+    @classmethod
+    def from_dict(cls, d: dict) -> RemoteConfig:
+        return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
+
+
+class RemoteRunner:
+    """Execute Obliteratus commands on a remote machine via SSH."""
+
+    def __init__(
+        self,
+        config: RemoteConfig,
+        on_log: Callable[[str], None] | None = None,
+    ):
+        self.config = config
+        self.on_log = on_log or (lambda msg: console.print(f"[dim][remote][/] {msg}"))
+
+    def _ssh_base_cmd(self) -> list[str]:
+        """Build base SSH command with common options."""
+        cmd = [
+            "ssh",
+            "-o", "StrictHostKeyChecking=no",
+            "-o", "BatchMode=yes",
+            "-o", "ConnectTimeout=30",
+            "-p", str(self.config.port),
+        ]
+        if self.config.ssh_key:
+            key_path = os.path.expanduser(self.config.ssh_key)
+            cmd.extend(["-i", key_path])
+        cmd.append(self.config.ssh_target)
+        return cmd
+
+    def _scp_base_cmd(self) -> list[str]:
+        """Build base SCP command."""
+        cmd = [
+            "scp",
+            "-o", "StrictHostKeyChecking=no",
+            "-o", "BatchMode=yes",
+            "-P", str(self.config.port),
+            "-r",
+        ]
+        if self.config.ssh_key:
+            key_path = os.path.expanduser(self.config.ssh_key)
+            cmd.extend(["-i", key_path])
+        return cmd
+
+    def run_ssh(self, remote_cmd: str, stream: bool = False, timeout: int | None = None) -> subprocess.CompletedProcess | int:
+        """Run a command on the remote host.
+
+        If stream=True, streams stdout/stderr in real-time and returns the
+        exit code. Otherwise returns CompletedProcess.
+        """
+        cmd = self._ssh_base_cmd() + [remote_cmd]
+
+        if stream:
+            proc = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                bufsize=1,
+            )
+            try:
+                for line in proc.stdout:
+                    line = line.rstrip("\n")
+                    self.on_log(line)
+                proc.wait(timeout=timeout)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                self.on_log("[red]Remote command timed out[/]")
+                return 1
+            return proc.returncode
+        else:
+            return subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+            )
+
+    def check_connection(self) -> bool:
+        """Verify SSH connectivity."""
+        self.on_log(f"Testing SSH connection to {self.config.ssh_target}...")
+        result = self.run_ssh("echo ok", timeout=30)
+        if isinstance(result, subprocess.CompletedProcess) and result.returncode == 0:
+            self.on_log("SSH connection OK")
+            return True
+        self.on_log("[red]SSH connection failed[/]")
+        return False
+
+    def check_gpu(self) -> str | None:
+        """Check for CUDA GPUs on remote. Returns nvidia-smi output or None."""
+        result = self.run_ssh("nvidia-smi --query-gpu=name,memory.total --format=csv,noheader", timeout=30)
+        if isinstance(result, subprocess.CompletedProcess) and result.returncode == 0:
+            gpu_info = result.stdout.strip()
+            self.on_log(f"Remote GPUs: {gpu_info}")
+            return gpu_info
+        self.on_log("[yellow]No GPUs detected on remote (nvidia-smi failed)[/]")
+        return None
+
+    def ensure_obliteratus(self) -> bool:
+        """Install or update obliteratus on the remote if needed."""
+        # Check if already installed
+        check = self.run_ssh(
+            f"{self.config.python} -c \"import obliteratus; print(obliteratus.__version__)\"",
+            timeout=30,
+        )
+        if isinstance(check, subprocess.CompletedProcess) and check.returncode == 0:
+            version = check.stdout.strip()
+            self.on_log(f"Obliteratus {version} already installed on remote")
+            return True
+
+        # Install from PyPI or git
+        self.on_log("Installing obliteratus on remote...")
+        install_cmd = (
+            f"{self.config.python} -m pip install --quiet "
+            f"'obliteratus @ git+https://github.com/EleutherAI/OBLITERATUS.git'"
+        )
+        rc = self.run_ssh(install_cmd, stream=True, timeout=self.config.install_timeout)
+        if rc != 0:
+            self.on_log("[red]Failed to install obliteratus on remote[/]")
+            return False
+
+        self.on_log("Obliteratus installed successfully")
+        return True
+
+    def sync_results_back(self, remote_output_dir: str, local_output_dir: str) -> bool:
+        """Copy results from remote back to local machine via scp."""
+        local_path = Path(local_output_dir)
+        local_path.mkdir(parents=True, exist_ok=True)
+
+        self.on_log(f"Syncing results: {self.config.ssh_target}:{remote_output_dir} -> {local_output_dir}")
+
+        cmd = self._scp_base_cmd() + [
+            f"{self.config.ssh_target}:{remote_output_dir}/",
+            str(local_path),
+        ]
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
+        if result.returncode == 0:
+            self.on_log(f"Results synced to {local_output_dir}")
+            return True
+        else:
+            self.on_log(f"[red]SCP failed: {result.stderr}[/]")
+            return False
+
+    def build_obliterate_command(
+        self,
+        model: str,
+        output_dir: str | None = None,
+        method: str = "advanced",
+        device: str = "auto",
+        dtype: str = "float16",
+        quantization: str | None = None,
+        n_directions: int | None = None,
+        direction_method: str | None = None,
+        regularization: float | None = None,
+        refinement_passes: int | None = None,
+        large_model: bool = False,
+        verify_sample_size: int | None = None,
+    ) -> str:
+        """Build the remote obliteratus CLI command."""
+        remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
+
+        parts = [
+            self.config.python, "-m", "obliteratus",
+            "obliterate", shlex.quote(model),
+            "--output-dir", shlex.quote(remote_output),
+            "--method", method,
+            "--device", device,
+            "--dtype", dtype,
+        ]
+        if quantization:
+            parts.extend(["--quantization", quantization])
+        if n_directions is not None:
+            parts.extend(["--n-directions", str(n_directions)])
+        if direction_method:
+            parts.extend(["--direction-method", direction_method])
+        if regularization is not None:
+            parts.extend(["--regularization", str(regularization)])
+        if refinement_passes is not None:
+            parts.extend(["--refinement-passes", str(refinement_passes)])
+        if large_model:
+            parts.append("--large-model")
+        if verify_sample_size is not None:
+            parts.extend(["--verify-sample-size", str(verify_sample_size)])
+
+        return " ".join(parts)
+
+    def build_run_command(self, remote_config_path: str, output_dir: str | None = None, preset: str | None = None) -> str:
+        """Build remote 'obliteratus run' command."""
+        parts = [
+            self.config.python, "-m", "obliteratus",
+            "run", shlex.quote(remote_config_path),
+        ]
+        if output_dir:
+            parts.extend(["--output-dir", shlex.quote(output_dir)])
+        if preset:
+            parts.extend(["--preset", preset])
+        return " ".join(parts)
+
+    def build_tourney_command(
+        self,
+        model: str,
+        output_dir: str | None = None,
+        device: str = "auto",
+        dtype: str = "float16",
+        quantization: str | None = None,
+        methods: list[str] | None = None,
+        hub_org: str | None = None,
+        hub_repo: str | None = None,
+        dataset: str = "builtin",
+    ) -> str:
+        """Build remote 'obliteratus tourney' command."""
+        remote_output = output_dir or f"{self.config.remote_dir}/tourney/{model.replace('/', '_')}"
+
+        parts = [
+            self.config.python, "-m", "obliteratus",
+            "tourney", shlex.quote(model),
+            "--output-dir", shlex.quote(remote_output),
+            "--device", device,
+            "--dtype", dtype,
+            "--dataset", dataset,
+        ]
+        if quantization:
+            parts.extend(["--quantization", quantization])
+        if hub_org:
+            parts.extend(["--hub-org", hub_org])
+        if hub_repo:
+            parts.extend(["--hub-repo", hub_repo])
+        if methods:
+            parts.extend(["--methods"] + methods)
+        return " ".join(parts)
+
+    def upload_config(self, local_config_path: str) -> str:
+        """Upload a YAML config file to the remote."""
+        remote_path = f"{self.config.remote_dir}/config.yaml"
+        self.run_ssh(f"mkdir -p {shlex.quote(self.config.remote_dir)}")
+
+        cmd = self._scp_base_cmd()
+        # scp uses -P not -p, already handled in _scp_base_cmd
+        cmd += [local_config_path, f"{self.config.ssh_target}:{remote_path}"]
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to upload config: {result.stderr}")
+        self.on_log(f"Config uploaded to {remote_path}")
+        return remote_path
+
+    def run_obliterate(
+        self,
+        model: str,
+        local_output_dir: str | None = None,
+        **kwargs,
+    ) -> str | None:
+        """Full remote obliteration: setup, run, sync results.
+
+        Returns local path to results, or None on failure.
+        """
+        # 1. Verify connection
+        if not self.check_connection():
+            return None
+
+        # 2. Check GPUs
+        self.check_gpu()
+
+        # 3. Ensure obliteratus is installed
+        if not self.ensure_obliteratus():
+            return None
+
+        # 4. Create remote working directory
+        self.run_ssh(f"mkdir -p {shlex.quote(self.config.remote_dir)}")
+
+        # 5. Build and run the command
+        remote_output = f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
+        cmd = self.build_obliterate_command(model, output_dir=remote_output, **kwargs)
+        self.on_log(f"Running: {cmd}")
+
+        rc = self.run_ssh(cmd, stream=True)
+        if rc != 0:
+            self.on_log(f"[red]Remote obliteration failed (exit code {rc})[/]")
+            return None
+
+        # 6. Sync results back
+        if self.config.sync_results:
+            local_output = local_output_dir or f"abliterated/{model.replace('/', '_')}"
+            if self.sync_results_back(remote_output, local_output):
+                return local_output
+            return None
+
+        self.on_log(f"Results on remote: {remote_output}")
+        return remote_output
+
+    def run_config(
+        self,
+        local_config_path: str,
+        local_output_dir: str | None = None,
+        preset: str | None = None,
+    ) -> str | None:
+        """Upload config, run study remotely, sync results."""
+        if not self.check_connection():
+            return None
+        self.check_gpu()
+        if not self.ensure_obliteratus():
+            return None
+
+        # Upload config
+        remote_config = self.upload_config(local_config_path)
+
+        # Determine remote output dir
+        remote_output = f"{self.config.remote_dir}/results"
+        cmd = self.build_run_command(remote_config, output_dir=remote_output, preset=preset)
+        self.on_log(f"Running: {cmd}")
+
+        rc = self.run_ssh(cmd, stream=True)
+        if rc != 0:
+            self.on_log(f"[red]Remote run failed (exit code {rc})[/]")
+            return None
+
+        if self.config.sync_results:
+            local_output = local_output_dir or "results"
+            if self.sync_results_back(remote_output, local_output):
+                return local_output
+            return None
+
+        return remote_output
+
+    def run_tourney(
+        self,
+        model: str,
+        local_output_dir: str | None = None,
+        **kwargs,
+    ) -> str | None:
+        """Run tournament remotely, sync results."""
+        if not self.check_connection():
+            return None
+        self.check_gpu()
+        if not self.ensure_obliteratus():
+            return None
+
+        remote_output = f"{self.config.remote_dir}/tourney/{model.replace('/', '_')}"
+        cmd = self.build_tourney_command(model, output_dir=remote_output, **kwargs)
+        self.on_log(f"Running: {cmd}")
+
+        rc = self.run_ssh(cmd, stream=True)
+        if rc != 0:
+            self.on_log(f"[red]Remote tourney failed (exit code {rc})[/]")
+            return None
+
+        if self.config.sync_results:
+            local_output = local_output_dir or f"/tmp/obliteratus_tourney/{model.replace('/', '_')}"
+            if self.sync_results_back(remote_output, local_output):
+                return local_output
+            return None
+
+        return remote_output

From cbdb772eb9c0270d434967ff69f3e9f43c97e906 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Thu, 12 Mar 2026 12:35:29 -0400
Subject: [PATCH 02/10] Add multi-GPU support with --gpus flag

Adds --gpus flag to obliterate, run, and tourney commands for controlling
which GPUs to use (sets CUDA_VISIBLE_DEVICES). Works both locally and with
--remote. Models are automatically split across selected GPUs via
accelerate's device_map="auto". Also adds gpus field to remote YAML config.
---
 examples/remote_gpu_node.yaml |  6 +++++
 obliteratus/cli.py            | 44 +++++++++++++++++++++++++++++++++++
 obliteratus/config.py         |  1 +
 obliteratus/remote.py         | 28 ++++++++++++++++++----
 4 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/examples/remote_gpu_node.yaml b/examples/remote_gpu_node.yaml
index b6b533c..6d6951f 100644
--- a/examples/remote_gpu_node.yaml
+++ b/examples/remote_gpu_node.yaml
@@ -9,6 +9,11 @@
 #
 # You can also use --remote on any command instead of a YAML section:
 #   obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
+#
+# Multi-GPU: Models are automatically split across all available GPUs via
+# accelerate's device_map="auto". Use --gpus or the gpus: field to select
+# specific GPUs:
+#   obliteratus obliterate model --remote root@gpu-node --gpus 0,1,2,3
 
 model:
   name: meta-llama/Llama-3.1-8B-Instruct
@@ -39,3 +44,4 @@ remote:
   remote_dir: /tmp/obliteratus_run
   python: python3
   sync_results: true
+  # gpus: "0,1,2,3"  # uncomment to select specific GPUs (default: all)
diff --git a/obliteratus/cli.py b/obliteratus/cli.py
index b9595ed..aa37c94 100644
--- a/obliteratus/cli.py
+++ b/obliteratus/cli.py
@@ -22,6 +22,19 @@ _BANNER = r"""
 """
 
 
+def _add_gpu_args(parser):
+    """Add --gpus flag for multi-GPU control."""
+    gpu_group = parser.add_argument_group("GPU selection")
+    gpu_group.add_argument(
+        "--gpus", type=str, default=None, metavar="IDS",
+        help=(
+            "Comma-separated GPU IDs to use (e.g. '0,1,2,3' or 'all'). "
+            "Sets CUDA_VISIBLE_DEVICES. By default uses all available GPUs. "
+            "Models are automatically split across selected GPUs via accelerate."
+        ),
+    )
+
+
 def _add_remote_args(parser):
     """Add --remote execution flags to a subcommand parser."""
     remote_group = parser.add_argument_group("remote execution")
@@ -51,6 +64,28 @@ def _add_remote_args(parser):
     )
 
 
+def _apply_gpu_selection(args):
+    """Set CUDA_VISIBLE_DEVICES based on --gpus flag (for local runs only)."""
+    import os
+
+    gpus = getattr(args, "gpus", None)
+    if gpus is None or getattr(args, "remote", None):
+        return  # skip for remote runs (handled by remote runner)
+
+    if gpus.lower() == "all":
+        return  # use all GPUs (default behavior)
+
+    # Validate: should be comma-separated integers
+    try:
+        gpu_ids = [int(g.strip()) for g in gpus.split(",")]
+    except ValueError:
+        console.print(f"[red]Invalid --gpus value: {gpus!r}. Expected comma-separated integers or 'all'.[/]")
+        raise SystemExit(1)
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in gpu_ids)
+    console.print(f"[dim]Using GPUs: {gpu_ids} (CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']})[/dim]")
+
+
 def main(argv: list[str] | None = None):
     console.print(_BANNER)
     parser = argparse.ArgumentParser(
@@ -69,6 +104,7 @@ def main(argv: list[str] | None = None):
         default=None,
         help="Apply a preset (e.g. quick, full, attention, jailbreak, guardrail)",
     )
+    _add_gpu_args(run_parser)
     _add_remote_args(run_parser)
 
     # --- info ---
@@ -174,10 +210,12 @@ def main(argv: list[str] | None = None):
         help="One-click: remove refusal directions from a model (SOTA multi-technique)",
     )
     _add_obliterate_args(abl_parser)
+    _add_gpu_args(abl_parser)
     _add_remote_args(abl_parser)
     # Backward-compat alias (hidden from help)
     abl_alias = subparsers.add_parser("abliterate", help=argparse.SUPPRESS)
     _add_obliterate_args(abl_alias)
+    _add_gpu_args(abl_alias)
     _add_remote_args(abl_alias)
 
     # --- report ---
@@ -212,6 +250,7 @@ def main(argv: list[str] | None = None):
         "--methods", type=str, nargs="+", default=None,
         help="Override: only run these methods (space-separated)",
     )
+    _add_gpu_args(tourney_parser)
     _add_remote_args(tourney_parser)
 
     # --- recommend ---
@@ -229,6 +268,9 @@ def main(argv: list[str] | None = None):
 
     args = parser.parse_args(argv)
 
+    # Apply GPU selection early (before any CUDA init)
+    _apply_gpu_selection(args)
+
     if args.command == "run":
         if getattr(args, "remote", None):
             _cmd_remote_run(args)
@@ -369,6 +411,7 @@ def _cmd_run(args):
             remote_dir=config.remote.remote_dir,
             python=config.remote.python,
             sync_results=config.remote.sync_results,
+            gpus=config.remote.gpus,
         )
         runner = RemoteRunner(rc)
         result_path = runner.run_config(
@@ -733,6 +776,7 @@ def _make_remote_runner(args):
         remote_dir=args.remote_dir,
         python=args.remote_python,
         sync_results=not args.no_sync,
+        gpus=getattr(args, "gpus", None),
     )
     return RemoteRunner(rc)
 
diff --git a/obliteratus/config.py b/obliteratus/config.py
index 1a31e31..e12d837 100644
--- a/obliteratus/config.py
+++ b/obliteratus/config.py
@@ -46,6 +46,7 @@ class RemoteConfig:
     remote_dir: str = "/tmp/obliteratus_run"
     python: str = "python3"
     sync_results: bool = True
+    gpus: str | None = None  # comma-separated GPU IDs or "all"
 
 
 @dataclass
diff --git a/obliteratus/remote.py b/obliteratus/remote.py
index 9659e04..57c7be7 100644
--- a/obliteratus/remote.py
+++ b/obliteratus/remote.py
@@ -45,6 +45,7 @@ class RemoteConfig:
     install_timeout: int = 600  # seconds
     python: str = "python3"  # remote python binary
     sync_results: bool = True
+    gpus: str | None = None  # comma-separated GPU IDs or "all"
 
     @property
     def ssh_target(self) -> str:
@@ -151,14 +152,31 @@ class RemoteRunner:
 
     def check_gpu(self) -> str | None:
         """Check for CUDA GPUs on remote. Returns nvidia-smi output or None."""
-        result = self.run_ssh("nvidia-smi --query-gpu=name,memory.total --format=csv,noheader", timeout=30)
+        result = self.run_ssh(
+            "nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader",
+            timeout=30,
+        )
         if isinstance(result, subprocess.CompletedProcess) and result.returncode == 0:
             gpu_info = result.stdout.strip()
-            self.on_log(f"Remote GPUs: {gpu_info}")
+            lines = gpu_info.split("\n")
+            self.on_log(f"Remote GPUs ({len(lines)} detected):")
+            for line in lines:
+                self.on_log(f"  {line.strip()}")
+            if self.config.gpus and self.config.gpus.lower() != "all":
+                self.on_log(f"  Selected GPUs: {self.config.gpus}")
+            else:
+                self.on_log(f"  Using: all {len(lines)} GPUs")
             return gpu_info
         self.on_log("[yellow]No GPUs detected on remote (nvidia-smi failed)[/]")
         return None
 
+    def _env_prefix(self) -> str:
+        """Build environment variable prefix for remote commands (e.g. CUDA_VISIBLE_DEVICES)."""
+        parts = []
+        if self.config.gpus and self.config.gpus.lower() != "all":
+            parts.append(f"CUDA_VISIBLE_DEVICES={self.config.gpus}")
+        return " ".join(parts) + " " if parts else ""
+
     def ensure_obliteratus(self) -> bool:
         """Install or update obliteratus on the remote if needed."""
         # Check if already installed
@@ -224,7 +242,7 @@ class RemoteRunner:
         remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
 
         parts = [
-            self.config.python, "-m", "obliteratus",
+            self._env_prefix() + self.config.python, "-m", "obliteratus",
             "obliterate", shlex.quote(model),
             "--output-dir", shlex.quote(remote_output),
             "--method", method,
@@ -251,7 +269,7 @@ class RemoteRunner:
     def build_run_command(self, remote_config_path: str, output_dir: str | None = None, preset: str | None = None) -> str:
         """Build remote 'obliteratus run' command."""
         parts = [
-            self.config.python, "-m", "obliteratus",
+            self._env_prefix() + self.config.python, "-m", "obliteratus",
             "run", shlex.quote(remote_config_path),
         ]
         if output_dir:
@@ -276,7 +294,7 @@ class RemoteRunner:
         remote_output = output_dir or f"{self.config.remote_dir}/tourney/{model.replace('/', '_')}"
 
         parts = [
-            self.config.python, "-m", "obliteratus",
+            self._env_prefix() + self.config.python, "-m", "obliteratus",
             "tourney", shlex.quote(model),
             "--output-dir", shlex.quote(remote_output),
             "--device", device,

From b23d98982440a3d75c3eef48cb8dc2a1a16e185a Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Thu, 12 Mar 2026 12:56:03 -0400
Subject: [PATCH 03/10] Add __main__.py and fix remote pip install command

---
 obliteratus/__main__.py | 5 +++++
 obliteratus/remote.py   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 obliteratus/__main__.py

diff --git a/obliteratus/__main__.py b/obliteratus/__main__.py
new file mode 100644
index 0000000..995f990
--- /dev/null
+++ b/obliteratus/__main__.py
@@ -0,0 +1,5 @@
+"""Allow running obliteratus as ``python -m obliteratus``."""
+
+from obliteratus.cli import main
+
+main()
diff --git a/obliteratus/remote.py b/obliteratus/remote.py
index 57c7be7..960c7ad 100644
--- a/obliteratus/remote.py
+++ b/obliteratus/remote.py
@@ -193,7 +193,7 @@ class RemoteRunner:
         self.on_log("Installing obliteratus on remote...")
         install_cmd = (
             f"{self.config.python} -m pip install --quiet "
-            f"'obliteratus @ git+https://github.com/EleutherAI/OBLITERATUS.git'"
+            f"git+https://github.com/EleutherAI/OBLITERATUS.git"
         )
         rc = self.run_ssh(install_cmd, stream=True, timeout=self.config.install_timeout)
         if rc != 0:

From a634950abd9b3ec67387d875843220a6f21111e0 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 13 Mar 2026 01:02:18 -0400
Subject: [PATCH 04/10] Update remote install URL to StellaAthena fork

---
 obliteratus/remote.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/obliteratus/remote.py b/obliteratus/remote.py
index 960c7ad..4254ad3 100644
--- a/obliteratus/remote.py
+++ b/obliteratus/remote.py
@@ -193,7 +193,7 @@ class RemoteRunner:
         self.on_log("Installing obliteratus on remote...")
         install_cmd = (
             f"{self.config.python} -m pip install --quiet "
-            f"git+https://github.com/EleutherAI/OBLITERATUS.git"
+            f"git+https://github.com/StellaAthena/OBLITERATUS.git"
         )
         rc = self.run_ssh(install_cmd, stream=True, timeout=self.config.install_timeout)
         if rc != 0:

From 1a6e2577bb4d22f139730b6aa5d425292add1a1c Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 13 Mar 2026 01:24:31 -0400
Subject: [PATCH 05/10] Add data parallel support for PROBE stage

When --data-parallel is passed and the model fits on a single GPU,
wraps it with nn.DataParallel to split prompt batches across all
available GPUs during activation collection. Batch size scales by
GPU count. Hooks already move activations to CPU so they work
correctly across replicas.
---
 README.md                 | 71 +++++++++++++++++++++++++++++++++++++++
 obliteratus/abliterate.py | 35 +++++++++++++++++++
 obliteratus/cli.py        | 14 +++++++-
 obliteratus/remote.py     |  3 ++
 4 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 045ae5f..0ae7bd8 100644
--- a/README.md
+++ b/README.md
@@ -197,6 +197,77 @@ obliteratus aggregate --format summary
 obliteratus aggregate --format latex --metric refusal_rate --min-runs 3
 ```
 
+#### Multi-GPU support
+
+OBLITERATUS supports two multi-GPU modes:
+
+**Model parallelism** (automatic): Models that exceed single-GPU VRAM are automatically sharded across all available GPUs using accelerate's `device_map="auto"`, which places different layers on different devices. Only one GPU computes at a time during the forward pass — the benefit is fitting larger models (e.g., a 117B model across 8x A100-80GB), not faster processing.
+
+**Data parallelism** (`--data-parallel`): When the model fits on a single GPU, `--data-parallel` replicates it across all available GPUs and splits the prompt batches across them during the PROBE stage (activation collection). This provides a near-linear speedup for the most time-consuming pipeline stage. If the model is already sharded via `device_map="auto"`, data parallelism is not available (the model can't be replicated).
+
+```bash
+# Data parallel: replicate model across GPUs, split batches (faster PROBE stage)
+obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --data-parallel
+
+# Combine with GPU selection
+obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 --data-parallel --gpus 0,1,2,3
+```
+
+To select specific GPUs instead of using all available:
+
+```bash
+# Use only GPUs 0, 1, 2, 3
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 0,1,2,3
+
+# Works with any command
+obliteratus run my_study.yaml --gpus 0,1
+obliteratus tourney some-model --gpus 2,3
+```
+
+The `--gpus` flag sets `CUDA_VISIBLE_DEVICES` before CUDA initialization. When omitted, all GPUs are used.
+
+#### Remote execution via SSH
+
+Run any obliteratus command on a remote GPU machine from your laptop. Obliteratus handles SSH connectivity, auto-installation on the remote, GPU detection, live log streaming, and copying results back via SCP.
+
+```bash
+# Abliterate a model on a remote GPU node
+obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
+    --remote user@gpu-node \
+    --ssh-key ~/.ssh/id_rsa
+
+# All remote flags
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
+    --remote root@gpu-node \
+    --ssh-key ~/.ssh/id_rsa \
+    --ssh-port 22 \
+    --remote-dir /tmp/obliteratus_run \
+    --remote-python /opt/conda/bin/python3 \
+    --gpus 0,1,2,3 \
+    --no-sync  # keep results on remote, don't copy back
+
+# Run a YAML study remotely
+obliteratus run my_study.yaml --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
+
+# Tournaments work too
+obliteratus tourney meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node
+```
+
+Remote execution can also be configured via the YAML `remote:` section (see `examples/remote_gpu_node.yaml`):
+
+```yaml
+remote:
+  host: gpu-node.example.com
+  user: root
+  ssh_key: ~/.ssh/id_rsa
+  remote_dir: /tmp/obliteratus_run
+  python: python3
+  sync_results: true
+  gpus: "0,1,2,3"
+```
+
+The remote machine needs CUDA-capable GPUs and a Python environment. Obliteratus will be auto-installed from GitHub if not already present.
+
 ### 5. Python API (full programmatic control)
 
 For researchers who want to integrate OBLITERATUS into their own pipelines:
diff --git a/obliteratus/abliterate.py b/obliteratus/abliterate.py
index 9f56395..ef5efcf 100644
--- a/obliteratus/abliterate.py
+++ b/obliteratus/abliterate.py
@@ -647,6 +647,8 @@ class AbliterationPipeline:
         max_seq_length: int | None = None,
         # Verify stage sample size
         verify_sample_size: int | None = None,
+        # Data parallelism
+        data_parallel: bool = False,
         on_stage: Callable[[StageResult], None] | None = None,
         on_log: Callable[[str], None] | None = None,
     ):
@@ -750,6 +752,7 @@ class AbliterationPipeline:
         # refusal rate measurement.  Default 30 gives ~3.3% resolution;
         # increase for tighter confidence intervals (reviewer feedback).
         self.verify_sample_size = verify_sample_size if verify_sample_size is not None else 30
+        self.data_parallel = data_parallel
 
         # Large model mode: conservative defaults for 120B+ models.
         # Reduces memory footprint by limiting SAE features, directions,
@@ -828,6 +831,22 @@ class AbliterationPipeline:
         """Release unused GPU/accelerator memory between pipeline stages."""
         dev.free_gpu_memory()
 
+    def _can_data_parallel(self) -> bool:
+        """Check if data parallelism is feasible.
+
+        Returns True when:
+        - data_parallel was requested
+        - CUDA is available with >1 GPUs
+        - The model is NOT already sharded across devices (device_map="auto")
+        """
+        if not self.data_parallel:
+            return False
+        if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
+            return False
+        if self.handle and hasattr(self.handle.model, "hf_device_map"):
+            return False  # already sharded, can't replicate
+        return True
+
     @staticmethod
     def _get_model_device(model: nn.Module) -> torch.device:
         """Return the correct input device for a model.
@@ -1452,8 +1471,24 @@ class AbliterationPipeline:
 
         device = self._get_model_device(model)
 
+        # ── Data parallelism: wrap model to split batches across GPUs ──
+        # DataParallel replicates the model on each GPU and scatters the
+        # input batch.  Hooks fire on each replica (shared via shallow copy
+        # of _forward_hooks), and since they .detach().cpu().float() the
+        # activations, all results land in the same `activations` dict on
+        # CPU.  list.append is GIL-protected so thread-safe.  Order within
+        # a batch is nondeterministic across replicas, but that's fine —
+        # we only compute means and SVD over the collected activations.
+        use_dp = self._can_data_parallel()
+        n_gpus = torch.cuda.device_count() if use_dp else 1
+        if use_dp:
+            model = nn.DataParallel(model)
+            self.log(f"  Data parallel: splitting batches across {n_gpus} GPUs")
+
         # Batch prompts for throughput — hooks unbatch per-prompt activations
         batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
+        if use_dp:
+            batch_size *= n_gpus
         # Left-pad so position -1 is always the last real token in every batch element
         orig_padding_side = getattr(tokenizer, "padding_side", "right")
         if batch_size > 1:
diff --git a/obliteratus/cli.py b/obliteratus/cli.py
index aa37c94..d8d2189 100644
--- a/obliteratus/cli.py
+++ b/obliteratus/cli.py
@@ -23,7 +23,7 @@ _BANNER = r"""
 
 
 def _add_gpu_args(parser):
-    """Add --gpus flag for multi-GPU control."""
+    """Add --gpus and --data-parallel flags for multi-GPU control."""
     gpu_group = parser.add_argument_group("GPU selection")
     gpu_group.add_argument(
         "--gpus", type=str, default=None, metavar="IDS",
@@ -33,6 +33,15 @@ def _add_gpu_args(parser):
             "Models are automatically split across selected GPUs via accelerate."
         ),
     )
+    gpu_group.add_argument(
+        "--data-parallel", action="store_true", default=False,
+        help=(
+            "Use data parallelism to split prompt batches across GPUs during "
+            "activation collection (PROBE stage). Only effective when the model "
+            "fits on a single GPU and multiple GPUs are available. For models "
+            "already sharded across GPUs (device_map='auto'), this has no effect."
+        ),
+    )
 
 
 def _add_remote_args(parser):
@@ -714,6 +723,7 @@ def _cmd_abliterate(args):
         quantization=args.quantization,
         large_model_mode=getattr(args, "large_model", False),
         verify_sample_size=getattr(args, "verify_sample_size", None),
+        data_parallel=getattr(args, "data_parallel", False),
         on_stage=on_stage,
         on_log=on_log,
     )
@@ -807,6 +817,8 @@ def _cmd_remote_abliterate(args):
         kwargs["large_model"] = True
     if getattr(args, "verify_sample_size", None) is not None:
         kwargs["verify_sample_size"] = args.verify_sample_size
+    if getattr(args, "data_parallel", False):
+        kwargs["data_parallel"] = True
 
     result_path = runner.run_obliterate(
         model=args.model,
diff --git a/obliteratus/remote.py b/obliteratus/remote.py
index 4254ad3..c2702ed 100644
--- a/obliteratus/remote.py
+++ b/obliteratus/remote.py
@@ -237,6 +237,7 @@ class RemoteRunner:
         refinement_passes: int | None = None,
         large_model: bool = False,
         verify_sample_size: int | None = None,
+        data_parallel: bool = False,
     ) -> str:
         """Build the remote obliteratus CLI command."""
         remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
@@ -263,6 +264,8 @@ class RemoteRunner:
             parts.append("--large-model")
         if verify_sample_size is not None:
             parts.extend(["--verify-sample-size", str(verify_sample_size)])
+        if data_parallel:
+            parts.append("--data-parallel")
 
         return " ".join(parts)
 

From a2bb748f1be44f73ddd15247a2c58f9bf4820a3b Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 13 Mar 2026 16:54:31 -0400
Subject: [PATCH 06/10] Revert "Add data parallel support for PROBE stage"

This reverts commit 1a6e2577bb4d22f139730b6aa5d425292add1a1c.
---
 README.md                 | 71 ---------------------------------------
 obliteratus/abliterate.py | 35 -------------------
 obliteratus/cli.py        | 14 +-------
 obliteratus/remote.py     |  3 --
 4 files changed, 1 insertion(+), 122 deletions(-)

diff --git a/README.md b/README.md
index 0ae7bd8..045ae5f 100644
--- a/README.md
+++ b/README.md
@@ -197,77 +197,6 @@ obliteratus aggregate --format summary
 obliteratus aggregate --format latex --metric refusal_rate --min-runs 3
 ```
 
-#### Multi-GPU support
-
-OBLITERATUS supports two multi-GPU modes:
-
-**Model parallelism** (automatic): Models that exceed single-GPU VRAM are automatically sharded across all available GPUs using accelerate's `device_map="auto"`, which places different layers on different devices. Only one GPU computes at a time during the forward pass — the benefit is fitting larger models (e.g., a 117B model across 8x A100-80GB), not faster processing.
-
-**Data parallelism** (`--data-parallel`): When the model fits on a single GPU, `--data-parallel` replicates it across all available GPUs and splits the prompt batches across them during the PROBE stage (activation collection). This provides a near-linear speedup for the most time-consuming pipeline stage. If the model is already sharded via `device_map="auto"`, data parallelism is not available (the model can't be replicated).
-
-```bash
-# Data parallel: replicate model across GPUs, split batches (faster PROBE stage)
-obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --data-parallel
-
-# Combine with GPU selection
-obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 --data-parallel --gpus 0,1,2,3
-```
-
-To select specific GPUs instead of using all available:
-
-```bash
-# Use only GPUs 0, 1, 2, 3
-obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 0,1,2,3
-
-# Works with any command
-obliteratus run my_study.yaml --gpus 0,1
-obliteratus tourney some-model --gpus 2,3
-```
-
-The `--gpus` flag sets `CUDA_VISIBLE_DEVICES` before CUDA initialization. When omitted, all GPUs are used.
-
-#### Remote execution via SSH
-
-Run any obliteratus command on a remote GPU machine from your laptop. Obliteratus handles SSH connectivity, auto-installation on the remote, GPU detection, live log streaming, and copying results back via SCP.
-
-```bash
-# Abliterate a model on a remote GPU node
-obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
-    --remote user@gpu-node \
-    --ssh-key ~/.ssh/id_rsa
-
-# All remote flags
-obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
-    --remote root@gpu-node \
-    --ssh-key ~/.ssh/id_rsa \
-    --ssh-port 22 \
-    --remote-dir /tmp/obliteratus_run \
-    --remote-python /opt/conda/bin/python3 \
-    --gpus 0,1,2,3 \
-    --no-sync  # keep results on remote, don't copy back
-
-# Run a YAML study remotely
-obliteratus run my_study.yaml --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
-
-# Tournaments work too
-obliteratus tourney meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node
-```
-
-Remote execution can also be configured via the YAML `remote:` section (see `examples/remote_gpu_node.yaml`):
-
-```yaml
-remote:
-  host: gpu-node.example.com
-  user: root
-  ssh_key: ~/.ssh/id_rsa
-  remote_dir: /tmp/obliteratus_run
-  python: python3
-  sync_results: true
-  gpus: "0,1,2,3"
-```
-
-The remote machine needs CUDA-capable GPUs and a Python environment. Obliteratus will be auto-installed from GitHub if not already present.
-
 ### 5. Python API (full programmatic control)
 
 For researchers who want to integrate OBLITERATUS into their own pipelines:
diff --git a/obliteratus/abliterate.py b/obliteratus/abliterate.py
index ef5efcf..9f56395 100644
--- a/obliteratus/abliterate.py
+++ b/obliteratus/abliterate.py
@@ -647,8 +647,6 @@ class AbliterationPipeline:
         max_seq_length: int | None = None,
         # Verify stage sample size
         verify_sample_size: int | None = None,
-        # Data parallelism
-        data_parallel: bool = False,
         on_stage: Callable[[StageResult], None] | None = None,
         on_log: Callable[[str], None] | None = None,
     ):
@@ -752,7 +750,6 @@ class AbliterationPipeline:
         # refusal rate measurement.  Default 30 gives ~3.3% resolution;
         # increase for tighter confidence intervals (reviewer feedback).
         self.verify_sample_size = verify_sample_size if verify_sample_size is not None else 30
-        self.data_parallel = data_parallel
 
         # Large model mode: conservative defaults for 120B+ models.
         # Reduces memory footprint by limiting SAE features, directions,
@@ -831,22 +828,6 @@ class AbliterationPipeline:
         """Release unused GPU/accelerator memory between pipeline stages."""
         dev.free_gpu_memory()
 
-    def _can_data_parallel(self) -> bool:
-        """Check if data parallelism is feasible.
-
-        Returns True when:
-        - data_parallel was requested
-        - CUDA is available with >1 GPUs
-        - The model is NOT already sharded across devices (device_map="auto")
-        """
-        if not self.data_parallel:
-            return False
-        if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
-            return False
-        if self.handle and hasattr(self.handle.model, "hf_device_map"):
-            return False  # already sharded, can't replicate
-        return True
-
     @staticmethod
     def _get_model_device(model: nn.Module) -> torch.device:
         """Return the correct input device for a model.
@@ -1471,24 +1452,8 @@ class AbliterationPipeline:
 
         device = self._get_model_device(model)
 
-        # ── Data parallelism: wrap model to split batches across GPUs ──
-        # DataParallel replicates the model on each GPU and scatters the
-        # input batch.  Hooks fire on each replica (shared via shallow copy
-        # of _forward_hooks), and since they .detach().cpu().float() the
-        # activations, all results land in the same `activations` dict on
-        # CPU.  list.append is GIL-protected so thread-safe.  Order within
-        # a batch is nondeterministic across replicas, but that's fine —
-        # we only compute means and SVD over the collected activations.
-        use_dp = self._can_data_parallel()
-        n_gpus = torch.cuda.device_count() if use_dp else 1
-        if use_dp:
-            model = nn.DataParallel(model)
-            self.log(f"  Data parallel: splitting batches across {n_gpus} GPUs")
-
         # Batch prompts for throughput — hooks unbatch per-prompt activations
         batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
-        if use_dp:
-            batch_size *= n_gpus
         # Left-pad so position -1 is always the last real token in every batch element
         orig_padding_side = getattr(tokenizer, "padding_side", "right")
         if batch_size > 1:
diff --git a/obliteratus/cli.py b/obliteratus/cli.py
index d8d2189..aa37c94 100644
--- a/obliteratus/cli.py
+++ b/obliteratus/cli.py
@@ -23,7 +23,7 @@ _BANNER = r"""
 
 
 def _add_gpu_args(parser):
-    """Add --gpus and --data-parallel flags for multi-GPU control."""
+    """Add --gpus flag for multi-GPU control."""
     gpu_group = parser.add_argument_group("GPU selection")
     gpu_group.add_argument(
         "--gpus", type=str, default=None, metavar="IDS",
@@ -33,15 +33,6 @@ def _add_gpu_args(parser):
             "Models are automatically split across selected GPUs via accelerate."
         ),
     )
-    gpu_group.add_argument(
-        "--data-parallel", action="store_true", default=False,
-        help=(
-            "Use data parallelism to split prompt batches across GPUs during "
-            "activation collection (PROBE stage). Only effective when the model "
-            "fits on a single GPU and multiple GPUs are available. For models "
-            "already sharded across GPUs (device_map='auto'), this has no effect."
-        ),
-    )
 
 
 def _add_remote_args(parser):
@@ -723,7 +714,6 @@ def _cmd_abliterate(args):
         quantization=args.quantization,
         large_model_mode=getattr(args, "large_model", False),
         verify_sample_size=getattr(args, "verify_sample_size", None),
-        data_parallel=getattr(args, "data_parallel", False),
         on_stage=on_stage,
         on_log=on_log,
     )
@@ -817,8 +807,6 @@ def _cmd_remote_abliterate(args):
         kwargs["large_model"] = True
     if getattr(args, "verify_sample_size", None) is not None:
         kwargs["verify_sample_size"] = args.verify_sample_size
-    if getattr(args, "data_parallel", False):
-        kwargs["data_parallel"] = True
 
     result_path = runner.run_obliterate(
         model=args.model,
diff --git a/obliteratus/remote.py b/obliteratus/remote.py
index c2702ed..4254ad3 100644
--- a/obliteratus/remote.py
+++ b/obliteratus/remote.py
@@ -237,7 +237,6 @@ class RemoteRunner:
         refinement_passes: int | None = None,
         large_model: bool = False,
         verify_sample_size: int | None = None,
-        data_parallel: bool = False,
     ) -> str:
         """Build the remote obliteratus CLI command."""
         remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
@@ -264,8 +263,6 @@ class RemoteRunner:
             parts.append("--large-model")
         if verify_sample_size is not None:
             parts.extend(["--verify-sample-size", str(verify_sample_size)])
-        if data_parallel:
-            parts.append("--data-parallel")
 
         return " ".join(parts)
 

From 51f621d0a2b8719a9c6ff569ff0e5433dee28d56 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 13 Mar 2026 17:13:50 -0400
Subject: [PATCH 07/10] Save model snapshot to CPU to avoid OOM on multi-GPU
 setups

The snapshot() deepcopy was cloning tensors on their original GPU
devices, doubling VRAM usage. For a 234GB model sharded across 6
A100-80GB GPUs (~39GB each), this left no room for the copy.

Now snapshot stores tensors on CPU and restore() moves them back
to each parameter's current device.
---
 obliteratus/models/loader.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/obliteratus/models/loader.py b/obliteratus/models/loader.py
index 5e98115..ed7af53 100644
--- a/obliteratus/models/loader.py
+++ b/obliteratus/models/loader.py
@@ -312,14 +312,27 @@ class ModelHandle:
         )
 
     def snapshot(self):
-        """Save a deep copy of the model state dict so we can restore after ablation."""
-        self._original_state = copy.deepcopy(self.model.state_dict())
+        """Save a copy of the model state dict so we can restore after ablation.
+
+        Tensors are moved to CPU to avoid doubling GPU memory usage on
+        multi-GPU (device_map) setups.
+        """
+        self._original_state = {k: v.cpu().clone() for k, v in self.model.state_dict().items()}
 
     def restore(self):
-        """Restore the model to the snapshot state."""
+        """Restore the model to the snapshot state.
+
+        Moves CPU-saved tensors back to each parameter's current device.
+        """
         if self._original_state is None:
             raise RuntimeError("No snapshot to restore — call .snapshot() first.")
-        self.model.load_state_dict(self._original_state)
+        # Map each key to the device where the model currently holds it
+        current_state = self.model.state_dict()
+        restored = {}
+        for k, v in self._original_state.items():
+            target = current_state[k].device if k in current_state else None
+            restored[k] = v.to(target) if target is not None else v
+        self.model.load_state_dict(restored)
 
     def cleanup(self):
         """Remove temporary offload directory if one was auto-created."""

From c723da02c84194f14078675d2b777996076ff2df Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Mon, 16 Mar 2026 14:39:22 -0400
Subject: [PATCH 08/10] Document multi-GPU parallelism, benchmarks, and remote
 SSH execution

Add a comprehensive section covering:
- How model sharding (pipeline parallelism) works and its limitations
- GPU selection via --gpus flag
- Pipeline parallel benchmarks on GPT-OSS-120B across 3-8 A100-80GB GPUs
- Stage-by-stage timing breakdown
- When data parallelism helps (and when it doesn't)
- Remote SSH execution with CLI and YAML examples
- Decision table for choosing the right setup
---
 README.md | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/README.md b/README.md
index 045ae5f..008cde1 100644
--- a/README.md
+++ b/README.md
@@ -415,6 +415,138 @@ Includes pre-liberated variants (Dolphin, Hermes, WhiteRabbitNeo) for A/B compar
 obliteratus models
 ```
 
+## Multi-GPU and remote execution
+
+OBLITERATUS automatically shards models across multiple GPUs when they don't fit on a single card. It also supports remote execution over SSH, so you can run the pipeline on a GPU server from your laptop.
+
+### How model sharding works
+
+When you have multiple GPUs, OBLITERATUS uses accelerate's `device_map="auto"` to split the model's layers across all available GPUs. This is **naive pipeline parallelism** — layers are distributed evenly, but only one GPU computes at a time as activations flow sequentially through the layer stack. The other GPUs hold their assigned layers in memory but are idle until their turn.
+
+This means multi-GPU sharding is a **memory solution, not a speed solution**. It lets you run models that don't fit on one GPU, but it won't make small models run faster. In fact, more GPUs can be *slower* due to inter-GPU data transfer overhead at layer boundaries.
+
+### Selecting GPUs
+
+Use `--gpus` to control which GPUs are used:
+
+```bash
+# Use all 8 GPUs (default)
+obliteratus obliterate bigmodel/200B --gpus all
+
+# Use only GPUs 0-3
+obliteratus obliterate bigmodel/200B --gpus 0,1,2,3
+
+# Use a specific pair
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 2,5
+```
+
+This sets `CUDA_VISIBLE_DEVICES` before CUDA initializes. The model is then sharded across the selected GPUs.
+
+### Pipeline parallel benchmarks
+
+We benchmarked the full abliteration pipeline on `openai/gpt-oss-120b` (117B MoE, ~234 GB in bf16) across varying numbers of A100-80GB GPUs:
+
+| GPUs | Total time | VRAM/GPU | Notes |
+|------|-----------|----------|-------|
+| 3 | **FAILED** | ~78 GB | Not enough headroom for activations; some layers offloaded to CPU as meta tensors, crashes during EXCISE |
+| 4 | **615s** (10m15s) | ~58 GB | Fastest. Fewest inter-GPU transfers. Snapshot auto-skipped (insufficient free VRAM) |
+| 5 | 763s (12m43s) | ~47 GB | +24% slower than 4 GPUs |
+| 6 | 766s (12m46s) | ~39 GB | +25% slower than 4 GPUs |
+| 8 | 633s (10m33s) | ~29 GB | +3% slower than 4 GPUs. Ran CPU-side state dict snapshot (adds ~20s) |
+
+Stage breakdown (approximately constant across GPU counts):
+
+| Stage | Time | Bottleneck |
+|-------|------|-----------|
+| SUMMON (load) | ~11s | Disk I/O (model cached locally) |
+| PROBE (activations) | ~20s | Forward passes through sharded model |
+| DISTILL + EXCISE | ~30s | SVD + weight projection (CPU-bound) |
+| VERIFY | ~210s | Forward passes on validation prompts |
+| REBIRTH (save) | ~350s | Writing 234 GB to disk |
+
+Key findings:
+
+- **Use the minimum number of GPUs that fits your model.** Extra GPUs only add cross-device transfer overhead. 4 GPUs was faster than 8 for GPT-OSS-120B.
+- **The pipeline is I/O-dominated for large models.** VERIFY and REBIRTH together account for ~90% of wall time. The actual compute (PROBE, DISTILL, EXCISE) is fast regardless of GPU count.
+- **Leave headroom.** The model needs VRAM beyond just its parameter storage — activation tensors, KV cache, and intermediate computations during PROBE and VERIFY all consume memory. 3x A100-80GB (240 GB) was not enough for a 234 GB model.
+- **Pipeline parallelism doesn't help compute-bound stages.** Since only one GPU computes at a time, doubling GPUs doesn't halve PROBE or VERIFY time. It only enables fitting larger models.
+
+### When you actually need data parallelism
+
+For models that fit on a single GPU with room to spare, the PROBE stage (which runs 1024 forward passes to collect activations) is the main computational bottleneck. Pipeline parallelism doesn't help here — it still processes one prompt at a time through the full layer stack.
+
+True data parallelism (replicating the model and splitting prompts across GPUs) can speed up PROBE, but it requires enough VRAM to hold a full copy of the model on each GPU. An experimental pre-replicated data parallel implementation is available on the `data-parallel-prereplication` branch:
+
+```bash
+git checkout data-parallel-prereplication
+obliteratus obliterate EleutherAI/pythia-12b --data-parallel
+```
+
+This deep-copies the model to each GPU once, then distributes prompt batches across replicas using a thread pool. Benchmarks on Pythia 12B (24 GB model, 8x A100-80GB):
+
+| Mode | PROBE time | Notes |
+|------|-----------|-------|
+| Single GPU | 7.1s | Baseline |
+| Pre-replicated DP (8 GPUs) | 7.7s | Near parity — PROBE is too fast at this scale for parallelism to help |
+
+Data parallelism becomes more valuable as the prompt count or model size increases relative to the per-forward-pass cost. For most models, the overhead of replication exceeds the time saved.
+
+### Remote execution over SSH
+
+Run the full pipeline on a remote GPU node from your local machine. OBLITERATUS handles SSH connection, auto-installs itself on the remote if needed, streams logs in real time, and copies results back when done.
+
+```bash
+# Basic remote run
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
+    --remote user@gpu-node
+
+# With SSH key and custom options
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
+    --remote root@10.0.0.5 \
+    --ssh-key ~/.ssh/id_rsa \
+    --ssh-port 2222 \
+    --remote-dir /data/obliteratus \
+    --remote-python python3.11
+
+# Don't copy results back (keep on remote only)
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
+    --remote user@gpu-node --no-sync
+```
+
+Remote execution also works with `obliteratus run` (YAML configs) and `obliteratus tourney` (method comparison). You can specify remote settings in YAML:
+
+```yaml
+model:
+  name: meta-llama/Llama-3.1-70B-Instruct
+  dtype: float16
+
+remote:
+  host: gpu-node
+  user: root
+  ssh_key: ~/.ssh/id_rsa
+  remote_dir: /tmp/obliteratus_run
+  gpus: "0,1,2,3"     # select GPUs on the remote
+  sync_results: true   # copy results back when done
+```
+
+The remote runner:
+1. Tests SSH connectivity
+2. Detects GPUs on the remote (`nvidia-smi`)
+3. Installs obliteratus if not already present
+4. Uploads config files if using `obliteratus run`
+5. Runs the pipeline with real-time log streaming
+6. Copies results back via SCP
+
+### Choosing the right setup
+
+| Scenario | Recommendation |
+|----------|---------------|
+| Model fits on 1 GPU | Use 1 GPU. Adding more won't help and may slow things down. |
+| Model fits on 1 GPU, PROBE is slow (many prompts) | Try `data-parallel-prereplication` branch. Only helps if model fits on each GPU with room for activations. |
+| Model doesn't fit on 1 GPU | Use `--gpus` with the **minimum** number of GPUs that fits. E.g., a 70B model in fp16 (~140 GB) needs 2x A100-80GB — don't use 4. |
+| Model needs 4+ GPUs | Pipeline parallel via `device_map="auto"` is the only option. Expect I/O-dominated runtimes for very large models. |
+| No local GPUs | Use `--remote user@gpu-node` to run on a remote machine, or use HuggingFace Spaces / Colab. |
+
 ## 10 study presets
 
 Pre-configured ablation studies you can run out of the box:

From 79b469d3dcab86e44cb2775c6b4ebe3197d7e746 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Mon, 16 Mar 2026 15:29:57 -0400
Subject: [PATCH 09/10] Add DeepSeek-R1-Distill-Llama-70B pipeline parallel
 benchmarks

Benchmarked 70B dense model (149 GB bf16) on 2/3/4/8 A100-80GB GPUs.
3 GPUs was fastest (536s), confirming minimum-viable-GPU-count guidance.
Combined stage breakdown table for both models.
---
 README.md | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 008cde1..275be60 100644
--- a/README.md
+++ b/README.md
@@ -444,7 +444,9 @@ This sets `CUDA_VISIBLE_DEVICES` before CUDA initializes. The model is then shar
 
 ### Pipeline parallel benchmarks
 
-We benchmarked the full abliteration pipeline on `openai/gpt-oss-120b` (117B MoE, ~234 GB in bf16) across varying numbers of A100-80GB GPUs:
+We benchmarked the full abliteration pipeline across varying numbers of A100-80GB GPUs on two large models.
+
+**GPT-OSS-120B** (117B MoE, ~234 GB in bf16):
 
 | GPUs | Total time | VRAM/GPU | Notes |
 |------|-----------|----------|-------|
@@ -454,21 +456,30 @@ We benchmarked the full abliteration pipeline on `openai/gpt-oss-120b` (117B MoE
 | 6 | 766s (12m46s) | ~39 GB | +25% slower than 4 GPUs |
 | 8 | 633s (10m33s) | ~29 GB | +3% slower than 4 GPUs. Ran CPU-side state dict snapshot (adds ~20s) |
 
+**DeepSeek-R1-Distill-Llama-70B** (70B dense, ~149 GB in bf16, 80 layers):
+
+| GPUs | Total time | VRAM/GPU | Notes |
+|------|-----------|----------|-------|
+| 2 | **FAILED** | ~75 GB | Meta tensor crash — 149 GB model on 160 GB total VRAM leaves no activation headroom |
+| 3 | **536s** (8m56s) | ~50 GB | Fastest. Minimum viable GPU count for this model |
+| 4 | 626s (10m26s) | ~37 GB | +17% slower than 3 GPUs |
+| 8 | 627s (10m27s) | ~19 GB | +17% slower than 3 GPUs. No benefit over 4 |
+
 Stage breakdown (approximately constant across GPU counts):
 
-| Stage | Time | Bottleneck |
-|-------|------|-----------|
-| SUMMON (load) | ~11s | Disk I/O (model cached locally) |
-| PROBE (activations) | ~20s | Forward passes through sharded model |
-| DISTILL + EXCISE | ~30s | SVD + weight projection (CPU-bound) |
-| VERIFY | ~210s | Forward passes on validation prompts |
-| REBIRTH (save) | ~350s | Writing 234 GB to disk |
+| Stage | GPT-OSS-120B | DeepSeek-70B | Bottleneck |
+|-------|-------------|-------------|-----------|
+| SUMMON (load) | ~11s | ~24s | Disk I/O (model cached locally) |
+| PROBE (activations) | ~20s | ~20s | Forward passes through sharded model |
+| DISTILL + EXCISE | ~30s | ~30s | SVD + weight projection (CPU-bound) |
+| VERIFY | ~210s | ~270s | Forward passes on validation prompts |
+| REBIRTH (save) | ~350s | ~194s | Writing model to disk (234 GB vs 141 GB) |
 
 Key findings:
 
-- **Use the minimum number of GPUs that fits your model.** Extra GPUs only add cross-device transfer overhead. 4 GPUs was faster than 8 for GPT-OSS-120B.
+- **Use the minimum number of GPUs that fits your model.** Extra GPUs only add cross-device transfer overhead. 4 GPUs was faster than 8 for GPT-OSS-120B; 3 GPUs was fastest for DeepSeek-70B.
 - **The pipeline is I/O-dominated for large models.** VERIFY and REBIRTH together account for ~90% of wall time. The actual compute (PROBE, DISTILL, EXCISE) is fast regardless of GPU count.
-- **Leave headroom.** The model needs VRAM beyond just its parameter storage — activation tensors, KV cache, and intermediate computations during PROBE and VERIFY all consume memory. 3x A100-80GB (240 GB) was not enough for a 234 GB model.
+- **Leave headroom.** The model needs VRAM beyond just its parameter storage — activation tensors, KV cache, and intermediate computations during PROBE and VERIFY all consume memory. 3x A100-80GB (240 GB) was not enough for a 234 GB model; 2x A100-80GB (160 GB) was not enough for a 149 GB model.
 - **Pipeline parallelism doesn't help compute-bound stages.** Since only one GPU computes at a time, doubling GPUs doesn't halve PROBE or VERIFY time. It only enables fitting larger models.
 
 ### When you actually need data parallelism

From 501ff0c9633f20f0e73854a01a826541b7edbd90 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Tue, 17 Mar 2026 14:01:18 -0400
Subject: [PATCH 10/10] Add gpu-calc command and document
 precision/quantization options

New `obliteratus gpu-calc` subcommand estimates minimum GPU count from
model params, dtype, and GPU VRAM. Auto-detects param counts from HF
configs including MoE expert structure.

README now covers --dtype, --quantization flags, the gpu-calc command,
and references both in the "Choosing the right setup" table.
---
 README.md          |  49 ++++++++-
 obliteratus/cli.py | 241 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 287 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 275be60..fa6654e 100644
--- a/README.md
+++ b/README.md
@@ -442,6 +442,49 @@ obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 2,5
 
 This sets `CUDA_VISIBLE_DEVICES` before CUDA initializes. The model is then sharded across the selected GPUs.
 
+### Precision and quantization
+
+The `--dtype` flag controls the precision of model weights, which directly determines how much VRAM you need. Lower precision means smaller memory footprint at the cost of some numerical fidelity:
+
+| Dtype | Bytes/param | 7B model | 70B model | 405B model |
+|-------|-----------|---------|----------|-----------|
+| `float32` | 4 | 28 GB | 280 GB | 1620 GB |
+| `float16` / `bfloat16` | 2 | 14 GB | 140 GB | 810 GB |
+| `int8` (via `--quantization bitsandbytes-8bit`) | 1 | 7 GB | 70 GB | 405 GB |
+| `int4` (via `--quantization bitsandbytes-4bit`) | 0.5 | 3.5 GB | 35 GB | 203 GB |
+
+```bash
+# Default: bfloat16
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct
+
+# 8-bit quantization — fits on fewer GPUs
+obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
+    --quantization bitsandbytes-8bit
+
+# 4-bit quantization — Llama-405B on 4x A100-80GB
+obliteratus obliterate meta-llama/Llama-3.1-405B-Instruct \
+    --quantization bitsandbytes-4bit --dtype float16
+```
+
+Quantization roughly halves the GPU count at each step down. A 70B model that needs 3x A100-80GB in bf16 fits on 2 in int8 or 1 in int4.
+
+### GPU calculator
+
+Not sure how many GPUs you need? The `gpu-calc` command estimates the minimum GPU count for any model, accounting for weight memory, activation overhead, and CUDA context:
+
+```bash
+# Auto-detect from HuggingFace model name
+obliteratus gpu-calc meta-llama/Llama-3.1-70B-Instruct --gpu-mem 24
+
+# Manual: specify params and precision
+obliteratus gpu-calc --params 70 --dtype bfloat16 --gpu-mem 80
+
+# MoE models: specify active params separately
+obliteratus gpu-calc --params 117 --active-params 13 --dtype bfloat16 --gpu-mem 80
+```
+
+The calculator fetches the model config from HuggingFace to estimate parameter counts (including MoE expert structure), then shows a table of GPU configurations with headroom estimates. For MoE models, activation overhead is computed from the active parameter count rather than total parameters.
+
 ### Pipeline parallel benchmarks
 
 We benchmarked the full abliteration pipeline across varying numbers of A100-80GB GPUs on two large models.
@@ -553,9 +596,11 @@ The remote runner:
 | Scenario | Recommendation |
 |----------|---------------|
 | Model fits on 1 GPU | Use 1 GPU. Adding more won't help and may slow things down. |
+| Model almost fits on 1 GPU | Try `--quantization bitsandbytes-8bit` or `bitsandbytes-4bit` to reduce memory. Halving precision roughly halves VRAM. |
 | Model fits on 1 GPU, PROBE is slow (many prompts) | Try `data-parallel-prereplication` branch. Only helps if model fits on each GPU with room for activations. |
-| Model doesn't fit on 1 GPU | Use `--gpus` with the **minimum** number of GPUs that fits. E.g., a 70B model in fp16 (~140 GB) needs 2x A100-80GB — don't use 4. |
-| Model needs 4+ GPUs | Pipeline parallel via `device_map="auto"` is the only option. Expect I/O-dominated runtimes for very large models. |
+| Model doesn't fit on 1 GPU | Use `--gpus` with the **minimum** number of GPUs that fits. Run `obliteratus gpu-calc` to find that number. |
+| Model needs 4+ GPUs | Pipeline parallel via `device_map="auto"` is the only option. Expect I/O-dominated runtimes for very large models. Consider quantization first — int4 can cut the GPU count by 4x. |
+| Not sure how many GPUs you need | Run `obliteratus gpu-calc <model> --gpu-mem <your_vram>` for an estimate. |
 | No local GPUs | Use `--remote user@gpu-node` to run on a remote machine, or use HuggingFace Spaces / Colab. |
 
 ## 10 study presets
diff --git a/obliteratus/cli.py b/obliteratus/cli.py
index aa37c94..b0767ab 100644
--- a/obliteratus/cli.py
+++ b/obliteratus/cli.py
@@ -266,12 +266,42 @@ def main(argv: list[str] | None = None):
         help="Also show global cross-architecture insights",
     )
 
+    # --- gpu-calc ---
+    calc_parser = subparsers.add_parser(
+        "gpu-calc",
+        help="Estimate minimum GPUs needed for a model",
+    )
+    calc_parser.add_argument(
+        "model", type=str, nargs="?", default=None,
+        help="HuggingFace model name/path (auto-fetches param counts)",
+    )
+    calc_parser.add_argument(
+        "--params", type=float, default=None, metavar="B",
+        help="Total parameters in billions (overrides auto-detection)",
+    )
+    calc_parser.add_argument(
+        "--active-params", type=float, default=None, metavar="B",
+        help="Active parameters in billions (for MoE models; defaults to --params)",
+    )
+    calc_parser.add_argument(
+        "--dtype", type=str, default="bfloat16",
+        choices=["float32", "float16", "bfloat16", "int8", "int4"],
+        help="Data type for model weights (default: bfloat16)",
+    )
+    calc_parser.add_argument(
+        "--gpu-mem", type=float, default=80.0, metavar="GB",
+        help="VRAM per GPU in GB (default: 80 for A100-80GB)",
+    )
+
     args = parser.parse_args(argv)
 
     # Apply GPU selection early (before any CUDA init)
     _apply_gpu_selection(args)
 
-    if args.command == "run":
+    if args.command == "gpu-calc":
+        _cmd_gpu_calc(args)
+        return
+    elif args.command == "run":
         if getattr(args, "remote", None):
             _cmd_remote_run(args)
         else:
@@ -765,6 +795,215 @@ def _cmd_abliterate(args):
     )
 
 
+def _cmd_gpu_calc(args):
+    import math
+
+    from rich.panel import Panel
+    from rich.table import Table
+
+    BYTES_PER_PARAM = {
+        "float32": 4,
+        "float16": 2,
+        "bfloat16": 2,
+        "int8": 1,
+        "int4": 0.5,
+    }
+
+    # Resolve param counts
+    total_params_b = args.params
+    active_params_b = args.active_params
+
+    if total_params_b is None:
+        if args.model is None:
+            console.print("[red]Provide either a model name or --params.[/]")
+            raise SystemExit(1)
+        console.print(f"Fetching config for [cyan]{args.model}[/]...")
+        try:
+            from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
+        except Exception as e:
+            console.print(f"[red]Could not load config: {e}[/]")
+            raise SystemExit(1)
+
+        # Total params: prefer explicit num_parameters, else estimate from config
+        total_params_b = _estimate_total_params_b(config)
+
+        # Active params for MoE
+        if active_params_b is None:
+            active_params_b = _estimate_active_params_b(config, total_params_b)
+
+    if active_params_b is None:
+        active_params_b = total_params_b
+
+    bpp = BYTES_PER_PARAM[args.dtype]
+    gpu_mem_gb = args.gpu_mem
+
+    # Model weight memory (use base-10 GB to match HF/nvidia conventions)
+    weight_gb = total_params_b * bpp
+
+    # Activation overhead during forward passes (PROBE/VERIFY).
+    # Scales with active params, not total. Empirical from benchmarks:
+    # - DeepSeek-70B (149GB): failed at 160GB (2 GPUs), OK at 240GB (3 GPUs)
+    # - GPT-OSS-120B (234GB): failed at 240GB (3 GPUs), OK at 320GB (4 GPUs)
+    # This implies ~15-35% overhead. We use 20% as a reasonable middle ground.
+    active_weight_gb = active_params_b * bpp
+    activation_overhead_gb = active_weight_gb * 0.20
+
+    # CUDA context + fragmentation overhead: ~1.5 GB per GPU (fixed cost)
+    cuda_overhead_per_gpu = 1.5
+
+    # Total memory needed (before splitting across GPUs)
+    total_needed_gb = weight_gb + activation_overhead_gb
+
+    # Find minimum GPUs: we need total_needed / (gpu_mem - cuda_overhead) GPUs
+    usable_per_gpu = gpu_mem_gb - cuda_overhead_per_gpu
+    if usable_per_gpu <= 0:
+        console.print("[red]GPU memory too small after CUDA overhead.[/]")
+        raise SystemExit(1)
+
+    min_gpus = math.ceil(total_needed_gb / usable_per_gpu)
+    min_gpus = max(min_gpus, 1)
+
+    # Show results for a range of GPU counts
+    is_moe = active_params_b < total_params_b * 0.99
+
+    table = Table(title="GPU Configurations", show_edge=True)
+    table.add_column("GPUs", justify="right", style="cyan")
+    table.add_column("VRAM/GPU", justify="right")
+    table.add_column("Total VRAM", justify="right")
+    table.add_column("Headroom", justify="right")
+    table.add_column("Verdict", min_width=20)
+
+    # Show from min_gpus-1 (to show why it fails) up to 8
+    low = max(1, min_gpus - 1)
+    high = max(min_gpus + 3, 8)
+    for n in range(low, high + 1):
+        total_vram = n * gpu_mem_gb
+        usable_vram = n * usable_per_gpu
+        headroom = usable_vram - total_needed_gb
+        headroom_pct = headroom / total_needed_gb * 100
+        vram_per = total_needed_gb / n
+
+        if headroom < 0:
+            verdict = "[red]INSUFFICIENT[/]"
+        elif headroom_pct < 15:
+            verdict = "[yellow]TIGHT — may fail[/]"
+        elif n == min_gpus:
+            verdict = "[bold green]MINIMUM (recommended)[/]"
+        else:
+            verdict = "[green]OK[/] [dim](more GPUs = slower)[/]"
+
+        table.add_row(
+            str(n),
+            f"{vram_per:.1f} GB",
+            f"{total_vram:.0f} GB",
+            f"{headroom:+.1f} GB ({headroom_pct:+.0f}%)",
+            verdict,
+        )
+
+    model_label = args.model or f"{total_params_b:.1f}B params"
+    moe_line = ""
+    if is_moe:
+        moe_line = f"\n  Active params:  [cyan]{active_params_b:.1f}B[/] ({active_params_b/total_params_b*100:.0f}% of total — MoE)"
+
+    console.print(Panel(
+        f"  Model:          [cyan]{model_label}[/]\n"
+        f"  Total params:   [cyan]{total_params_b:.1f}B[/]"
+        f"{moe_line}\n"
+        f"  Dtype:          [cyan]{args.dtype}[/] ({bpp} bytes/param)\n"
+        f"  Weight memory:  [cyan]{weight_gb:.1f} GB[/]\n"
+        f"  Activation est: [cyan]{activation_overhead_gb:.1f} GB[/]\n"
+        f"  Total needed:   [bold]{total_needed_gb:.1f} GB[/]\n"
+        f"  GPU VRAM:       [cyan]{gpu_mem_gb:.0f} GB[/] per device",
+        title="[bold]GPU Calculator[/]",
+        border_style="cyan",
+    ))
+    console.print(table)
+    console.print(
+        f"\n  [bold green]Minimum GPUs: {min_gpus}[/]"
+        f"  ({min_gpus} x {gpu_mem_gb:.0f} GB = {min_gpus * gpu_mem_gb:.0f} GB)\n"
+    )
+    console.print(
+        "[dim]Note: fewer GPUs = faster (pipeline parallel has cross-device overhead).\n"
+        "Estimates are conservative. Actual memory may vary with sequence length\n"
+        "and model architecture. See 'obliteratus obliterate --help' for runtime options.[/]\n"
+    )
+
+
+def _estimate_total_params_b(config) -> float:
+    """Estimate total parameter count in billions from a HuggingFace config."""
+    # Some configs have explicit param counts
+    for attr in ("num_parameters", "n_params"):
+        val = getattr(config, attr, None)
+        if val and val > 1000:
+            return val / 1e9
+
+    # Estimate from architecture dimensions
+    h = getattr(config, "hidden_size", 0)
+    L = getattr(config, "num_hidden_layers", 0)
+    V = getattr(config, "vocab_size", 0)
+    i = getattr(config, "intermediate_size", h * 4)
+
+    if h == 0 or L == 0:
+        console.print("[red]Cannot determine model size from config. Use --params.[/]")
+        raise SystemExit(1)
+
+    n_heads = getattr(config, "num_attention_heads", None) or (h // 128)
+    head_dim = getattr(config, "head_dim", None) or (h // n_heads if n_heads else 128)
+    kv_heads = getattr(config, "num_key_value_heads", None) or n_heads
+
+    # Attention: Q + K + V projections + output projection
+    attn_params = h * (n_heads * head_dim) + h * (kv_heads * head_dim) * 2 + (n_heads * head_dim) * h
+
+    # FFN (MoE or dense)
+    n_experts = getattr(config, "num_local_experts", getattr(config, "num_experts", 1)) or 1
+    # MoE models often have a separate intermediate size for expert FFNs
+    moe_i = getattr(config, "moe_intermediate_size", i)
+    # gate + up + down projections per expert
+    ffn_per_expert = h * moe_i * 3
+    ffn_params = ffn_per_expert * n_experts
+    # Some architectures (Qwen, DeepSeek) also have a shared/dense FFN per layer
+    if n_experts > 1 and hasattr(config, "moe_intermediate_size"):
+        # The dense FFN uses the main intermediate_size
+        ffn_params += h * i * 3
+    # Router
+    if n_experts > 1:
+        ffn_params += h * n_experts
+
+    # Per-layer: attention + FFN + layernorms
+    layer_params = attn_params + ffn_params + h * 4  # 2 layernorms, 2 params each
+
+    # Embedding + LM head
+    embed_params = V * h * 2  # input + output embeddings (may be tied but counts for memory)
+
+    total = L * layer_params + embed_params
+    return total / 1e9
+
+
+def _estimate_active_params_b(config, total_params_b: float) -> float:
+    """For MoE models, estimate active parameters per forward pass."""
+    n_experts = getattr(config, "num_local_experts", getattr(config, "num_experts", 1)) or 1
+    if n_experts <= 1:
+        return total_params_b
+
+    top_k = getattr(config, "num_experts_per_tok", getattr(config, "top_k", 2)) or 2
+
+    h = getattr(config, "hidden_size", 0)
+    i = getattr(config, "intermediate_size", h * 4)
+    moe_i = getattr(config, "moe_intermediate_size", i)
+    L = getattr(config, "num_hidden_layers", 0)
+
+    # FFN per expert (uses moe_intermediate_size if available)
+    ffn_per_expert = h * moe_i * 3
+    # Active FFN = top_k experts instead of all n_experts
+    ffn_all = ffn_per_expert * n_experts * L
+    ffn_active = ffn_per_expert * top_k * L
+    # Non-FFN params (includes any shared/dense FFN)
+    non_ffn = total_params_b * 1e9 - ffn_all
+    active = non_ffn + ffn_active
+    return max(active / 1e9, 0.1)
+
+
 def _make_remote_runner(args):
     """Create a RemoteRunner from CLI --remote flags."""
     from obliteratus.remote import RemoteConfig, RemoteRunner