From a2bb748f1be44f73ddd15247a2c58f9bf4820a3b Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 13 Mar 2026 16:54:31 -0400
Subject: [PATCH] Revert "Add data parallel support for PROBE stage"

This reverts commit 1a6e2577bb4d22f139730b6aa5d425292add1a1c.
---
 README.md                 | 71 ---------------------------------------
 obliteratus/abliterate.py | 35 -------------------
 obliteratus/cli.py        | 14 +-------
 obliteratus/remote.py     |  3 --
 4 files changed, 1 insertion(+), 122 deletions(-)

diff --git a/README.md b/README.md
index 0ae7bd8..045ae5f 100644
--- a/README.md
+++ b/README.md
@@ -197,77 +197,6 @@ obliteratus aggregate --format summary
 obliteratus aggregate --format latex --metric refusal_rate --min-runs 3
 ```
 
-#### Multi-GPU support
-
-OBLITERATUS supports two multi-GPU modes:
-
-**Model parallelism** (automatic): Models that exceed single-GPU VRAM are automatically sharded across all available GPUs using accelerate's `device_map="auto"`, which places different layers on different devices. Only one GPU computes at a time during the forward pass — the benefit is fitting larger models (e.g., a 117B model across 8x A100-80GB), not faster processing.
-
-**Data parallelism** (`--data-parallel`): When the model fits on a single GPU, `--data-parallel` replicates it across all available GPUs and splits the prompt batches across them during the PROBE stage (activation collection). This provides a near-linear speedup for the most time-consuming pipeline stage. If the model is already sharded via `device_map="auto"`, data parallelism is not available (the model can't be replicated).
-
-```bash
-# Data parallel: replicate model across GPUs, split batches (faster PROBE stage)
-obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --data-parallel
-
-# Combine with GPU selection
-obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 --data-parallel --gpus 0,1,2,3
-```
-
-To select specific GPUs instead of using all available:
-
-```bash
-# Use only GPUs 0, 1, 2, 3
-obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 0,1,2,3
-
-# Works with any command
-obliteratus run my_study.yaml --gpus 0,1
-obliteratus tourney some-model --gpus 2,3
-```
-
-The `--gpus` flag sets `CUDA_VISIBLE_DEVICES` before CUDA initialization. When omitted, all GPUs are used.
-
-#### Remote execution via SSH
-
-Run any obliteratus command on a remote GPU machine from your laptop. Obliteratus handles SSH connectivity, auto-installation on the remote, GPU detection, live log streaming, and copying results back via SCP.
-
-```bash
-# Abliterate a model on a remote GPU node
-obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
-    --remote user@gpu-node \
-    --ssh-key ~/.ssh/id_rsa
-
-# All remote flags
-obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
-    --remote root@gpu-node \
-    --ssh-key ~/.ssh/id_rsa \
-    --ssh-port 22 \
-    --remote-dir /tmp/obliteratus_run \
-    --remote-python /opt/conda/bin/python3 \
-    --gpus 0,1,2,3 \
-    --no-sync  # keep results on remote, don't copy back
-
-# Run a YAML study remotely
-obliteratus run my_study.yaml --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
-
-# Tournaments work too
-obliteratus tourney meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node
-```
-
-Remote execution can also be configured via the YAML `remote:` section (see `examples/remote_gpu_node.yaml`):
-
-```yaml
-remote:
-  host: gpu-node.example.com
-  user: root
-  ssh_key: ~/.ssh/id_rsa
-  remote_dir: /tmp/obliteratus_run
-  python: python3
-  sync_results: true
-  gpus: "0,1,2,3"
-```
-
-The remote machine needs CUDA-capable GPUs and a Python environment. Obliteratus will be auto-installed from GitHub if not already present.
-
 ### 5. Python API (full programmatic control)
 
 For researchers who want to integrate OBLITERATUS into their own pipelines:
diff --git a/obliteratus/abliterate.py b/obliteratus/abliterate.py
index ef5efcf..9f56395 100644
--- a/obliteratus/abliterate.py
+++ b/obliteratus/abliterate.py
@@ -647,8 +647,6 @@ class AbliterationPipeline:
         max_seq_length: int | None = None,
         # Verify stage sample size
         verify_sample_size: int | None = None,
-        # Data parallelism
-        data_parallel: bool = False,
         on_stage: Callable[[StageResult], None] | None = None,
         on_log: Callable[[str], None] | None = None,
     ):
@@ -752,7 +750,6 @@ class AbliterationPipeline:
         # refusal rate measurement.  Default 30 gives ~3.3% resolution;
         # increase for tighter confidence intervals (reviewer feedback).
         self.verify_sample_size = verify_sample_size if verify_sample_size is not None else 30
-        self.data_parallel = data_parallel
 
         # Large model mode: conservative defaults for 120B+ models.
         # Reduces memory footprint by limiting SAE features, directions,
@@ -831,22 +828,6 @@ class AbliterationPipeline:
         """Release unused GPU/accelerator memory between pipeline stages."""
         dev.free_gpu_memory()
 
-    def _can_data_parallel(self) -> bool:
-        """Check if data parallelism is feasible.
-
-        Returns True when:
-        - data_parallel was requested
-        - CUDA is available with >1 GPUs
-        - The model is NOT already sharded across devices (device_map="auto")
-        """
-        if not self.data_parallel:
-            return False
-        if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
-            return False
-        if self.handle and hasattr(self.handle.model, "hf_device_map"):
-            return False  # already sharded, can't replicate
-        return True
-
     @staticmethod
     def _get_model_device(model: nn.Module) -> torch.device:
         """Return the correct input device for a model.
@@ -1471,24 +1452,8 @@ class AbliterationPipeline:
 
         device = self._get_model_device(model)
 
-        # ── Data parallelism: wrap model to split batches across GPUs ──
-        # DataParallel replicates the model on each GPU and scatters the
-        # input batch.  Hooks fire on each replica (shared via shallow copy
-        # of _forward_hooks), and since they .detach().cpu().float() the
-        # activations, all results land in the same `activations` dict on
-        # CPU.  list.append is GIL-protected so thread-safe.  Order within
-        # a batch is nondeterministic across replicas, but that's fine —
-        # we only compute means and SVD over the collected activations.
-        use_dp = self._can_data_parallel()
-        n_gpus = torch.cuda.device_count() if use_dp else 1
-        if use_dp:
-            model = nn.DataParallel(model)
-            self.log(f"  Data parallel: splitting batches across {n_gpus} GPUs")
-
         # Batch prompts for throughput — hooks unbatch per-prompt activations
         batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
-        if use_dp:
-            batch_size *= n_gpus
         # Left-pad so position -1 is always the last real token in every batch element
         orig_padding_side = getattr(tokenizer, "padding_side", "right")
         if batch_size > 1:
diff --git a/obliteratus/cli.py b/obliteratus/cli.py
index d8d2189..aa37c94 100644
--- a/obliteratus/cli.py
+++ b/obliteratus/cli.py
@@ -23,7 +23,7 @@ _BANNER = r"""
 
 
 def _add_gpu_args(parser):
-    """Add --gpus and --data-parallel flags for multi-GPU control."""
+    """Add --gpus flag for multi-GPU control."""
     gpu_group = parser.add_argument_group("GPU selection")
     gpu_group.add_argument(
         "--gpus", type=str, default=None, metavar="IDS",
@@ -33,15 +33,6 @@ def _add_gpu_args(parser):
             "Models are automatically split across selected GPUs via accelerate."
         ),
     )
-    gpu_group.add_argument(
-        "--data-parallel", action="store_true", default=False,
-        help=(
-            "Use data parallelism to split prompt batches across GPUs during "
-            "activation collection (PROBE stage). Only effective when the model "
-            "fits on a single GPU and multiple GPUs are available. For models "
-            "already sharded across GPUs (device_map='auto'), this has no effect."
-        ),
-    )
 
 
 def _add_remote_args(parser):
@@ -723,7 +714,6 @@ def _cmd_abliterate(args):
         quantization=args.quantization,
         large_model_mode=getattr(args, "large_model", False),
         verify_sample_size=getattr(args, "verify_sample_size", None),
-        data_parallel=getattr(args, "data_parallel", False),
         on_stage=on_stage,
         on_log=on_log,
     )
@@ -817,8 +807,6 @@ def _cmd_remote_abliterate(args):
         kwargs["large_model"] = True
     if getattr(args, "verify_sample_size", None) is not None:
         kwargs["verify_sample_size"] = args.verify_sample_size
-    if getattr(args, "data_parallel", False):
-        kwargs["data_parallel"] = True
 
     result_path = runner.run_obliterate(
         model=args.model,
diff --git a/obliteratus/remote.py b/obliteratus/remote.py
index c2702ed..4254ad3 100644
--- a/obliteratus/remote.py
+++ b/obliteratus/remote.py
@@ -237,7 +237,6 @@ class RemoteRunner:
         refinement_passes: int | None = None,
         large_model: bool = False,
         verify_sample_size: int | None = None,
-        data_parallel: bool = False,
     ) -> str:
         """Build the remote obliteratus CLI command."""
         remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
@@ -264,8 +263,6 @@ class RemoteRunner:
             parts.append("--large-model")
         if verify_sample_size is not None:
             parts.extend(["--verify-sample-size", str(verify_sample_size)])
-        if data_parallel:
-            parts.append("--data-parallel")
 
         return " ".join(parts)