Revert "Add data parallel support for PROBE stage"

This reverts commit 1a6e2577bb.
2026-07-24 04:40:53 +02:00 · 2026-03-13 16:54:31 -04:00
parent 1a6e2577bb
commit a2bb748f1b
4 changed files with 1 additions and 122 deletions
@@ -197,77 +197,6 @@ obliteratus aggregate --format summary
 obliteratus aggregate --format latex --metric refusal_rate --min-runs 3
 ```

-#### Multi-GPU support
-
-OBLITERATUS supports two multi-GPU modes:
-
-**Model parallelism** (automatic): Models that exceed single-GPU VRAM are automatically sharded across all available GPUs using accelerate's `device_map="auto"`, which places different layers on different devices. Only one GPU computes at a time during the forward pass — the benefit is fitting larger models (e.g., a 117B model across 8x A100-80GB), not faster processing.
-
-**Data parallelism** (`--data-parallel`): When the model fits on a single GPU, `--data-parallel` replicates it across all available GPUs and splits the prompt batches across them during the PROBE stage (activation collection). This provides a near-linear speedup for the most time-consuming pipeline stage. If the model is already sharded via `device_map="auto"`, data parallelism is not available (the model can't be replicated).
-
-```bash
-# Data parallel: replicate model across GPUs, split batches (faster PROBE stage)
-obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --data-parallel
-
-# Combine with GPU selection
-obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 --data-parallel --gpus 0,1,2,3
-```
-
-To select specific GPUs instead of using all available:
-
-```bash
-# Use only GPUs 0, 1, 2, 3
-obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 0,1,2,3
-
-# Works with any command
-obliteratus run my_study.yaml --gpus 0,1
-obliteratus tourney some-model --gpus 2,3
-```
-
-The `--gpus` flag sets `CUDA_VISIBLE_DEVICES` before CUDA initialization. When omitted, all GPUs are used.
-
-#### Remote execution via SSH
-
-Run any obliteratus command on a remote GPU machine from your laptop. Obliteratus handles SSH connectivity, auto-installation on the remote, GPU detection, live log streaming, and copying results back via SCP.
-
-```bash
-# Abliterate a model on a remote GPU node
-obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
-    --remote user@gpu-node \
-    --ssh-key ~/.ssh/id_rsa
-
-# All remote flags
-obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
-    --remote root@gpu-node \
-    --ssh-key ~/.ssh/id_rsa \
-    --ssh-port 22 \
-    --remote-dir /tmp/obliteratus_run \
-    --remote-python /opt/conda/bin/python3 \
-    --gpus 0,1,2,3 \
-    --no-sync  # keep results on remote, don't copy back
-
-# Run a YAML study remotely
-obliteratus run my_study.yaml --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
-
-# Tournaments work too
-obliteratus tourney meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node
-```
-
-Remote execution can also be configured via the YAML `remote:` section (see `examples/remote_gpu_node.yaml`):
-
-```yaml
-remote:
-  host: gpu-node.example.com
-  user: root
-  ssh_key: ~/.ssh/id_rsa
-  remote_dir: /tmp/obliteratus_run
-  python: python3
-  sync_results: true
-  gpus: "0,1,2,3"
-```
-
-The remote machine needs CUDA-capable GPUs and a Python environment. Obliteratus will be auto-installed from GitHub if not already present.
-
 ### 5. Python API (full programmatic control)

 For researchers who want to integrate OBLITERATUS into their own pipelines:
@@ -647,8 +647,6 @@ class AbliterationPipeline:
        max_seq_length: int | None = None,
        # Verify stage sample size
        verify_sample_size: int | None = None,
-        # Data parallelism
-        data_parallel: bool = False,
        on_stage: Callable[[StageResult], None] | None = None,
        on_log: Callable[[str], None] | None = None,
    ):
@@ -752,7 +750,6 @@ class AbliterationPipeline:
        # refusal rate measurement.  Default 30 gives ~3.3% resolution;
        # increase for tighter confidence intervals (reviewer feedback).
        self.verify_sample_size = verify_sample_size if verify_sample_size is not None else 30
-        self.data_parallel = data_parallel

        # Large model mode: conservative defaults for 120B+ models.
        # Reduces memory footprint by limiting SAE features, directions,
@@ -831,22 +828,6 @@ class AbliterationPipeline:
        """Release unused GPU/accelerator memory between pipeline stages."""
        dev.free_gpu_memory()

-    def _can_data_parallel(self) -> bool:
-        """Check if data parallelism is feasible.
-
-        Returns True when:
-        - data_parallel was requested
-        - CUDA is available with >1 GPUs
-        - The model is NOT already sharded across devices (device_map="auto")
-        """
-        if not self.data_parallel:
-            return False
-        if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
-            return False
-        if self.handle and hasattr(self.handle.model, "hf_device_map"):
-            return False  # already sharded, can't replicate
-        return True
-
    @staticmethod
    def _get_model_device(model: nn.Module) -> torch.device:
        """Return the correct input device for a model.
@@ -1471,24 +1452,8 @@ class AbliterationPipeline:

        device = self._get_model_device(model)

-        # ── Data parallelism: wrap model to split batches across GPUs ──
-        # DataParallel replicates the model on each GPU and scatters the
-        # input batch.  Hooks fire on each replica (shared via shallow copy
-        # of _forward_hooks), and since they .detach().cpu().float() the
-        # activations, all results land in the same `activations` dict on
-        # CPU.  list.append is GIL-protected so thread-safe.  Order within
-        # a batch is nondeterministic across replicas, but that's fine —
-        # we only compute means and SVD over the collected activations.
-        use_dp = self._can_data_parallel()
-        n_gpus = torch.cuda.device_count() if use_dp else 1
-        if use_dp:
-            model = nn.DataParallel(model)
-            self.log(f"  Data parallel: splitting batches across {n_gpus} GPUs")
-
        # Batch prompts for throughput — hooks unbatch per-prompt activations
        batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
-        if use_dp:
-            batch_size *= n_gpus
        # Left-pad so position -1 is always the last real token in every batch element
        orig_padding_side = getattr(tokenizer, "padding_side", "right")
        if batch_size > 1:
@@ -23,7 +23,7 @@ _BANNER = r"""


 def _add_gpu_args(parser):
-    """Add --gpus and --data-parallel flags for multi-GPU control."""
+    """Add --gpus flag for multi-GPU control."""
    gpu_group = parser.add_argument_group("GPU selection")
    gpu_group.add_argument(
        "--gpus", type=str, default=None, metavar="IDS",
@@ -33,15 +33,6 @@ def _add_gpu_args(parser):
            "Models are automatically split across selected GPUs via accelerate."
        ),
    )
-    gpu_group.add_argument(
-        "--data-parallel", action="store_true", default=False,
-        help=(
-            "Use data parallelism to split prompt batches across GPUs during "
-            "activation collection (PROBE stage). Only effective when the model "
-            "fits on a single GPU and multiple GPUs are available. For models "
-            "already sharded across GPUs (device_map='auto'), this has no effect."
-        ),
-    )


 def _add_remote_args(parser):
@@ -723,7 +714,6 @@ def _cmd_abliterate(args):
        quantization=args.quantization,
        large_model_mode=getattr(args, "large_model", False),
        verify_sample_size=getattr(args, "verify_sample_size", None),
-        data_parallel=getattr(args, "data_parallel", False),
        on_stage=on_stage,
        on_log=on_log,
    )
@@ -817,8 +807,6 @@ def _cmd_remote_abliterate(args):
        kwargs["large_model"] = True
    if getattr(args, "verify_sample_size", None) is not None:
        kwargs["verify_sample_size"] = args.verify_sample_size
-    if getattr(args, "data_parallel", False):
-        kwargs["data_parallel"] = True

    result_path = runner.run_obliterate(
        model=args.model,
@@ -237,7 +237,6 @@ class RemoteRunner:
        refinement_passes: int | None = None,
        large_model: bool = False,
        verify_sample_size: int | None = None,
-        data_parallel: bool = False,
    ) -> str:
        """Build the remote obliteratus CLI command."""
        remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
@@ -264,8 +263,6 @@ class RemoteRunner:
            parts.append("--large-model")
        if verify_sample_size is not None:
            parts.extend(["--verify-sample-size", str(verify_sample_size)])
-        if data_parallel:
-            parts.append("--data-parallel")

        return " ".join(parts)