From a2bb748f1be44f73ddd15247a2c58f9bf4820a3b Mon Sep 17 00:00:00 2001 From: Stella Biderman Date: Fri, 13 Mar 2026 16:54:31 -0400 Subject: [PATCH] Revert "Add data parallel support for PROBE stage" This reverts commit 1a6e2577bb4d22f139730b6aa5d425292add1a1c. --- README.md | 71 --------------------------------------- obliteratus/abliterate.py | 35 ------------------- obliteratus/cli.py | 14 +------- obliteratus/remote.py | 3 -- 4 files changed, 1 insertion(+), 122 deletions(-) diff --git a/README.md b/README.md index 0ae7bd8..045ae5f 100644 --- a/README.md +++ b/README.md @@ -197,77 +197,6 @@ obliteratus aggregate --format summary obliteratus aggregate --format latex --metric refusal_rate --min-runs 3 ``` -#### Multi-GPU support - -OBLITERATUS supports two multi-GPU modes: - -**Model parallelism** (automatic): Models that exceed single-GPU VRAM are automatically sharded across all available GPUs using accelerate's `device_map="auto"`, which places different layers on different devices. Only one GPU computes at a time during the forward pass — the benefit is fitting larger models (e.g., a 117B model across 8x A100-80GB), not faster processing. - -**Data parallelism** (`--data-parallel`): When the model fits on a single GPU, `--data-parallel` replicates it across all available GPUs and splits the prompt batches across them during the PROBE stage (activation collection). This provides a near-linear speedup for the most time-consuming pipeline stage. If the model is already sharded via `device_map="auto"`, data parallelism is not available (the model can't be replicated). - -```bash -# Data parallel: replicate model across GPUs, split batches (faster PROBE stage) -obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --data-parallel - -# Combine with GPU selection -obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 --data-parallel --gpus 0,1,2,3 -``` - -To select specific GPUs instead of using all available: - -```bash -# Use only GPUs 0, 1, 2, 3 -obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 0,1,2,3 - -# Works with any command -obliteratus run my_study.yaml --gpus 0,1 -obliteratus tourney some-model --gpus 2,3 -``` - -The `--gpus` flag sets `CUDA_VISIBLE_DEVICES` before CUDA initialization. When omitted, all GPUs are used. - -#### Remote execution via SSH - -Run any obliteratus command on a remote GPU machine from your laptop. Obliteratus handles SSH connectivity, auto-installation on the remote, GPU detection, live log streaming, and copying results back via SCP. - -```bash -# Abliterate a model on a remote GPU node -obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \ - --remote user@gpu-node \ - --ssh-key ~/.ssh/id_rsa - -# All remote flags -obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \ - --remote root@gpu-node \ - --ssh-key ~/.ssh/id_rsa \ - --ssh-port 22 \ - --remote-dir /tmp/obliteratus_run \ - --remote-python /opt/conda/bin/python3 \ - --gpus 0,1,2,3 \ - --no-sync # keep results on remote, don't copy back - -# Run a YAML study remotely -obliteratus run my_study.yaml --remote root@gpu-node --ssh-key ~/.ssh/id_rsa - -# Tournaments work too -obliteratus tourney meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node -``` - -Remote execution can also be configured via the YAML `remote:` section (see `examples/remote_gpu_node.yaml`): - -```yaml -remote: - host: gpu-node.example.com - user: root - ssh_key: ~/.ssh/id_rsa - remote_dir: /tmp/obliteratus_run - python: python3 - sync_results: true - gpus: "0,1,2,3" -``` - -The remote machine needs CUDA-capable GPUs and a Python environment. Obliteratus will be auto-installed from GitHub if not already present. - ### 5. Python API (full programmatic control) For researchers who want to integrate OBLITERATUS into their own pipelines: diff --git a/obliteratus/abliterate.py b/obliteratus/abliterate.py index ef5efcf..9f56395 100644 --- a/obliteratus/abliterate.py +++ b/obliteratus/abliterate.py @@ -647,8 +647,6 @@ class AbliterationPipeline: max_seq_length: int | None = None, # Verify stage sample size verify_sample_size: int | None = None, - # Data parallelism - data_parallel: bool = False, on_stage: Callable[[StageResult], None] | None = None, on_log: Callable[[str], None] | None = None, ): @@ -752,7 +750,6 @@ class AbliterationPipeline: # refusal rate measurement. Default 30 gives ~3.3% resolution; # increase for tighter confidence intervals (reviewer feedback). self.verify_sample_size = verify_sample_size if verify_sample_size is not None else 30 - self.data_parallel = data_parallel # Large model mode: conservative defaults for 120B+ models. # Reduces memory footprint by limiting SAE features, directions, @@ -831,22 +828,6 @@ class AbliterationPipeline: """Release unused GPU/accelerator memory between pipeline stages.""" dev.free_gpu_memory() - def _can_data_parallel(self) -> bool: - """Check if data parallelism is feasible. - - Returns True when: - - data_parallel was requested - - CUDA is available with >1 GPUs - - The model is NOT already sharded across devices (device_map="auto") - """ - if not self.data_parallel: - return False - if not torch.cuda.is_available() or torch.cuda.device_count() < 2: - return False - if self.handle and hasattr(self.handle.model, "hf_device_map"): - return False # already sharded, can't replicate - return True - @staticmethod def _get_model_device(model: nn.Module) -> torch.device: """Return the correct input device for a model. @@ -1471,24 +1452,8 @@ class AbliterationPipeline: device = self._get_model_device(model) - # ── Data parallelism: wrap model to split batches across GPUs ── - # DataParallel replicates the model on each GPU and scatters the - # input batch. Hooks fire on each replica (shared via shallow copy - # of _forward_hooks), and since they .detach().cpu().float() the - # activations, all results land in the same `activations` dict on - # CPU. list.append is GIL-protected so thread-safe. Order within - # a batch is nondeterministic across replicas, but that's fine — - # we only compute means and SVD over the collected activations. - use_dp = self._can_data_parallel() - n_gpus = torch.cuda.device_count() if use_dp else 1 - if use_dp: - model = nn.DataParallel(model) - self.log(f" Data parallel: splitting batches across {n_gpus} GPUs") - # Batch prompts for throughput — hooks unbatch per-prompt activations batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1 - if use_dp: - batch_size *= n_gpus # Left-pad so position -1 is always the last real token in every batch element orig_padding_side = getattr(tokenizer, "padding_side", "right") if batch_size > 1: diff --git a/obliteratus/cli.py b/obliteratus/cli.py index d8d2189..aa37c94 100644 --- a/obliteratus/cli.py +++ b/obliteratus/cli.py @@ -23,7 +23,7 @@ _BANNER = r""" def _add_gpu_args(parser): - """Add --gpus and --data-parallel flags for multi-GPU control.""" + """Add --gpus flag for multi-GPU control.""" gpu_group = parser.add_argument_group("GPU selection") gpu_group.add_argument( "--gpus", type=str, default=None, metavar="IDS", @@ -33,15 +33,6 @@ def _add_gpu_args(parser): "Models are automatically split across selected GPUs via accelerate." ), ) - gpu_group.add_argument( - "--data-parallel", action="store_true", default=False, - help=( - "Use data parallelism to split prompt batches across GPUs during " - "activation collection (PROBE stage). Only effective when the model " - "fits on a single GPU and multiple GPUs are available. For models " - "already sharded across GPUs (device_map='auto'), this has no effect." - ), - ) def _add_remote_args(parser): @@ -723,7 +714,6 @@ def _cmd_abliterate(args): quantization=args.quantization, large_model_mode=getattr(args, "large_model", False), verify_sample_size=getattr(args, "verify_sample_size", None), - data_parallel=getattr(args, "data_parallel", False), on_stage=on_stage, on_log=on_log, ) @@ -817,8 +807,6 @@ def _cmd_remote_abliterate(args): kwargs["large_model"] = True if getattr(args, "verify_sample_size", None) is not None: kwargs["verify_sample_size"] = args.verify_sample_size - if getattr(args, "data_parallel", False): - kwargs["data_parallel"] = True result_path = runner.run_obliterate( model=args.model, diff --git a/obliteratus/remote.py b/obliteratus/remote.py index c2702ed..4254ad3 100644 --- a/obliteratus/remote.py +++ b/obliteratus/remote.py @@ -237,7 +237,6 @@ class RemoteRunner: refinement_passes: int | None = None, large_model: bool = False, verify_sample_size: int | None = None, - data_parallel: bool = False, ) -> str: """Build the remote obliteratus CLI command.""" remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}" @@ -264,8 +263,6 @@ class RemoteRunner: parts.append("--large-model") if verify_sample_size is not None: parts.extend(["--verify-sample-size", str(verify_sample_size)]) - if data_parallel: - parts.append("--data-parallel") return " ".join(parts)