Revert "Add data parallel support for PROBE stage"

This reverts commit 1a6e2577bb.
This commit is contained in:
Stella Biderman
2026-03-13 16:54:31 -04:00
parent 1a6e2577bb
commit a2bb748f1b
4 changed files with 1 additions and 122 deletions
-71
View File
@@ -197,77 +197,6 @@ obliteratus aggregate --format summary
obliteratus aggregate --format latex --metric refusal_rate --min-runs 3
```
#### Multi-GPU support
OBLITERATUS supports two multi-GPU modes:
**Model parallelism** (automatic): Models that exceed single-GPU VRAM are automatically sharded across all available GPUs using accelerate's `device_map="auto"`, which places different layers on different devices. Only one GPU computes at a time during the forward pass — the benefit is fitting larger models (e.g., a 117B model across 8x A100-80GB), not faster processing.
**Data parallelism** (`--data-parallel`): When the model fits on a single GPU, `--data-parallel` replicates it across all available GPUs and splits the prompt batches across them during the PROBE stage (activation collection). This provides a near-linear speedup for the most time-consuming pipeline stage. If the model is already sharded via `device_map="auto"`, data parallelism is not available (the model can't be replicated).
```bash
# Data parallel: replicate model across GPUs, split batches (faster PROBE stage)
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --data-parallel
# Combine with GPU selection
obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 --data-parallel --gpus 0,1,2,3
```
To select specific GPUs instead of using all available:
```bash
# Use only GPUs 0, 1, 2, 3
obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 0,1,2,3
# Works with any command
obliteratus run my_study.yaml --gpus 0,1
obliteratus tourney some-model --gpus 2,3
```
The `--gpus` flag sets `CUDA_VISIBLE_DEVICES` before CUDA initialization. When omitted, all GPUs are used.
#### Remote execution via SSH
Run any obliteratus command on a remote GPU machine from your laptop. Obliteratus handles SSH connectivity, auto-installation on the remote, GPU detection, live log streaming, and copying results back via SCP.
```bash
# Abliterate a model on a remote GPU node
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
--remote user@gpu-node \
--ssh-key ~/.ssh/id_rsa
# All remote flags
obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
--remote root@gpu-node \
--ssh-key ~/.ssh/id_rsa \
--ssh-port 22 \
--remote-dir /tmp/obliteratus_run \
--remote-python /opt/conda/bin/python3 \
--gpus 0,1,2,3 \
--no-sync # keep results on remote, don't copy back
# Run a YAML study remotely
obliteratus run my_study.yaml --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
# Tournaments work too
obliteratus tourney meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node
```
Remote execution can also be configured via the YAML `remote:` section (see `examples/remote_gpu_node.yaml`):
```yaml
remote:
host: gpu-node.example.com
user: root
ssh_key: ~/.ssh/id_rsa
remote_dir: /tmp/obliteratus_run
python: python3
sync_results: true
gpus: "0,1,2,3"
```
The remote machine needs CUDA-capable GPUs and a Python environment. Obliteratus will be auto-installed from GitHub if not already present.
### 5. Python API (full programmatic control)
For researchers who want to integrate OBLITERATUS into their own pipelines:
-35
View File
@@ -647,8 +647,6 @@ class AbliterationPipeline:
max_seq_length: int | None = None,
# Verify stage sample size
verify_sample_size: int | None = None,
# Data parallelism
data_parallel: bool = False,
on_stage: Callable[[StageResult], None] | None = None,
on_log: Callable[[str], None] | None = None,
):
@@ -752,7 +750,6 @@ class AbliterationPipeline:
# refusal rate measurement. Default 30 gives ~3.3% resolution;
# increase for tighter confidence intervals (reviewer feedback).
self.verify_sample_size = verify_sample_size if verify_sample_size is not None else 30
self.data_parallel = data_parallel
# Large model mode: conservative defaults for 120B+ models.
# Reduces memory footprint by limiting SAE features, directions,
@@ -831,22 +828,6 @@ class AbliterationPipeline:
"""Release unused GPU/accelerator memory between pipeline stages."""
dev.free_gpu_memory()
def _can_data_parallel(self) -> bool:
"""Check if data parallelism is feasible.
Returns True when:
- data_parallel was requested
- CUDA is available with >1 GPUs
- The model is NOT already sharded across devices (device_map="auto")
"""
if not self.data_parallel:
return False
if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
return False
if self.handle and hasattr(self.handle.model, "hf_device_map"):
return False # already sharded, can't replicate
return True
@staticmethod
def _get_model_device(model: nn.Module) -> torch.device:
"""Return the correct input device for a model.
@@ -1471,24 +1452,8 @@ class AbliterationPipeline:
device = self._get_model_device(model)
# ── Data parallelism: wrap model to split batches across GPUs ──
# DataParallel replicates the model on each GPU and scatters the
# input batch. Hooks fire on each replica (shared via shallow copy
# of _forward_hooks), and since they .detach().cpu().float() the
# activations, all results land in the same `activations` dict on
# CPU. list.append is GIL-protected so thread-safe. Order within
# a batch is nondeterministic across replicas, but that's fine —
# we only compute means and SVD over the collected activations.
use_dp = self._can_data_parallel()
n_gpus = torch.cuda.device_count() if use_dp else 1
if use_dp:
model = nn.DataParallel(model)
self.log(f" Data parallel: splitting batches across {n_gpus} GPUs")
# Batch prompts for throughput — hooks unbatch per-prompt activations
batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
if use_dp:
batch_size *= n_gpus
# Left-pad so position -1 is always the last real token in every batch element
orig_padding_side = getattr(tokenizer, "padding_side", "right")
if batch_size > 1:
+1 -13
View File
@@ -23,7 +23,7 @@ _BANNER = r"""
def _add_gpu_args(parser):
"""Add --gpus and --data-parallel flags for multi-GPU control."""
"""Add --gpus flag for multi-GPU control."""
gpu_group = parser.add_argument_group("GPU selection")
gpu_group.add_argument(
"--gpus", type=str, default=None, metavar="IDS",
@@ -33,15 +33,6 @@ def _add_gpu_args(parser):
"Models are automatically split across selected GPUs via accelerate."
),
)
gpu_group.add_argument(
"--data-parallel", action="store_true", default=False,
help=(
"Use data parallelism to split prompt batches across GPUs during "
"activation collection (PROBE stage). Only effective when the model "
"fits on a single GPU and multiple GPUs are available. For models "
"already sharded across GPUs (device_map='auto'), this has no effect."
),
)
def _add_remote_args(parser):
@@ -723,7 +714,6 @@ def _cmd_abliterate(args):
quantization=args.quantization,
large_model_mode=getattr(args, "large_model", False),
verify_sample_size=getattr(args, "verify_sample_size", None),
data_parallel=getattr(args, "data_parallel", False),
on_stage=on_stage,
on_log=on_log,
)
@@ -817,8 +807,6 @@ def _cmd_remote_abliterate(args):
kwargs["large_model"] = True
if getattr(args, "verify_sample_size", None) is not None:
kwargs["verify_sample_size"] = args.verify_sample_size
if getattr(args, "data_parallel", False):
kwargs["data_parallel"] = True
result_path = runner.run_obliterate(
model=args.model,
-3
View File
@@ -237,7 +237,6 @@ class RemoteRunner:
refinement_passes: int | None = None,
large_model: bool = False,
verify_sample_size: int | None = None,
data_parallel: bool = False,
) -> str:
"""Build the remote obliteratus CLI command."""
remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
@@ -264,8 +263,6 @@ class RemoteRunner:
parts.append("--large-model")
if verify_sample_size is not None:
parts.extend(["--verify-sample-size", str(verify_sample_size)])
if data_parallel:
parts.append("--data-parallel")
return " ".join(parts)