mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-06-07 14:53:53 +02:00
Revert "Add data parallel support for PROBE stage"
This reverts commit 1a6e2577bb.
This commit is contained in:
@@ -197,77 +197,6 @@ obliteratus aggregate --format summary
|
||||
obliteratus aggregate --format latex --metric refusal_rate --min-runs 3
|
||||
```
|
||||
|
||||
#### Multi-GPU support
|
||||
|
||||
OBLITERATUS supports two multi-GPU modes:
|
||||
|
||||
**Model parallelism** (automatic): Models that exceed single-GPU VRAM are automatically sharded across all available GPUs using accelerate's `device_map="auto"`, which places different layers on different devices. Only one GPU computes at a time during the forward pass — the benefit is fitting larger models (e.g., a 117B model across 8x A100-80GB), not faster processing.
|
||||
|
||||
**Data parallelism** (`--data-parallel`): When the model fits on a single GPU, `--data-parallel` replicates it across all available GPUs and splits the prompt batches across them during the PROBE stage (activation collection). This provides a near-linear speedup for the most time-consuming pipeline stage. If the model is already sharded via `device_map="auto"`, data parallelism is not available (the model can't be replicated).
|
||||
|
||||
```bash
|
||||
# Data parallel: replicate model across GPUs, split batches (faster PROBE stage)
|
||||
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --data-parallel
|
||||
|
||||
# Combine with GPU selection
|
||||
obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 --data-parallel --gpus 0,1,2,3
|
||||
```
|
||||
|
||||
To select specific GPUs instead of using all available:
|
||||
|
||||
```bash
|
||||
# Use only GPUs 0, 1, 2, 3
|
||||
obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 0,1,2,3
|
||||
|
||||
# Works with any command
|
||||
obliteratus run my_study.yaml --gpus 0,1
|
||||
obliteratus tourney some-model --gpus 2,3
|
||||
```
|
||||
|
||||
The `--gpus` flag sets `CUDA_VISIBLE_DEVICES` before CUDA initialization. When omitted, all GPUs are used.
|
||||
|
||||
#### Remote execution via SSH
|
||||
|
||||
Run any obliteratus command on a remote GPU machine from your laptop. Obliteratus handles SSH connectivity, auto-installation on the remote, GPU detection, live log streaming, and copying results back via SCP.
|
||||
|
||||
```bash
|
||||
# Abliterate a model on a remote GPU node
|
||||
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
|
||||
--remote user@gpu-node \
|
||||
--ssh-key ~/.ssh/id_rsa
|
||||
|
||||
# All remote flags
|
||||
obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \
|
||||
--remote root@gpu-node \
|
||||
--ssh-key ~/.ssh/id_rsa \
|
||||
--ssh-port 22 \
|
||||
--remote-dir /tmp/obliteratus_run \
|
||||
--remote-python /opt/conda/bin/python3 \
|
||||
--gpus 0,1,2,3 \
|
||||
--no-sync # keep results on remote, don't copy back
|
||||
|
||||
# Run a YAML study remotely
|
||||
obliteratus run my_study.yaml --remote root@gpu-node --ssh-key ~/.ssh/id_rsa
|
||||
|
||||
# Tournaments work too
|
||||
obliteratus tourney meta-llama/Llama-3.1-8B-Instruct --remote root@gpu-node
|
||||
```
|
||||
|
||||
Remote execution can also be configured via the YAML `remote:` section (see `examples/remote_gpu_node.yaml`):
|
||||
|
||||
```yaml
|
||||
remote:
|
||||
host: gpu-node.example.com
|
||||
user: root
|
||||
ssh_key: ~/.ssh/id_rsa
|
||||
remote_dir: /tmp/obliteratus_run
|
||||
python: python3
|
||||
sync_results: true
|
||||
gpus: "0,1,2,3"
|
||||
```
|
||||
|
||||
The remote machine needs CUDA-capable GPUs and a Python environment. Obliteratus will be auto-installed from GitHub if not already present.
|
||||
|
||||
### 5. Python API (full programmatic control)
|
||||
|
||||
For researchers who want to integrate OBLITERATUS into their own pipelines:
|
||||
|
||||
@@ -647,8 +647,6 @@ class AbliterationPipeline:
|
||||
max_seq_length: int | None = None,
|
||||
# Verify stage sample size
|
||||
verify_sample_size: int | None = None,
|
||||
# Data parallelism
|
||||
data_parallel: bool = False,
|
||||
on_stage: Callable[[StageResult], None] | None = None,
|
||||
on_log: Callable[[str], None] | None = None,
|
||||
):
|
||||
@@ -752,7 +750,6 @@ class AbliterationPipeline:
|
||||
# refusal rate measurement. Default 30 gives ~3.3% resolution;
|
||||
# increase for tighter confidence intervals (reviewer feedback).
|
||||
self.verify_sample_size = verify_sample_size if verify_sample_size is not None else 30
|
||||
self.data_parallel = data_parallel
|
||||
|
||||
# Large model mode: conservative defaults for 120B+ models.
|
||||
# Reduces memory footprint by limiting SAE features, directions,
|
||||
@@ -831,22 +828,6 @@ class AbliterationPipeline:
|
||||
"""Release unused GPU/accelerator memory between pipeline stages."""
|
||||
dev.free_gpu_memory()
|
||||
|
||||
def _can_data_parallel(self) -> bool:
|
||||
"""Check if data parallelism is feasible.
|
||||
|
||||
Returns True when:
|
||||
- data_parallel was requested
|
||||
- CUDA is available with >1 GPUs
|
||||
- The model is NOT already sharded across devices (device_map="auto")
|
||||
"""
|
||||
if not self.data_parallel:
|
||||
return False
|
||||
if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
|
||||
return False
|
||||
if self.handle and hasattr(self.handle.model, "hf_device_map"):
|
||||
return False # already sharded, can't replicate
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _get_model_device(model: nn.Module) -> torch.device:
|
||||
"""Return the correct input device for a model.
|
||||
@@ -1471,24 +1452,8 @@ class AbliterationPipeline:
|
||||
|
||||
device = self._get_model_device(model)
|
||||
|
||||
# ── Data parallelism: wrap model to split batches across GPUs ──
|
||||
# DataParallel replicates the model on each GPU and scatters the
|
||||
# input batch. Hooks fire on each replica (shared via shallow copy
|
||||
# of _forward_hooks), and since they .detach().cpu().float() the
|
||||
# activations, all results land in the same `activations` dict on
|
||||
# CPU. list.append is GIL-protected so thread-safe. Order within
|
||||
# a batch is nondeterministic across replicas, but that's fine —
|
||||
# we only compute means and SVD over the collected activations.
|
||||
use_dp = self._can_data_parallel()
|
||||
n_gpus = torch.cuda.device_count() if use_dp else 1
|
||||
if use_dp:
|
||||
model = nn.DataParallel(model)
|
||||
self.log(f" Data parallel: splitting batches across {n_gpus} GPUs")
|
||||
|
||||
# Batch prompts for throughput — hooks unbatch per-prompt activations
|
||||
batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
|
||||
if use_dp:
|
||||
batch_size *= n_gpus
|
||||
# Left-pad so position -1 is always the last real token in every batch element
|
||||
orig_padding_side = getattr(tokenizer, "padding_side", "right")
|
||||
if batch_size > 1:
|
||||
|
||||
+1
-13
@@ -23,7 +23,7 @@ _BANNER = r"""
|
||||
|
||||
|
||||
def _add_gpu_args(parser):
|
||||
"""Add --gpus and --data-parallel flags for multi-GPU control."""
|
||||
"""Add --gpus flag for multi-GPU control."""
|
||||
gpu_group = parser.add_argument_group("GPU selection")
|
||||
gpu_group.add_argument(
|
||||
"--gpus", type=str, default=None, metavar="IDS",
|
||||
@@ -33,15 +33,6 @@ def _add_gpu_args(parser):
|
||||
"Models are automatically split across selected GPUs via accelerate."
|
||||
),
|
||||
)
|
||||
gpu_group.add_argument(
|
||||
"--data-parallel", action="store_true", default=False,
|
||||
help=(
|
||||
"Use data parallelism to split prompt batches across GPUs during "
|
||||
"activation collection (PROBE stage). Only effective when the model "
|
||||
"fits on a single GPU and multiple GPUs are available. For models "
|
||||
"already sharded across GPUs (device_map='auto'), this has no effect."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _add_remote_args(parser):
|
||||
@@ -723,7 +714,6 @@ def _cmd_abliterate(args):
|
||||
quantization=args.quantization,
|
||||
large_model_mode=getattr(args, "large_model", False),
|
||||
verify_sample_size=getattr(args, "verify_sample_size", None),
|
||||
data_parallel=getattr(args, "data_parallel", False),
|
||||
on_stage=on_stage,
|
||||
on_log=on_log,
|
||||
)
|
||||
@@ -817,8 +807,6 @@ def _cmd_remote_abliterate(args):
|
||||
kwargs["large_model"] = True
|
||||
if getattr(args, "verify_sample_size", None) is not None:
|
||||
kwargs["verify_sample_size"] = args.verify_sample_size
|
||||
if getattr(args, "data_parallel", False):
|
||||
kwargs["data_parallel"] = True
|
||||
|
||||
result_path = runner.run_obliterate(
|
||||
model=args.model,
|
||||
|
||||
@@ -237,7 +237,6 @@ class RemoteRunner:
|
||||
refinement_passes: int | None = None,
|
||||
large_model: bool = False,
|
||||
verify_sample_size: int | None = None,
|
||||
data_parallel: bool = False,
|
||||
) -> str:
|
||||
"""Build the remote obliteratus CLI command."""
|
||||
remote_output = output_dir or f"{self.config.remote_dir}/output/{model.replace('/', '_')}"
|
||||
@@ -264,8 +263,6 @@ class RemoteRunner:
|
||||
parts.append("--large-model")
|
||||
if verify_sample_size is not None:
|
||||
parts.extend(["--verify-sample-size", str(verify_sample_size)])
|
||||
if data_parallel:
|
||||
parts.append("--data-parallel")
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user