diff --git a/README.md b/README.md index 275be60..fa6654e 100644 --- a/README.md +++ b/README.md @@ -442,6 +442,49 @@ obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct --gpus 2,5 This sets `CUDA_VISIBLE_DEVICES` before CUDA initializes. The model is then sharded across the selected GPUs. +### Precision and quantization + +The `--dtype` flag controls the precision of model weights, which directly determines how much VRAM you need. Lower precision means smaller memory footprint at the cost of some numerical fidelity: + +| Dtype | Bytes/param | 7B model | 70B model | 405B model | +|-------|-----------|---------|----------|-----------| +| `float32` | 4 | 28 GB | 280 GB | 1620 GB | +| `float16` / `bfloat16` | 2 | 14 GB | 140 GB | 810 GB | +| `int8` (via `--quantization bitsandbytes-8bit`) | 1 | 7 GB | 70 GB | 405 GB | +| `int4` (via `--quantization bitsandbytes-4bit`) | 0.5 | 3.5 GB | 35 GB | 203 GB | + +```bash +# Default: bfloat16 +obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct + +# 8-bit quantization — fits on fewer GPUs +obliteratus obliterate meta-llama/Llama-3.1-70B-Instruct \ + --quantization bitsandbytes-8bit + +# 4-bit quantization — Llama-405B on 4x A100-80GB +obliteratus obliterate meta-llama/Llama-3.1-405B-Instruct \ + --quantization bitsandbytes-4bit --dtype float16 +``` + +Quantization roughly halves the GPU count at each step down. A 70B model that needs 3x A100-80GB in bf16 fits on 2 in int8 or 1 in int4. + +### GPU calculator + +Not sure how many GPUs you need? The `gpu-calc` command estimates the minimum GPU count for any model, accounting for weight memory, activation overhead, and CUDA context: + +```bash +# Auto-detect from HuggingFace model name +obliteratus gpu-calc meta-llama/Llama-3.1-70B-Instruct --gpu-mem 24 + +# Manual: specify params and precision +obliteratus gpu-calc --params 70 --dtype bfloat16 --gpu-mem 80 + +# MoE models: specify active params separately +obliteratus gpu-calc --params 117 --active-params 13 --dtype bfloat16 --gpu-mem 80 +``` + +The calculator fetches the model config from HuggingFace to estimate parameter counts (including MoE expert structure), then shows a table of GPU configurations with headroom estimates. For MoE models, activation overhead is computed from the active parameter count rather than total parameters. + ### Pipeline parallel benchmarks We benchmarked the full abliteration pipeline across varying numbers of A100-80GB GPUs on two large models. @@ -553,9 +596,11 @@ The remote runner: | Scenario | Recommendation | |----------|---------------| | Model fits on 1 GPU | Use 1 GPU. Adding more won't help and may slow things down. | +| Model almost fits on 1 GPU | Try `--quantization bitsandbytes-8bit` or `bitsandbytes-4bit` to reduce memory. Halving precision roughly halves VRAM. | | Model fits on 1 GPU, PROBE is slow (many prompts) | Try `data-parallel-prereplication` branch. Only helps if model fits on each GPU with room for activations. | -| Model doesn't fit on 1 GPU | Use `--gpus` with the **minimum** number of GPUs that fits. E.g., a 70B model in fp16 (~140 GB) needs 2x A100-80GB — don't use 4. | -| Model needs 4+ GPUs | Pipeline parallel via `device_map="auto"` is the only option. Expect I/O-dominated runtimes for very large models. | +| Model doesn't fit on 1 GPU | Use `--gpus` with the **minimum** number of GPUs that fits. Run `obliteratus gpu-calc` to find that number. | +| Model needs 4+ GPUs | Pipeline parallel via `device_map="auto"` is the only option. Expect I/O-dominated runtimes for very large models. Consider quantization first — int4 can cut the GPU count by 4x. | +| Not sure how many GPUs you need | Run `obliteratus gpu-calc --gpu-mem ` for an estimate. | | No local GPUs | Use `--remote user@gpu-node` to run on a remote machine, or use HuggingFace Spaces / Colab. | ## 10 study presets diff --git a/obliteratus/cli.py b/obliteratus/cli.py index aa37c94..b0767ab 100644 --- a/obliteratus/cli.py +++ b/obliteratus/cli.py @@ -266,12 +266,42 @@ def main(argv: list[str] | None = None): help="Also show global cross-architecture insights", ) + # --- gpu-calc --- + calc_parser = subparsers.add_parser( + "gpu-calc", + help="Estimate minimum GPUs needed for a model", + ) + calc_parser.add_argument( + "model", type=str, nargs="?", default=None, + help="HuggingFace model name/path (auto-fetches param counts)", + ) + calc_parser.add_argument( + "--params", type=float, default=None, metavar="B", + help="Total parameters in billions (overrides auto-detection)", + ) + calc_parser.add_argument( + "--active-params", type=float, default=None, metavar="B", + help="Active parameters in billions (for MoE models; defaults to --params)", + ) + calc_parser.add_argument( + "--dtype", type=str, default="bfloat16", + choices=["float32", "float16", "bfloat16", "int8", "int4"], + help="Data type for model weights (default: bfloat16)", + ) + calc_parser.add_argument( + "--gpu-mem", type=float, default=80.0, metavar="GB", + help="VRAM per GPU in GB (default: 80 for A100-80GB)", + ) + args = parser.parse_args(argv) # Apply GPU selection early (before any CUDA init) _apply_gpu_selection(args) - if args.command == "run": + if args.command == "gpu-calc": + _cmd_gpu_calc(args) + return + elif args.command == "run": if getattr(args, "remote", None): _cmd_remote_run(args) else: @@ -765,6 +795,215 @@ def _cmd_abliterate(args): ) +def _cmd_gpu_calc(args): + import math + + from rich.panel import Panel + from rich.table import Table + + BYTES_PER_PARAM = { + "float32": 4, + "float16": 2, + "bfloat16": 2, + "int8": 1, + "int4": 0.5, + } + + # Resolve param counts + total_params_b = args.params + active_params_b = args.active_params + + if total_params_b is None: + if args.model is None: + console.print("[red]Provide either a model name or --params.[/]") + raise SystemExit(1) + console.print(f"Fetching config for [cyan]{args.model}[/]...") + try: + from transformers import AutoConfig + config = AutoConfig.from_pretrained(args.model, trust_remote_code=True) + except Exception as e: + console.print(f"[red]Could not load config: {e}[/]") + raise SystemExit(1) + + # Total params: prefer explicit num_parameters, else estimate from config + total_params_b = _estimate_total_params_b(config) + + # Active params for MoE + if active_params_b is None: + active_params_b = _estimate_active_params_b(config, total_params_b) + + if active_params_b is None: + active_params_b = total_params_b + + bpp = BYTES_PER_PARAM[args.dtype] + gpu_mem_gb = args.gpu_mem + + # Model weight memory (use base-10 GB to match HF/nvidia conventions) + weight_gb = total_params_b * bpp + + # Activation overhead during forward passes (PROBE/VERIFY). + # Scales with active params, not total. Empirical from benchmarks: + # - DeepSeek-70B (149GB): failed at 160GB (2 GPUs), OK at 240GB (3 GPUs) + # - GPT-OSS-120B (234GB): failed at 240GB (3 GPUs), OK at 320GB (4 GPUs) + # This implies ~15-35% overhead. We use 20% as a reasonable middle ground. + active_weight_gb = active_params_b * bpp + activation_overhead_gb = active_weight_gb * 0.20 + + # CUDA context + fragmentation overhead: ~1.5 GB per GPU (fixed cost) + cuda_overhead_per_gpu = 1.5 + + # Total memory needed (before splitting across GPUs) + total_needed_gb = weight_gb + activation_overhead_gb + + # Find minimum GPUs: we need total_needed / (gpu_mem - cuda_overhead) GPUs + usable_per_gpu = gpu_mem_gb - cuda_overhead_per_gpu + if usable_per_gpu <= 0: + console.print("[red]GPU memory too small after CUDA overhead.[/]") + raise SystemExit(1) + + min_gpus = math.ceil(total_needed_gb / usable_per_gpu) + min_gpus = max(min_gpus, 1) + + # Show results for a range of GPU counts + is_moe = active_params_b < total_params_b * 0.99 + + table = Table(title="GPU Configurations", show_edge=True) + table.add_column("GPUs", justify="right", style="cyan") + table.add_column("VRAM/GPU", justify="right") + table.add_column("Total VRAM", justify="right") + table.add_column("Headroom", justify="right") + table.add_column("Verdict", min_width=20) + + # Show from min_gpus-1 (to show why it fails) up to 8 + low = max(1, min_gpus - 1) + high = max(min_gpus + 3, 8) + for n in range(low, high + 1): + total_vram = n * gpu_mem_gb + usable_vram = n * usable_per_gpu + headroom = usable_vram - total_needed_gb + headroom_pct = headroom / total_needed_gb * 100 + vram_per = total_needed_gb / n + + if headroom < 0: + verdict = "[red]INSUFFICIENT[/]" + elif headroom_pct < 15: + verdict = "[yellow]TIGHT — may fail[/]" + elif n == min_gpus: + verdict = "[bold green]MINIMUM (recommended)[/]" + else: + verdict = "[green]OK[/] [dim](more GPUs = slower)[/]" + + table.add_row( + str(n), + f"{vram_per:.1f} GB", + f"{total_vram:.0f} GB", + f"{headroom:+.1f} GB ({headroom_pct:+.0f}%)", + verdict, + ) + + model_label = args.model or f"{total_params_b:.1f}B params" + moe_line = "" + if is_moe: + moe_line = f"\n Active params: [cyan]{active_params_b:.1f}B[/] ({active_params_b/total_params_b*100:.0f}% of total — MoE)" + + console.print(Panel( + f" Model: [cyan]{model_label}[/]\n" + f" Total params: [cyan]{total_params_b:.1f}B[/]" + f"{moe_line}\n" + f" Dtype: [cyan]{args.dtype}[/] ({bpp} bytes/param)\n" + f" Weight memory: [cyan]{weight_gb:.1f} GB[/]\n" + f" Activation est: [cyan]{activation_overhead_gb:.1f} GB[/]\n" + f" Total needed: [bold]{total_needed_gb:.1f} GB[/]\n" + f" GPU VRAM: [cyan]{gpu_mem_gb:.0f} GB[/] per device", + title="[bold]GPU Calculator[/]", + border_style="cyan", + )) + console.print(table) + console.print( + f"\n [bold green]Minimum GPUs: {min_gpus}[/]" + f" ({min_gpus} x {gpu_mem_gb:.0f} GB = {min_gpus * gpu_mem_gb:.0f} GB)\n" + ) + console.print( + "[dim]Note: fewer GPUs = faster (pipeline parallel has cross-device overhead).\n" + "Estimates are conservative. Actual memory may vary with sequence length\n" + "and model architecture. See 'obliteratus obliterate --help' for runtime options.[/]\n" + ) + + +def _estimate_total_params_b(config) -> float: + """Estimate total parameter count in billions from a HuggingFace config.""" + # Some configs have explicit param counts + for attr in ("num_parameters", "n_params"): + val = getattr(config, attr, None) + if val and val > 1000: + return val / 1e9 + + # Estimate from architecture dimensions + h = getattr(config, "hidden_size", 0) + L = getattr(config, "num_hidden_layers", 0) + V = getattr(config, "vocab_size", 0) + i = getattr(config, "intermediate_size", h * 4) + + if h == 0 or L == 0: + console.print("[red]Cannot determine model size from config. Use --params.[/]") + raise SystemExit(1) + + n_heads = getattr(config, "num_attention_heads", None) or (h // 128) + head_dim = getattr(config, "head_dim", None) or (h // n_heads if n_heads else 128) + kv_heads = getattr(config, "num_key_value_heads", None) or n_heads + + # Attention: Q + K + V projections + output projection + attn_params = h * (n_heads * head_dim) + h * (kv_heads * head_dim) * 2 + (n_heads * head_dim) * h + + # FFN (MoE or dense) + n_experts = getattr(config, "num_local_experts", getattr(config, "num_experts", 1)) or 1 + # MoE models often have a separate intermediate size for expert FFNs + moe_i = getattr(config, "moe_intermediate_size", i) + # gate + up + down projections per expert + ffn_per_expert = h * moe_i * 3 + ffn_params = ffn_per_expert * n_experts + # Some architectures (Qwen, DeepSeek) also have a shared/dense FFN per layer + if n_experts > 1 and hasattr(config, "moe_intermediate_size"): + # The dense FFN uses the main intermediate_size + ffn_params += h * i * 3 + # Router + if n_experts > 1: + ffn_params += h * n_experts + + # Per-layer: attention + FFN + layernorms + layer_params = attn_params + ffn_params + h * 4 # 2 layernorms, 2 params each + + # Embedding + LM head + embed_params = V * h * 2 # input + output embeddings (may be tied but counts for memory) + + total = L * layer_params + embed_params + return total / 1e9 + + +def _estimate_active_params_b(config, total_params_b: float) -> float: + """For MoE models, estimate active parameters per forward pass.""" + n_experts = getattr(config, "num_local_experts", getattr(config, "num_experts", 1)) or 1 + if n_experts <= 1: + return total_params_b + + top_k = getattr(config, "num_experts_per_tok", getattr(config, "top_k", 2)) or 2 + + h = getattr(config, "hidden_size", 0) + i = getattr(config, "intermediate_size", h * 4) + moe_i = getattr(config, "moe_intermediate_size", i) + L = getattr(config, "num_hidden_layers", 0) + + # FFN per expert (uses moe_intermediate_size if available) + ffn_per_expert = h * moe_i * 3 + # Active FFN = top_k experts instead of all n_experts + ffn_all = ffn_per_expert * n_experts * L + ffn_active = ffn_per_expert * top_k * L + # Non-FFN params (includes any shared/dense FFN) + non_ffn = total_params_b * 1e9 - ffn_all + active = non_ffn + ffn_active + return max(active / 1e9, 0.1) + + def _make_remote_runner(args): """Create a RemoteRunner from CLI --remote flags.""" from obliteratus.remote import RemoteConfig, RemoteRunner