OBLITERATUS/scripts/run_benchmark_remote.sh

#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────────────────────
# OBLITERATUS Remote Benchmark Runner
#
# One-command benchmark on your HuggingFace Space GPU.
#
# Usage:
#   ./scripts/run_benchmark_remote.sh                  # defaults: Qwen 0.5B, all methods
#   ./scripts/run_benchmark_remote.sh --model Qwen/Qwen2.5-1.5B-Instruct
#   ./scripts/run_benchmark_remote.sh --model openai/gpt-oss-20b
#   ./scripts/run_benchmark_remote.sh --models "Qwen/Qwen2.5-0.5B-Instruct openai/gpt-oss-20b"
#   ./scripts/run_benchmark_remote.sh --methods "basic advanced surgical"
#   ./scripts/run_benchmark_remote.sh --prompts 33     # use 33/66/99 prompts per side
#   ./scripts/run_benchmark_remote.sh --dry-run        # print the command, don't execute
#   ./scripts/run_benchmark_remote.sh --verbose        # show SSH debug output
# ─────────────────────────────────────────────────────────────────────────────
set -euo pipefail

# ── Defaults ─────────────────────────────────────────────────────────────────
SSH_KEY="${OBLITERATUS_SSH_KEY:-$HOME/.ssh/hf_obliteratus}"
SSH_HOST="${OBLITERATUS_SSH_HOST:-}"
MODEL="${OBLITERATUS_MODEL:-Qwen/Qwen2.5-0.5B-Instruct}"
MODELS=""
METHODS="${OBLITERATUS_METHODS:-basic advanced aggressive surgical inverted nuclear}"
PROMPTS="${OBLITERATUS_PROMPTS:-33}"
DRY_RUN=false
VERBOSE=false

# ── Parse args ───────────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
  case "$1" in
    --model)    MODEL="$2"; MODELS=""; shift 2 ;;
    --models)   MODELS="$2";           shift 2 ;;
    --methods)  METHODS="$2";          shift 2 ;;
    --prompts)  PROMPTS="$2";          shift 2 ;;
    --key)      SSH_KEY="$2";          shift 2 ;;
    --host)     SSH_HOST="$2";         shift 2 ;;
    --dry-run)  DRY_RUN=true;          shift ;;
    --verbose|-v) VERBOSE=true;        shift ;;
    -h|--help)
      head -15 "$0" | tail -11
      exit 0
      ;;
    *)
      echo "Unknown arg: $1" >&2; exit 1 ;;
  esac
done

# If --models not set, use single --model
if [[ -z "$MODELS" ]]; then
  MODELS="$MODEL"
fi

# ── Validate SSH host ──────────────────────────────────────────────────────
if [[ -z "$SSH_HOST" ]]; then
  echo "ERROR: SSH_HOST not configured."
  echo ""
  echo "Set your HF Space SSH host:"
  echo "  1. export OBLITERATUS_SSH_HOST=your-username-spacename@ssh.hf.space"
  echo "  2. Or pass --host your-username-spacename@ssh.hf.space"
  exit 1
fi

# ── Validate SSH key ────────────────────────────────────────────────────────
if [[ ! -f "$SSH_KEY" ]]; then
  echo "ERROR: SSH key not found at $SSH_KEY"
  echo ""
  echo "Either:"
  echo "  1. Place your HF Space SSH key at ~/.ssh/hf_obliteratus"
  echo "  2. Set OBLITERATUS_SSH_KEY=/path/to/key"
  echo "  3. Pass --key /path/to/key"
  exit 1
fi

echo "╔══════════════════════════════════════════════════════════════╗"
echo "║          OBLITERATUS — Remote GPU Benchmark                 ║"
echo "╠══════════════════════════════════════════════════════════════╣"
echo "║  Host:     $SSH_HOST"
echo "║  Models:   $MODELS"
echo "║  Methods:  $METHODS"
echo "║  Prompts:  $PROMPTS per side"
echo "║  SSH key:  $SSH_KEY"
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""

# ── Build the Python benchmark script to run remotely ────────────────────────
read -r -d '' REMOTE_SCRIPT << 'PYEOF' || true
import json, sys, time, shutil, gc, os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")

import torch
import torch.nn as nn

# Add app dir to path (HF Space layout: /home/user/app)
sys.path.insert(0, os.environ.get("APP_DIR", "/home/user/app"))

# ── Hotpatch: fix device detection for accelerate device_map="auto" ──────
# The deployed Space code uses next(model.parameters()).device which is
# unreliable when accelerate distributes params across devices.
import obliteratus.abliterate as _abl

@staticmethod
def _get_model_device(model):
    """Find the correct input device (embedding layer) for accelerate models."""
    if hasattr(model, "hf_device_map"):
        try:
            embed = model.get_input_embeddings()
            return next(embed.parameters()).device
        except (StopIteration, AttributeError):
            for p in model.parameters():
                if p.device.type != "meta":
                    return p.device
            return torch.device("cpu")
    return next(model.parameters()).device

_abl.AbliterationPipeline._get_model_device = _get_model_device

# Patch _collect_activations to use the fixed device detection
_orig_collect = _abl.AbliterationPipeline._collect_activations.__code__
import types

def _patched_collect(self, layer_modules, prompts, label):
    """Collect last-token activations — patched for correct device detection."""
    n_layers = len(layer_modules)
    activations = {i: [] for i in range(n_layers)}
    hooks = []

    def make_hook(idx):
        def hook_fn(module, input, output):
            hidden = output[0] if isinstance(output, tuple) else output
            activations[idx].append(hidden[:, -1, :].detach().cpu().float())
        return hook_fn

    for idx in range(n_layers):
        hooks.append(layer_modules[idx].register_forward_hook(make_hook(idx)))

    model = self.handle.model
    tokenizer = self.handle.tokenizer

    max_length = 256
    if torch.cuda.is_available():
        free_gb = sum(
            torch.cuda.mem_get_info(i)[0] / (1024 ** 3)
            for i in range(torch.cuda.device_count())
        )
        # Scale thresholds by model size (baseline: 7B with hidden=4096, 32 layers)
        _h = self.handle.hidden_size if self.handle else 4096
        _l = n_layers if n_layers else 32
        _ms = (_h / 4096) * (_l / 32)
        _tight = max(4.0 * _ms, 0.5)
        _low = max(2.0 * _ms, 0.25)
        if free_gb < _low:
            max_length = 64
            self.log(f"  Low GPU memory ({free_gb:.1f} GB free, threshold {_low:.1f} GB), using max_length={max_length}")
        elif free_gb < _tight:
            max_length = 128
            self.log(f"  Tight GPU memory ({free_gb:.1f} GB free, threshold {_tight:.1f} GB), using max_length={max_length}")

    device = self._get_model_device(model)

    try:
        for i, prompt in enumerate(prompts):
            self.log(f"  [{label}] prompt {i + 1}/{len(prompts)}")
            inputs = tokenizer(
                prompt, return_tensors="pt", padding=True, truncation=True,
                max_length=max_length,
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                model(**inputs)
            del inputs
            self._free_gpu_memory()
    finally:
        for h in hooks:
            h.remove()

    return activations

_abl.AbliterationPipeline._collect_activations = _patched_collect
print("[hotpatch] Device detection fix applied")
# ── End hotpatch ─────────────────────────────────────────────────────────

# ── Hotpatch: nuclear mode tuning ─────────────────────────────────────────
# The deployed Space code has stale nuclear defaults.  Override them here
# so the benchmark exercises the latest tuning without redeploying.
import math as _math

# 1. Updated method configs (read at __init__ time)
_abl.METHODS["nuclear"].update({
    "n_directions": 4,
    "reflection_strength": 1.25,
    "embed_regularization": 0.50,
    "steering_strength": 0.15,
    "safety_neuron_masking": False,
})
_abl.METHODS["inverted"]["safety_neuron_masking"] = False

# 2. Cap layers for inversion modes (40% of total) — post-distill
_orig_distill = _abl.AbliterationPipeline._distill_refusal_subspace
def _patched_distill(self):
    _orig_distill(self)
    if self.invert_refusal and self._strong_layers:
        try:
            n_total = len(_abl.get_layer_modules(self.handle))
        except Exception:
            n_total = 24
        max_layers = max(3, int(n_total * 0.40))
        if len(self._strong_layers) > max_layers:
            old_count = len(self._strong_layers)
            self._strong_layers = self._strong_layers[:max_layers]
            self.log(f"  [hotpatch] Capped {old_count} -> {max_layers} layers for inversion (40% of {n_total})")
        # Truncate SAE directions: 4 features for nuclear, 6 for inverted
        n_sae = 4 if self.reflection_strength < 2.0 else 6
        for idx in list(self._sae_directions.keys()):
            dirs = self._sae_directions[idx]
            if dirs.shape[0] > n_sae:
                self._sae_directions[idx] = dirs[:n_sae]
        if self._sae_directions:
            self.log(f"  [hotpatch] SAE features capped to {n_sae} per layer")
_abl.AbliterationPipeline._distill_refusal_subspace = _patched_distill

print("[hotpatch] Nuclear tuning: 4 dirs, 1.25x reflect, no neuron mask, 40%% layer cap, 4 SAE features")
# ── End nuclear hotpatch ──────────────────────────────────────────────────

from obliteratus.abliterate import AbliterationPipeline, METHODS, HARMFUL_PROMPTS, HARMLESS_PROMPTS

MODELS_LIST = os.environ["BENCH_MODELS"].split()
METHODS_LIST = os.environ["BENCH_METHODS"].split()
N_PROMPTS = int(os.environ["BENCH_PROMPTS"])

print(f"\n{'='*60}")
print(f"OBLITERATUS BENCHMARK")
print(f"{'='*60}")
print(f"Models:   {MODELS_LIST}")
print(f"Methods:  {METHODS_LIST}")
print(f"Prompts:  {N_PROMPTS} per side")
if torch.cuda.is_available():
    gpu = torch.cuda.get_device_name(0)
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    free = torch.cuda.mem_get_info(0)[0] / 1e9
    print(f"GPU:      {gpu} ({total:.1f} GB total, {free:.1f} GB free)")
else:
    print("GPU:      NONE (CPU only)")
print(f"{'='*60}\n")

harmful = HARMFUL_PROMPTS[:N_PROMPTS]
harmless = HARMLESS_PROMPTS[:N_PROMPTS]

all_results = []

for model_name in MODELS_LIST:
    print(f"\n{'═'*60}")
    print(f"MODEL: {model_name}")
    print(f"{'═'*60}")

    model_results = []

    for method in METHODS_LIST:
        if method not in METHODS:
            print(f"SKIP unknown method: {method}")
            continue

        print(f"\n{'─'*60}")
        print(f"METHOD: {method} — {METHODS[method]['label']}")
        print(f"{'─'*60}")

        # Clean slate
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()

        outdir = f"/tmp/obliteratus_bench_{method}"
        t0 = time.time()
        pipeline = None

        try:
            pipeline = AbliterationPipeline(
                model_name=model_name,
                output_dir=outdir,
                device="auto",
                dtype="float16",
                method=method,
                harmful_prompts=harmful,
                harmless_prompts=harmless,
                on_log=lambda msg: print(f"  {msg}"),
            )
            result_path = pipeline.run()
            elapsed = time.time() - t0

            r = {
                "model": model_name,
                "method": method,
                "label": METHODS[method]["label"],
                "time_seconds": round(elapsed, 1),
                "quality": pipeline._quality_metrics,
                "strong_layers": pipeline._strong_layers,
                "n_strong_layers": len(pipeline._strong_layers),
                "n_directions": pipeline.n_directions,
            }

            if torch.cuda.is_available():
                r["peak_gpu_mb"] = round(torch.cuda.max_memory_allocated() / 1e6, 1)

            model_results.append(r)

            print(f"\n  ✓ {method} complete in {elapsed:.1f}s")
            print(f"    Quality: {json.dumps(pipeline._quality_metrics, default=str)}")

        except Exception as e:
            elapsed = time.time() - t0
            model_results.append({
                "model": model_name,
                "method": method,
                "label": METHODS.get(method, {}).get("label", method),
                "time_seconds": round(elapsed, 1),
                "error": str(e),
            })
            print(f"\n  ✗ {method} FAILED after {elapsed:.1f}s: {e}")
            import traceback
            traceback.print_exc()

        # Cleanup saved model to free disk
        shutil.rmtree(outdir, ignore_errors=True)

        # Force cleanup between runs
        if pipeline is not None:
            del pipeline
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    all_results.extend(model_results)

    # Summary table for this model
    print(f"\n{'='*60}")
    print(f"RESULTS: {model_name}")
    print(f"{'Method':<12} {'Time':>8} {'PPL':>10} {'Coher':>8} {'Refusal':>8} {'GPU MB':>8}")
    print(f"{'─'*12} {'─'*8} {'─'*10} {'─'*8} {'─'*8} {'─'*8}")
    for r in model_results:
        if "error" in r:
            print(f"{r['method']:<12} {r['time_seconds']:>7.1f}s {'FAILED':>10}")
            continue
        q = r.get("quality", {})
        ppl = q.get("perplexity")
        coh = q.get("coherence")
        ref = q.get("refusal_rate")
        gpu = r.get("peak_gpu_mb")
        ppl_str = f"{ppl:.2f}" if ppl is not None else "N/A"
        print(f"{r['method']:<12} {r['time_seconds']:>7.1f}s "
              f"{ppl_str:>10} "
              f"{f'{coh:.0%}' if coh is not None else 'N/A':>8} "
              f"{f'{ref:.0%}' if ref is not None else 'N/A':>8} "
              f"{gpu if gpu is not None else 'N/A':>8}")
    print(f"{'='*60}")

# Final JSON dump
print(f"\n\n{'='*60}")
print("ALL BENCHMARK RESULTS (JSON)")
print(f"{'='*60}")
print("```json")
print(json.dumps(all_results, indent=2, default=str))
print("```")
PYEOF

# ── SSH options ──────────────────────────────────────────────────────────────
SSH_OPTS=(
  -i "$SSH_KEY"
  -o StrictHostKeyChecking=no
  -o UserKnownHostsFile=/dev/null
  -o ConnectTimeout=30
  -o ServerAliveInterval=60
  -o ServerAliveCountMax=10
)

if $VERBOSE; then
  SSH_OPTS+=( -v )
fi

# ── Pre-flight: verify SSH connectivity ─────────────────────────────────────
echo "Checking SSH connectivity..."
if ! ssh "${SSH_OPTS[@]}" "$SSH_HOST" "echo 'SSH_OK'" 2>/tmp/obliteratus_ssh_debug.log; then
  echo ""
  echo "ERROR: SSH connection failed!"
  echo ""
  echo "Debug output:"
  cat /tmp/obliteratus_ssh_debug.log
  echo ""
  echo "Troubleshooting checklist:"
  echo "  1. Is Dev Mode enabled on your HF Space?"
  echo "     → Check your Space's Settings tab (Dev Mode must be ON)"
  echo "  2. Is the Space awake (not sleeping/building)?"
  echo "     → Visit the Space URL and wait for the UI to load"
  echo "  3. Is your SSH public key added to your HF profile?"
  echo "     → https://huggingface.co/settings/keys"
  echo "     → Run: cat ${SSH_KEY}.pub"
  echo "  4. Are key permissions correct?"
  echo "     → Run: chmod 600 $SSH_KEY"
  echo "  5. Try manually:"
  echo "     → ssh -v -i $SSH_KEY $SSH_HOST echo hello"
  echo ""
  rm -f /tmp/obliteratus_ssh_debug.log
  exit 1
fi
rm -f /tmp/obliteratus_ssh_debug.log
echo "SSH connection verified ✓"
echo ""

# ── Build SSH command ────────────────────────────────────────────────────────
# Write the Python script to a temp file and pipe it, instead of passing
# via -c (avoids command-line length limits and shell escaping issues).
REMOTE_SCRIPT_FILE=$(mktemp /tmp/obliteratus_bench_XXXXXX.py)
echo "$REMOTE_SCRIPT" > "$REMOTE_SCRIPT_FILE"
trap "rm -f '$REMOTE_SCRIPT_FILE'" EXIT

if $DRY_RUN; then
  echo "[DRY RUN] Would execute:"
  echo "  cat script.py | ssh ${SSH_OPTS[*]} $SSH_HOST 'BENCH_MODELS=... python3 -u'"
  echo ""
  echo "Script saved to: $REMOTE_SCRIPT_FILE"
  exit 0
fi

echo "Running benchmark on Space..."
echo ""

# Sanitize inputs: reject values containing shell metacharacters to prevent
# command injection on the remote host.
for _var_name in MODELS METHODS PROMPTS; do
  _val="${!_var_name}"
  if [[ "$_val" =~ [\'\"\;\&\|\`\$\(\)\{\}\<\>\\] ]]; then
    echo "ERROR: ${_var_name} contains unsafe characters: $_val" >&2
    exit 1
  fi
done

cat "$REMOTE_SCRIPT_FILE" | ssh "${SSH_OPTS[@]}" "$SSH_HOST" \
  "BENCH_MODELS='$MODELS' BENCH_METHODS='$METHODS' BENCH_PROMPTS='$PROMPTS' python3 -u -"