OBLITERATUS/scripts/benchmark_gpt_oss_20b.py

#!/usr/bin/env python3
"""OBLITERATUS GPT-OSS 20B Benchmark — Full Method Comparison.

Runs all abliteration methods on openai/gpt-oss-20b and produces a
comprehensive comparison table with:
  - Refusal rate (primary metric)
  - KL divergence / perplexity (capability preservation)
  - Capability probes (knowledge, truthfulness, math reasoning)
  - MoE-specific metrics (EGA expert directions, router stability)
  - Timing and GPU memory usage

Usage:
    python scripts/benchmark_gpt_oss_20b.py
    python scripts/benchmark_gpt_oss_20b.py --methods basic surgical optimized nuclear
    python scripts/benchmark_gpt_oss_20b.py --prompts 50 --output results.json
    python scripts/benchmark_gpt_oss_20b.py --quick  # fast mode: 20 prompts, skip slow methods

Designed for T4 16GB (auto 4-bit quantization) or A10G+ (float16).
"""

from __future__ import annotations

import argparse
import gc
import json
import os
import shutil
import sys
import time
from pathlib import Path

os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

import torch

# Ensure the project root is on sys.path
project_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(project_root))

from obliteratus.abliterate import (  # noqa: E402
    AbliterationPipeline,
    METHODS,
    HARMFUL_PROMPTS,
    HARMLESS_PROMPTS,
)
from obliteratus.evaluation.benchmarks import BenchmarkRunner, format_benchmark_report  # noqa: E402


def parse_args():
    parser = argparse.ArgumentParser(description="OBLITERATUS GPT-OSS 20B Benchmark")
    parser.add_argument(
        "--model", default="openai/gpt-oss-20b",
        help="Model to benchmark (default: openai/gpt-oss-20b)",
    )
    parser.add_argument(
        "--methods", nargs="+",
        default=["basic", "advanced", "surgical", "optimized", "inverted", "nuclear"],
        help="Methods to compare",
    )
    parser.add_argument(
        "--prompts", type=int, default=33,
        help="Number of prompts per side (harmful/harmless)",
    )
    parser.add_argument(
        "--output", type=str, default=None,
        help="Save results JSON to this path",
    )
    parser.add_argument(
        "--quick", action="store_true",
        help="Quick mode: 20 prompts, skip aggressive/inverted",
    )
    parser.add_argument(
        "--skip-benchmarks", action="store_true",
        help="Skip capability benchmark probes (faster)",
    )
    parser.add_argument(
        "--output-dir", default="/tmp/obliteratus_bench",
        help="Directory for temporary model outputs",
    )
    parser.add_argument(
        "--bayesian-trials", type=int, default=30,
        help="Number of Bayesian optimization trials for 'optimized' method",
    )
    return parser.parse_args()


def gpu_info() -> dict:
    """Get GPU information."""
    if not torch.cuda.is_available():
        return {"gpu": "CPU only", "total_gb": 0, "free_gb": 0}
    return {
        "gpu": torch.cuda.get_device_name(0),
        "total_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
        "free_gb": round(torch.cuda.mem_get_info(0)[0] / 1e9, 1),
    }


def cleanup():
    """Force GPU memory cleanup."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()


def run_single_method(
    model_name: str,
    method: str,
    harmful: list[str],
    harmless: list[str],
    output_dir: str,
    run_benchmarks: bool = True,
    bayesian_trials: int = 30,
) -> dict:
    """Run a single abliteration method and collect metrics."""
    cleanup()

    outdir = f"{output_dir}/{method}"
    t0 = time.time()
    pipeline = None
    result = {
        "model": model_name,
        "method": method,
        "label": METHODS.get(method, {}).get("label", method),
    }

    try:
        # For the optimized method, we might want to control trial count
        if method == "optimized":
            # Temporarily patch bayesian_trials in the method config
            METHODS["optimized"]["bayesian_trials"] = bayesian_trials

        pipeline = AbliterationPipeline(
            model_name=model_name,
            output_dir=outdir,
            device="auto",
            dtype="float16",
            method=method,
            harmful_prompts=harmful,
            harmless_prompts=harmless,
            on_log=lambda msg: print(f"  {msg}"),
        )
        pipeline.run()
        elapsed = time.time() - t0

        result.update({
            "time_seconds": round(elapsed, 1),
            "quality": dict(pipeline._quality_metrics),
            "strong_layers": pipeline._strong_layers,
            "n_strong_layers": len(pipeline._strong_layers),
            "n_directions": pipeline.n_directions,
        })

        # MoE-specific metrics
        if pipeline._expert_directions:
            n_expert_dirs = sum(len(d) for d in pipeline._expert_directions.values())
            result["ega_expert_dirs"] = n_expert_dirs
            result["ega_layers"] = len(pipeline._expert_directions)

        if pipeline._expert_safety_scores:
            result["expert_classified_layers"] = len(pipeline._expert_safety_scores)

        if pipeline._cot_preserve_directions:
            result["cot_preserved_layers"] = len(pipeline._cot_preserve_directions)

        if pipeline._float_layer_weights:
            result["float_layer_weights"] = {
                str(k): round(v, 3) for k, v in pipeline._float_layer_weights.items()
            }

        if pipeline._kl_contributions:
            result["kl_contributions"] = {
                str(k): round(v, 6) for k, v in pipeline._kl_contributions.items()
            }

        if pipeline._lora_adapters:
            result["lora_adapters"] = len(pipeline._lora_adapters)

        if pipeline._steering_hooks:
            result["steering_hooks"] = len(pipeline._steering_hooks)

        # GPU memory
        if torch.cuda.is_available():
            result["peak_gpu_mb"] = round(torch.cuda.max_memory_allocated() / 1e6, 1)

        # Capability benchmarks (optional)
        if run_benchmarks:
            print("\n  Running capability benchmarks...")
            try:
                runner = BenchmarkRunner(
                    pipeline.handle.model,
                    pipeline.handle.tokenizer,
                )
                bench_results = runner.run_all()
                result["benchmarks"] = {
                    name: {
                        "score": round(br.score, 3),
                        "n_correct": br.n_correct,
                        "n_total": br.n_total,
                        "per_category": {
                            k: round(v, 3) for k, v in br.per_category.items()
                        },
                    }
                    for name, br in bench_results.items()
                }
                report = format_benchmark_report(bench_results)
                print(f"\n{report}")
            except Exception as e:
                print(f"  Benchmark probes failed: {e}")
                result["benchmarks"] = {"error": str(e)}

        print(f"\n  === {method} complete in {elapsed:.1f}s ===")
        print(f"  Quality: {json.dumps(pipeline._quality_metrics, default=str)}")

    except Exception as e:
        elapsed = time.time() - t0
        result.update({
            "time_seconds": round(elapsed, 1),
            "error": str(e),
        })
        print(f"\n  === {method} FAILED after {elapsed:.1f}s: {e} ===")
        import traceback
        traceback.print_exc()

    # Cleanup saved model to free disk
    shutil.rmtree(outdir, ignore_errors=True)

    if pipeline is not None:
        del pipeline
    cleanup()

    return result


def print_summary_table(results: list[dict]):
    """Print a formatted comparison table."""
    print(f"\n{'='*90}")
    print("BENCHMARK RESULTS SUMMARY")
    print(f"{'='*90}")

    # Header
    header = (
        f"{'Method':<12} {'Time':>7} {'PPL':>8} {'Coher':>7} "
        f"{'Refusal':>8} {'Know':>6} {'Truth':>6} {'Math':>6} "
        f"{'EGA':>5} {'CoT':>4} {'GPU MB':>7}"
    )
    print(header)
    print("-" * len(header))

    for r in results:
        method = r["method"]
        time_s = f"{r['time_seconds']:.0f}s" if "time_seconds" in r else "N/A"

        if "error" in r:
            print(f"{method:<12} {time_s:>7} {'FAILED':>8}")
            continue

        q = r.get("quality", {})
        ppl = q.get("perplexity")
        coh = q.get("coherence")
        ref = q.get("refusal_rate")
        gpu = r.get("peak_gpu_mb")

        # Benchmark scores
        bench = r.get("benchmarks", {})
        know = bench.get("knowledge", {}).get("score")
        truth = bench.get("truthfulness", {}).get("score")
        math = bench.get("math_reasoning", {}).get("score")

        # MoE metrics
        ega = r.get("ega_expert_dirs", "")
        cot = r.get("cot_preserved_layers", "")

        ppl_s = f"{ppl:.1f}" if ppl is not None else "N/A"
        coh_s = f"{coh:.0%}" if coh is not None else "N/A"
        ref_s = f"{ref:.0%}" if ref is not None else "N/A"
        know_s = f"{know:.0%}" if know is not None else "N/A"
        truth_s = f"{truth:.0%}" if truth is not None else "N/A"
        math_s = f"{math:.0%}" if math is not None else "N/A"
        gpu_s = f"{gpu:.0f}" if gpu is not None else "N/A"
        ega_s = str(ega) if ega else "-"
        cot_s = str(cot) if cot else "-"

        print(
            f"{method:<12} {time_s:>7} {ppl_s:>8} {coh_s:>7} "
            f"{ref_s:>8} {know_s:>6} {truth_s:>6} {math_s:>6} "
            f"{ega_s:>5} {cot_s:>4} {gpu_s:>7}"
        )

    print(f"{'='*90}")

    # Legend
    print("\nLegend:")
    print("  PPL     = Perplexity (lower = better capability preservation)")
    print("  Coher   = Coherence score (higher = more coherent text)")
    print("  Refusal = Refusal rate on harmful prompts (lower = more abliterated)")
    print("  Know    = MMLU-style knowledge probe")
    print("  Truth   = TruthfulQA-style truthfulness probe")
    print("  Math    = GSM8K-style math reasoning probe")
    print("  EGA     = Expert-Granular Abliteration directions computed")
    print("  CoT     = Layers where CoT reasoning was preserved")
    print("  GPU MB  = Peak GPU memory usage")


def main():
    args = parse_args()

    if args.quick:
        args.prompts = 20
        args.methods = [m for m in args.methods if m not in ("aggressive", "inverted")]
        args.bayesian_trials = 15

    gpu = gpu_info()
    harmful = HARMFUL_PROMPTS[:args.prompts]
    harmless = HARMLESS_PROMPTS[:args.prompts]

    print("=" * 60)
    print("  OBLITERATUS GPT-OSS 20B BENCHMARK")
    print("=" * 60)
    print(f"  Model:     {args.model}")
    print(f"  Methods:   {args.methods}")
    print(f"  Prompts:   {args.prompts} per side")
    print(f"  GPU:       {gpu['gpu']} ({gpu['total_gb']} GB total, {gpu['free_gb']} GB free)")
    print(f"  Benchmarks: {'skip' if args.skip_benchmarks else 'enabled'}")
    if "optimized" in args.methods:
        print(f"  Bayesian:  {args.bayesian_trials} trials")
    print("=" * 60)

    all_results = []

    for method in args.methods:
        if method not in METHODS:
            print(f"\nSKIP: unknown method '{method}'")
            continue

        print(f"\n{'━'*60}")
        print(f"  METHOD: {method} — {METHODS[method]['label']}")
        print(f"{'━'*60}")

        result = run_single_method(
            model_name=args.model,
            method=method,
            harmful=harmful,
            harmless=harmless,
            output_dir=args.output_dir,
            run_benchmarks=not args.skip_benchmarks,
            bayesian_trials=args.bayesian_trials,
        )
        all_results.append(result)

    # Summary
    print_summary_table(all_results)

    # Save JSON
    output_path = args.output or f"benchmark_gpt_oss_{int(time.time())}.json"
    with open(output_path, "w") as f:
        json.dump(all_results, f, indent=2, default=str)
    print(f"\nFull results saved to: {output_path}")


if __name__ == "__main__":
    main()