mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-04-23 19:56:15 +02:00
363 lines
12 KiB
Python
363 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""OBLITERATUS GPT-OSS 20B Benchmark — Full Method Comparison.
|
|
|
|
Runs all abliteration methods on openai/gpt-oss-20b and produces a
|
|
comprehensive comparison table with:
|
|
- Refusal rate (primary metric)
|
|
- KL divergence / perplexity (capability preservation)
|
|
- Capability probes (knowledge, truthfulness, math reasoning)
|
|
- MoE-specific metrics (EGA expert directions, router stability)
|
|
- Timing and GPU memory usage
|
|
|
|
Usage:
|
|
python scripts/benchmark_gpt_oss_20b.py
|
|
python scripts/benchmark_gpt_oss_20b.py --methods basic surgical optimized nuclear
|
|
python scripts/benchmark_gpt_oss_20b.py --prompts 50 --output results.json
|
|
python scripts/benchmark_gpt_oss_20b.py --quick # fast mode: 20 prompts, skip slow methods
|
|
|
|
Designed for T4 16GB (auto 4-bit quantization) or A10G+ (float16).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import gc
|
|
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
|
|
|
import torch
|
|
|
|
# Ensure the project root is on sys.path
|
|
project_root = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
from obliteratus.abliterate import ( # noqa: E402
|
|
AbliterationPipeline,
|
|
METHODS,
|
|
HARMFUL_PROMPTS,
|
|
HARMLESS_PROMPTS,
|
|
)
|
|
from obliteratus.evaluation.benchmarks import BenchmarkRunner, format_benchmark_report # noqa: E402
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="OBLITERATUS GPT-OSS 20B Benchmark")
|
|
parser.add_argument(
|
|
"--model", default="openai/gpt-oss-20b",
|
|
help="Model to benchmark (default: openai/gpt-oss-20b)",
|
|
)
|
|
parser.add_argument(
|
|
"--methods", nargs="+",
|
|
default=["basic", "advanced", "surgical", "optimized", "inverted", "nuclear"],
|
|
help="Methods to compare",
|
|
)
|
|
parser.add_argument(
|
|
"--prompts", type=int, default=33,
|
|
help="Number of prompts per side (harmful/harmless)",
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=str, default=None,
|
|
help="Save results JSON to this path",
|
|
)
|
|
parser.add_argument(
|
|
"--quick", action="store_true",
|
|
help="Quick mode: 20 prompts, skip aggressive/inverted",
|
|
)
|
|
parser.add_argument(
|
|
"--skip-benchmarks", action="store_true",
|
|
help="Skip capability benchmark probes (faster)",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir", default="/tmp/obliteratus_bench",
|
|
help="Directory for temporary model outputs",
|
|
)
|
|
parser.add_argument(
|
|
"--bayesian-trials", type=int, default=30,
|
|
help="Number of Bayesian optimization trials for 'optimized' method",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def gpu_info() -> dict:
|
|
"""Get GPU information."""
|
|
if not torch.cuda.is_available():
|
|
return {"gpu": "CPU only", "total_gb": 0, "free_gb": 0}
|
|
return {
|
|
"gpu": torch.cuda.get_device_name(0),
|
|
"total_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
|
|
"free_gb": round(torch.cuda.mem_get_info(0)[0] / 1e9, 1),
|
|
}
|
|
|
|
|
|
def cleanup():
|
|
"""Force GPU memory cleanup."""
|
|
gc.collect()
|
|
if torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.reset_peak_memory_stats()
|
|
|
|
|
|
def run_single_method(
|
|
model_name: str,
|
|
method: str,
|
|
harmful: list[str],
|
|
harmless: list[str],
|
|
output_dir: str,
|
|
run_benchmarks: bool = True,
|
|
bayesian_trials: int = 30,
|
|
) -> dict:
|
|
"""Run a single abliteration method and collect metrics."""
|
|
cleanup()
|
|
|
|
outdir = f"{output_dir}/{method}"
|
|
t0 = time.time()
|
|
pipeline = None
|
|
result = {
|
|
"model": model_name,
|
|
"method": method,
|
|
"label": METHODS.get(method, {}).get("label", method),
|
|
}
|
|
|
|
try:
|
|
# For the optimized method, we might want to control trial count
|
|
if method == "optimized":
|
|
# Temporarily patch bayesian_trials in the method config
|
|
METHODS["optimized"]["bayesian_trials"] = bayesian_trials
|
|
|
|
pipeline = AbliterationPipeline(
|
|
model_name=model_name,
|
|
output_dir=outdir,
|
|
device="auto",
|
|
dtype="float16",
|
|
method=method,
|
|
harmful_prompts=harmful,
|
|
harmless_prompts=harmless,
|
|
on_log=lambda msg: print(f" {msg}"),
|
|
)
|
|
pipeline.run()
|
|
elapsed = time.time() - t0
|
|
|
|
result.update({
|
|
"time_seconds": round(elapsed, 1),
|
|
"quality": dict(pipeline._quality_metrics),
|
|
"strong_layers": pipeline._strong_layers,
|
|
"n_strong_layers": len(pipeline._strong_layers),
|
|
"n_directions": pipeline.n_directions,
|
|
})
|
|
|
|
# MoE-specific metrics
|
|
if pipeline._expert_directions:
|
|
n_expert_dirs = sum(len(d) for d in pipeline._expert_directions.values())
|
|
result["ega_expert_dirs"] = n_expert_dirs
|
|
result["ega_layers"] = len(pipeline._expert_directions)
|
|
|
|
if pipeline._expert_safety_scores:
|
|
result["expert_classified_layers"] = len(pipeline._expert_safety_scores)
|
|
|
|
if pipeline._cot_preserve_directions:
|
|
result["cot_preserved_layers"] = len(pipeline._cot_preserve_directions)
|
|
|
|
if pipeline._float_layer_weights:
|
|
result["float_layer_weights"] = {
|
|
str(k): round(v, 3) for k, v in pipeline._float_layer_weights.items()
|
|
}
|
|
|
|
if pipeline._kl_contributions:
|
|
result["kl_contributions"] = {
|
|
str(k): round(v, 6) for k, v in pipeline._kl_contributions.items()
|
|
}
|
|
|
|
if pipeline._lora_adapters:
|
|
result["lora_adapters"] = len(pipeline._lora_adapters)
|
|
|
|
if pipeline._steering_hooks:
|
|
result["steering_hooks"] = len(pipeline._steering_hooks)
|
|
|
|
# GPU memory
|
|
if torch.cuda.is_available():
|
|
result["peak_gpu_mb"] = round(torch.cuda.max_memory_allocated() / 1e6, 1)
|
|
|
|
# Capability benchmarks (optional)
|
|
if run_benchmarks:
|
|
print("\n Running capability benchmarks...")
|
|
try:
|
|
runner = BenchmarkRunner(
|
|
pipeline.handle.model,
|
|
pipeline.handle.tokenizer,
|
|
)
|
|
bench_results = runner.run_all()
|
|
result["benchmarks"] = {
|
|
name: {
|
|
"score": round(br.score, 3),
|
|
"n_correct": br.n_correct,
|
|
"n_total": br.n_total,
|
|
"per_category": {
|
|
k: round(v, 3) for k, v in br.per_category.items()
|
|
},
|
|
}
|
|
for name, br in bench_results.items()
|
|
}
|
|
report = format_benchmark_report(bench_results)
|
|
print(f"\n{report}")
|
|
except Exception as e:
|
|
print(f" Benchmark probes failed: {e}")
|
|
result["benchmarks"] = {"error": str(e)}
|
|
|
|
print(f"\n === {method} complete in {elapsed:.1f}s ===")
|
|
print(f" Quality: {json.dumps(pipeline._quality_metrics, default=str)}")
|
|
|
|
except Exception as e:
|
|
elapsed = time.time() - t0
|
|
result.update({
|
|
"time_seconds": round(elapsed, 1),
|
|
"error": str(e),
|
|
})
|
|
print(f"\n === {method} FAILED after {elapsed:.1f}s: {e} ===")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Cleanup saved model to free disk
|
|
shutil.rmtree(outdir, ignore_errors=True)
|
|
|
|
if pipeline is not None:
|
|
del pipeline
|
|
cleanup()
|
|
|
|
return result
|
|
|
|
|
|
def print_summary_table(results: list[dict]):
|
|
"""Print a formatted comparison table."""
|
|
print(f"\n{'='*90}")
|
|
print("BENCHMARK RESULTS SUMMARY")
|
|
print(f"{'='*90}")
|
|
|
|
# Header
|
|
header = (
|
|
f"{'Method':<12} {'Time':>7} {'PPL':>8} {'Coher':>7} "
|
|
f"{'Refusal':>8} {'Know':>6} {'Truth':>6} {'Math':>6} "
|
|
f"{'EGA':>5} {'CoT':>4} {'GPU MB':>7}"
|
|
)
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
for r in results:
|
|
method = r["method"]
|
|
time_s = f"{r['time_seconds']:.0f}s" if "time_seconds" in r else "N/A"
|
|
|
|
if "error" in r:
|
|
print(f"{method:<12} {time_s:>7} {'FAILED':>8}")
|
|
continue
|
|
|
|
q = r.get("quality", {})
|
|
ppl = q.get("perplexity")
|
|
coh = q.get("coherence")
|
|
ref = q.get("refusal_rate")
|
|
gpu = r.get("peak_gpu_mb")
|
|
|
|
# Benchmark scores
|
|
bench = r.get("benchmarks", {})
|
|
know = bench.get("knowledge", {}).get("score")
|
|
truth = bench.get("truthfulness", {}).get("score")
|
|
math = bench.get("math_reasoning", {}).get("score")
|
|
|
|
# MoE metrics
|
|
ega = r.get("ega_expert_dirs", "")
|
|
cot = r.get("cot_preserved_layers", "")
|
|
|
|
ppl_s = f"{ppl:.1f}" if ppl is not None else "N/A"
|
|
coh_s = f"{coh:.0%}" if coh is not None else "N/A"
|
|
ref_s = f"{ref:.0%}" if ref is not None else "N/A"
|
|
know_s = f"{know:.0%}" if know is not None else "N/A"
|
|
truth_s = f"{truth:.0%}" if truth is not None else "N/A"
|
|
math_s = f"{math:.0%}" if math is not None else "N/A"
|
|
gpu_s = f"{gpu:.0f}" if gpu is not None else "N/A"
|
|
ega_s = str(ega) if ega else "-"
|
|
cot_s = str(cot) if cot else "-"
|
|
|
|
print(
|
|
f"{method:<12} {time_s:>7} {ppl_s:>8} {coh_s:>7} "
|
|
f"{ref_s:>8} {know_s:>6} {truth_s:>6} {math_s:>6} "
|
|
f"{ega_s:>5} {cot_s:>4} {gpu_s:>7}"
|
|
)
|
|
|
|
print(f"{'='*90}")
|
|
|
|
# Legend
|
|
print("\nLegend:")
|
|
print(" PPL = Perplexity (lower = better capability preservation)")
|
|
print(" Coher = Coherence score (higher = more coherent text)")
|
|
print(" Refusal = Refusal rate on harmful prompts (lower = more abliterated)")
|
|
print(" Know = MMLU-style knowledge probe")
|
|
print(" Truth = TruthfulQA-style truthfulness probe")
|
|
print(" Math = GSM8K-style math reasoning probe")
|
|
print(" EGA = Expert-Granular Abliteration directions computed")
|
|
print(" CoT = Layers where CoT reasoning was preserved")
|
|
print(" GPU MB = Peak GPU memory usage")
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
if args.quick:
|
|
args.prompts = 20
|
|
args.methods = [m for m in args.methods if m not in ("aggressive", "inverted")]
|
|
args.bayesian_trials = 15
|
|
|
|
gpu = gpu_info()
|
|
harmful = HARMFUL_PROMPTS[:args.prompts]
|
|
harmless = HARMLESS_PROMPTS[:args.prompts]
|
|
|
|
print("=" * 60)
|
|
print(" OBLITERATUS GPT-OSS 20B BENCHMARK")
|
|
print("=" * 60)
|
|
print(f" Model: {args.model}")
|
|
print(f" Methods: {args.methods}")
|
|
print(f" Prompts: {args.prompts} per side")
|
|
print(f" GPU: {gpu['gpu']} ({gpu['total_gb']} GB total, {gpu['free_gb']} GB free)")
|
|
print(f" Benchmarks: {'skip' if args.skip_benchmarks else 'enabled'}")
|
|
if "optimized" in args.methods:
|
|
print(f" Bayesian: {args.bayesian_trials} trials")
|
|
print("=" * 60)
|
|
|
|
all_results = []
|
|
|
|
for method in args.methods:
|
|
if method not in METHODS:
|
|
print(f"\nSKIP: unknown method '{method}'")
|
|
continue
|
|
|
|
print(f"\n{'━'*60}")
|
|
print(f" METHOD: {method} — {METHODS[method]['label']}")
|
|
print(f"{'━'*60}")
|
|
|
|
result = run_single_method(
|
|
model_name=args.model,
|
|
method=method,
|
|
harmful=harmful,
|
|
harmless=harmless,
|
|
output_dir=args.output_dir,
|
|
run_benchmarks=not args.skip_benchmarks,
|
|
bayesian_trials=args.bayesian_trials,
|
|
)
|
|
all_results.append(result)
|
|
|
|
# Summary
|
|
print_summary_table(all_results)
|
|
|
|
# Save JSON
|
|
output_path = args.output or f"benchmark_gpt_oss_{int(time.time())}.json"
|
|
with open(output_path, "w") as f:
|
|
json.dump(all_results, f, indent=2, default=str)
|
|
print(f"\nFull results saved to: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|