mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-05-01 15:37:52 +02:00
Add files via upload
This commit is contained in:
@@ -98,6 +98,51 @@ def _is_quota_error(exc: BaseException) -> bool:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _load_model_to_device(
|
||||
pretrained_path: str,
|
||||
*,
|
||||
torch_dtype=None,
|
||||
trust_remote_code: bool = False,
|
||||
quantization_config=None,
|
||||
offload_folder: str | None = None,
|
||||
low_cpu_mem_usage: bool = False,
|
||||
token: str | None = None,
|
||||
) -> AutoModelForCausalLM:
|
||||
"""Load a causal LM onto the best available device, MPS-safe.
|
||||
|
||||
Accelerate's ``device_map="auto"`` is not supported on MPS — models
|
||||
silently land on CPU. This helper skips ``device_map`` on non-CUDA
|
||||
backends and explicitly moves the model to the best device after loading.
|
||||
On CUDA the behaviour is identical to ``device_map="auto"``.
|
||||
"""
|
||||
kwargs: dict = {}
|
||||
if torch_dtype is not None:
|
||||
kwargs["torch_dtype"] = torch_dtype
|
||||
if trust_remote_code:
|
||||
kwargs["trust_remote_code"] = True
|
||||
if quantization_config is not None:
|
||||
kwargs["quantization_config"] = quantization_config
|
||||
if offload_folder is not None:
|
||||
kwargs["offload_folder"] = offload_folder
|
||||
if low_cpu_mem_usage:
|
||||
kwargs["low_cpu_mem_usage"] = True
|
||||
if token is not None:
|
||||
kwargs["token"] = token
|
||||
|
||||
if dev.supports_device_map_auto():
|
||||
kwargs["device_map"] = "auto"
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(pretrained_path, **kwargs)
|
||||
|
||||
# On MPS / CPU: model loaded without device_map, move to best device
|
||||
if not dev.supports_device_map_auto():
|
||||
target = dev.get_device()
|
||||
model = model.to(target)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Global state
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -164,7 +209,7 @@ def _recover_sessions_from_disk() -> None:
|
||||
"""
|
||||
global _last_obliterated_label, _obliterate_counter
|
||||
found_any = False
|
||||
for pattern in ("obliterated_*", "obliterated", "bench_*"):
|
||||
for pattern in ("obliterated_*", "obliterated", "bench_*", "obliteratus_tourney/r*"):
|
||||
for p in Path("/tmp").glob(pattern):
|
||||
if not p.is_dir():
|
||||
continue
|
||||
@@ -291,6 +336,11 @@ METHODS = {
|
||||
"optimized (bayesian auto-tuned)": "optimized",
|
||||
"inverted (semantic refusal inversion)": "inverted",
|
||||
"nuclear (maximum force combo)": "nuclear",
|
||||
# Baseline reproductions for benchmarking
|
||||
"failspy (FailSpy/abliterator baseline)": "failspy",
|
||||
"gabliteration (Gülmez 2026 baseline)": "gabliteration",
|
||||
"heretic (p-e-w 2025-2026 baseline)": "heretic",
|
||||
"rdo (Wollschlager ICML 2025 baseline)": "rdo",
|
||||
}
|
||||
|
||||
# ── Community Hub push ────────────────────────────────────────────────
|
||||
@@ -316,6 +366,7 @@ def _get_preset_defaults(method_display: str):
|
||||
cfg = _PRESET_CONFIGS.get(method_key, _PRESET_CONFIGS["advanced"])
|
||||
return {
|
||||
"n_directions": cfg.get("n_directions", 4),
|
||||
"direction_method": cfg.get("direction_method", "svd"),
|
||||
"regularization": cfg.get("regularization", 0.3),
|
||||
"refinement_passes": cfg.get("refinement_passes", 2),
|
||||
"norm_preserve": cfg.get("norm_preserve", True),
|
||||
@@ -341,6 +392,17 @@ def _get_preset_defaults(method_display: str):
|
||||
"spectral_cascade": cfg.get("spectral_cascade", False),
|
||||
"spectral_bands": cfg.get("spectral_bands", 3),
|
||||
"spectral_threshold": cfg.get("spectral_threshold", 0.05),
|
||||
# Baseline-specific parameters
|
||||
"layer_selection": cfg.get("layer_selection", "all"),
|
||||
"winsorize_activations": cfg.get("winsorize_activations", False),
|
||||
"winsorize_percentile": cfg.get("winsorize_percentile", 1.0),
|
||||
"use_kl_optimization": cfg.get("use_kl_optimization", False),
|
||||
"kl_budget": cfg.get("kl_budget", 0.5),
|
||||
"float_layer_interpolation": cfg.get("float_layer_interpolation", False),
|
||||
"rdo_refinement": cfg.get("rdo_refinement", False),
|
||||
"cot_aware": cfg.get("cot_aware", False),
|
||||
"bayesian_trials": cfg.get("bayesian_trials", 50),
|
||||
"n_sae_features": cfg.get("n_sae_features", 64),
|
||||
}
|
||||
|
||||
def _on_method_change(method_display: str):
|
||||
@@ -348,6 +410,7 @@ def _on_method_change(method_display: str):
|
||||
d = _get_preset_defaults(method_display)
|
||||
return (
|
||||
d["n_directions"],
|
||||
d["direction_method"],
|
||||
d["regularization"],
|
||||
d["refinement_passes"],
|
||||
d["reflection_strength"],
|
||||
@@ -374,6 +437,16 @@ def _on_method_change(method_display: str):
|
||||
d["expert_transplant"],
|
||||
d["use_wasserstein_optimal"],
|
||||
d["spectral_cascade"],
|
||||
d["layer_selection"],
|
||||
d["winsorize_activations"],
|
||||
d["winsorize_percentile"],
|
||||
d["use_kl_optimization"],
|
||||
d["kl_budget"],
|
||||
d["float_layer_interpolation"],
|
||||
d["rdo_refinement"],
|
||||
d["cot_aware"],
|
||||
d["bayesian_trials"],
|
||||
d["n_sae_features"],
|
||||
)
|
||||
|
||||
def _on_dataset_change(dataset_label: str):
|
||||
@@ -1731,8 +1804,9 @@ def _format_multi_model_results(results: list[dict], context: dict | None = None
|
||||
def obliterate(model_choice: str, method_choice: str,
|
||||
prompt_volume_choice: str, dataset_source_choice: str,
|
||||
custom_harmful: str, custom_harmless: str,
|
||||
# Advanced params (sliders)
|
||||
adv_n_directions: int, adv_regularization: float,
|
||||
# Advanced params (sliders + radio)
|
||||
adv_n_directions: int, adv_direction_method: str,
|
||||
adv_regularization: float,
|
||||
adv_refinement_passes: int, adv_reflection_strength: float,
|
||||
adv_embed_regularization: float, adv_steering_strength: float,
|
||||
adv_transplant_blend: float,
|
||||
@@ -1748,6 +1822,12 @@ def obliterate(model_choice: str, method_choice: str,
|
||||
adv_project_embeddings: bool, adv_activation_steering: bool,
|
||||
adv_expert_transplant: bool, adv_wasserstein_optimal: bool,
|
||||
adv_spectral_cascade: bool,
|
||||
adv_layer_selection: str, adv_winsorize: bool,
|
||||
adv_winsorize_percentile: float,
|
||||
adv_kl_optimization: bool, adv_kl_budget: float,
|
||||
adv_float_layer_interp: bool, adv_rdo_refinement: bool,
|
||||
adv_cot_aware: bool,
|
||||
adv_bayesian_trials: int, adv_n_sae_features: int,
|
||||
progress=gr.Progress()):
|
||||
"""Run the full obliteration pipeline, streaming log updates to the UI.
|
||||
|
||||
@@ -1906,6 +1986,7 @@ def obliterate(model_choice: str, method_choice: str,
|
||||
on_log=on_log,
|
||||
# Advanced overrides from UI
|
||||
n_directions=int(adv_n_directions),
|
||||
direction_method=adv_direction_method,
|
||||
regularization=float(adv_regularization),
|
||||
refinement_passes=int(adv_refinement_passes),
|
||||
norm_preserve=adv_norm_preserve,
|
||||
@@ -1932,6 +2013,15 @@ def obliterate(model_choice: str, method_choice: str,
|
||||
spectral_bands=int(adv_spectral_bands),
|
||||
spectral_threshold=float(adv_spectral_threshold),
|
||||
verify_sample_size=int(adv_verify_sample_size),
|
||||
layer_selection=adv_layer_selection,
|
||||
winsorize_activations=adv_winsorize,
|
||||
winsorize_percentile=float(adv_winsorize_percentile),
|
||||
use_kl_optimization=adv_kl_optimization,
|
||||
kl_budget=float(adv_kl_budget),
|
||||
float_layer_interpolation=adv_float_layer_interp,
|
||||
rdo_refinement=adv_rdo_refinement,
|
||||
cot_aware=adv_cot_aware,
|
||||
n_sae_features=int(adv_n_sae_features),
|
||||
)
|
||||
pipeline_ref[0] = pipeline
|
||||
pipeline.run()
|
||||
@@ -2103,10 +2193,9 @@ def obliterate(model_choice: str, method_choice: str,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
llm_int8_enable_fp32_cpu_offload=True,
|
||||
)
|
||||
model_reloaded = AutoModelForCausalLM.from_pretrained(
|
||||
model_reloaded = _load_model_to_device(
|
||||
save_dir,
|
||||
quantization_config=bnb_cfg,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
tokenizer_reloaded = AutoTokenizer.from_pretrained(
|
||||
@@ -2144,9 +2233,8 @@ def obliterate(model_choice: str, method_choice: str,
|
||||
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
||||
try:
|
||||
offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
|
||||
model_reloaded = AutoModelForCausalLM.from_pretrained(
|
||||
model_reloaded = _load_model_to_device(
|
||||
save_dir,
|
||||
device_map="auto",
|
||||
offload_folder=offload_dir,
|
||||
torch_dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
@@ -2307,8 +2395,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
||||
if checkpoint and Path(checkpoint).exists():
|
||||
try:
|
||||
is_preset = (_state.get("model_name") or "") in MODELS
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
checkpoint, device_map="auto", torch_dtype=torch.float16,
|
||||
model = _load_model_to_device(
|
||||
checkpoint, torch_dtype=torch.float16,
|
||||
trust_remote_code=is_preset,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
@@ -2498,8 +2586,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
||||
if checkpoint and Path(checkpoint).exists():
|
||||
is_preset = (_state.get("model_name") or "") in MODELS
|
||||
try:
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained(
|
||||
checkpoint, device_map="auto", torch_dtype=torch.float16,
|
||||
model_loaded = _load_model_to_device(
|
||||
checkpoint, torch_dtype=torch.float16,
|
||||
trust_remote_code=is_preset,
|
||||
)
|
||||
tokenizer_loaded = AutoTokenizer.from_pretrained(
|
||||
@@ -2559,9 +2647,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
||||
|
||||
is_preset = cfg["model_choice"] in MODELS
|
||||
try:
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained(
|
||||
model_loaded = _load_model_to_device(
|
||||
checkpoint_dir,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16,
|
||||
trust_remote_code=is_preset,
|
||||
)
|
||||
@@ -2595,10 +2682,9 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
||||
)
|
||||
yield f"**Loading {choice}** in 4-bit (model too large for fp16)...", ""
|
||||
progress(0.5, desc="Loading 4-bit...")
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained(
|
||||
model_loaded = _load_model_to_device(
|
||||
checkpoint_dir,
|
||||
quantization_config=bnb_cfg,
|
||||
device_map="auto",
|
||||
trust_remote_code=is_preset,
|
||||
)
|
||||
tokenizer_loaded = AutoTokenizer.from_pretrained(
|
||||
@@ -2740,8 +2826,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
||||
if checkpoint and Path(checkpoint).exists():
|
||||
try:
|
||||
is_preset = (model_name or "") in MODELS
|
||||
abliterated_model = AutoModelForCausalLM.from_pretrained(
|
||||
checkpoint, device_map="auto", torch_dtype=torch.float16,
|
||||
abliterated_model = _load_model_to_device(
|
||||
checkpoint, torch_dtype=torch.float16,
|
||||
trust_remote_code=is_preset,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
@@ -2866,10 +2952,9 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
||||
is_preset = model_name in MODELS
|
||||
original_response = ""
|
||||
try:
|
||||
from transformers import AutoModelForCausalLM as AMCLM
|
||||
original_model = AMCLM.from_pretrained(
|
||||
original_model = _load_model_to_device(
|
||||
model_id, torch_dtype=torch.float16,
|
||||
device_map="auto", trust_remote_code=is_preset,
|
||||
trust_remote_code=is_preset,
|
||||
low_cpu_mem_usage=True,
|
||||
token=os.environ.get("HF_TOKEN") or None,
|
||||
)
|
||||
@@ -3026,6 +3111,9 @@ def strength_sweep(model_choice: str, method_choice: str,
|
||||
entry["perplexity"] = metrics.get("perplexity")
|
||||
entry["refusal_rate"] = metrics.get("refusal_rate")
|
||||
entry["coherence"] = metrics.get("coherence")
|
||||
entry["kl_divergence"] = metrics.get("kl_divergence")
|
||||
entry["spectral_cert"] = metrics.get("spectral_certification") or ""
|
||||
entry["direction_method"] = getattr(pipe, "direction_method", "")
|
||||
entry["strong_layers"] = len(pipe._strong_layers)
|
||||
if hasattr(pipe, "handle") and pipe.handle is not None:
|
||||
pipe.handle.model = None
|
||||
@@ -3115,17 +3203,21 @@ def _format_sweep_results(results: list[dict]) -> str:
|
||||
return "*No results yet.*"
|
||||
|
||||
lines = ["### Strength Sweep Results", "",
|
||||
"| Reg | Time | Perplexity | Refusal Rate | Coherence | Error |",
|
||||
"|-----|------|-----------|-------------|-----------|-------|"]
|
||||
"| Reg | Dir | Time | PPL | Refusal | Coherence | KL Div | Cert | Error |",
|
||||
"|-----|-----|------|-----|---------|-----------|--------|------|-------|"]
|
||||
|
||||
for r in results:
|
||||
reg = f"{r['regularization']:.3f}"
|
||||
ppl = f"{r['perplexity']:.2f}" if r.get("perplexity") is not None else "—"
|
||||
ref = f"{r['refusal_rate']:.0%}" if r.get("refusal_rate") is not None else "—"
|
||||
coh = f"{r['coherence']:.0%}" if r.get("coherence") is not None else "—"
|
||||
kl_val = r.get("kl_divergence")
|
||||
kl_str = f"{kl_val:.4f}" if kl_val is not None else "—"
|
||||
cert = r.get("spectral_cert", "") or "—"
|
||||
dir_m = r.get("direction_method", "") or "—"
|
||||
err = r.get("error", "")
|
||||
err_short = (err[:25] + "...") if err and len(err) > 25 else (err or "")
|
||||
lines.append(f"| {reg} | {r['time_s']}s | {ppl} | {ref} | {coh} | {err_short} |")
|
||||
lines.append(f"| {reg} | {dir_m} | {r['time_s']}s | {ppl} | {ref} | {coh} | {kl_str} | {cert} | {err_short} |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@@ -3173,8 +3265,8 @@ def _tourney_gpu_wrapper(fn, *args, **kwargs):
|
||||
return _tourney_gpu_run(fn, *args, **kwargs)
|
||||
|
||||
|
||||
def run_tourney(model_choice, dataset, quantization):
|
||||
"""Run an elimination tournament across all abliteration methods.
|
||||
def run_tourney(model_choice, selected_methods, dataset, quantization):
|
||||
"""Run an elimination tournament across selected abliteration methods.
|
||||
|
||||
Each individual method is run inside its own ``@spaces.GPU`` allocation
|
||||
(up to 5 minutes per method) so the full tournament is not constrained
|
||||
@@ -3187,6 +3279,10 @@ def run_tourney(model_choice, dataset, quantization):
|
||||
yield "**Error:** Select a model first.", "", ""
|
||||
return
|
||||
|
||||
if not selected_methods or len(selected_methods) < 3:
|
||||
yield "**Error:** Select at least 3 methods for a tournament.", "", ""
|
||||
return
|
||||
|
||||
from obliteratus.tourney import (
|
||||
TourneyRunner, render_bracket_html,
|
||||
_load_checkpoint, _checkpoint_matches,
|
||||
@@ -3218,6 +3314,7 @@ def run_tourney(model_choice, dataset, quantization):
|
||||
hub_repo=None,
|
||||
dataset_key=dataset_key,
|
||||
quantization=quant,
|
||||
methods=list(selected_methods),
|
||||
on_log=logger,
|
||||
resume=resume,
|
||||
)
|
||||
@@ -3322,18 +3419,27 @@ def run_tourney(model_choice, dataset, quantization):
|
||||
_ts = datetime.now().strftime("%H:%M")
|
||||
_short = model_id.split("/")[-1] if "/" in model_id else model_id
|
||||
_label = f"tourney winner ({winner.method}) on {_short} ({_ts})"
|
||||
_winner_meta = {
|
||||
"model_id": model_id,
|
||||
"model_choice": model_choice,
|
||||
"method": winner.method,
|
||||
"dataset_key": dataset_key,
|
||||
"prompt_volume": 0,
|
||||
"output_dir": winner.output_dir,
|
||||
"source": "tourney",
|
||||
"tourney_score": winner.score,
|
||||
"tourney_metrics": winner.metrics,
|
||||
}
|
||||
with _lock:
|
||||
_session_models[_label] = {
|
||||
"model_id": model_id,
|
||||
"model_choice": model_choice,
|
||||
"method": winner.method,
|
||||
"dataset_key": dataset_key,
|
||||
"prompt_volume": 0,
|
||||
"output_dir": winner.output_dir,
|
||||
"source": "tourney",
|
||||
"tourney_score": winner.score,
|
||||
"tourney_metrics": winner.metrics,
|
||||
}
|
||||
_session_models[_label] = _winner_meta
|
||||
# Persist so the winner survives ZeroGPU process restarts
|
||||
_persist_session_meta(winner.output_dir, _label, {
|
||||
"model_id": model_id,
|
||||
"model_choice": model_choice,
|
||||
"method": winner.method,
|
||||
"dataset_key": dataset_key,
|
||||
"source": "tourney",
|
||||
})
|
||||
yield (
|
||||
f"**Champion: `{winner.method}`** "
|
||||
f"(score: {winner.score:.4f})\n"
|
||||
@@ -3930,7 +4036,13 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
|
||||
with gr.Row():
|
||||
adv_n_directions = gr.Slider(
|
||||
1, 8, value=_defaults["n_directions"], step=1,
|
||||
label="Directions", info="Number of refusal directions to extract via SVD",
|
||||
label="Directions", info="Number of refusal directions to extract",
|
||||
)
|
||||
adv_direction_method = gr.Radio(
|
||||
choices=["diff_means", "svd", "leace"],
|
||||
value=_defaults["direction_method"],
|
||||
label="Direction Method",
|
||||
info="diff_means: simple & robust, svd: multi-direction, leace: optimal erasure",
|
||||
)
|
||||
adv_regularization = gr.Slider(
|
||||
0.0, 1.0, value=_defaults["regularization"], step=0.05,
|
||||
@@ -3996,10 +4108,52 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
|
||||
with gr.Row():
|
||||
adv_spectral_cascade = gr.Checkbox(value=_defaults["spectral_cascade"], label="Spectral Cascade",
|
||||
info="DCT frequency decomposition for precision refusal targeting")
|
||||
gr.Markdown("**Layer Selection & Baseline Options**")
|
||||
with gr.Row():
|
||||
adv_layer_selection = gr.Dropdown(
|
||||
choices=["knee_cosmic", "all", "all_except_first", "middle60", "top_k", "knee"],
|
||||
value=_defaults["layer_selection"],
|
||||
label="Layer Selection",
|
||||
info="Which layers to project refusal directions from",
|
||||
)
|
||||
adv_winsorize_percentile = gr.Slider(
|
||||
0.0, 1.0, value=_defaults["winsorize_percentile"], step=0.01,
|
||||
label="Winsorize Percentile",
|
||||
info="Activation clamping quantile (1.0 = disabled, 0.01 = 99th pctile)",
|
||||
)
|
||||
adv_kl_budget = gr.Slider(
|
||||
0.0, 2.0, value=_defaults["kl_budget"], step=0.1,
|
||||
label="KL Budget",
|
||||
info="Max KL divergence from base model (Heretic/optimized)",
|
||||
)
|
||||
with gr.Row():
|
||||
adv_winsorize = gr.Checkbox(value=_defaults["winsorize_activations"], label="Winsorize Activations",
|
||||
info="Clamp outlier activations before direction extraction")
|
||||
adv_kl_optimization = gr.Checkbox(value=_defaults["use_kl_optimization"], label="KL Optimization",
|
||||
info="Optimize projection strength to stay within KL budget")
|
||||
adv_float_layer_interp = gr.Checkbox(value=_defaults["float_layer_interpolation"], label="Float Layer Interpolation",
|
||||
info="Interpolate between adjacent layers' directions (Heretic)")
|
||||
adv_rdo_refinement = gr.Checkbox(value=_defaults["rdo_refinement"], label="RDO Refinement",
|
||||
info="Gradient-based direction refinement (Wollschlager et al.)")
|
||||
with gr.Row():
|
||||
adv_cot_aware = gr.Checkbox(value=_defaults["cot_aware"], label="CoT-Aware",
|
||||
info="Preserve chain-of-thought reasoning during abliteration")
|
||||
with gr.Row():
|
||||
adv_bayesian_trials = gr.Slider(
|
||||
10, 200, value=_defaults["bayesian_trials"], step=10,
|
||||
label="Bayesian Trials",
|
||||
info="Optuna TPE optimization trials (Heretic/optimized methods)",
|
||||
)
|
||||
adv_n_sae_features = gr.Slider(
|
||||
16, 256, value=_defaults["n_sae_features"], step=16,
|
||||
label="SAE Features",
|
||||
info="Number of SAE features to target (inverted/nuclear methods)",
|
||||
)
|
||||
|
||||
# List of all advanced controls (order must match _on_method_change return)
|
||||
_adv_controls = [
|
||||
adv_n_directions, adv_regularization, adv_refinement_passes,
|
||||
adv_n_directions, adv_direction_method,
|
||||
adv_regularization, adv_refinement_passes,
|
||||
adv_reflection_strength, adv_embed_regularization,
|
||||
adv_steering_strength, adv_transplant_blend,
|
||||
adv_spectral_bands, adv_spectral_threshold,
|
||||
@@ -4011,6 +4165,12 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
|
||||
adv_project_embeddings, adv_activation_steering,
|
||||
adv_expert_transplant, adv_wasserstein_optimal,
|
||||
adv_spectral_cascade,
|
||||
adv_layer_selection, adv_winsorize,
|
||||
adv_winsorize_percentile,
|
||||
adv_kl_optimization, adv_kl_budget,
|
||||
adv_float_layer_interp, adv_rdo_refinement,
|
||||
adv_cot_aware,
|
||||
adv_bayesian_trials, adv_n_sae_features,
|
||||
]
|
||||
|
||||
obliterate_btn = gr.Button(
|
||||
@@ -4181,7 +4341,8 @@ result = client.predict(
|
||||
mm_method = gr.Dropdown(
|
||||
choices=["basic", "advanced", "aggressive",
|
||||
"spectral_cascade", "informed", "surgical",
|
||||
"optimized", "inverted", "nuclear"],
|
||||
"optimized", "inverted", "nuclear",
|
||||
"failspy", "gabliteration", "heretic", "rdo"],
|
||||
value="surgical",
|
||||
label="Abliteration Method",
|
||||
)
|
||||
@@ -4550,11 +4711,11 @@ tradeoff point where refusal is minimized with minimal capability damage.
|
||||
|
||||
# ── Tab 6: Tourney ────────────────────────────────────────────────
|
||||
with gr.Tab("Tourney", id="tourney"):
|
||||
gr.Markdown("""### March Madness Tournament
|
||||
Pit **all abliteration methods** against each other in elimination rounds.
|
||||
gr.Markdown("""### Tourney Mode
|
||||
Pit abliteration methods against each other in elimination rounds.
|
||||
The winner is saved locally — push it to HuggingFace Hub from the **Push to Hub** tab.
|
||||
|
||||
**Round 1 — Qualifiers:** All methods, reduced prompts. Bottom half eliminated.
|
||||
**Round 1 — Qualifiers:** Selected methods, reduced prompts. Bottom half eliminated.
|
||||
**Round 2 — Semifinals:** Survivors, full prompts. Bottom half eliminated.
|
||||
**Round 3 — Finals:** Top contenders, maximum prompts. Champion crowned.
|
||||
""")
|
||||
@@ -4566,6 +4727,14 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
|
||||
allow_custom_value=True,
|
||||
)
|
||||
|
||||
from obliteratus.tourney import TOURNEY_METHODS as _ALL_TOURNEY_METHODS
|
||||
tourney_methods_cb = gr.CheckboxGroup(
|
||||
choices=_ALL_TOURNEY_METHODS,
|
||||
value=_ALL_TOURNEY_METHODS,
|
||||
label="Methods to Compete",
|
||||
info="Pick at least 3 methods. All selected by default.",
|
||||
)
|
||||
|
||||
with gr.Accordion("Advanced Settings", open=False):
|
||||
with gr.Row():
|
||||
tourney_dataset_dd = gr.Dropdown(
|
||||
@@ -4595,9 +4764,16 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
|
||||
|
||||
tourney_btn.click(
|
||||
fn=run_tourney,
|
||||
inputs=[tourney_model_dd,
|
||||
inputs=[tourney_model_dd, tourney_methods_cb,
|
||||
tourney_dataset_dd, tourney_quant_dd],
|
||||
outputs=[tourney_status, tourney_bracket, tourney_log],
|
||||
).then(
|
||||
fn=lambda: (
|
||||
gr.update(choices=_get_session_model_choices()),
|
||||
gr.update(choices=_get_session_model_choices()),
|
||||
_get_vram_html(),
|
||||
),
|
||||
outputs=[session_model_dd, ab_session_model_dd, vram_display],
|
||||
)
|
||||
|
||||
# ── Tab 7: Export ─────────────────────────────────────────────────
|
||||
|
||||
+58
-4
@@ -1317,11 +1317,56 @@
|
||||
<span class="method-label">AGGRESSIVE</span>
|
||||
<span class="method-desc">Full Gabliteration + 3-pass refine</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-spectral_cascade" onclick="setAblMethod('spectral_cascade')">
|
||||
<input type="radio" name="abl-method" value="spectral_cascade">
|
||||
<span class="method-label">SPECTRAL</span>
|
||||
<span class="method-desc">DCT frequency-selective decomposition</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
|
||||
<input type="radio" name="abl-method" value="informed">
|
||||
<span class="method-label" style="color:var(--cyan)">INFORMED</span>
|
||||
<span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-surgical" onclick="setAblMethod('surgical')">
|
||||
<input type="radio" name="abl-method" value="surgical">
|
||||
<span class="method-label">SURGICAL</span>
|
||||
<span class="method-desc">Precision MoE-aware head surgery</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-optimized" onclick="setAblMethod('optimized')">
|
||||
<input type="radio" name="abl-method" value="optimized">
|
||||
<span class="method-label">OPTIMIZED</span>
|
||||
<span class="method-desc">Bayesian auto-tuned + KL-optimized</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-inverted" onclick="setAblMethod('inverted')">
|
||||
<input type="radio" name="abl-method" value="inverted">
|
||||
<span class="method-label">INVERTED</span>
|
||||
<span class="method-desc">Semantic refusal inversion</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-nuclear" onclick="setAblMethod('nuclear')">
|
||||
<input type="radio" name="abl-method" value="nuclear">
|
||||
<span class="method-label">NUCLEAR</span>
|
||||
<span class="method-desc">Maximum force combo</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-failspy" onclick="setAblMethod('failspy')">
|
||||
<input type="radio" name="abl-method" value="failspy">
|
||||
<span class="method-label">FAILSPY</span>
|
||||
<span class="method-desc">FailSpy/abliterator baseline</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-gabliteration" onclick="setAblMethod('gabliteration')">
|
||||
<input type="radio" name="abl-method" value="gabliteration">
|
||||
<span class="method-label">GABLIT</span>
|
||||
<span class="method-desc">Gabliteration (Gülmez 2026) baseline</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-heretic" onclick="setAblMethod('heretic')">
|
||||
<input type="radio" name="abl-method" value="heretic">
|
||||
<span class="method-label">HERETIC</span>
|
||||
<span class="method-desc">Heretic/p-e-w Bayesian baseline</span>
|
||||
</label>
|
||||
<label class="method-radio" id="method-rdo" onclick="setAblMethod('rdo')">
|
||||
<input type="radio" name="abl-method" value="rdo">
|
||||
<span class="method-label">RDO</span>
|
||||
<span class="method-desc">Refusal Direction Optimization baseline</span>
|
||||
</label>
|
||||
</div>
|
||||
<div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
|
||||
4 SVD directions • norm-preserving • 30% regularization • 2 refinement passes • 32 prompt pairs
|
||||
@@ -1941,10 +1986,19 @@ function startAbliterateFromLibrary(hfId) {
|
||||
|
||||
let ablMethod = 'advanced';
|
||||
const METHOD_INFO = {
|
||||
basic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'1 direction • standard projection • 1 pass • 32 prompt pairs'},
|
||||
advanced: {dirs:4, norm:true, reg:0.3, passes:2, desc:'4 SVD directions • norm-preserving • 30% regularization • 2 refinement passes • 32 prompt pairs'},
|
||||
aggressive: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions • norm-preserving • full orthogonalization • 3 refinement passes • 32 prompt pairs'},
|
||||
informed: {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> • auto directions • auto regularization • Ouroboros-compensated • cone/alignment/cluster/defense analysis'},
|
||||
basic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'1 direction • standard projection • 1 pass'},
|
||||
advanced: {dirs:4, norm:true, reg:0.3, passes:2, desc:'4 SVD directions • norm-preserving • 30% regularization • 2 refinement passes'},
|
||||
aggressive: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions • norm-preserving • full orthogonalization • 3 refinement passes'},
|
||||
spectral_cascade: {dirs:6, norm:true, reg:0.15, passes:1, desc:'6 whitened-SVD directions • DCT frequency decomposition • coherence-weighted • adaptive bands'},
|
||||
informed: {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> • auto directions • auto regularization • Ouroboros-compensated • cone/alignment/cluster analysis'},
|
||||
surgical: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions • attention head surgery • SAE features • safety neuron masking • per-expert MoE'},
|
||||
optimized: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions • Bayesian auto-tuned • CoT-aware • KL co-optimized • winsorized activations'},
|
||||
inverted: {dirs:4, norm:true, reg:0.1, passes:2, desc:'4 SVD directions • semantic inversion (2x reflection) • SAE feature targeting'},
|
||||
nuclear: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions • all techniques combined • maximum force • head surgery + SAE + steering + transplant'},
|
||||
failspy: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 1 diff-means direction • all layers except first • FailSpy/abliterator reproduction'},
|
||||
gabliteration: {dirs:4, norm:false, reg:0.231, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 4 SVD directions • ridge reg (alpha=0.3) • top-k layer selection • Gülmez 2026'},
|
||||
heretic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 1 diff-means • Bayesian (Optuna TPE) • KL-optimized • float layer interpolation • p-e-w'},
|
||||
rdo: {dirs:4, norm:true, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 4 SVD directions • gradient-refined (RDO) • linear probe classifier • Wollschlager ICML 2025'},
|
||||
};
|
||||
|
||||
function getAblCmd() {
|
||||
|
||||
+162
-49
@@ -63,6 +63,7 @@ METHODS = {
|
||||
"label": "Basic (Arditi et al.)",
|
||||
"description": "Single refusal direction via difference-in-means",
|
||||
"n_directions": 1,
|
||||
"direction_method": "diff_means",
|
||||
"norm_preserve": False,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 1,
|
||||
@@ -75,6 +76,7 @@ METHODS = {
|
||||
"label": "Advanced (Multi-direction + Norm-preserving)",
|
||||
"description": "SVD-based multi-direction extraction with norm preservation",
|
||||
"n_directions": 4,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.3,
|
||||
"embed_regularization": 0.5,
|
||||
@@ -97,6 +99,7 @@ METHODS = {
|
||||
"Zero regularization for maximum refusal removal."
|
||||
),
|
||||
"n_directions": 8,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 3,
|
||||
@@ -124,6 +127,7 @@ METHODS = {
|
||||
"separating trained-in refusal patterns from per-layer artifacts."
|
||||
),
|
||||
"n_directions": 6,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 2,
|
||||
@@ -146,25 +150,31 @@ METHODS = {
|
||||
"Uses InformedAbliterationPipeline for the full feedback loop. "
|
||||
"Auto-detects alignment method (DPO/RLHF/CAI/SFT), maps concept "
|
||||
"cone geometry, performs cluster-aware layer selection, and gates "
|
||||
"projection by safety-capability entanglement. Includes spectral "
|
||||
"certification of abliteration completeness and Wasserstein-optimal "
|
||||
"primary direction extraction."
|
||||
"projection by safety-capability entanglement. Defaults to single "
|
||||
"diff-of-means direction + Bayesian optimization (Heretic-style). "
|
||||
"LEACE available via direction_method='leace'."
|
||||
),
|
||||
"n_directions": 4,
|
||||
"n_directions": 1,
|
||||
"direction_method": "diff_means",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 2,
|
||||
"project_biases": True,
|
||||
"use_chat_template": True,
|
||||
"use_whitened_svd": True,
|
||||
"use_whitened_svd": False,
|
||||
"true_iterative_refinement": True,
|
||||
"use_jailbreak_contrast": False,
|
||||
"layer_adaptive_strength": False,
|
||||
"layer_adaptive_strength": True,
|
||||
"safety_neuron_masking": False,
|
||||
"per_expert_directions": False,
|
||||
"attention_head_surgery": False,
|
||||
"use_sae_features": False,
|
||||
"use_wasserstein_optimal": True,
|
||||
"use_wasserstein_optimal": False,
|
||||
"use_kl_optimization": True,
|
||||
"kl_budget": 0.5,
|
||||
"float_layer_interpolation": True,
|
||||
"winsorize_activations": True,
|
||||
"winsorize_percentile": 0.01,
|
||||
},
|
||||
"surgical": {
|
||||
"label": "Surgical (Full SOTA MoE-Aware)",
|
||||
@@ -176,6 +186,7 @@ METHODS = {
|
||||
"minimizing capability damage via precision targeting."
|
||||
),
|
||||
"n_directions": 8,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 2,
|
||||
@@ -204,6 +215,7 @@ METHODS = {
|
||||
"techniques plus the inversion layer."
|
||||
),
|
||||
"n_directions": 8,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 2,
|
||||
@@ -234,6 +246,7 @@ METHODS = {
|
||||
"Best for maximizing quality when compute budget allows ~50 trials."
|
||||
),
|
||||
"n_directions": 4,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 1,
|
||||
@@ -275,6 +288,7 @@ METHODS = {
|
||||
"runtime overhead except lightweight steering hooks."
|
||||
),
|
||||
"n_directions": 4,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 2,
|
||||
@@ -314,12 +328,14 @@ METHODS = {
|
||||
"description": (
|
||||
"Faithful reproduction of the FailSpy/abliterator library — the "
|
||||
"most widely used community tool. Single direction via difference-"
|
||||
"in-means (Arditi et al.), middle 60%% layer heuristic (layers "
|
||||
"20%%-80%%), no regularization, no norm preservation. Uses chat "
|
||||
"template for instruct models. This is what most HuggingFace "
|
||||
"abliterated models were created with."
|
||||
"in-means (Arditi et al.), applied to all layers except layer 0 "
|
||||
"(matching FailSpy source: range(1, n_layers)). Projects both "
|
||||
"W_O (attention output) and MLP W_out. No regularization, no "
|
||||
"norm preservation. Uses chat template for instruct models. "
|
||||
"This is what most HuggingFace abliterated models were created with."
|
||||
),
|
||||
"n_directions": 1,
|
||||
"direction_method": "diff_means",
|
||||
"norm_preserve": False,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 1,
|
||||
@@ -334,7 +350,7 @@ METHODS = {
|
||||
"attention_head_surgery": False,
|
||||
"use_sae_features": False,
|
||||
"invert_refusal": False,
|
||||
"layer_selection": "middle60",
|
||||
"layer_selection": "all_except_first",
|
||||
},
|
||||
"gabliteration": {
|
||||
"label": "Gabliteration (Gülmez 2026 Baseline)",
|
||||
@@ -347,6 +363,7 @@ METHODS = {
|
||||
"whitened SVD, no iterative refinement."
|
||||
),
|
||||
"n_directions": 4,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": False,
|
||||
# Ridge alpha=0.3 → effective reg = alpha/(1+alpha) = 0.3/1.3 ≈ 0.231
|
||||
# For orthonormal V: P_V^alpha = 1/(1+alpha) * VV^T = 0.769 * VV^T
|
||||
@@ -367,19 +384,26 @@ METHODS = {
|
||||
"layer_selection": "top_k",
|
||||
},
|
||||
"heretic": {
|
||||
"label": "Heretic / p-e-w (2025 Baseline)",
|
||||
"label": "Heretic / p-e-w (2025-2026 Baseline)",
|
||||
"description": (
|
||||
"Faithful reproduction of Heretic's core algorithm (p-e-w, 2025). "
|
||||
"Bayesian optimization via Optuna TPE with parametric bell curve "
|
||||
"kernel. Uses 1-2 directions (float interpolation between top SVD "
|
||||
"components), component-specific scaling (attention vs MLP), "
|
||||
"activation winsorization (1%% tails). No whitened SVD, no SAE "
|
||||
"features, no jailbreak contrast. The key innovation is replacing "
|
||||
"Faithful reproduction of Heretic's core algorithm (p-e-w, 2025-2026). "
|
||||
"Bayesian optimization via Optuna TPE with linear bell curve layer "
|
||||
"weighting (NOT Gaussian — linear interpolation between max_weight and "
|
||||
"min_weight over min_weight_distance). One diff-of-means direction per "
|
||||
"layer; direction_scope is sampled ('global' selects a float layer index "
|
||||
"with lerp between adjacent layers' directions, 'per layer' uses each "
|
||||
"layer's own direction). LoRA-based ablation (delta W = -lambda * v * "
|
||||
"(v^T W)), never modifies base weights directly. Row normalization "
|
||||
"defaults to NONE (PRE and FULL are options). Activation winsorization "
|
||||
"via symmetric quantile clamping. The key innovation is replacing "
|
||||
"manual hyperparameter selection with automated Pareto optimization "
|
||||
"over the (refusal_rate, KL_divergence) frontier."
|
||||
"over the (refusal_count, KL_divergence) frontier."
|
||||
),
|
||||
"n_directions": 2,
|
||||
"norm_preserve": True,
|
||||
"n_directions": 1,
|
||||
"direction_method": "diff_means",
|
||||
# Heretic default row_normalization is NONE; PRE/FULL are optional.
|
||||
# OBLITERATUS norm_preserve=False matches Heretic's default behavior.
|
||||
"norm_preserve": False,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 1,
|
||||
"project_biases": False,
|
||||
@@ -387,14 +411,21 @@ METHODS = {
|
||||
"use_whitened_svd": False,
|
||||
"true_iterative_refinement": False,
|
||||
"use_jailbreak_contrast": False,
|
||||
"layer_adaptive_strength": True,
|
||||
# Heretic uses its own bell curve weighting (linear, not Gaussian),
|
||||
# not OBLITERATUS's norm-based layer_adaptive_strength.
|
||||
"layer_adaptive_strength": False,
|
||||
"safety_neuron_masking": False,
|
||||
"per_expert_directions": False,
|
||||
"attention_head_surgery": False,
|
||||
"use_sae_features": False,
|
||||
"invert_refusal": False,
|
||||
"winsorize_activations": True,
|
||||
"winsorize_percentile": 0.01,
|
||||
# Heretic default winsorization_quantile is 1.0 (disabled by default).
|
||||
# For faithful baseline reproduction we match the source default.
|
||||
"winsorize_activations": False,
|
||||
"winsorize_percentile": 1.0,
|
||||
# Heretic's float direction index interpolates between adjacent LAYERS'
|
||||
# directions (not SVD components). OBLITERATUS float_layer_interpolation
|
||||
# provides the bell-curve layer weighting aspect.
|
||||
"float_layer_interpolation": True,
|
||||
"cot_aware": False,
|
||||
"use_kl_optimization": True,
|
||||
@@ -414,6 +445,7 @@ METHODS = {
|
||||
"boundary rather than the statistical activation difference."
|
||||
),
|
||||
"n_directions": 4,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0,
|
||||
"refinement_passes": 1,
|
||||
@@ -566,6 +598,7 @@ class AbliterationPipeline:
|
||||
hub_token: str | None = None,
|
||||
hub_community_org: str | None = None,
|
||||
n_directions: int | None = None,
|
||||
direction_method: str | None = None,
|
||||
norm_preserve: bool | None = None,
|
||||
regularization: float | None = None,
|
||||
refinement_passes: int | None = None,
|
||||
@@ -659,6 +692,7 @@ class AbliterationPipeline:
|
||||
method_cfg = METHODS[method]
|
||||
self.method = method
|
||||
self.n_directions = n_directions if n_directions is not None else method_cfg["n_directions"]
|
||||
self.direction_method = direction_method if direction_method is not None else method_cfg.get("direction_method", "svd")
|
||||
self.norm_preserve = norm_preserve if norm_preserve is not None else method_cfg["norm_preserve"]
|
||||
self.regularization = regularization if regularization is not None else method_cfg["regularization"]
|
||||
self.refinement_passes = refinement_passes if refinement_passes is not None else method_cfg["refinement_passes"]
|
||||
@@ -936,7 +970,7 @@ class AbliterationPipeline:
|
||||
self.log(f"Loading model: {self.model_name}")
|
||||
self.log(f"Device: {self.device} | Dtype: {self.dtype}")
|
||||
self.log(f"Method: {method_label}")
|
||||
self.log(f" Directions: {self.n_directions} | Norm-preserve: {self.norm_preserve}")
|
||||
self.log(f" Directions: {self.n_directions} ({self.direction_method}) | Norm-preserve: {self.norm_preserve}")
|
||||
self.log(f" Regularization: {self.regularization} | Refinement passes: {self.refinement_passes}")
|
||||
|
||||
self.handle = load_model(
|
||||
@@ -1400,18 +1434,26 @@ class AbliterationPipeline:
|
||||
else:
|
||||
max_length = 384 if collect_multi_pos else 256
|
||||
free_gb = dev.get_total_free_gb()
|
||||
# Scale memory thresholds by model size — a 1.2B model needs far
|
||||
# less KV-cache memory per token than a 7B model. Baseline
|
||||
# thresholds (4 / 2 GB) were tuned for 7B (hidden=4096, layers=32).
|
||||
_h = self.handle.hidden_size if self.handle else 4096
|
||||
_l = n_layers if n_layers else 32
|
||||
_mem_scale = (_h / 4096) * (_l / 32)
|
||||
_tight_gb = max(4.0 * _mem_scale, 0.5)
|
||||
_low_gb = max(2.0 * _mem_scale, 0.25)
|
||||
if dev.is_gpu_available():
|
||||
if self.max_seq_length is None and free_gb < 2.0:
|
||||
if self.max_seq_length is None and free_gb < _low_gb:
|
||||
max_length = 64
|
||||
self.log(f" Low GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
|
||||
elif self.max_seq_length is None and free_gb < 4.0:
|
||||
self.log(f" Low GPU memory ({free_gb:.1f} GB free, threshold {_low_gb:.1f} GB), using max_length={max_length}")
|
||||
elif self.max_seq_length is None and free_gb < _tight_gb:
|
||||
max_length = 128
|
||||
self.log(f" Tight GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
|
||||
self.log(f" Tight GPU memory ({free_gb:.1f} GB free, threshold {_tight_gb:.1f} GB), using max_length={max_length}")
|
||||
|
||||
device = self._get_model_device(model)
|
||||
|
||||
# Batch prompts for throughput — hooks unbatch per-prompt activations
|
||||
batch_size = 16 if free_gb > 4.0 else 8 if free_gb > 2.0 else 1
|
||||
batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
|
||||
# Left-pad so position -1 is always the last real token in every batch element
|
||||
orig_padding_side = getattr(tokenizer, "padding_side", "right")
|
||||
if batch_size > 1:
|
||||
@@ -1498,9 +1540,16 @@ class AbliterationPipeline:
|
||||
wasserstein_extractor = WassersteinOptimalExtractor()
|
||||
self.log("Using Wasserstein-optimal direction extraction (cost-minimizing GEP)")
|
||||
|
||||
# Optionally use LEACE for theoretically optimal concept erasure
|
||||
leace_extractor = None
|
||||
if self.direction_method == "leace":
|
||||
from obliteratus.analysis.leace import LEACEExtractor
|
||||
leace_extractor = LEACEExtractor()
|
||||
self.log("Using LEACE (closed-form optimal concept erasure) for direction extraction")
|
||||
|
||||
# Optionally use whitened SVD for cleaner direction extraction
|
||||
whitened_extractor = None
|
||||
if self.use_whitened_svd and n_dirs > 1 and not self.use_wasserstein_optimal:
|
||||
if self.use_whitened_svd and n_dirs > 1 and not self.use_wasserstein_optimal and leace_extractor is None:
|
||||
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
|
||||
whitened_extractor = WhitenedSVDExtractor()
|
||||
self.log("Using whitened SVD (covariance-normalized) for direction extraction")
|
||||
@@ -1547,6 +1596,30 @@ class AbliterationPipeline:
|
||||
if idx < 5:
|
||||
self.log(f" layer {idx}: Wasserstein extraction failed ({e}), falling back to SVD")
|
||||
|
||||
if leace_extractor is not None:
|
||||
# LEACE: closed-form optimal concept erasure direction
|
||||
if idx in self._harmful_acts and idx in self._harmless_acts:
|
||||
try:
|
||||
l_result = leace_extractor.extract(
|
||||
self._harmful_acts[idx],
|
||||
self._harmless_acts[idx],
|
||||
layer_idx=idx,
|
||||
)
|
||||
self.refusal_directions[idx] = l_result.direction
|
||||
self.refusal_subspaces[idx] = l_result.direction.unsqueeze(0)
|
||||
norms[idx] = l_result.generalized_eigenvalue
|
||||
|
||||
if idx < 5 or idx == n_layers - 1:
|
||||
self.log(
|
||||
f" layer {idx}: LEACE eigenvalue={l_result.generalized_eigenvalue:.4f}, "
|
||||
f"erasure_loss={l_result.erasure_loss:.4f}, "
|
||||
f"cond={l_result.within_class_condition:.0f}"
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
if idx < 5:
|
||||
self.log(f" layer {idx}: LEACE failed ({e}), falling back to diff-of-means")
|
||||
|
||||
if n_dirs == 1:
|
||||
# Classic single-direction: difference-in-means
|
||||
diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
|
||||
@@ -1630,7 +1703,8 @@ class AbliterationPipeline:
|
||||
# Supports multiple algorithms for baseline comparison:
|
||||
# knee_cosmic: OBLITERATUS default (knee detection + COSMIC fusion)
|
||||
# knee: knee detection only (simplified OBLITERATUS)
|
||||
# middle60: FailSpy/abliterator heuristic (layers 20%-80%)
|
||||
# middle60: legacy heuristic (layers 20%-80%)
|
||||
# all_except_first: FailSpy/abliterator (all layers except layer 0)
|
||||
# all: all layers (for Bayesian optimization / Heretic)
|
||||
# top_k: top-k by refusal strength (Gabliteration-style)
|
||||
sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
|
||||
@@ -1643,8 +1717,14 @@ class AbliterationPipeline:
|
||||
|
||||
selection_method = self.layer_selection
|
||||
|
||||
if selection_method == "middle60":
|
||||
# FailSpy/abliterator heuristic: middle 60% of layers
|
||||
if selection_method == "all_except_first":
|
||||
# FailSpy/abliterator: all layers except layer 0
|
||||
# Source: range(1, self.model.cfg.n_layers) in FailSpy/abliterator
|
||||
self._strong_layers = list(range(1, n_layers))
|
||||
self.log(f"Layer selection: all-except-first ({len(self._strong_layers)} layers)")
|
||||
|
||||
elif selection_method == "middle60":
|
||||
# Legacy heuristic: middle 60% of layers (layers 20%-80%)
|
||||
self._strong_layers = self._select_layers_middle60(n_layers)
|
||||
self.log(f"Layer selection: middle-60% ({len(self._strong_layers)} layers)")
|
||||
|
||||
@@ -2300,14 +2380,14 @@ class AbliterationPipeline:
|
||||
|
||||
@staticmethod
|
||||
def _select_layers_middle60(n_layers: int) -> list[int]:
|
||||
"""Select the middle 60% of layers (FailSpy/abliterator heuristic).
|
||||
"""Select the middle 60% of layers (legacy heuristic).
|
||||
|
||||
The original abliterator library by FailSpy selects layers from index
|
||||
n_layers*0.2 to n_layers*0.8, based on the empirical observation that
|
||||
refusal concentrates in middle layers (not early embedding layers or
|
||||
late unembedding layers).
|
||||
Selects layers from index n_layers*0.2 to n_layers*0.8.
|
||||
|
||||
Reference: FailSpy/abliterator (2024), GitHub.
|
||||
NOTE: This does NOT match FailSpy/abliterator's actual layer selection.
|
||||
FailSpy uses all layers except layer 0 (range(1, n_layers)). Use
|
||||
layer_selection="all_except_first" for faithful FailSpy reproduction.
|
||||
This method is retained for backward compatibility only.
|
||||
"""
|
||||
start = int(n_layers * 0.2)
|
||||
end = int(n_layers * 0.8)
|
||||
@@ -3589,9 +3669,18 @@ class AbliterationPipeline:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Use LEACE when enabled (matching main _distill)
|
||||
leace_extractor = None
|
||||
if self.direction_method == "leace":
|
||||
try:
|
||||
from obliteratus.analysis.leace import LEACEExtractor
|
||||
leace_extractor = LEACEExtractor()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Use whitened SVD when enabled (matching main _distill)
|
||||
whitened_extractor = None
|
||||
if self.use_whitened_svd and n_dirs > 1 and wasserstein_extractor is None:
|
||||
if self.use_whitened_svd and n_dirs > 1 and wasserstein_extractor is None and leace_extractor is None:
|
||||
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
|
||||
whitened_extractor = WhitenedSVDExtractor()
|
||||
|
||||
@@ -3624,6 +3713,22 @@ class AbliterationPipeline:
|
||||
except Exception:
|
||||
pass # Fall through to SVD
|
||||
|
||||
# LEACE path (matching main _distill)
|
||||
if leace_extractor is not None:
|
||||
if idx in self._harmful_acts and idx in self._harmless_acts:
|
||||
try:
|
||||
l_result = leace_extractor.extract(
|
||||
self._harmful_acts[idx],
|
||||
self._harmless_acts[idx],
|
||||
layer_idx=idx,
|
||||
)
|
||||
self.refusal_directions[idx] = l_result.direction
|
||||
self.refusal_subspaces[idx] = l_result.direction.unsqueeze(0)
|
||||
norms[idx] = l_result.generalized_eigenvalue
|
||||
continue
|
||||
except Exception:
|
||||
pass # Fall through to diff-of-means
|
||||
|
||||
if n_dirs == 1:
|
||||
diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
|
||||
norm = diff.norm()
|
||||
@@ -3667,7 +3772,9 @@ class AbliterationPipeline:
|
||||
|
||||
# Respect configured layer_selection (matching _distill)
|
||||
selection_method = self.layer_selection
|
||||
if selection_method == "middle60":
|
||||
if selection_method == "all_except_first":
|
||||
self._strong_layers = list(range(1, n_layers))
|
||||
elif selection_method == "middle60":
|
||||
self._strong_layers = self._select_layers_middle60(n_layers)
|
||||
elif selection_method == "all":
|
||||
self._strong_layers = self._select_layers_all(n_layers)
|
||||
@@ -5663,8 +5770,9 @@ class AbliterationPipeline:
|
||||
cert_n = min(20, len(self.harmful_prompts), len(self.harmless_prompts))
|
||||
cert_harmful = self._maybe_apply_chat_template(self.harmful_prompts[:cert_n])
|
||||
cert_harmless = self._maybe_apply_chat_template(self.harmless_prompts[:cert_n])
|
||||
cert_h_acts = self._collect_activations(layers, cert_harmful, "cert_harmful")
|
||||
cert_b_acts = self._collect_activations(layers, cert_harmless, "cert_harmless")
|
||||
cert_layer_modules = get_layer_modules(self.handle)
|
||||
cert_h_acts = self._collect_activations(cert_layer_modules, cert_harmful, "cert_harmful")
|
||||
cert_b_acts = self._collect_activations(cert_layer_modules, cert_harmless, "cert_harmless")
|
||||
|
||||
cert_results = []
|
||||
for layer_idx in cert_layers:
|
||||
@@ -5741,6 +5849,7 @@ class AbliterationPipeline:
|
||||
"method": self.method,
|
||||
"method_config": {
|
||||
"n_directions": self.n_directions,
|
||||
"direction_method": self.direction_method,
|
||||
"norm_preserve": self.norm_preserve,
|
||||
"regularization": self.regularization,
|
||||
"refinement_passes": self.refinement_passes,
|
||||
@@ -5868,10 +5977,11 @@ class AbliterationPipeline:
|
||||
param_bytes = sum(v.numel() * v.element_size() for v in state_dict.values())
|
||||
self.log(f"State dict: {len(state_dict)} tensors, {param_bytes / 1e9:.1f} GB")
|
||||
|
||||
# 3. NOW it's safe to clean up the offload dir — all weights are in memory.
|
||||
self._cleanup_offload_dir()
|
||||
|
||||
# 4. Save model + tokenizer + metadata
|
||||
# 3. Save model + tokenizer + metadata
|
||||
# NOTE: offload dir cleanup is deferred until AFTER save_pretrained
|
||||
# completes, because accelerate's dispatch hooks may still access
|
||||
# the offload dir during serialization (even when state_dict is
|
||||
# explicitly provided).
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.log(f"Saving model to {self.output_dir}/")
|
||||
|
||||
@@ -5940,6 +6050,9 @@ class AbliterationPipeline:
|
||||
del state_dict
|
||||
self._free_gpu_memory()
|
||||
|
||||
# NOW it's safe to clean up the offload dir — save_pretrained is done.
|
||||
self._cleanup_offload_dir()
|
||||
|
||||
self.handle.tokenizer.save_pretrained(self.output_dir)
|
||||
|
||||
(self.output_dir / "abliteration_metadata.json").write_text(
|
||||
|
||||
@@ -269,7 +269,7 @@ class ConditionalAbliterator:
|
||||
) -> torch.Tensor | None:
|
||||
"""Extract category-specific refusal direction.
|
||||
|
||||
Uses Fisher's Linear Discriminant (whitened difference-of-means)
|
||||
Uses difference-of-means (category_mean - harmless_mean)
|
||||
and then orthogonalizes against previously extracted directions
|
||||
to ensure category independence.
|
||||
"""
|
||||
|
||||
@@ -0,0 +1,239 @@
|
||||
"""LEACE-inspired direction extraction for refusal concept erasure.
|
||||
|
||||
This module implements Fisher's Linear Discriminant (FLD) direction for
|
||||
concept erasure, inspired by LEACE (Belrose et al. 2023).
|
||||
|
||||
IMPORTANT: This is NOT a faithful implementation of LEACE as described in
|
||||
the paper. Key difference:
|
||||
|
||||
- **True LEACE** uses the *total* covariance Sigma_X for whitening:
|
||||
P* = I - W^{-1} P_{W Sigma_XZ} W where W = Sigma_X^{-1/2}
|
||||
For binary concepts, this yields: v = Sigma_X^{-1} delta
|
||||
|
||||
- **This implementation** uses *within-class* covariance S_w:
|
||||
v = S_w^{-1} delta
|
||||
This is Fisher's Linear Discriminant direction, which maximizes
|
||||
class separability relative to within-class spread.
|
||||
|
||||
For binary concepts, Sigma_X = S_w + p(1-p) * delta @ delta^T,
|
||||
so the two directions differ when the between-class scatter is
|
||||
non-negligible relative to within-class scatter. In high-dimensional
|
||||
settings (d >> 1) with moderate class separation, the difference
|
||||
is typically small but non-zero.
|
||||
|
||||
The FLD direction is still a strong choice for refusal erasure — it
|
||||
handles rogue dimensions (high-variance but non-discriminative) better
|
||||
than plain diff-of-means, and is a closed-form solution with no
|
||||
iterative optimization.
|
||||
|
||||
Advantages over SVD:
|
||||
- Within-class normalization prevents high-variance but
|
||||
non-discriminative dimensions from dominating
|
||||
- No hyperparameters beyond regularization epsilon
|
||||
- Closed-form solution (no iterative optimization)
|
||||
|
||||
References:
|
||||
- Belrose et al. (2023): LEACE: Perfect linear concept erasure in
|
||||
closed form. NeurIPS 2023.
|
||||
- Ravfogel et al. (2022): RLACE: Adversarial concept erasure
|
||||
(iterative precursor to LEACE).
|
||||
- Fisher (1936): The use of multiple measurements in taxonomic
|
||||
problems. Annals of Eugenics.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEACEResult:
|
||||
"""Result of LEACE direction extraction for a single layer."""
|
||||
|
||||
layer_idx: int
|
||||
direction: torch.Tensor # (hidden_dim,) unit vector
|
||||
generalized_eigenvalue: float # lambda from GEP (discriminability)
|
||||
within_class_condition: float # condition number of S_w
|
||||
mean_diff_norm: float # ||mu_1 - mu_0||
|
||||
erasure_loss: float # expected squared distortion from erasure
|
||||
|
||||
|
||||
class LEACEExtractor:
|
||||
"""Extract refusal directions via Fisher's Linear Discriminant.
|
||||
|
||||
Finds the direction that maximally separates harmful from harmless
|
||||
activations relative to within-class variance (v = S_w^{-1} delta).
|
||||
See module docstring for how this relates to true LEACE.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
regularization_eps: float = 1e-4,
|
||||
shrinkage: float = 0.0,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
regularization_eps: Tikhonov regularization for S_w inversion.
|
||||
Larger values produce more conservative (but stable) results.
|
||||
shrinkage: Ledoit-Wolf shrinkage toward identity (0..1).
|
||||
0 = no shrinkage, 1 = full shrinkage to scaled identity.
|
||||
Useful when n_samples < hidden_dim.
|
||||
"""
|
||||
self.regularization_eps = regularization_eps
|
||||
self.shrinkage = shrinkage
|
||||
|
||||
def extract(
|
||||
self,
|
||||
harmful_activations: list[torch.Tensor],
|
||||
harmless_activations: list[torch.Tensor],
|
||||
layer_idx: int = 0,
|
||||
) -> LEACEResult:
|
||||
"""Extract the LEACE direction for a single layer.
|
||||
|
||||
Args:
|
||||
harmful_activations: List of (hidden_dim,) tensors from harmful prompts.
|
||||
harmless_activations: List of (hidden_dim,) tensors from harmless prompts.
|
||||
layer_idx: Layer index (for metadata).
|
||||
|
||||
Returns:
|
||||
LEACEResult with the optimal erasure direction.
|
||||
"""
|
||||
H = torch.stack(harmful_activations).float() # (n_h, d)
|
||||
B = torch.stack(harmless_activations).float() # (n_b, d)
|
||||
|
||||
if H.dim() == 3:
|
||||
H = H.squeeze(1)
|
||||
if B.dim() == 3:
|
||||
B = B.squeeze(1)
|
||||
|
||||
n_h, d = H.shape
|
||||
n_b = B.shape[0]
|
||||
|
||||
# Class-conditional means
|
||||
mu_h = H.mean(dim=0) # (d,)
|
||||
mu_b = B.mean(dim=0) # (d,)
|
||||
|
||||
# Mean difference (between-class direction)
|
||||
delta = mu_h - mu_b # (d,)
|
||||
delta_norm = delta.norm().item()
|
||||
|
||||
# Within-class covariance: S_w = (S_h + S_b) / 2
|
||||
# where S_h = (H - mu_h)^T (H - mu_h) / (n_h - 1) etc.
|
||||
H_centered = H - mu_h.unsqueeze(0)
|
||||
B_centered = B - mu_b.unsqueeze(0)
|
||||
|
||||
S_h = (H_centered.T @ H_centered) / max(n_h - 1, 1)
|
||||
S_b = (B_centered.T @ B_centered) / max(n_b - 1, 1)
|
||||
S_w = (S_h + S_b) / 2.0 # (d, d)
|
||||
|
||||
# Apply Ledoit-Wolf shrinkage if requested
|
||||
if self.shrinkage > 0:
|
||||
trace_S_w = S_w.trace().item()
|
||||
S_w = (1 - self.shrinkage) * S_w + self.shrinkage * (trace_S_w / d) * torch.eye(d, device=S_w.device)
|
||||
|
||||
# Regularize S_w for numerical stability
|
||||
S_w_reg = S_w + self.regularization_eps * torch.eye(d, device=S_w.device)
|
||||
|
||||
# Condition number of S_w (for diagnostics)
|
||||
try:
|
||||
eigs_w = torch.linalg.eigvalsh(S_w_reg)
|
||||
eigs_w = eigs_w.clamp(min=0)
|
||||
pos_eigs = eigs_w[eigs_w > eigs_w.max() * 1e-10]
|
||||
condition = (pos_eigs.max() / pos_eigs.min()).item() if pos_eigs.numel() > 0 else float('inf')
|
||||
except Exception:
|
||||
condition = float('inf')
|
||||
|
||||
# LEACE direction via S_w^{-1} @ delta
|
||||
# The generalized eigenvector for rank-1 S_between = delta @ delta^T
|
||||
# reduces to: v = S_w^{-1} @ delta (up to normalization)
|
||||
try:
|
||||
# Use solve for numerical stability (avoids explicit inverse)
|
||||
v = torch.linalg.solve(S_w_reg, delta) # (d,)
|
||||
except torch.linalg.LinAlgError:
|
||||
# Fallback: pseudoinverse
|
||||
v = torch.linalg.lstsq(S_w_reg, delta.unsqueeze(1)).solution.squeeze(1)
|
||||
|
||||
# Normalize to unit length
|
||||
v_norm = v.norm()
|
||||
if v_norm > 1e-8:
|
||||
direction = v / v_norm
|
||||
else:
|
||||
# Degenerate case: fall back to normalized mean difference
|
||||
direction = delta / max(delta_norm, 1e-8)
|
||||
|
||||
# Generalized eigenvalue: lambda = delta^T @ S_w^{-1} @ delta
|
||||
# This measures how discriminable the classes are after whitening
|
||||
gen_eigenvalue = (delta @ v).item()
|
||||
|
||||
# Erasure loss: expected squared distortion E[||x - x'||^2]
|
||||
# For rank-1 projection: loss = v^T @ S_total @ v where S_total
|
||||
# is the total (pooled) covariance
|
||||
all_acts = torch.cat([H, B], dim=0)
|
||||
mu_total = all_acts.mean(dim=0)
|
||||
centered_total = all_acts - mu_total.unsqueeze(0)
|
||||
S_total = (centered_total.T @ centered_total) / max(all_acts.shape[0] - 1, 1)
|
||||
erasure_loss = (direction @ S_total @ direction).item()
|
||||
|
||||
return LEACEResult(
|
||||
layer_idx=layer_idx,
|
||||
direction=direction,
|
||||
generalized_eigenvalue=gen_eigenvalue,
|
||||
within_class_condition=condition,
|
||||
mean_diff_norm=delta_norm,
|
||||
erasure_loss=erasure_loss,
|
||||
)
|
||||
|
||||
def extract_all_layers(
|
||||
self,
|
||||
harmful_acts: dict[int, list[torch.Tensor]],
|
||||
harmless_acts: dict[int, list[torch.Tensor]],
|
||||
) -> dict[int, LEACEResult]:
|
||||
"""Extract LEACE directions for all layers.
|
||||
|
||||
Args:
|
||||
harmful_acts: {layer_idx: [activations]} from activation collection.
|
||||
harmless_acts: {layer_idx: [activations]} from activation collection.
|
||||
|
||||
Returns:
|
||||
{layer_idx: LEACEResult} for each layer.
|
||||
"""
|
||||
results = {}
|
||||
for idx in sorted(harmful_acts.keys()):
|
||||
if idx not in harmless_acts:
|
||||
continue
|
||||
results[idx] = self.extract(
|
||||
harmful_acts[idx],
|
||||
harmless_acts[idx],
|
||||
layer_idx=idx,
|
||||
)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def compare_with_diff_of_means(
|
||||
leace_result: LEACEResult,
|
||||
harmful_mean: torch.Tensor,
|
||||
harmless_mean: torch.Tensor,
|
||||
) -> dict[str, float]:
|
||||
"""Compare LEACE direction with simple diff-of-means.
|
||||
|
||||
Returns cosine similarity and diagnostic metrics showing how much
|
||||
the within-class normalization rotates the direction.
|
||||
"""
|
||||
diff = harmful_mean.squeeze() - harmless_mean.squeeze()
|
||||
diff_norm = diff.norm()
|
||||
if diff_norm > 1e-8:
|
||||
diff_normalized = diff / diff_norm
|
||||
else:
|
||||
diff_normalized = diff
|
||||
|
||||
cosine_sim = (leace_result.direction @ diff_normalized).abs().item()
|
||||
|
||||
return {
|
||||
"cosine_similarity": cosine_sim,
|
||||
"leace_eigenvalue": leace_result.generalized_eigenvalue,
|
||||
"leace_erasure_loss": leace_result.erasure_loss,
|
||||
"within_class_condition": leace_result.within_class_condition,
|
||||
"mean_diff_norm": leace_result.mean_diff_norm,
|
||||
}
|
||||
@@ -428,8 +428,15 @@ class RiemannianManifoldAnalyzer:
|
||||
geodesic triangle with area A satisfies:
|
||||
sum(angles) = pi + K * A (Gauss-Bonnet for small triangles)
|
||||
|
||||
We approximate geodesics with straight lines (valid for small K)
|
||||
and use angle excess to estimate K.
|
||||
IMPORTANT LIMITATION: This method uses Euclidean chords and angles
|
||||
in ambient space, NOT geodesics on the manifold. In flat Euclidean
|
||||
space, the angle sum of any triangle is exactly pi, so this method
|
||||
will yield K ≈ 0 (up to numerical noise) regardless of the actual
|
||||
manifold curvature. The results are only meaningful when the data
|
||||
lies on an approximately low-dimensional curved submanifold and
|
||||
triangles are sufficiently small relative to the curvature radius.
|
||||
For rigorous curvature estimates, use methods based on local PCA
|
||||
eigenvalue decay or Jacobian-based Riemannian metric computation.
|
||||
"""
|
||||
# Compute sides
|
||||
ab = (b - a).float()
|
||||
@@ -613,8 +620,12 @@ class RiemannianManifoldAnalyzer:
|
||||
return torch.zeros_like(activation)
|
||||
v = v / norm
|
||||
|
||||
# Correction magnitude: K * proj_magnitude^2 / 2
|
||||
correction_magnitude = curvature * proj_magnitude ** 2 / 2.0
|
||||
# Second-order geodesic correction: K * proj_magnitude^2 / 6
|
||||
# From Jacobi field estimate: deviation of geodesic from straight
|
||||
# line over distance L with curvature K is ≈ K * L^2 / 6.
|
||||
# Note: the residual bound in analyze() uses K * ||x||^2 / 8
|
||||
# which is a looser upper bound including higher-order terms.
|
||||
correction_magnitude = curvature * proj_magnitude ** 2 / 6.0
|
||||
|
||||
# Clamp to prevent instability
|
||||
correction_magnitude = max(-0.1, min(0.1, correction_magnitude))
|
||||
|
||||
@@ -94,7 +94,7 @@ class SparseAutoencoder(nn.Module):
|
||||
|
||||
@property
|
||||
def decoder_weight(self) -> torch.Tensor:
|
||||
"""Return the decoder weight matrix (n_features x hidden_dim for untied, or encoder.weight.T)."""
|
||||
"""Return the decoder weight matrix (hidden_dim x n_features for untied, or encoder.weight.T)."""
|
||||
if self.tied_weights:
|
||||
return self.encoder.weight.T
|
||||
return self.decoder.weight
|
||||
|
||||
@@ -175,10 +175,11 @@ class SpectralCertifier:
|
||||
harmful_centered = harmful_activations - harmful_mean
|
||||
harmless_centered = harmless_activations - harmless_mean
|
||||
|
||||
# Pooled within-class covariance
|
||||
# Pooled within-class covariance (standard formula: sum of scatter
|
||||
# matrices divided by total degrees of freedom)
|
||||
cov_h = harmful_centered.T @ harmful_centered / max(n_h - 1, 1)
|
||||
cov_b = harmless_centered.T @ harmless_centered / max(n_b - 1, 1)
|
||||
pooled_cov = (cov_h * n_h + cov_b * n_b) / max(n - 2, 1)
|
||||
pooled_cov = (cov_h * (n_h - 1) + cov_b * (n_b - 1)) / max(n - 2, 1)
|
||||
|
||||
# Step 2: Estimate noise variance (median eigenvalue method)
|
||||
noise_var = self._estimate_noise_variance(pooled_cov, n, d)
|
||||
@@ -374,8 +375,13 @@ class SpectralCertifier:
|
||||
# Correct for MP bias: median of MP distribution
|
||||
gamma = d / max(n, 1)
|
||||
if gamma < 1:
|
||||
# MP median approximation (from Bai & Silverstein)
|
||||
mp_median_ratio = (1 + math.sqrt(gamma)) ** 2 * 0.5
|
||||
# MP median approximation. The exact MP median requires
|
||||
# numerical inversion of the MP CDF; we use the empirical
|
||||
# approximation median ≈ (1 - sqrt(gamma))^2 + gamma^(1/3)
|
||||
# which is more accurate than the naive 0.5 * upper_edge
|
||||
# for small gamma. Falls back to the simpler formula when
|
||||
# gamma is very small.
|
||||
mp_median_ratio = (1 - math.sqrt(gamma)) ** 2 + gamma ** (1.0 / 3.0)
|
||||
noise_var = median_eig / max(mp_median_ratio, 1e-10)
|
||||
else:
|
||||
noise_var = median_eig
|
||||
|
||||
@@ -58,7 +58,7 @@ class WassersteinDirectionResult:
|
||||
direction: torch.Tensor # (hidden_dim,) optimal direction
|
||||
wasserstein_cost: float # W_2^2 cost for this direction
|
||||
mean_shift_component: float # (r^T m)^2 portion
|
||||
bures_component: float # r^T Sigma r portion (upper bound)
|
||||
bures_component: float # r^T Sigma r portion (exact when r is eigenvector of Sigma, lower bound otherwise)
|
||||
refusal_projection: float # (r^T d)^2
|
||||
cost_effectiveness_ratio: float # W_2^2 / (r^T d)^2
|
||||
|
||||
|
||||
@@ -142,28 +142,35 @@ def _parametric_layer_weight(
|
||||
min_weight: float,
|
||||
spread: float,
|
||||
) -> float:
|
||||
"""Compute ablation weight for a layer using a parametric bell curve.
|
||||
"""Compute ablation weight for a layer using a piecewise-linear tent kernel.
|
||||
|
||||
This is the Heretic-style parametric kernel:
|
||||
- max_weight: peak ablation strength (0..1)
|
||||
- peak_position: normalized position of peak (0..1 maps to layer 0..n_layers-1)
|
||||
- min_weight: minimum ablation weight at the tails
|
||||
- spread: controls width of the bell curve (higher = wider)
|
||||
Faithful reproduction of Heretic's parametric kernel (p-e-w/heretic):
|
||||
- max_weight: peak ablation strength at peak_position
|
||||
- peak_position: normalized position of peak (0..1)
|
||||
- min_weight: weight at the edges of the tent
|
||||
- spread: normalized distance from peak to tent edge (min_weight_distance)
|
||||
|
||||
Returns a value in [min_weight, max_weight] representing how strongly
|
||||
to ablate this layer (1.0 = full projection, 0.0 = no projection).
|
||||
Layers beyond ``spread`` from the peak get weight 0 (skipped entirely).
|
||||
Within the tent, weight drops linearly from max_weight to min_weight.
|
||||
This matches Heretic's actual formula::
|
||||
|
||||
distance = abs(layer_index - max_weight_position)
|
||||
if distance > min_weight_distance: skip
|
||||
weight = max_weight + (distance / min_weight_distance) * (min_weight - max_weight)
|
||||
"""
|
||||
if n_layers <= 1:
|
||||
return max_weight
|
||||
|
||||
normalized_pos = layer_idx / (n_layers - 1)
|
||||
peak = peak_position
|
||||
# Gaussian-shaped kernel
|
||||
dist = abs(normalized_pos - peak)
|
||||
sigma = max(spread, 0.01)
|
||||
gauss = math.exp(-0.5 * (dist / sigma) ** 2)
|
||||
dist = abs(normalized_pos - peak_position)
|
||||
min_weight_distance = max(spread, 0.01)
|
||||
|
||||
return min_weight + (max_weight - min_weight) * gauss
|
||||
# Hard cutoff: layers outside the tent get 0 weight (Heretic skips them)
|
||||
if dist > min_weight_distance:
|
||||
return 0.0
|
||||
|
||||
# Linear interpolation: max_weight at peak → min_weight at edges
|
||||
return max_weight + (dist / min_weight_distance) * (min_weight - max_weight)
|
||||
|
||||
|
||||
def _interpolate_direction(
|
||||
@@ -171,37 +178,56 @@ def _interpolate_direction(
|
||||
layer_idx: int,
|
||||
float_dir_idx: float,
|
||||
) -> torch.Tensor:
|
||||
"""Get an interpolated refusal direction from a float-valued index.
|
||||
"""Get an interpolated refusal direction from a float-valued layer index.
|
||||
|
||||
Non-integer values interpolate between adjacent SVD directions in the
|
||||
refusal subspace, unlocking a continuous space of directions beyond
|
||||
the discrete top-k.
|
||||
Faithful reproduction of Heretic's direction interpolation: the index
|
||||
selects which *layer's* diff-of-means direction to use, with float
|
||||
values interpolating between adjacent layers' directions. This is
|
||||
fundamentally different from interpolating between SVD components
|
||||
within a single layer — it searches across the layer axis.
|
||||
|
||||
From Heretic source (model.py)::
|
||||
|
||||
weight, index = math.modf(direction_index + 1)
|
||||
refusal_direction = F.normalize(
|
||||
refusal_directions[int(index)].lerp(
|
||||
refusal_directions[int(index) + 1], weight), p=2, dim=0)
|
||||
|
||||
Args:
|
||||
pipeline: Pipeline with extracted refusal subspaces.
|
||||
layer_idx: Which layer's subspace to use.
|
||||
float_dir_idx: Continuous direction index (e.g., 0.7 interpolates
|
||||
between direction 0 and direction 1).
|
||||
pipeline: Pipeline with extracted refusal directions per layer.
|
||||
layer_idx: The layer being projected (used as fallback).
|
||||
float_dir_idx: Continuous direction index — selects which layer's
|
||||
direction to use (e.g., 5.3 interpolates 70% layer-5 + 30% layer-6).
|
||||
|
||||
Returns:
|
||||
Normalized direction tensor.
|
||||
"""
|
||||
subspace = pipeline.refusal_subspaces.get(layer_idx)
|
||||
if subspace is None or subspace.shape[0] == 0:
|
||||
# Build sorted list of layer indices that have refusal directions
|
||||
sorted_layers = sorted(pipeline.refusal_directions.keys())
|
||||
if not sorted_layers:
|
||||
return pipeline.refusal_directions.get(layer_idx, torch.zeros(1))
|
||||
|
||||
n_dirs = subspace.shape[0]
|
||||
# Clamp to valid range
|
||||
float_dir_idx = max(0.0, min(float_dir_idx, n_dirs - 1))
|
||||
n_layers_with_dirs = len(sorted_layers)
|
||||
|
||||
# Heretic uses direction_index + 1 offset; we map float_dir_idx into
|
||||
# the sorted layer list, clamped to valid range.
|
||||
float_dir_idx = max(0.0, min(float_dir_idx, n_layers_with_dirs - 1))
|
||||
|
||||
lo = int(float_dir_idx)
|
||||
hi = min(lo + 1, n_dirs - 1)
|
||||
hi = min(lo + 1, n_layers_with_dirs - 1)
|
||||
|
||||
lo_layer = sorted_layers[lo]
|
||||
hi_layer = sorted_layers[hi]
|
||||
|
||||
d_lo = pipeline.refusal_directions[lo_layer]
|
||||
d_hi = pipeline.refusal_directions[hi_layer]
|
||||
|
||||
if lo == hi:
|
||||
d = subspace[lo]
|
||||
d = d_lo
|
||||
else:
|
||||
# Linear interpolation between adjacent layers' directions
|
||||
alpha = float_dir_idx - lo
|
||||
d = (1.0 - alpha) * subspace[lo] + alpha * subspace[hi]
|
||||
d = (1.0 - alpha) * d_lo + alpha * d_hi
|
||||
|
||||
norm = d.norm()
|
||||
if norm > 1e-8:
|
||||
@@ -342,9 +368,14 @@ def run_bayesian_optimization(
|
||||
for live_data, saved_clone in original_params: # noqa: F821
|
||||
live_data.copy_(saved_clone.to(live_data.device))
|
||||
|
||||
# Warm-start values for the parametric kernel
|
||||
# Estimate peak position from strongest layer
|
||||
if pipeline._strong_layers:
|
||||
# Warm-start values for the parametric kernel.
|
||||
# If the informed pipeline provided analysis-derived warm-start params,
|
||||
# use those (they're much better than the default heuristic).
|
||||
informed_warm = getattr(pipeline, "_informed_warm_start", None)
|
||||
if informed_warm:
|
||||
warm_peak = informed_warm.get("peak_position", 0.5)
|
||||
pipeline.log(f" Using analysis-informed warm-start (peak={warm_peak:.2f})")
|
||||
elif pipeline._strong_layers:
|
||||
peak_layer = pipeline._strong_layers[0]
|
||||
warm_peak = peak_layer / max(n_total_layers - 1, 1)
|
||||
else:
|
||||
@@ -356,56 +387,56 @@ def run_bayesian_optimization(
|
||||
# Suppress Optuna's verbose logging
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
|
||||
# Max SVD directions available (for float direction interpolation)
|
||||
max_n_dirs = max(
|
||||
(pipeline.refusal_subspaces[idx].shape[0]
|
||||
for idx in pipeline._strong_layers
|
||||
if idx in pipeline.refusal_subspaces),
|
||||
default=1,
|
||||
)
|
||||
# Max layers with directions (for float direction interpolation)
|
||||
n_layers_with_dirs = len([
|
||||
idx for idx in pipeline._strong_layers
|
||||
if idx in pipeline.refusal_directions
|
||||
])
|
||||
|
||||
# ── Phase 1: Parametric kernel optimization (compact search space) ──
|
||||
# Heretic uses SEPARATE kernel parameters for attention and MLP,
|
||||
# allowing them to peak at different layers (8 params + 1 dir_idx = 9).
|
||||
|
||||
def objective(trial: optuna.Trial) -> tuple[float, float]:
|
||||
"""Multi-objective: minimize (refusal_rate, kl_divergence)."""
|
||||
_restore_all()
|
||||
|
||||
# Parametric kernel: 4 params describe the entire layer weighting
|
||||
max_weight = trial.suggest_float("max_weight", 0.5, 1.0)
|
||||
peak_position = trial.suggest_float("peak_position", 0.1, 0.9)
|
||||
min_weight = trial.suggest_float("min_weight", 0.0, 0.3)
|
||||
spread = trial.suggest_float("spread", 0.1, 0.6)
|
||||
# Attention kernel: 4 params
|
||||
attn_max = trial.suggest_float("attn_max_weight", 0.5, 1.0)
|
||||
attn_peak = trial.suggest_float("attn_peak_position", 0.1, 0.9)
|
||||
attn_min = trial.suggest_float("attn_min_weight", 0.0, 0.3)
|
||||
attn_spread = trial.suggest_float("attn_spread", 0.1, 0.6)
|
||||
|
||||
# Component-specific scaling (Heretic insight: MLP more damaging)
|
||||
attn_scale = trial.suggest_float("attn_scale", 0.5, 1.0)
|
||||
mlp_scale = trial.suggest_float("mlp_scale", 0.3, 1.0)
|
||||
# MLP kernel: 4 params (separate — can peak at a different layer)
|
||||
mlp_max = trial.suggest_float("mlp_max_weight", 0.3, 1.0)
|
||||
mlp_peak = trial.suggest_float("mlp_peak_position", 0.1, 0.9)
|
||||
mlp_min = trial.suggest_float("mlp_min_weight", 0.0, 0.3)
|
||||
mlp_spread = trial.suggest_float("mlp_spread", 0.1, 0.6)
|
||||
|
||||
# Float direction index (continuous interpolation between SVD dirs)
|
||||
dir_idx = trial.suggest_float("dir_idx", 0.0, max(max_n_dirs - 1, 0.0))
|
||||
# Float direction index (cross-layer interpolation, Heretic-style)
|
||||
dir_idx = trial.suggest_float("dir_idx", 0.0, max(n_layers_with_dirs - 1, 0.0))
|
||||
|
||||
# Compute per-layer regularization from parametric kernel
|
||||
layer_regs: dict[int, float] = {}
|
||||
# Compute per-layer, per-component regularization from kernels
|
||||
attn_regs: dict[int, float] = {}
|
||||
mlp_regs: dict[int, float] = {}
|
||||
for idx in pipeline._strong_layers:
|
||||
weight = _parametric_layer_weight(
|
||||
idx, n_total_layers, max_weight, peak_position, min_weight, spread,
|
||||
)
|
||||
# Convert weight to regularization (weight=1 → reg=0, weight=0 → reg=1)
|
||||
layer_regs[idx] = 1.0 - weight
|
||||
attn_w = _parametric_layer_weight(idx, n_total_layers, attn_max, attn_peak, attn_min, attn_spread)
|
||||
mlp_w = _parametric_layer_weight(idx, n_total_layers, mlp_max, mlp_peak, mlp_min, mlp_spread)
|
||||
attn_regs[idx] = 1.0 - attn_w
|
||||
mlp_regs[idx] = 1.0 - mlp_w
|
||||
|
||||
# Apply projection with trial's parameters
|
||||
for idx in pipeline._strong_layers:
|
||||
if idx not in pipeline.refusal_subspaces:
|
||||
if idx not in pipeline.refusal_directions:
|
||||
continue
|
||||
|
||||
# Use interpolated direction
|
||||
# Use cross-layer interpolated direction
|
||||
direction = _interpolate_direction(pipeline, idx, dir_idx)
|
||||
d_col = direction.to(device=next(layer_modules[idx].parameters()).device)
|
||||
d_col = d_col.unsqueeze(-1) if d_col.dim() == 1 else d_col
|
||||
|
||||
reg = layer_regs[idx]
|
||||
|
||||
# Attention projection (with attn_scale)
|
||||
attn_reg = 1.0 - (1.0 - reg) * attn_scale
|
||||
# Attention projection (with per-component kernel)
|
||||
attn_reg = attn_regs[idx]
|
||||
try:
|
||||
attn = get_attention_module(layer_modules[idx], arch)
|
||||
pipeline._project_out_advanced(
|
||||
@@ -416,8 +447,8 @@ def run_bayesian_optimization(
|
||||
except (AttributeError, RuntimeError):
|
||||
pass
|
||||
|
||||
# MLP/FFN projection (with mlp_scale)
|
||||
mlp_reg = 1.0 - (1.0 - reg) * mlp_scale
|
||||
# MLP/FFN projection (with per-component kernel)
|
||||
mlp_reg = mlp_regs[idx]
|
||||
try:
|
||||
ffn = get_ffn_module(layer_modules[idx], arch)
|
||||
count = pipeline._project_out_advanced(
|
||||
@@ -439,18 +470,20 @@ def run_bayesian_optimization(
|
||||
refusal = _measure_refusal_rate(pipeline, n_prompts=n_refusal_prompts)
|
||||
kl = _measure_kl_divergence(pipeline, reference_logits, kl_prompts)
|
||||
|
||||
# Track best combined score
|
||||
# Track best combined score (use average of attn/mlp regs for layer_regs)
|
||||
nonlocal best_score, best_result
|
||||
combined = refusal + 0.5 * kl
|
||||
if combined < best_score:
|
||||
best_score = combined
|
||||
best_result = dict(layer_regs)
|
||||
best_result = {
|
||||
idx: (attn_regs[idx] + mlp_regs[idx]) / 2.0
|
||||
for idx in pipeline._strong_layers
|
||||
}
|
||||
|
||||
pipeline.log(
|
||||
f" Trial {trial.number + 1}/{n_trials}: "
|
||||
f"refusal={refusal:.0%}, KL={kl:.4f} "
|
||||
f"(peak={peak_position:.2f}, spread={spread:.2f}, "
|
||||
f"attn={attn_scale:.2f}, mlp={mlp_scale:.2f}, dir={dir_idx:.2f})"
|
||||
f"(attn_peak={attn_peak:.2f}, mlp_peak={mlp_peak:.2f}, dir={dir_idx:.2f})"
|
||||
)
|
||||
|
||||
return refusal, kl
|
||||
@@ -462,16 +495,33 @@ def run_bayesian_optimization(
|
||||
study_name="obliteratus_parametric_optimization",
|
||||
)
|
||||
|
||||
# Enqueue warm-start trial with analysis-derived estimates
|
||||
warm_params = {
|
||||
"max_weight": 0.9,
|
||||
"peak_position": warm_peak,
|
||||
"min_weight": 0.05,
|
||||
"spread": 0.3,
|
||||
"attn_scale": 0.8,
|
||||
"mlp_scale": 0.6,
|
||||
"dir_idx": 0.0,
|
||||
}
|
||||
# Enqueue warm-start trial with analysis-derived estimates.
|
||||
# Translate informed pipeline params to the new per-component format.
|
||||
if informed_warm:
|
||||
iw = informed_warm
|
||||
warm_params = {
|
||||
"attn_max_weight": iw.get("max_weight", 0.9),
|
||||
"attn_peak_position": iw.get("peak_position", warm_peak),
|
||||
"attn_min_weight": iw.get("min_weight", 0.05),
|
||||
"attn_spread": iw.get("spread", 0.3),
|
||||
"mlp_max_weight": iw.get("max_weight", 0.9) * iw.get("mlp_scale", 0.6),
|
||||
"mlp_peak_position": iw.get("peak_position", warm_peak),
|
||||
"mlp_min_weight": iw.get("min_weight", 0.05),
|
||||
"mlp_spread": iw.get("spread", 0.3),
|
||||
"dir_idx": iw.get("dir_idx", 0.0),
|
||||
}
|
||||
else:
|
||||
warm_params = {
|
||||
"attn_max_weight": 0.9,
|
||||
"attn_peak_position": warm_peak,
|
||||
"attn_min_weight": 0.05,
|
||||
"attn_spread": 0.3,
|
||||
"mlp_max_weight": 0.6,
|
||||
"mlp_peak_position": warm_peak,
|
||||
"mlp_min_weight": 0.05,
|
||||
"mlp_spread": 0.3,
|
||||
"dir_idx": 0.0,
|
||||
}
|
||||
study.enqueue_trial(warm_params)
|
||||
|
||||
pipeline.log(f"Bayesian optimization: running {n_trials} trials (parametric kernel)...")
|
||||
@@ -490,25 +540,32 @@ def run_bayesian_optimization(
|
||||
p = best_trial.params
|
||||
best_result = {}
|
||||
for idx in pipeline._strong_layers:
|
||||
weight = _parametric_layer_weight(
|
||||
attn_w = _parametric_layer_weight(
|
||||
idx, n_total_layers,
|
||||
p["max_weight"], p["peak_position"],
|
||||
p["min_weight"], p["spread"],
|
||||
p["attn_max_weight"], p["attn_peak_position"],
|
||||
p["attn_min_weight"], p["attn_spread"],
|
||||
)
|
||||
best_result[idx] = 1.0 - weight
|
||||
mlp_w = _parametric_layer_weight(
|
||||
idx, n_total_layers,
|
||||
p["mlp_max_weight"], p["mlp_peak_position"],
|
||||
p["mlp_min_weight"], p["mlp_spread"],
|
||||
)
|
||||
best_result[idx] = (attn_w + mlp_w) / 2.0 # average for layer-level reg
|
||||
best_result[idx] = 1.0 - best_result[idx]
|
||||
|
||||
pipeline.log(
|
||||
f" Best trial: refusal={best_trial.values[0]:.0%}, "
|
||||
f"KL={best_trial.values[1]:.4f}"
|
||||
)
|
||||
pipeline.log(
|
||||
f" Kernel: peak={p['peak_position']:.2f}, spread={p['spread']:.2f}, "
|
||||
f"max={p['max_weight']:.2f}, min={p['min_weight']:.2f}"
|
||||
f" Attn kernel: peak={p['attn_peak_position']:.2f}, "
|
||||
f"spread={p['attn_spread']:.2f}, max={p['attn_max_weight']:.2f}"
|
||||
)
|
||||
pipeline.log(
|
||||
f" Components: attn={p['attn_scale']:.2f}, mlp={p['mlp_scale']:.2f}, "
|
||||
f"dir_idx={p['dir_idx']:.2f}"
|
||||
f" MLP kernel: peak={p['mlp_peak_position']:.2f}, "
|
||||
f"spread={p['mlp_spread']:.2f}, max={p['mlp_max_weight']:.2f}"
|
||||
)
|
||||
pipeline.log(f" dir_idx={p['dir_idx']:.2f}")
|
||||
|
||||
# Store the best direction index for use during EXCISE
|
||||
best_dir_idx = p.get("dir_idx", 0.0)
|
||||
@@ -518,9 +575,9 @@ def run_bayesian_optimization(
|
||||
new_dir = _interpolate_direction(pipeline, idx, best_dir_idx)
|
||||
pipeline.refusal_directions[idx] = new_dir
|
||||
|
||||
# Store component scales for use in EXCISE
|
||||
pipeline._bayesian_attn_scale = p.get("attn_scale", 1.0)
|
||||
pipeline._bayesian_mlp_scale = p.get("mlp_scale", 1.0)
|
||||
# Store component scales for use in EXCISE (backward compat)
|
||||
pipeline._bayesian_attn_scale = p.get("attn_max_weight", 1.0)
|
||||
pipeline._bayesian_mlp_scale = p.get("mlp_max_weight", 1.0)
|
||||
|
||||
elif best_result:
|
||||
pipeline.log(f" Using best combined score: {best_score:.4f}")
|
||||
|
||||
+7
-1
@@ -109,7 +109,12 @@ def main(argv: list[str] | None = None):
|
||||
],
|
||||
help="Liberation method (default: advanced)",
|
||||
)
|
||||
p.add_argument("--n-directions", type=int, default=None, help="Override: number of SVD directions to extract")
|
||||
p.add_argument("--n-directions", type=int, default=None, help="Override: number of refusal directions to extract")
|
||||
p.add_argument(
|
||||
"--direction-method", type=str, default=None,
|
||||
choices=["diff_means", "svd", "leace"],
|
||||
help="Direction extraction method: diff_means (simple, robust), svd (multi-direction), leace (optimal erasure)",
|
||||
)
|
||||
p.add_argument("--regularization", type=float, default=None, help="Override: fraction to preserve (0.0-1.0)")
|
||||
p.add_argument("--refinement-passes", type=int, default=None, help="Override: number of iterative passes")
|
||||
p.add_argument(
|
||||
@@ -591,6 +596,7 @@ def _cmd_abliterate(args):
|
||||
dtype=args.dtype,
|
||||
method=method,
|
||||
n_directions=args.n_directions,
|
||||
direction_method=getattr(args, "direction_method", None),
|
||||
regularization=args.regularization,
|
||||
refinement_passes=args.refinement_passes,
|
||||
quantization=args.quantization,
|
||||
|
||||
@@ -334,19 +334,20 @@ def _load_harmbench_classifier():
|
||||
bnb_4bit_quant_type="nf4",
|
||||
llm_int8_enable_fp32_cpu_offload=True,
|
||||
)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
quantization_config=bnb_cfg,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
load_kwargs = dict(quantization_config=bnb_cfg, torch_dtype=torch.float16)
|
||||
if dev.supports_device_map_auto():
|
||||
load_kwargs["device_map"] = "auto"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
|
||||
except Exception:
|
||||
logger.info("4-bit quantization unavailable for classifier, loading in float16")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
load_kwargs = dict(torch_dtype=torch.float16)
|
||||
if dev.supports_device_map_auto():
|
||||
load_kwargs["device_map"] = "auto"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
|
||||
|
||||
# On MPS/CPU: move model to best available device
|
||||
if not dev.supports_device_map_auto():
|
||||
model = model.to(dev.get_device())
|
||||
|
||||
model.eval()
|
||||
_HARMBENCH_CLASSIFIER = (model, tokenizer)
|
||||
|
||||
@@ -73,15 +73,17 @@ INFORMED_METHOD = {
|
||||
"description": (
|
||||
"Runs analysis modules between PROBE and DISTILL to auto-configure "
|
||||
"direction extraction, layer selection, and projection strategy based "
|
||||
"on the model's actual refusal geometry."
|
||||
"on the model's actual refusal geometry. Defaults to single diff-of-means "
|
||||
"direction + Bayesian optimization (Heretic-style)."
|
||||
),
|
||||
"n_directions": 4, # overridden by analysis
|
||||
"n_directions": 1, # overridden by analysis
|
||||
"direction_method": "diff_means", # overridden by analysis; "leace" also available
|
||||
"norm_preserve": True,
|
||||
"regularization": 0.0, # overridden by analysis
|
||||
"refinement_passes": 2, # overridden by analysis
|
||||
"project_biases": True,
|
||||
"use_chat_template": True,
|
||||
"use_whitened_svd": True, # overridden by analysis
|
||||
"use_whitened_svd": False, # overridden by analysis
|
||||
"true_iterative_refinement": True,
|
||||
}
|
||||
|
||||
@@ -126,7 +128,8 @@ class AnalysisInsights:
|
||||
clean_layers: list[int] = field(default_factory=list)
|
||||
|
||||
# Derived configuration
|
||||
recommended_n_directions: int = 4
|
||||
recommended_n_directions: int = 1
|
||||
recommended_direction_method: str = "diff_means"
|
||||
recommended_regularization: float = 0.0
|
||||
recommended_refinement_passes: int = 2
|
||||
recommended_layers: list[int] = field(default_factory=list)
|
||||
@@ -217,12 +220,19 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
hub_token=hub_token,
|
||||
hub_community_org=hub_community_org,
|
||||
quantization=quantization,
|
||||
# Set informed defaults
|
||||
# Set informed defaults: single direction + Bayesian opt
|
||||
n_directions=1,
|
||||
direction_method="diff_means",
|
||||
norm_preserve=True,
|
||||
project_biases=True,
|
||||
use_chat_template=True,
|
||||
use_whitened_svd=True,
|
||||
use_whitened_svd=False,
|
||||
true_iterative_refinement=True,
|
||||
use_kl_optimization=True,
|
||||
float_layer_interpolation=True,
|
||||
layer_adaptive_strength=True,
|
||||
winsorize_activations=True,
|
||||
winsorize_percentile=0.01,
|
||||
)
|
||||
self.method = "informed"
|
||||
|
||||
@@ -311,7 +321,11 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
if self._run_defense:
|
||||
self._analyze_defense_robustness()
|
||||
|
||||
# 5. Derive configuration from insights
|
||||
# 5. Sparse Surgery Analysis (RSI computation)
|
||||
if self._run_sparse:
|
||||
self._analyze_sparsity()
|
||||
|
||||
# 6. Derive configuration from insights
|
||||
self._derive_configuration()
|
||||
|
||||
elapsed = time.time() - t0
|
||||
@@ -392,6 +406,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
sample_layers = candidate_layers[::step]
|
||||
|
||||
polyhedral_count = 0
|
||||
all_results = []
|
||||
best_cone_result = None
|
||||
best_strength = 0.0
|
||||
|
||||
@@ -405,34 +420,43 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
|
||||
all_results.append(result)
|
||||
if result.is_polyhedral:
|
||||
polyhedral_count += 1
|
||||
|
||||
# Track the strongest layer's cone analysis
|
||||
# Track the strongest layer's cone analysis for per-category directions
|
||||
general_strength = result.general_direction.norm().item() if result.general_direction.numel() > 1 else 0
|
||||
if general_strength > best_strength:
|
||||
best_strength = general_strength
|
||||
best_cone_result = result
|
||||
|
||||
if best_cone_result is not None:
|
||||
self._insights.cone_is_polyhedral = best_cone_result.is_polyhedral
|
||||
self._insights.cone_dimensionality = best_cone_result.cone_dimensionality
|
||||
self._insights.mean_pairwise_cosine = best_cone_result.mean_pairwise_cosine
|
||||
if all_results:
|
||||
# Aggregate cone geometry across sampled layers (majority vote +
|
||||
# mean dimensionality) instead of relying on a single layer.
|
||||
n_sampled = len(all_results)
|
||||
is_polyhedral = polyhedral_count > n_sampled / 2
|
||||
avg_dimensionality = sum(r.cone_dimensionality for r in all_results) / n_sampled
|
||||
avg_pairwise_cos = sum(r.mean_pairwise_cosine for r in all_results) / n_sampled
|
||||
|
||||
# Store per-category directions for category-aware excision
|
||||
for cd in best_cone_result.category_directions:
|
||||
self._insights.per_category_directions[cd.category] = cd.direction
|
||||
self._insights.direction_specificity[cd.category] = cd.specificity
|
||||
self._insights.cone_is_polyhedral = is_polyhedral
|
||||
self._insights.cone_dimensionality = avg_dimensionality
|
||||
self._insights.mean_pairwise_cosine = avg_pairwise_cos
|
||||
|
||||
cone_type = "POLYHEDRAL" if best_cone_result.is_polyhedral else "LINEAR"
|
||||
self.log(f" Cone type: {cone_type}")
|
||||
self.log(f" Dimensionality: {best_cone_result.cone_dimensionality:.2f}")
|
||||
self.log(f" Mean pairwise cosine: {best_cone_result.mean_pairwise_cosine:.3f}")
|
||||
self.log(f" Categories detected: {best_cone_result.category_count}")
|
||||
self.log(f" Polyhedral at {polyhedral_count}/{len(sample_layers)} sampled layers")
|
||||
# Store per-category directions from the strongest layer
|
||||
if best_cone_result is not None:
|
||||
for cd in best_cone_result.category_directions:
|
||||
self._insights.per_category_directions[cd.category] = cd.direction
|
||||
self._insights.direction_specificity[cd.category] = cd.specificity
|
||||
|
||||
for cd in sorted(best_cone_result.category_directions, key=lambda x: -x.strength)[:5]:
|
||||
self.log(f" {cd.category:15s} DSI={cd.specificity:.3f} str={cd.strength:.3f}")
|
||||
cone_type = "POLYHEDRAL" if is_polyhedral else "LINEAR"
|
||||
self.log(f" Cone type: {cone_type} (majority vote: {polyhedral_count}/{n_sampled} layers)")
|
||||
self.log(f" Avg dimensionality: {avg_dimensionality:.2f}")
|
||||
self.log(f" Avg pairwise cosine: {avg_pairwise_cos:.3f}")
|
||||
if best_cone_result is not None:
|
||||
self.log(f" Categories detected: {best_cone_result.category_count}")
|
||||
|
||||
for cd in sorted(best_cone_result.category_directions, key=lambda x: -x.strength)[:5]:
|
||||
self.log(f" {cd.category:15s} DSI={cd.specificity:.3f} str={cd.strength:.3f}")
|
||||
else:
|
||||
self.log(" No cone results — using default linear assumption")
|
||||
|
||||
@@ -517,6 +541,71 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
self.log(f" Most entangled layers: {emap.most_entangled_layers}")
|
||||
self.log(f" Cleanest layers: {emap.least_entangled_layers}")
|
||||
|
||||
def _analyze_sparsity(self):
|
||||
"""Compute Refusal Sparsity Index to decide sparse vs dense excision."""
|
||||
self.log("\n[5/5] Refusal Sparsity Analysis")
|
||||
self.log("-" * 40)
|
||||
|
||||
from obliteratus.analysis.sparse_surgery import SparseDirectionSurgeon
|
||||
from obliteratus.strategies.utils import (
|
||||
get_ffn_module,
|
||||
get_layer_modules,
|
||||
)
|
||||
|
||||
# Need refusal directions — use quick diff-in-means
|
||||
quick_directions = {}
|
||||
for idx in sorted(self._harmful_means.keys()):
|
||||
diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze()
|
||||
norm = diff.norm().item()
|
||||
if norm > 1e-10:
|
||||
quick_directions[idx] = diff / diff.norm()
|
||||
|
||||
if not quick_directions:
|
||||
self.log(" No refusal directions — skipping sparsity analysis")
|
||||
return
|
||||
|
||||
# Gather FFN output weights for representative layers (sample for speed)
|
||||
layers = get_layer_modules(self.handle)
|
||||
arch = self.handle.architecture
|
||||
n_layers = len(layers)
|
||||
sample_idxs = sorted(quick_directions.keys())
|
||||
step = max(1, len(sample_idxs) // 8)
|
||||
sample_idxs = sample_idxs[::step]
|
||||
|
||||
weights = {}
|
||||
sampled_dirs = {}
|
||||
for idx in sample_idxs:
|
||||
if idx >= n_layers:
|
||||
continue
|
||||
try:
|
||||
ffn = get_ffn_module(layers[idx], arch)
|
||||
for name in ["down_proj", "c_proj", "dense_4h_to_h", "fc_out", "fc2", "w2"]:
|
||||
proj = getattr(ffn, name, None)
|
||||
if proj is not None and hasattr(proj, "weight"):
|
||||
W = proj.weight.data
|
||||
d = quick_directions[idx]
|
||||
if W.shape[-1] == d.shape[0]:
|
||||
weights[idx] = W
|
||||
sampled_dirs[idx] = d
|
||||
break
|
||||
except (AttributeError, RuntimeError):
|
||||
continue
|
||||
|
||||
if not weights:
|
||||
self.log(" Could not access FFN weights — skipping sparsity analysis")
|
||||
return
|
||||
|
||||
surgeon = SparseDirectionSurgeon(auto_sparsity=True)
|
||||
plan = surgeon.plan_surgery(weights, sampled_dirs)
|
||||
|
||||
self._insights.mean_refusal_sparsity_index = plan.mean_refusal_sparsity_index
|
||||
self._insights.recommended_sparsity = plan.recommended_sparsity
|
||||
|
||||
self.log(f" Mean RSI: {plan.mean_refusal_sparsity_index:.3f}")
|
||||
self.log(f" Recommended sparsity: {plan.recommended_sparsity:.1%}")
|
||||
self.log(f" Most sparse layer: {plan.most_sparse_layer}")
|
||||
self.log(f" Most dense layer: {plan.most_dense_layer}")
|
||||
|
||||
# ── Configuration Derivation ─────────────────────────────────────
|
||||
|
||||
def _derive_configuration(self):
|
||||
@@ -528,18 +617,32 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
self.log("-" * 50)
|
||||
insights = self._insights
|
||||
|
||||
# 1. n_directions: based on cone geometry
|
||||
if insights.cone_is_polyhedral:
|
||||
# Polyhedral cone → need more directions to capture all facets
|
||||
# 1. n_directions + direction_method: based on cone geometry
|
||||
# Default: single direction via diff-of-means (proven most robust).
|
||||
# Only escalate to multi-direction when analysis confirms polyhedral geometry.
|
||||
if insights.cone_is_polyhedral and insights.cone_dimensionality > 2.0:
|
||||
# Clearly polyhedral cone → use multiple directions via SVD
|
||||
n_dirs = max(4, min(8, int(insights.cone_dimensionality * 2)))
|
||||
self.direction_method = "svd"
|
||||
self.use_whitened_svd = True
|
||||
self.log(f" Polyhedral cone (dim={insights.cone_dimensionality:.1f}) "
|
||||
f"→ n_directions={n_dirs}")
|
||||
f"→ n_directions={n_dirs}, method=svd (whitened)")
|
||||
elif insights.cone_is_polyhedral:
|
||||
# Mildly polyhedral → LEACE gives better single-direction erasure
|
||||
n_dirs = 1
|
||||
self.direction_method = "leace"
|
||||
self.use_whitened_svd = False
|
||||
self.log(f" Mildly polyhedral (dim={insights.cone_dimensionality:.1f}) "
|
||||
f"→ n_directions=1, method=leace")
|
||||
else:
|
||||
# Linear cone → fewer directions suffice
|
||||
n_dirs = max(1, min(4, int(insights.cone_dimensionality + 1)))
|
||||
# Linear cone → single direction via diff-of-means (simplest, most robust)
|
||||
n_dirs = 1
|
||||
self.direction_method = "diff_means"
|
||||
self.use_whitened_svd = False
|
||||
self.log(f" Linear cone (dim={insights.cone_dimensionality:.1f}) "
|
||||
f"→ n_directions={n_dirs}")
|
||||
f"→ n_directions=1, method=diff_means")
|
||||
insights.recommended_n_directions = n_dirs
|
||||
insights.recommended_direction_method = self.direction_method
|
||||
self.n_directions = n_dirs
|
||||
|
||||
# 2. regularization: based on alignment method + entanglement
|
||||
@@ -586,15 +689,22 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
|
||||
# 4. Layer selection: cluster-aware + entanglement-gated
|
||||
if insights.cluster_representative_layers:
|
||||
# Start from cluster representatives
|
||||
# Start from cluster representatives (strongest per cluster)
|
||||
base_layers = list(insights.cluster_representative_layers)
|
||||
|
||||
# Expand: add all layers from clusters that have strong signals
|
||||
all_cluster_layers = []
|
||||
# Conservative expansion: for each cluster, add at most the top-2
|
||||
# strongest layers (by refusal norm) beyond the representative,
|
||||
# to avoid over-modifying weak layers in large clusters.
|
||||
norms = {}
|
||||
for idx in self._harmful_means:
|
||||
if idx in self._harmless_means:
|
||||
norms[idx] = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze().norm().item()
|
||||
for cluster in insights.direction_clusters:
|
||||
all_cluster_layers.extend(cluster)
|
||||
if all_cluster_layers:
|
||||
base_layers = sorted(set(all_cluster_layers))
|
||||
ranked = sorted(cluster, key=lambda ly: norms.get(ly, 0), reverse=True)
|
||||
# Add up to 2 additional strong layers per cluster
|
||||
for ly in ranked[:3]: # representative + up to 2 more
|
||||
base_layers.append(ly)
|
||||
base_layers = sorted(set(base_layers))
|
||||
|
||||
# Gate: remove highly entangled layers
|
||||
skip = set()
|
||||
@@ -621,13 +731,9 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
self.log(f" RSI={insights.mean_refusal_sparsity_index:.2f} "
|
||||
f"→ standard dense projection")
|
||||
|
||||
# 6. Whitened SVD: always use for multi-direction, skip for single
|
||||
if n_dirs > 1:
|
||||
self.use_whitened_svd = True
|
||||
self.log(f" Multi-direction ({n_dirs}) → whitened SVD enabled")
|
||||
else:
|
||||
self.use_whitened_svd = False
|
||||
self.log(" Single direction → standard diff-in-means")
|
||||
# 6. Direction method summary (already set in step 1)
|
||||
self.log(f" Direction method: {self.direction_method} "
|
||||
f"(whitened_svd={'on' if self.use_whitened_svd else 'off'})")
|
||||
|
||||
# ── Informed DISTILL ─────────────────────────────────────────────
|
||||
|
||||
@@ -650,7 +756,38 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
n_layers = len(self._harmful_means)
|
||||
norms: dict[int, float] = {}
|
||||
|
||||
if self.use_whitened_svd and self.n_directions > 1:
|
||||
# ── Small-model direction cap (matching base _distill) ────────
|
||||
# On small models, each SVD direction removes a proportionally
|
||||
# larger fraction of weight energy. Cap to prevent over-ablation.
|
||||
hidden_size = self.handle.hidden_size if self.handle else 0
|
||||
total_params = getattr(self.handle, 'total_params', 0) if self.handle else 0
|
||||
if total_params == 0 and self.handle:
|
||||
try:
|
||||
total_params = sum(p.numel() for p in self.handle.model.parameters())
|
||||
except Exception:
|
||||
pass
|
||||
if self.n_directions > 1 and (
|
||||
(0 < hidden_size < 2048)
|
||||
or (0 < total_params < 2_000_000_000)
|
||||
or n_layers <= 16
|
||||
):
|
||||
max_dirs = max(1, min(self.n_directions, 2))
|
||||
if max_dirs < self.n_directions:
|
||||
self.log(
|
||||
f"Capped n_directions from {self.n_directions} to {max_dirs} "
|
||||
f"for small model (hidden={hidden_size}, "
|
||||
f"params={total_params / 1e9:.1f}B, layers={n_layers})"
|
||||
)
|
||||
self.n_directions = max_dirs
|
||||
|
||||
# LEACE extractor for optimal concept erasure
|
||||
leace_extractor = None
|
||||
if self.direction_method == "leace":
|
||||
from obliteratus.analysis.leace import LEACEExtractor
|
||||
leace_extractor = LEACEExtractor()
|
||||
self.log(f"Using LEACE (closed-form optimal concept erasure)")
|
||||
|
||||
if self.use_whitened_svd and self.n_directions > 1 and leace_extractor is None:
|
||||
from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
|
||||
whitened_extractor = WhitenedSVDExtractor()
|
||||
self.log(f"Using whitened SVD with {self.n_directions} directions")
|
||||
@@ -658,6 +795,29 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
whitened_extractor = None
|
||||
|
||||
for idx in range(n_layers):
|
||||
# LEACE path: theoretically optimal single-direction erasure
|
||||
if leace_extractor is not None:
|
||||
if idx in self._harmful_acts and idx in self._harmless_acts:
|
||||
try:
|
||||
l_result = leace_extractor.extract(
|
||||
self._harmful_acts[idx],
|
||||
self._harmless_acts[idx],
|
||||
layer_idx=idx,
|
||||
)
|
||||
self.refusal_directions[idx] = l_result.direction
|
||||
self.refusal_subspaces[idx] = l_result.direction.unsqueeze(0)
|
||||
norms[idx] = l_result.generalized_eigenvalue
|
||||
|
||||
if idx < 5 or idx == n_layers - 1:
|
||||
self.log(
|
||||
f" layer {idx}: LEACE eigenvalue={l_result.generalized_eigenvalue:.4f}, "
|
||||
f"erasure_loss={l_result.erasure_loss:.4f}"
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
if idx < 5:
|
||||
self.log(f" layer {idx}: LEACE failed ({e}), falling back")
|
||||
|
||||
if self.n_directions == 1:
|
||||
diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
|
||||
norm = diff.norm().item()
|
||||
@@ -691,6 +851,41 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
self.refusal_directions[idx] = primary / primary.norm()
|
||||
norms[idx] = S[:k].sum().item()
|
||||
|
||||
# Enrich subspaces with per-category cone directions when available.
|
||||
# This uses the actual refusal cone generators instead of purely
|
||||
# data-agnostic SVD components.
|
||||
cat_dirs = self._insights.per_category_directions
|
||||
if cat_dirs and self._insights.cone_is_polyhedral and self.n_directions > 1:
|
||||
cat_tensors = list(cat_dirs.values())
|
||||
# Stack and orthogonalize category directions
|
||||
cat_stack = torch.stack(cat_tensors) # (n_cats, hidden)
|
||||
cat_norms = cat_stack.norm(dim=1, keepdim=True).clamp(min=1e-8)
|
||||
cat_stack = cat_stack / cat_norms
|
||||
# Blend into strong-signal layers: replace later SVD components
|
||||
# with category directions (which are geometrically meaningful)
|
||||
n_cat = cat_stack.shape[0]
|
||||
for idx in norms:
|
||||
sub = self.refusal_subspaces.get(idx)
|
||||
if sub is None or sub.shape[0] <= 1:
|
||||
continue
|
||||
# Keep the first SVD direction (strongest), replace remaining
|
||||
# with category directions projected to be orthogonal to it
|
||||
primary = sub[0:1] # (1, hidden)
|
||||
# Project category directions orthogonal to primary
|
||||
cos = (cat_stack @ primary.squeeze(0)) # (n_cat,)
|
||||
ortho_cats = cat_stack - cos.unsqueeze(1) * primary
|
||||
ortho_norms = ortho_cats.norm(dim=1)
|
||||
# Keep only directions that survived orthogonalization
|
||||
valid = ortho_norms > 0.1
|
||||
if valid.sum() > 0:
|
||||
ortho_cats = ortho_cats[valid]
|
||||
ortho_cats = ortho_cats / ortho_cats.norm(dim=1, keepdim=True)
|
||||
# Take up to (n_directions - 1) category directions
|
||||
n_take = min(self.n_directions - 1, ortho_cats.shape[0])
|
||||
new_sub = torch.cat([primary, ortho_cats[:n_take]], dim=0)
|
||||
self.refusal_subspaces[idx] = new_sub
|
||||
self.log(f"Enriched subspaces with {n_cat} per-category cone directions")
|
||||
|
||||
# Layer selection: use analysis-recommended layers if available,
|
||||
# otherwise fall back to knee detection
|
||||
if self._insights.recommended_layers:
|
||||
@@ -728,15 +923,117 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
def _excise_informed(self):
|
||||
"""Excise refusal directions with analysis-informed strategy.
|
||||
|
||||
Uses sparse surgery if analysis recommends it, otherwise falls
|
||||
back to the standard projection with analysis-tuned parameters.
|
||||
Uses Bayesian optimization (when available) with analysis-derived
|
||||
warm-start parameters, falling back to sparse surgery or standard
|
||||
projection. This is the key integration: analysis maps the geometry,
|
||||
Bayesian optimization finds the optimal projection strength.
|
||||
"""
|
||||
if self._insights.use_sparse_surgery:
|
||||
self._excise_sparse()
|
||||
return
|
||||
|
||||
# Enable Bayesian optimization using analysis insights for warm-start.
|
||||
# The analysis provides much better initial parameters than the default
|
||||
# heuristic (strongest-layer-based peak), dramatically narrowing the
|
||||
# search space and improving convergence.
|
||||
self._configure_bayesian_warm_start()
|
||||
self._excise()
|
||||
|
||||
def _configure_bayesian_warm_start(self):
|
||||
"""Configure Bayesian optimization with analysis-derived warm-start.
|
||||
|
||||
Translates analysis insights into a much tighter search space:
|
||||
- peak_position from cluster representative layers
|
||||
- spread from cluster structure (narrow clusters → narrow spread)
|
||||
- component scaling from entanglement analysis
|
||||
- KL budget from alignment method detection
|
||||
"""
|
||||
insights = self._insights
|
||||
|
||||
# Enable Bayesian optimization (50 trials default, same as heretic)
|
||||
self._bayesian_trials = 50
|
||||
|
||||
# Also set heretic-compatible flags on the pipeline so the base
|
||||
# _excise_inner() picks them up during Bayesian optimization.
|
||||
self.layer_adaptive_strength = True
|
||||
self.float_layer_interpolation = True
|
||||
self.use_kl_optimization = True
|
||||
|
||||
# KL budget: tighter for methods that are fragile (CAI, RLHF),
|
||||
# looser for concentrated methods (DPO, SFT).
|
||||
method = insights.detected_alignment_method
|
||||
if method == "dpo":
|
||||
self.kl_budget = 0.5
|
||||
elif method == "rlhf":
|
||||
self.kl_budget = 0.3
|
||||
elif method == "cai":
|
||||
self.kl_budget = 0.2
|
||||
elif method == "sft":
|
||||
self.kl_budget = 0.4
|
||||
else:
|
||||
# Standard excision with analysis-tuned parameters
|
||||
# (regularization, norm_preserve, etc. already configured)
|
||||
self._excise()
|
||||
self.kl_budget = 0.35
|
||||
|
||||
self.log(f"Bayesian optimization enabled (50 trials, KL budget={self.kl_budget})")
|
||||
self.log("Analysis insights will warm-start the optimizer")
|
||||
|
||||
# Compute analysis-derived warm-start for the parametric kernel.
|
||||
# The Bayesian optimizer reads these from the pipeline if present.
|
||||
n_layers = len(self._harmful_means) if self._harmful_means else 32
|
||||
if insights.cluster_representative_layers and n_layers > 1:
|
||||
# Peak position: normalized position of the strongest cluster rep
|
||||
norms = {}
|
||||
for idx in self._harmful_means:
|
||||
if idx in self._harmless_means:
|
||||
norms[idx] = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze().norm().item()
|
||||
reps = insights.cluster_representative_layers
|
||||
if norms:
|
||||
best_rep = max(reps, key=lambda ly: norms.get(ly, 0))
|
||||
else:
|
||||
best_rep = reps[len(reps) // 2]
|
||||
warm_peak = best_rep / max(n_layers - 1, 1)
|
||||
|
||||
# Spread: narrow if clusters are tight, wide if clusters span many layers
|
||||
if insights.direction_clusters:
|
||||
cluster_widths = [
|
||||
(max(c) - min(c)) / max(n_layers - 1, 1)
|
||||
for c in insights.direction_clusters if len(c) > 1
|
||||
]
|
||||
warm_spread = max(0.1, min(0.6, sum(cluster_widths) / len(cluster_widths) if cluster_widths else 0.3))
|
||||
else:
|
||||
warm_spread = 0.3
|
||||
|
||||
# Min weight: higher if high persistence (refusal spread across all layers)
|
||||
warm_min = min(0.3, max(0.0, insights.direction_persistence * 0.2))
|
||||
|
||||
# Attn/MLP scaling: reduce MLP scaling if entanglement is high
|
||||
# (MLP projections cause more capability damage)
|
||||
if insights.entanglement_score > 0.5:
|
||||
warm_mlp = 0.4
|
||||
warm_attn = 0.7
|
||||
else:
|
||||
warm_mlp = 0.6
|
||||
warm_attn = 0.8
|
||||
else:
|
||||
warm_peak = 0.5
|
||||
warm_spread = 0.3
|
||||
warm_min = 0.05
|
||||
warm_mlp = 0.6
|
||||
warm_attn = 0.8
|
||||
|
||||
# Store warm-start params for the Bayesian optimizer to pick up
|
||||
self._informed_warm_start = {
|
||||
"max_weight": 0.9,
|
||||
"peak_position": warm_peak,
|
||||
"min_weight": warm_min,
|
||||
"spread": warm_spread,
|
||||
"attn_scale": warm_attn,
|
||||
"mlp_scale": warm_mlp,
|
||||
"dir_idx": 0.0,
|
||||
}
|
||||
self.log(
|
||||
f" Warm-start: peak={warm_peak:.2f}, spread={warm_spread:.2f}, "
|
||||
f"min={warm_min:.2f}, attn={warm_attn:.2f}, mlp={warm_mlp:.2f}"
|
||||
)
|
||||
|
||||
def _excise_sparse(self):
|
||||
"""Sparse direction surgery — only modifies high-projection rows."""
|
||||
@@ -825,14 +1122,22 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
1. Residual refusal signal (via activation probing)
|
||||
2. Self-repair / Ouroboros effect (via defense robustness)
|
||||
3. Triggers additional targeted passes at compensating layers
|
||||
|
||||
KL-gated: stops early if model damage (KL divergence) is getting
|
||||
worse even though refusal persists. This prevents the death spiral
|
||||
where each pass damages the model without removing refusal.
|
||||
"""
|
||||
# Run standard verification first
|
||||
self._verify()
|
||||
|
||||
# Check if Ouroboros compensation is needed
|
||||
refusal_rate = self._quality_metrics.get("refusal_rate", 0.0)
|
||||
prev_kl = self._quality_metrics.get("kl_divergence", 0.0)
|
||||
ouroboros_pass = 0
|
||||
|
||||
# KL budget: stop if KL exceeds this threshold (model too damaged)
|
||||
kl_ceiling = getattr(self, "kl_budget", 0.5) * 2.0 # 2x budget as hard ceiling
|
||||
|
||||
while (refusal_rate > self._ouroboros_threshold
|
||||
and ouroboros_pass < self._max_ouroboros_passes):
|
||||
ouroboros_pass += 1
|
||||
@@ -849,9 +1154,9 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
self._distill_inner()
|
||||
self.log(f"Found {len(self._strong_layers)} layers with residual refusal")
|
||||
|
||||
# Re-excise at the new strong layers
|
||||
# Re-excise at the new strong layers using informed strategy
|
||||
if self._strong_layers:
|
||||
self._excise()
|
||||
self._excise_informed()
|
||||
else:
|
||||
self.log("No strong layers found — stopping Ouroboros compensation")
|
||||
break
|
||||
@@ -859,7 +1164,24 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
# Re-verify
|
||||
self._verify()
|
||||
refusal_rate = self._quality_metrics.get("refusal_rate", 0.0)
|
||||
self.log(f"After Ouroboros pass {ouroboros_pass}: refusal rate = {refusal_rate:.0%}")
|
||||
current_kl = self._quality_metrics.get("kl_divergence", 0.0)
|
||||
self.log(f"After Ouroboros pass {ouroboros_pass}: refusal={refusal_rate:.0%}, KL={current_kl:.4f}")
|
||||
|
||||
# KL-gated early stopping: if KL is rising and exceeds ceiling,
|
||||
# the model is being damaged faster than refusal is being removed.
|
||||
if current_kl > kl_ceiling:
|
||||
self.log(
|
||||
f"KL divergence {current_kl:.4f} exceeds ceiling {kl_ceiling:.4f} — "
|
||||
f"stopping to prevent further model damage"
|
||||
)
|
||||
break
|
||||
if ouroboros_pass > 1 and current_kl > prev_kl * 1.5 and refusal_rate > 0.3:
|
||||
self.log(
|
||||
f"KL rising sharply ({prev_kl:.4f} → {current_kl:.4f}) with "
|
||||
f"refusal still at {refusal_rate:.0%} — stopping (diminishing returns)"
|
||||
)
|
||||
break
|
||||
prev_kl = current_kl
|
||||
|
||||
self._report.ouroboros_passes = ouroboros_pass
|
||||
self._report.final_refusal_rate = refusal_rate
|
||||
@@ -903,6 +1225,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
},
|
||||
"derived_config": {
|
||||
"n_directions": insights.recommended_n_directions,
|
||||
"direction_method": insights.recommended_direction_method,
|
||||
"regularization": insights.recommended_regularization,
|
||||
"refinement_passes": insights.recommended_refinement_passes,
|
||||
"layers_used": insights.recommended_layers,
|
||||
@@ -981,6 +1304,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
||||
|
||||
lines.append("Derived Configuration:")
|
||||
lines.append(f" n_directions: {insights.recommended_n_directions}")
|
||||
lines.append(f" direction_method: {insights.recommended_direction_method}")
|
||||
lines.append(f" regularization: {insights.recommended_regularization}")
|
||||
lines.append(f" refinement_passes: {insights.recommended_refinement_passes}")
|
||||
lines.append(f" sparse surgery: {insights.use_sparse_surgery}")
|
||||
|
||||
+71
-12
@@ -55,26 +55,45 @@ def composite_score(metrics: dict[str, Any]) -> float:
|
||||
"""Score an abliteration run on [0, 1]. Higher is better.
|
||||
|
||||
Weights:
|
||||
40% refusal removal — the whole point
|
||||
30% coherence — model must still be useful
|
||||
35% refusal removal — the whole point
|
||||
25% coherence — model must still be useful
|
||||
20% KL divergence — minimal capability damage
|
||||
10% perplexity — fluency preservation
|
||||
5% spectral cert — formal completeness guarantee
|
||||
5% degenerate penalty — penalize broken output
|
||||
"""
|
||||
rr = metrics.get("refusal_rate")
|
||||
co = metrics.get("coherence")
|
||||
kl = metrics.get("kl_divergence")
|
||||
pp = metrics.get("perplexity")
|
||||
spec = metrics.get("spectral_certification")
|
||||
degen = metrics.get("degenerate_count", 0) or 0
|
||||
|
||||
refusal_score = (1.0 - rr) if rr is not None else 0.0
|
||||
coherence_score = co if co is not None else 0.0
|
||||
kl_score = 1.0 / (1.0 + kl) if kl is not None else 0.5
|
||||
ppl_score = 1.0 / (1.0 + pp / 100.0) if pp is not None else 0.5
|
||||
|
||||
# Spectral certification: GREEN=1.0, YELLOW=0.5, RED=0.0, None=0.5 (neutral)
|
||||
if spec == "GREEN":
|
||||
spec_score = 1.0
|
||||
elif spec == "YELLOW":
|
||||
spec_score = 0.5
|
||||
elif spec == "RED":
|
||||
spec_score = 0.0
|
||||
else:
|
||||
spec_score = 0.5 # not measured → neutral
|
||||
|
||||
# Degenerate penalty: any broken outputs reduce score
|
||||
degen_score = 1.0 / (1.0 + degen) if degen > 0 else 1.0
|
||||
|
||||
return (
|
||||
refusal_score * 0.4
|
||||
+ coherence_score * 0.3
|
||||
+ kl_score * 0.2
|
||||
+ ppl_score * 0.1
|
||||
refusal_score * 0.35
|
||||
+ coherence_score * 0.25
|
||||
+ kl_score * 0.20
|
||||
+ ppl_score * 0.10
|
||||
+ spec_score * 0.05
|
||||
+ degen_score * 0.05
|
||||
)
|
||||
|
||||
|
||||
@@ -94,6 +113,8 @@ class Contender:
|
||||
time_s: float = 0.0
|
||||
error: str | None = None
|
||||
round_eliminated: int = 0 # 0 = still alive / winner
|
||||
direction_method: str = "" # which direction extraction was used
|
||||
spectral_cert: str = "" # GREEN/YELLOW/RED/""
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -140,6 +161,8 @@ class TourneyResult:
|
||||
"metrics": c.metrics,
|
||||
"time_s": c.time_s,
|
||||
"error": c.error,
|
||||
"direction_method": c.direction_method,
|
||||
"spectral_cert": c.spectral_cert,
|
||||
}
|
||||
for c in sorted(r.contenders, key=lambda x: x.score, reverse=True)
|
||||
],
|
||||
@@ -197,6 +220,8 @@ def _save_checkpoint(
|
||||
"time_s": c.time_s,
|
||||
"error": c.error,
|
||||
"round_eliminated": c.round_eliminated,
|
||||
"direction_method": c.direction_method,
|
||||
"spectral_cert": c.spectral_cert,
|
||||
}
|
||||
for c in r.contenders
|
||||
],
|
||||
@@ -218,6 +243,8 @@ def _save_checkpoint(
|
||||
"time_s": c.time_s,
|
||||
"error": c.error,
|
||||
"round_eliminated": c.round_eliminated,
|
||||
"direction_method": c.direction_method,
|
||||
"spectral_cert": c.spectral_cert,
|
||||
}
|
||||
for c in completed_methods
|
||||
],
|
||||
@@ -286,6 +313,8 @@ def _restore_rounds(checkpoint: dict) -> tuple[TourneyResult, list[Contender], l
|
||||
time_s=c_data.get("time_s", 0.0),
|
||||
error=c_data.get("error"),
|
||||
round_eliminated=c_data.get("round_eliminated", 0),
|
||||
direction_method=c_data.get("direction_method", ""),
|
||||
spectral_cert=c_data.get("spectral_cert", ""),
|
||||
))
|
||||
result.rounds.append(rnd)
|
||||
|
||||
@@ -328,14 +357,14 @@ def render_bracket(result: TourneyResult) -> str:
|
||||
lines.append(f"## Round {rnd.round_num}: {rnd.name}")
|
||||
lines.append(f"*{len(rnd.contenders)} contenders, {rnd.prompt_volume} prompt pairs*")
|
||||
lines.append("")
|
||||
lines.append("| Rank | Method | Score | Refusal | Coherence | KL Div | Perplexity | Time |")
|
||||
lines.append("|------|--------|-------|---------|-----------|--------|------------|------|")
|
||||
lines.append("| Rank | Method | Dir | Score | Refusal | Coherence | KL Div | PPL | Cert | Time |")
|
||||
lines.append("|------|--------|-----|-------|---------|-----------|--------|-----|------|------|")
|
||||
|
||||
sorted_contenders = sorted(rnd.contenders, key=lambda x: x.score, reverse=True)
|
||||
for i, c in enumerate(sorted_contenders, 1):
|
||||
if c.error:
|
||||
lines.append(
|
||||
f"| {i} | {c.method} | ERROR | — | — | — | — | {c.time_s:.0f}s |"
|
||||
f"| {i} | {c.method} | — | ERROR | — | — | — | — | — | {c.time_s:.0f}s |"
|
||||
)
|
||||
continue
|
||||
m = c.metrics
|
||||
@@ -351,9 +380,11 @@ def render_bracket(result: TourneyResult) -> str:
|
||||
kl_val = m.get('kl_divergence')
|
||||
kl_str = f"{kl_val:.4f}" if kl_val is not None else "—"
|
||||
pp = f"{m.get('perplexity', 0):.1f}" if m.get('perplexity') is not None else "—"
|
||||
dir_m = c.direction_method or m.get("direction_method", "—")
|
||||
cert = c.spectral_cert or "—"
|
||||
lines.append(
|
||||
f"| {i} | **{c.method}**{marker} | {c.score:.4f} "
|
||||
f"| {rr} | {co} | {kl_str} | {pp} | {c.time_s:.0f}s |"
|
||||
f"| {i} | **{c.method}**{marker} | {dir_m} | {c.score:.4f} "
|
||||
f"| {rr} | {co} | {kl_str} | {pp} | {cert} | {c.time_s:.0f}s |"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
@@ -572,9 +603,12 @@ def render_bracket_html(result: TourneyResult) -> str:
|
||||
f'<span class="trophy">🏆</span> '
|
||||
f'<span class="champ-name">{html_mod.escape(w.method)}</span>'
|
||||
)
|
||||
dir_m = w.direction_method or "—"
|
||||
cert = w.spectral_cert or "—"
|
||||
header_parts.append(
|
||||
f'<div class="champ-score">'
|
||||
f'Score: {w.score:.4f} | Refusal: {rr} | Coherence: {co}'
|
||||
f'Score: {w.score:.4f} | Refusal: {rr} | '
|
||||
f'Coherence: {co} | Dir: {html_mod.escape(dir_m)} | Cert: {html_mod.escape(cert)}'
|
||||
f'</div>'
|
||||
)
|
||||
header_parts.append("</div>")
|
||||
@@ -632,6 +666,11 @@ def render_bracket_html(result: TourneyResult) -> str:
|
||||
m = c.metrics or {}
|
||||
metric_spans = []
|
||||
if not c.error:
|
||||
dm = c.direction_method or m.get("direction_method", "")
|
||||
if dm:
|
||||
metric_spans.append(
|
||||
f'<span class="metric">dir <span class="val">{html_mod.escape(dm)}</span></span>'
|
||||
)
|
||||
rr = m.get("refusal_rate")
|
||||
if rr is not None:
|
||||
metric_spans.append(
|
||||
@@ -642,6 +681,12 @@ def render_bracket_html(result: TourneyResult) -> str:
|
||||
metric_spans.append(
|
||||
f'<span class="metric">coh <span class="val">{co:.3f}</span></span>'
|
||||
)
|
||||
sc = c.spectral_cert or m.get("spectral_certification", "")
|
||||
if sc:
|
||||
cert_color = {"GREEN": "#4ecca3", "YELLOW": "#f0c040", "RED": "#cc4444"}.get(sc, "#777")
|
||||
metric_spans.append(
|
||||
f'<span class="metric">cert <span class="val" style="color:{cert_color}">{html_mod.escape(sc)}</span></span>'
|
||||
)
|
||||
kl = m.get("kl_divergence")
|
||||
if kl is not None:
|
||||
metric_spans.append(
|
||||
@@ -705,10 +750,12 @@ in elimination rounds.
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Composite Score | **{w.score:.4f}** |
|
||||
| Direction Method | {w.direction_method or 'N/A'} |
|
||||
| Refusal Rate | {f'{w.metrics["refusal_rate"]:.1%}' if w.metrics.get('refusal_rate') is not None else 'N/A'} |
|
||||
| Coherence | {f'{w.metrics["coherence"]:.3f}' if w.metrics.get('coherence') is not None else 'N/A'} |
|
||||
| KL Divergence | {f'{w.metrics["kl_divergence"]:.4f}' if w.metrics.get('kl_divergence') is not None else 'N/A'} |
|
||||
| Perplexity | {f'{w.metrics["perplexity"]:.1f}' if w.metrics.get('perplexity') is not None else 'N/A'} |
|
||||
| Spectral Cert | {w.spectral_cert or 'N/A'} |
|
||||
|
||||
## How to Use
|
||||
|
||||
@@ -866,6 +913,8 @@ class TourneyRunner:
|
||||
contender.metrics = dict(pipeline._quality_metrics)
|
||||
contender.score = composite_score(contender.metrics)
|
||||
contender.output_dir = save_dir
|
||||
contender.direction_method = getattr(pipeline, "direction_method", "")
|
||||
contender.spectral_cert = contender.metrics.get("spectral_certification", "") or ""
|
||||
|
||||
# Free pipeline to reclaim GPU
|
||||
del pipeline
|
||||
@@ -1048,6 +1097,11 @@ class TourneyRunner:
|
||||
result.winner = winner
|
||||
result.total_time_s = time.time() - t_start
|
||||
|
||||
# Clean up non-winner finalist dirs to free disk
|
||||
for c in ranked[1:]:
|
||||
if c.output_dir and Path(c.output_dir).exists():
|
||||
shutil.rmtree(c.output_dir, ignore_errors=True)
|
||||
|
||||
self.log("")
|
||||
self.log("=" * 60)
|
||||
if winner:
|
||||
@@ -1352,6 +1406,11 @@ class TourneyRunner:
|
||||
result.winner = winner
|
||||
result.total_time_s = time.time() - t_start
|
||||
|
||||
# Clean up non-winner finalist dirs to free disk
|
||||
for c in ranked[1:]:
|
||||
if c.output_dir and Path(c.output_dir).exists():
|
||||
shutil.rmtree(c.output_dir, ignore_errors=True)
|
||||
|
||||
self.log("")
|
||||
self.log("=" * 60)
|
||||
if winner:
|
||||
|
||||
Reference in New Issue
Block a user