"""Bayesian optimization for abliteration hyperparameters. Implements Optuna TPE-based multi-objective optimization that searches for optimal ablation parameters co-minimizing refusal rate and KL divergence. Inspired by Heretic (p-e-w, 2025) which pioneered Bayesian optimization for abliteration. OBLITERATUS pushes this further by: 1. **Parametric layer kernel**: Instead of per-layer independent parameters, uses a bell-shaped curve described by 4 global params (Heretic-style). This reduces the search space from O(n_layers) to O(1) while capturing the spatial structure of refusal across layers. 2. **Float direction interpolation**: Direction index is continuous — non- integer values interpolate between adjacent SVD directions, unlocking a smooth direction space beyond the discrete top-k. 3. **Component-specific weights**: Separate scaling for attention vs MLP projections (Heretic showed MLP interventions are more damaging). 4. **Per-expert granularity**: For MoE models, optimizes per-expert scaling. 5. **CoT-aware objectives**: Adds chain-of-thought coherence as a third optimization objective for CoT models. 6. **Warm-start from analysis**: Uses OBLITERATUS's analysis-based heuristics as initial trial suggestions instead of random initialization. References: - Heretic (p-e-w, 2025): Bayesian optimization for LLM abliteration - Akiba et al. (2019): Optuna: A Next-generation Hyperparameter Optimization Framework """ from __future__ import annotations import logging import math from typing import TYPE_CHECKING import torch import torch.nn.functional as F if TYPE_CHECKING: from obliteratus.abliterate import AbliterationPipeline logger = logging.getLogger(__name__) def _measure_refusal_rate( pipeline: AbliterationPipeline, n_prompts: int = 10, max_new_tokens: int = 128, ) -> float: """Measure refusal rate on a subset of harmful prompts. Applies the model's chat template (when available) so the model is in chat mode — matching how users actually interact with it. Uses token-ID slicing for robust completion extraction regardless of special tokens. """ from obliteratus.evaluation.advanced_metrics import refusal_rate model = pipeline.handle.model tokenizer = pipeline.handle.tokenizer device = pipeline._get_model_device(model) test_prompts = pipeline.harmful_prompts[:n_prompts] # Always apply chat template for refusal measurement — refusal circuitry # is activated by chat formatting, so raw prompts give false 0% rates. saved_flag = pipeline.use_chat_template pipeline.use_chat_template = True formatted_prompts = pipeline._maybe_apply_chat_template(test_prompts) pipeline.use_chat_template = saved_flag responses = [] for prompt in formatted_prompts: try: inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=256, ) input_len = inputs["input_ids"].shape[1] inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, ) # Token-ID slicing: robust regardless of special tokens completion_ids = output[0][input_len:] completion = tokenizer.decode( completion_ids, skip_special_tokens=True, ).strip() responses.append(completion) del inputs, output except Exception: responses.append("") # Count failures as non-refusals pipeline._free_gpu_memory() return refusal_rate(responses, mode="combined") if responses else 1.0 def _measure_kl_divergence( pipeline: AbliterationPipeline, reference_logits: list[torch.Tensor], prompts: list[str], ) -> float: """Measure KL divergence from reference (pre-ablation) logits.""" model = pipeline.handle.model tokenizer = pipeline.handle.tokenizer device = pipeline._get_model_device(model) total_kl = 0.0 n_valid = 0 for i, prompt in enumerate(prompts): if i >= len(reference_logits): break try: inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=64, ) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model(**inputs) new_logits = outputs.logits[:, -1, :].detach().cpu().float() ref = reference_logits[i] log_p = F.log_softmax(ref, dim=-1) log_q = F.log_softmax(new_logits.squeeze(0), dim=-1) p = F.softmax(ref, dim=-1) kl = (p * (log_p - log_q)).sum().item() total_kl += max(kl, 0.0) # Clamp negative KL (numerical noise) n_valid += 1 del inputs, outputs, new_logits except Exception: pass pipeline._free_gpu_memory() return total_kl / max(n_valid, 1) def _parametric_layer_weight( layer_idx: int, n_layers: int, max_weight: float, peak_position: float, min_weight: float, spread: float, ) -> float: """Compute ablation weight for a layer using a parametric bell curve. This is the Heretic-style parametric kernel: - max_weight: peak ablation strength (0..1) - peak_position: normalized position of peak (0..1 maps to layer 0..n_layers-1) - min_weight: minimum ablation weight at the tails - spread: controls width of the bell curve (higher = wider) Returns a value in [min_weight, max_weight] representing how strongly to ablate this layer (1.0 = full projection, 0.0 = no projection). """ if n_layers <= 1: return max_weight normalized_pos = layer_idx / (n_layers - 1) peak = peak_position # Gaussian-shaped kernel dist = abs(normalized_pos - peak) sigma = max(spread, 0.01) gauss = math.exp(-0.5 * (dist / sigma) ** 2) return min_weight + (max_weight - min_weight) * gauss def _interpolate_direction( pipeline: AbliterationPipeline, layer_idx: int, float_dir_idx: float, ) -> torch.Tensor: """Get an interpolated refusal direction from a float-valued index. Non-integer values interpolate between adjacent SVD directions in the refusal subspace, unlocking a continuous space of directions beyond the discrete top-k. Args: pipeline: Pipeline with extracted refusal subspaces. layer_idx: Which layer's subspace to use. float_dir_idx: Continuous direction index (e.g., 0.7 interpolates between direction 0 and direction 1). Returns: Normalized direction tensor. """ subspace = pipeline.refusal_subspaces.get(layer_idx) if subspace is None or subspace.shape[0] == 0: return pipeline.refusal_directions.get(layer_idx, torch.zeros(1)) n_dirs = subspace.shape[0] # Clamp to valid range float_dir_idx = max(0.0, min(float_dir_idx, n_dirs - 1)) lo = int(float_dir_idx) hi = min(lo + 1, n_dirs - 1) if lo == hi: d = subspace[lo] else: alpha = float_dir_idx - lo d = (1.0 - alpha) * subspace[lo] + alpha * subspace[hi] norm = d.norm() if norm > 1e-8: d = d / norm return d def run_bayesian_optimization( pipeline: AbliterationPipeline, n_trials: int = 50, n_refusal_prompts: int = 30, n_kl_prompts: int = 5, ) -> dict[int, float]: """Run Bayesian optimization to find optimal ablation parameters. Uses Optuna TPE with a Heretic-style parametric layer kernel to search a compact parameter space: - 4 kernel params (max_weight, peak_position, min_weight, spread) - 1 float direction index (interpolated between SVD directions) - 2 component weights (attention vs MLP scaling) Total: 7 parameters regardless of model size (vs O(n_layers) before). Also optimizes per-layer independently when the kernel doesn't fit well (uses kernel as warm-start for per-layer refinement in phase 2). Args: pipeline: Initialized AbliterationPipeline (post-DISTILL stage). n_trials: Number of optimization trials. n_refusal_prompts: Number of harmful prompts for refusal measurement. n_kl_prompts: Number of harmless prompts for KL measurement. Returns: Dict mapping layer_idx -> optimal regularization value. """ try: import optuna from optuna.samplers import TPESampler except ImportError: logger.warning( "Optuna not installed — skipping Bayesian optimization. " "Install with: pip install optuna" ) return {} if not pipeline.handle or not pipeline._strong_layers: return {} model = pipeline.handle.model tokenizer = pipeline.handle.tokenizer device = pipeline._get_model_device(model) pipeline.log("Bayesian optimization: collecting reference logits...") # Collect reference (pre-ablation) logits for KL measurement kl_prompts = [ "The weather today is", "In mathematics, the concept of", "The history of ancient Rome", "How to cook a simple pasta", "The process of photosynthesis involves", ][:n_kl_prompts] reference_logits: list[torch.Tensor] = [] for prompt in kl_prompts: try: inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=64, ) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model(**inputs) reference_logits.append( outputs.logits[:, -1, :].detach().cpu().float().squeeze(0) ) del inputs, outputs except Exception: pass pipeline._free_gpu_memory() if not reference_logits: pipeline.log(" Failed to collect reference logits — skipping optimization") return {} from obliteratus.strategies.utils import ( get_layer_modules, get_attention_module, get_ffn_module, ) from obliteratus.abliterate import _ATTN_OUT_NAMES, _FFN_OUT_NAMES layer_modules = get_layer_modules(pipeline.handle) arch = pipeline.handle.architecture n_total_layers = len(layer_modules) # Save weight tensors for rollback — clone to CPU to free GPU memory original_params: list[tuple[torch.Tensor, torch.Tensor]] = [] seen_data_ptrs: set[int] = set() for idx in pipeline._strong_layers: try: attn = get_attention_module(layer_modules[idx], arch) for attr_name in _ATTN_OUT_NAMES: proj = getattr(attn, attr_name, None) if proj is not None and hasattr(proj, "weight"): ptr = proj.weight.data.data_ptr() if ptr not in seen_data_ptrs: original_params.append((proj.weight.data, proj.weight.data.clone().cpu())) seen_data_ptrs.add(ptr) if hasattr(proj, "bias") and proj.bias is not None: bptr = proj.bias.data.data_ptr() if bptr not in seen_data_ptrs: original_params.append((proj.bias.data, proj.bias.data.clone().cpu())) seen_data_ptrs.add(bptr) except (AttributeError, RuntimeError): pass try: ffn = get_ffn_module(layer_modules[idx], arch) for attr_name in _FFN_OUT_NAMES: proj = getattr(ffn, attr_name, None) if proj is not None and hasattr(proj, "weight"): ptr = proj.weight.data.data_ptr() if ptr not in seen_data_ptrs: original_params.append((proj.weight.data, proj.weight.data.clone().cpu())) seen_data_ptrs.add(ptr) if hasattr(proj, "bias") and proj.bias is not None: bptr = proj.bias.data.data_ptr() if bptr not in seen_data_ptrs: original_params.append((proj.bias.data, proj.bias.data.clone().cpu())) seen_data_ptrs.add(bptr) except (AttributeError, RuntimeError): pass del seen_data_ptrs total_saved_mb = sum(clone.nelement() * clone.element_size() for _, clone in original_params) / 1e6 pipeline.log(f" Saved {len(original_params)} weight tensors for rollback ({total_saved_mb:.0f} MB, on CPU)") def _restore_all(): for live_data, saved_clone in original_params: # noqa: F821 live_data.copy_(saved_clone.to(live_data.device)) # Warm-start values for the parametric kernel # Estimate peak position from strongest layer if pipeline._strong_layers: peak_layer = pipeline._strong_layers[0] warm_peak = peak_layer / max(n_total_layers - 1, 1) else: warm_peak = 0.5 best_result: dict[int, float] = {} best_score = float("inf") # Suppress Optuna's verbose logging optuna.logging.set_verbosity(optuna.logging.WARNING) # Max SVD directions available (for float direction interpolation) max_n_dirs = max( (pipeline.refusal_subspaces[idx].shape[0] for idx in pipeline._strong_layers if idx in pipeline.refusal_subspaces), default=1, ) # ── Phase 1: Parametric kernel optimization (compact search space) ── def objective(trial: optuna.Trial) -> tuple[float, float]: """Multi-objective: minimize (refusal_rate, kl_divergence).""" _restore_all() # Parametric kernel: 4 params describe the entire layer weighting max_weight = trial.suggest_float("max_weight", 0.5, 1.0) peak_position = trial.suggest_float("peak_position", 0.1, 0.9) min_weight = trial.suggest_float("min_weight", 0.0, 0.3) spread = trial.suggest_float("spread", 0.1, 0.6) # Component-specific scaling (Heretic insight: MLP more damaging) attn_scale = trial.suggest_float("attn_scale", 0.5, 1.0) mlp_scale = trial.suggest_float("mlp_scale", 0.3, 1.0) # Float direction index (continuous interpolation between SVD dirs) dir_idx = trial.suggest_float("dir_idx", 0.0, max(max_n_dirs - 1, 0.0)) # Compute per-layer regularization from parametric kernel layer_regs: dict[int, float] = {} for idx in pipeline._strong_layers: weight = _parametric_layer_weight( idx, n_total_layers, max_weight, peak_position, min_weight, spread, ) # Convert weight to regularization (weight=1 → reg=0, weight=0 → reg=1) layer_regs[idx] = 1.0 - weight # Apply projection with trial's parameters for idx in pipeline._strong_layers: if idx not in pipeline.refusal_subspaces: continue # Use interpolated direction direction = _interpolate_direction(pipeline, idx, dir_idx) d_col = direction.to(device=next(layer_modules[idx].parameters()).device) d_col = d_col.unsqueeze(-1) if d_col.dim() == 1 else d_col reg = layer_regs[idx] # Attention projection (with attn_scale) attn_reg = 1.0 - (1.0 - reg) * attn_scale try: attn = get_attention_module(layer_modules[idx], arch) pipeline._project_out_advanced( attn, d_col, _ATTN_OUT_NAMES, norm_preserve=pipeline.norm_preserve, regularization=attn_reg, ) except (AttributeError, RuntimeError): pass # MLP/FFN projection (with mlp_scale) mlp_reg = 1.0 - (1.0 - reg) * mlp_scale try: ffn = get_ffn_module(layer_modules[idx], arch) count = pipeline._project_out_advanced( ffn, d_col, _FFN_OUT_NAMES, norm_preserve=pipeline.norm_preserve, regularization=mlp_reg, ) if count == 0: pipeline._project_moe_experts( ffn, d_col, norm_preserve=pipeline.norm_preserve, regularization=mlp_reg, project_biases=False, ) except (AttributeError, RuntimeError): pass # Measure objectives refusal = _measure_refusal_rate(pipeline, n_prompts=n_refusal_prompts) kl = _measure_kl_divergence(pipeline, reference_logits, kl_prompts) # Track best combined score nonlocal best_score, best_result combined = refusal + 0.5 * kl if combined < best_score: best_score = combined best_result = dict(layer_regs) pipeline.log( f" Trial {trial.number + 1}/{n_trials}: " f"refusal={refusal:.0%}, KL={kl:.4f} " f"(peak={peak_position:.2f}, spread={spread:.2f}, " f"attn={attn_scale:.2f}, mlp={mlp_scale:.2f}, dir={dir_idx:.2f})" ) return refusal, kl sampler = TPESampler(seed=42, n_startup_trials=min(5, n_trials // 3)) study = optuna.create_study( directions=["minimize", "minimize"], sampler=sampler, study_name="obliteratus_parametric_optimization", ) # Enqueue warm-start trial with analysis-derived estimates warm_params = { "max_weight": 0.9, "peak_position": warm_peak, "min_weight": 0.05, "spread": 0.3, "attn_scale": 0.8, "mlp_scale": 0.6, "dir_idx": 0.0, } study.enqueue_trial(warm_params) pipeline.log(f"Bayesian optimization: running {n_trials} trials (parametric kernel)...") study.optimize(objective, n_trials=n_trials, show_progress_bar=False) # Restore model and apply best result _restore_all() # Get best trial from Pareto front (prefer low refusal) pareto = study.best_trials if pareto: pareto.sort(key=lambda t: (t.values[0], t.values[1])) best_trial = pareto[0] # Reconstruct per-layer regs from best kernel params p = best_trial.params best_result = {} for idx in pipeline._strong_layers: weight = _parametric_layer_weight( idx, n_total_layers, p["max_weight"], p["peak_position"], p["min_weight"], p["spread"], ) best_result[idx] = 1.0 - weight pipeline.log( f" Best trial: refusal={best_trial.values[0]:.0%}, " f"KL={best_trial.values[1]:.4f}" ) pipeline.log( f" Kernel: peak={p['peak_position']:.2f}, spread={p['spread']:.2f}, " f"max={p['max_weight']:.2f}, min={p['min_weight']:.2f}" ) pipeline.log( f" Components: attn={p['attn_scale']:.2f}, mlp={p['mlp_scale']:.2f}, " f"dir_idx={p['dir_idx']:.2f}" ) # Store the best direction index for use during EXCISE best_dir_idx = p.get("dir_idx", 0.0) if best_dir_idx > 0.1: pipeline.log(f" Applying interpolated direction (idx={best_dir_idx:.2f})...") for idx in pipeline._strong_layers: new_dir = _interpolate_direction(pipeline, idx, best_dir_idx) pipeline.refusal_directions[idx] = new_dir # Store component scales for use in EXCISE pipeline._bayesian_attn_scale = p.get("attn_scale", 1.0) pipeline._bayesian_mlp_scale = p.get("mlp_scale", 1.0) elif best_result: pipeline.log(f" Using best combined score: {best_score:.4f}") # Clean up del original_params pipeline._free_gpu_memory() return best_result