Files
OBLITERATUS/obliteratus/study_presets.py
T
2026-03-04 12:38:18 -08:00

255 lines
9.5 KiB
Python

"""Pre-built ablation presets.
Each preset defines a combination of strategies, evaluation settings, and
a description of when to use it. Users can pick a preset instead of
manually configuring every knob.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass
class StudyPreset:
"""A reusable ablation recipe."""
name: str
key: str # short identifier used in CLI / config
description: str
strategies: list[dict[str, Any]] # [{name: ..., params: {...}}, ...]
metrics: list[str] = field(default_factory=lambda: ["perplexity"])
max_samples: int = 100
batch_size: int = 4
max_length: int = 256
tags: list[str] = field(default_factory=list)
STUDY_PRESETS: dict[str, StudyPreset] = {}
_PRESETS_LIST = [
# ── Quick / smoke-test ──────────────────────────────────────────────
StudyPreset(
name="Quick Scan",
key="quick",
description=(
"Fast sanity check. Removes each layer once and each FFN once. "
"Good for a first look at any model."
),
strategies=[
{"name": "layer_removal", "params": {}},
{"name": "ffn_ablation", "params": {}},
],
max_samples=25,
batch_size=4,
max_length=128,
tags=["fast", "general"],
),
# ── Full sweep ──────────────────────────────────────────────────────
StudyPreset(
name="Full Sweep",
key="full",
description=(
"Run every strategy on every component. Layers, heads, FFNs, and "
"embedding chunks. The most thorough option — can be slow on large models."
),
strategies=[
{"name": "layer_removal", "params": {}},
{"name": "head_pruning", "params": {}},
{"name": "ffn_ablation", "params": {}},
{"name": "embedding_ablation", "params": {"chunk_size": 48}},
],
max_samples=200,
batch_size=4,
max_length=256,
tags=["thorough", "general"],
),
# ── Attention-focused ───────────────────────────────────────────────
StudyPreset(
name="Attention Deep-Dive",
key="attention",
description=(
"Focus exclusively on attention heads. Prunes every head individually "
"to find which heads are critical vs. redundant. Essential for "
"understanding multi-head attention allocation."
),
strategies=[
{"name": "head_pruning", "params": {}},
],
max_samples=100,
batch_size=4,
max_length=256,
tags=["attention", "heads", "focused"],
),
# ── Layer importance ────────────────────────────────────────────────
StudyPreset(
name="Layer Importance",
key="layers",
description=(
"Remove each transformer layer one at a time and also ablate each "
"FFN block. Reveals the depth profile of the model — which layers "
"carry the most information."
),
strategies=[
{"name": "layer_removal", "params": {}},
{"name": "ffn_ablation", "params": {}},
],
max_samples=100,
batch_size=4,
max_length=256,
tags=["layers", "depth", "general"],
),
# ── Knowledge localization ──────────────────────────────────────────
StudyPreset(
name="Knowledge Localization",
key="knowledge",
description=(
"Targets the FFN/MLP blocks and embedding dimensions. FFNs are "
"believed to store factual knowledge — this preset helps identify "
"where knowledge is concentrated in the model."
),
strategies=[
{"name": "ffn_ablation", "params": {}},
{"name": "embedding_ablation", "params": {"chunk_size": 32}},
],
max_samples=150,
batch_size=4,
max_length=256,
tags=["knowledge", "ffn", "embeddings"],
),
# ── Pruning candidate finder ────────────────────────────────────────
StudyPreset(
name="Pruning Candidates",
key="pruning",
description=(
"Designed for model compression research. Tests every head and every "
"FFN to find components that can be removed with minimal quality loss. "
"Use the results to guide structured pruning."
),
strategies=[
{"name": "head_pruning", "params": {}},
{"name": "ffn_ablation", "params": {}},
],
max_samples=100,
batch_size=4,
max_length=256,
tags=["pruning", "compression", "efficiency"],
),
# ── Embedding analysis ──────────────────────────────────────────────
StudyPreset(
name="Embedding Analysis",
key="embeddings",
description=(
"Systematically ablate embedding dimension ranges to understand "
"which dimensions carry the most semantic signal. Uses fine-grained "
"16-dim chunks for detailed analysis."
),
strategies=[
{"name": "embedding_ablation", "params": {"chunk_size": 16}},
],
max_samples=100,
batch_size=4,
max_length=256,
tags=["embeddings", "representation"],
),
# ── Jailbreak / refusal localization ───────────────────────────────
StudyPreset(
name="Jailbreak Analysis",
key="jailbreak",
description=(
"Surgical preset for locating refusal-mediating components. "
"Inspired by 'Refusal in Language Models Is Mediated by a Single "
"Direction' (Arditi et al.). Uses fine-grained head pruning, FFN "
"ablation, and 16-dim embedding chunks to pinpoint which specific "
"components encode refusal behaviors. Best used on instruct/chat "
"models — compare results against the base model to isolate "
"RLHF/DPO imprints. Pair with custom safety-probing prompts for "
"behavioral analysis beyond perplexity."
),
strategies=[
{"name": "head_pruning", "params": {}},
{"name": "ffn_ablation", "params": {}},
{"name": "embedding_ablation", "params": {"chunk_size": 16}},
],
max_samples=400,
batch_size=4,
max_length=512,
tags=["jailbreak", "refusal", "alignment", "uncensored", "interpretability"],
),
# ── Guardrail / safety ablation ────────────────────────────────────
StudyPreset(
name="Guardrail Ablation",
key="guardrail",
description=(
"Systematic removal of components to study where safety and alignment "
"behaviors are encoded. Ablates every layer, every attention head, "
"every FFN block, and embedding dimensions. Designed for alignment "
"researchers studying refusal mechanisms, RLHF imprints, and safety "
"fine-tuning localization. Use with safety-tuned models for best results."
),
strategies=[
{"name": "layer_removal", "params": {}},
{"name": "head_pruning", "params": {}},
{"name": "ffn_ablation", "params": {}},
{"name": "embedding_ablation", "params": {"chunk_size": 24}},
],
max_samples=300,
batch_size=4,
max_length=512,
tags=["safety", "alignment", "guardrails", "uncensored", "research"],
),
# ── Robustness test ─────────────────────────────────────────────────
StudyPreset(
name="Robustness Test",
key="robustness",
description=(
"Stress-test the model by ablating layers, heads, and FFNs with "
"a larger evaluation set. Good for understanding how fragile the "
"model is and which components are load-bearing."
),
strategies=[
{"name": "layer_removal", "params": {}},
{"name": "head_pruning", "params": {}},
{"name": "ffn_ablation", "params": {}},
],
max_samples=500,
batch_size=8,
max_length=512,
tags=["robustness", "thorough"],
),
]
for p in _PRESETS_LIST:
STUDY_PRESETS[p.key] = p
def get_study_preset(key: str) -> StudyPreset:
"""Look up a preset by its key."""
if key not in STUDY_PRESETS:
available = ", ".join(sorted(STUDY_PRESETS))
raise KeyError(f"Unknown preset {key!r}. Available: {available}")
return STUDY_PRESETS[key]
# Convenience alias
get_preset = get_study_preset
def list_study_presets() -> list[StudyPreset]:
"""Return all presets in display order."""
return list(STUDY_PRESETS.values())
# Convenience alias
list_presets = list_study_presets