mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-06-03 12:58:06 +02:00
255 lines
9.5 KiB
Python
255 lines
9.5 KiB
Python
"""Pre-built ablation presets.
|
|
|
|
Each preset defines a combination of strategies, evaluation settings, and
|
|
a description of when to use it. Users can pick a preset instead of
|
|
manually configuring every knob.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
|
|
@dataclass
|
|
class StudyPreset:
|
|
"""A reusable ablation recipe."""
|
|
|
|
name: str
|
|
key: str # short identifier used in CLI / config
|
|
description: str
|
|
strategies: list[dict[str, Any]] # [{name: ..., params: {...}}, ...]
|
|
metrics: list[str] = field(default_factory=lambda: ["perplexity"])
|
|
max_samples: int = 100
|
|
batch_size: int = 4
|
|
max_length: int = 256
|
|
tags: list[str] = field(default_factory=list)
|
|
|
|
|
|
STUDY_PRESETS: dict[str, StudyPreset] = {}
|
|
|
|
_PRESETS_LIST = [
|
|
# ── Quick / smoke-test ──────────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Quick Scan",
|
|
key="quick",
|
|
description=(
|
|
"Fast sanity check. Removes each layer once and each FFN once. "
|
|
"Good for a first look at any model."
|
|
),
|
|
strategies=[
|
|
{"name": "layer_removal", "params": {}},
|
|
{"name": "ffn_ablation", "params": {}},
|
|
],
|
|
max_samples=25,
|
|
batch_size=4,
|
|
max_length=128,
|
|
tags=["fast", "general"],
|
|
),
|
|
|
|
# ── Full sweep ──────────────────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Full Sweep",
|
|
key="full",
|
|
description=(
|
|
"Run every strategy on every component. Layers, heads, FFNs, and "
|
|
"embedding chunks. The most thorough option — can be slow on large models."
|
|
),
|
|
strategies=[
|
|
{"name": "layer_removal", "params": {}},
|
|
{"name": "head_pruning", "params": {}},
|
|
{"name": "ffn_ablation", "params": {}},
|
|
{"name": "embedding_ablation", "params": {"chunk_size": 48}},
|
|
],
|
|
max_samples=200,
|
|
batch_size=4,
|
|
max_length=256,
|
|
tags=["thorough", "general"],
|
|
),
|
|
|
|
# ── Attention-focused ───────────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Attention Deep-Dive",
|
|
key="attention",
|
|
description=(
|
|
"Focus exclusively on attention heads. Prunes every head individually "
|
|
"to find which heads are critical vs. redundant. Essential for "
|
|
"understanding multi-head attention allocation."
|
|
),
|
|
strategies=[
|
|
{"name": "head_pruning", "params": {}},
|
|
],
|
|
max_samples=100,
|
|
batch_size=4,
|
|
max_length=256,
|
|
tags=["attention", "heads", "focused"],
|
|
),
|
|
|
|
# ── Layer importance ────────────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Layer Importance",
|
|
key="layers",
|
|
description=(
|
|
"Remove each transformer layer one at a time and also ablate each "
|
|
"FFN block. Reveals the depth profile of the model — which layers "
|
|
"carry the most information."
|
|
),
|
|
strategies=[
|
|
{"name": "layer_removal", "params": {}},
|
|
{"name": "ffn_ablation", "params": {}},
|
|
],
|
|
max_samples=100,
|
|
batch_size=4,
|
|
max_length=256,
|
|
tags=["layers", "depth", "general"],
|
|
),
|
|
|
|
# ── Knowledge localization ──────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Knowledge Localization",
|
|
key="knowledge",
|
|
description=(
|
|
"Targets the FFN/MLP blocks and embedding dimensions. FFNs are "
|
|
"believed to store factual knowledge — this preset helps identify "
|
|
"where knowledge is concentrated in the model."
|
|
),
|
|
strategies=[
|
|
{"name": "ffn_ablation", "params": {}},
|
|
{"name": "embedding_ablation", "params": {"chunk_size": 32}},
|
|
],
|
|
max_samples=150,
|
|
batch_size=4,
|
|
max_length=256,
|
|
tags=["knowledge", "ffn", "embeddings"],
|
|
),
|
|
|
|
# ── Pruning candidate finder ────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Pruning Candidates",
|
|
key="pruning",
|
|
description=(
|
|
"Designed for model compression research. Tests every head and every "
|
|
"FFN to find components that can be removed with minimal quality loss. "
|
|
"Use the results to guide structured pruning."
|
|
),
|
|
strategies=[
|
|
{"name": "head_pruning", "params": {}},
|
|
{"name": "ffn_ablation", "params": {}},
|
|
],
|
|
max_samples=100,
|
|
batch_size=4,
|
|
max_length=256,
|
|
tags=["pruning", "compression", "efficiency"],
|
|
),
|
|
|
|
# ── Embedding analysis ──────────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Embedding Analysis",
|
|
key="embeddings",
|
|
description=(
|
|
"Systematically ablate embedding dimension ranges to understand "
|
|
"which dimensions carry the most semantic signal. Uses fine-grained "
|
|
"16-dim chunks for detailed analysis."
|
|
),
|
|
strategies=[
|
|
{"name": "embedding_ablation", "params": {"chunk_size": 16}},
|
|
],
|
|
max_samples=100,
|
|
batch_size=4,
|
|
max_length=256,
|
|
tags=["embeddings", "representation"],
|
|
),
|
|
|
|
# ── Jailbreak / refusal localization ───────────────────────────────
|
|
StudyPreset(
|
|
name="Jailbreak Analysis",
|
|
key="jailbreak",
|
|
description=(
|
|
"Surgical preset for locating refusal-mediating components. "
|
|
"Inspired by 'Refusal in Language Models Is Mediated by a Single "
|
|
"Direction' (Arditi et al.). Uses fine-grained head pruning, FFN "
|
|
"ablation, and 16-dim embedding chunks to pinpoint which specific "
|
|
"components encode refusal behaviors. Best used on instruct/chat "
|
|
"models — compare results against the base model to isolate "
|
|
"RLHF/DPO imprints. Pair with custom safety-probing prompts for "
|
|
"behavioral analysis beyond perplexity."
|
|
),
|
|
strategies=[
|
|
{"name": "head_pruning", "params": {}},
|
|
{"name": "ffn_ablation", "params": {}},
|
|
{"name": "embedding_ablation", "params": {"chunk_size": 16}},
|
|
],
|
|
max_samples=400,
|
|
batch_size=4,
|
|
max_length=512,
|
|
tags=["jailbreak", "refusal", "alignment", "uncensored", "interpretability"],
|
|
),
|
|
|
|
# ── Guardrail / safety ablation ────────────────────────────────────
|
|
StudyPreset(
|
|
name="Guardrail Ablation",
|
|
key="guardrail",
|
|
description=(
|
|
"Systematic removal of components to study where safety and alignment "
|
|
"behaviors are encoded. Ablates every layer, every attention head, "
|
|
"every FFN block, and embedding dimensions. Designed for alignment "
|
|
"researchers studying refusal mechanisms, RLHF imprints, and safety "
|
|
"fine-tuning localization. Use with safety-tuned models for best results."
|
|
),
|
|
strategies=[
|
|
{"name": "layer_removal", "params": {}},
|
|
{"name": "head_pruning", "params": {}},
|
|
{"name": "ffn_ablation", "params": {}},
|
|
{"name": "embedding_ablation", "params": {"chunk_size": 24}},
|
|
],
|
|
max_samples=300,
|
|
batch_size=4,
|
|
max_length=512,
|
|
tags=["safety", "alignment", "guardrails", "uncensored", "research"],
|
|
),
|
|
|
|
# ── Robustness test ─────────────────────────────────────────────────
|
|
StudyPreset(
|
|
name="Robustness Test",
|
|
key="robustness",
|
|
description=(
|
|
"Stress-test the model by ablating layers, heads, and FFNs with "
|
|
"a larger evaluation set. Good for understanding how fragile the "
|
|
"model is and which components are load-bearing."
|
|
),
|
|
strategies=[
|
|
{"name": "layer_removal", "params": {}},
|
|
{"name": "head_pruning", "params": {}},
|
|
{"name": "ffn_ablation", "params": {}},
|
|
],
|
|
max_samples=500,
|
|
batch_size=8,
|
|
max_length=512,
|
|
tags=["robustness", "thorough"],
|
|
),
|
|
]
|
|
|
|
for p in _PRESETS_LIST:
|
|
STUDY_PRESETS[p.key] = p
|
|
|
|
|
|
def get_study_preset(key: str) -> StudyPreset:
|
|
"""Look up a preset by its key."""
|
|
if key not in STUDY_PRESETS:
|
|
available = ", ".join(sorted(STUDY_PRESETS))
|
|
raise KeyError(f"Unknown preset {key!r}. Available: {available}")
|
|
return STUDY_PRESETS[key]
|
|
|
|
|
|
# Convenience alias
|
|
get_preset = get_study_preset
|
|
|
|
|
|
def list_study_presets() -> list[StudyPreset]:
|
|
"""Return all presets in display order."""
|
|
return list(STUDY_PRESETS.values())
|
|
|
|
|
|
# Convenience alias
|
|
list_presets = list_study_presets
|