Add files via upload

This commit is contained in:
pliny
2026-03-08 12:07:56 -07:00
committed by GitHub
parent 1065809658
commit 69fa63ac43
14 changed files with 1320 additions and 274 deletions
+58 -4
View File
@@ -1317,11 +1317,56 @@
<span class="method-label">AGGRESSIVE</span>
<span class="method-desc">Full Gabliteration + 3-pass refine</span>
</label>
<label class="method-radio" id="method-spectral_cascade" onclick="setAblMethod('spectral_cascade')">
<input type="radio" name="abl-method" value="spectral_cascade">
<span class="method-label">SPECTRAL</span>
<span class="method-desc">DCT frequency-selective decomposition</span>
</label>
<label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
<input type="radio" name="abl-method" value="informed">
<span class="method-label" style="color:var(--cyan)">INFORMED</span>
<span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
</label>
<label class="method-radio" id="method-surgical" onclick="setAblMethod('surgical')">
<input type="radio" name="abl-method" value="surgical">
<span class="method-label">SURGICAL</span>
<span class="method-desc">Precision MoE-aware head surgery</span>
</label>
<label class="method-radio" id="method-optimized" onclick="setAblMethod('optimized')">
<input type="radio" name="abl-method" value="optimized">
<span class="method-label">OPTIMIZED</span>
<span class="method-desc">Bayesian auto-tuned + KL-optimized</span>
</label>
<label class="method-radio" id="method-inverted" onclick="setAblMethod('inverted')">
<input type="radio" name="abl-method" value="inverted">
<span class="method-label">INVERTED</span>
<span class="method-desc">Semantic refusal inversion</span>
</label>
<label class="method-radio" id="method-nuclear" onclick="setAblMethod('nuclear')">
<input type="radio" name="abl-method" value="nuclear">
<span class="method-label">NUCLEAR</span>
<span class="method-desc">Maximum force combo</span>
</label>
<label class="method-radio" id="method-failspy" onclick="setAblMethod('failspy')">
<input type="radio" name="abl-method" value="failspy">
<span class="method-label">FAILSPY</span>
<span class="method-desc">FailSpy/abliterator baseline</span>
</label>
<label class="method-radio" id="method-gabliteration" onclick="setAblMethod('gabliteration')">
<input type="radio" name="abl-method" value="gabliteration">
<span class="method-label">GABLIT</span>
<span class="method-desc">Gabliteration (G&uuml;lmez 2026) baseline</span>
</label>
<label class="method-radio" id="method-heretic" onclick="setAblMethod('heretic')">
<input type="radio" name="abl-method" value="heretic">
<span class="method-label">HERETIC</span>
<span class="method-desc">Heretic/p-e-w Bayesian baseline</span>
</label>
<label class="method-radio" id="method-rdo" onclick="setAblMethod('rdo')">
<input type="radio" name="abl-method" value="rdo">
<span class="method-label">RDO</span>
<span class="method-desc">Refusal Direction Optimization baseline</span>
</label>
</div>
<div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs
@@ -1941,10 +1986,19 @@ function startAbliterateFromLibrary(hfId) {
let ablMethod = 'advanced';
const METHOD_INFO = {
basic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'1 direction &bull; standard projection &bull; 1 pass &bull; 32 prompt pairs'},
advanced: {dirs:4, norm:true, reg:0.3, passes:2, desc:'4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs'},
aggressive: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions &bull; norm-preserving &bull; full orthogonalization &bull; 3 refinement passes &bull; 32 prompt pairs'},
informed: {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> &bull; auto directions &bull; auto regularization &bull; Ouroboros-compensated &bull; cone/alignment/cluster/defense analysis'},
basic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'1 direction &bull; standard projection &bull; 1 pass'},
advanced: {dirs:4, norm:true, reg:0.3, passes:2, desc:'4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes'},
aggressive: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions &bull; norm-preserving &bull; full orthogonalization &bull; 3 refinement passes'},
spectral_cascade: {dirs:6, norm:true, reg:0.15, passes:1, desc:'6 whitened-SVD directions &bull; DCT frequency decomposition &bull; coherence-weighted &bull; adaptive bands'},
informed: {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> &bull; auto directions &bull; auto regularization &bull; Ouroboros-compensated &bull; cone/alignment/cluster analysis'},
surgical: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions &bull; attention head surgery &bull; SAE features &bull; safety neuron masking &bull; per-expert MoE'},
optimized: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions &bull; Bayesian auto-tuned &bull; CoT-aware &bull; KL co-optimized &bull; winsorized activations'},
inverted: {dirs:4, norm:true, reg:0.1, passes:2, desc:'4 SVD directions &bull; semantic inversion (2x reflection) &bull; SAE feature targeting'},
nuclear: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions &bull; all techniques combined &bull; maximum force &bull; head surgery + SAE + steering + transplant'},
failspy: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 1 diff-means direction &bull; all layers except first &bull; FailSpy/abliterator reproduction'},
gabliteration: {dirs:4, norm:false, reg:0.231, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 4 SVD directions &bull; ridge reg (alpha=0.3) &bull; top-k layer selection &bull; G&uuml;lmez 2026'},
heretic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 1 diff-means &bull; Bayesian (Optuna TPE) &bull; KL-optimized &bull; float layer interpolation &bull; p-e-w'},
rdo: {dirs:4, norm:true, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 4 SVD directions &bull; gradient-refined (RDO) &bull; linear probe classifier &bull; Wollschlager ICML 2025'},
};
function getAblCmd() {