From 238c702b068338ccef2adb64d8fa6b668c0c1b54 Mon Sep 17 00:00:00 2001 From: Alexander Panfilov <39771221+kotekjedi@users.noreply.github.com> Date: Fri, 10 Apr 2026 13:53:34 +0300 Subject: [PATCH] Default filter_ascii to False to match legacy benchmark behavior (#3) Pre-2026-03-20 the token filter defaulted to off (filter_ids=False, no forbidden tokens). The subsequent split into filter_ascii/filter_special/ filter_retok introduced filter_ascii=True as the new default, silently narrowing the optimization vocabulary by ~50% for Qwen and invalidating comparisons against historical numbers (verified on claude_v63: avg loss 0.98 with filter vs 0.49 without, with ~12/20 samples matching bit-exact to the legacy hmcGCG numbers once the filter is disabled). Revert default to False so fresh runs reproduce the earlier leaderboard out of the box; presets that want ASCII filtering can still opt in explicitly. Assisted-by: Claude --- claudini/bench.py | 2 +- claudini/run_bench.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/claudini/bench.py b/claudini/bench.py index ef245ff..d08ea68 100644 --- a/claudini/bench.py +++ b/claudini/bench.py @@ -47,7 +47,7 @@ class BenchmarkConfig: input_spec: InputSpec = field(default_factory=InputSpec.default) # Token filtering and final evaluation mode - filter_ascii: bool = True # block non-ASCII / non-printable tokens + filter_ascii: bool = False # block non-ASCII / non-printable tokens filter_special: bool = False # block special / control / added tokens filter_retok: bool = False # decode->re-encode retokenization round-trip filter final_input: str = "tokens" diff --git a/claudini/run_bench.py b/claudini/run_bench.py index d9b95ec..c85c24c 100644 --- a/claudini/run_bench.py +++ b/claudini/run_bench.py @@ -115,7 +115,7 @@ def run_bench( resolved_max_time = preset_cfg.get("max_time", None) # Token filtering - resolved_filter_ascii = preset_cfg.get("filter_ascii", True) + resolved_filter_ascii = preset_cfg.get("filter_ascii", False) resolved_filter_special = preset_cfg.get("filter_special", False) resolved_filter_retok = preset_cfg.get("filter_retok", False) resolved_final_input = preset_cfg.get("final_input", "tokens")