Default filter_ascii to False to match legacy benchmark behavior (#3)

Pre-2026-03-20 the token filter defaulted to off (filter_ids=False, no
forbidden tokens). The subsequent split into filter_ascii/filter_special/
filter_retok introduced filter_ascii=True as the new default, silently
narrowing the optimization vocabulary by ~50% for Qwen and invalidating
comparisons against historical numbers (verified on claude_v63: avg loss
0.98 with filter vs 0.49 without, with ~12/20 samples matching bit-exact
to the legacy hmcGCG numbers once the filter is disabled). Revert default
to False so fresh runs reproduce the earlier leaderboard out of the box;
presets that want ASCII filtering can still opt in explicitly.

Assisted-by: Claude <noreply@anthropic.com>
This commit is contained in:
Alexander Panfilov
2026-04-10 13:53:34 +03:00
committed by GitHub
parent 8221f0afb8
commit 238c702b06
2 changed files with 2 additions and 2 deletions
+1 -1
View File
@@ -47,7 +47,7 @@ class BenchmarkConfig:
input_spec: InputSpec = field(default_factory=InputSpec.default)
# Token filtering and final evaluation mode
filter_ascii: bool = True # block non-ASCII / non-printable tokens
filter_ascii: bool = False # block non-ASCII / non-printable tokens
filter_special: bool = False # block special / control / added tokens
filter_retok: bool = False # decode->re-encode retokenization round-trip filter
final_input: str = "tokens"
+1 -1
View File
@@ -115,7 +115,7 @@ def run_bench(
resolved_max_time = preset_cfg.get("max_time", None)
# Token filtering
resolved_filter_ascii = preset_cfg.get("filter_ascii", True)
resolved_filter_ascii = preset_cfg.get("filter_ascii", False)
resolved_filter_special = preset_cfg.get("filter_special", False)
resolved_filter_retok = preset_cfg.get("filter_retok", False)
resolved_final_input = preset_cfg.get("final_input", "tokens")