Add leaderboard generation script (#2)

Add `claudini.leaderboard` module that scans benchmark result files and generates per-track, per model leaderboard JSONs ranking methods by average loss. Output: results/loss_leaderboard/<preset>/<model_tag>.json

Also: rename _build_input_spec -> build_input_spec in run_bench.py.

Assisted-by: Claude <noreply@anthropic.com>
Co-authored-by: Alexander Panfilov <apanfilov@g003.internal.cluster.is.localnet>
Co-authored-by: Peter Romov <peter@romov.com>
This commit is contained in:
Alexander Panfilov
2026-04-06 18:55:32 +03:00
committed by GitHub
parent 59106bdf3c
commit 8221f0afb8
3 changed files with 189 additions and 2 deletions
+2
View File
@@ -70,6 +70,8 @@ We consider white-box GCG-style attacks that search directly over the model's vo
See [`CLAUDE.md`](CLAUDE.md) for how to implement a new method.
**Leaderboard.** Run `uv run -m claudini.leaderboard results/` to generate per-track, per-model leaderboards ranking all methods by average loss. Results are saved to `results/loss_leaderboard/<preset>/<model_tag>.json`.
## Citation
```bibtex
+185
View File
@@ -0,0 +1,185 @@
"""
Generate loss leaderboards from benchmark results.
Usage:
uv run -m claudini.leaderboard results/
uv run -m claudini.leaderboard results/ --preset random_valid
uv run -m claudini.leaderboard results/ --preset random_valid --model-tag Qwen2.5-7B-Instruct
"""
import json
import logging
from pathlib import Path
from typing import Annotated, TypedDict
import typer
logger = logging.getLogger("claudini")
app = typer.Typer(add_completion=False)
class MethodEntry(TypedDict):
rank: Annotated[int, "1-indexed position, sorted by avg_loss ascending"]
method: Annotated[str, "Method name, e.g. 'gcg', 'claude_v82'"]
num_runs: Annotated[int, "Number of runs (examples x seeds) for this method"]
avg_loss: Annotated[float, "Mean of best_loss over all runs"]
std_loss: Annotated[float, "Population stddev of best_loss over all runs"]
min_loss: Annotated[float, "Minimum best_loss over all runs"]
max_flops: Annotated[float, "Maximum FLOPs consumed over all runs"]
class Leaderboard(TypedDict):
preset: Annotated[str, "Config preset name, e.g. 'random_valid'"]
model: Annotated[str, "Full HuggingFace model ID, e.g. 'Qwen/Qwen2.5-7B-Instruct'"]
model_tag: Annotated[str, "Short model directory name, e.g. 'Qwen2.5-7B-Instruct'"]
max_flops: Annotated[float | None, "Maximum FLOPs consumed over all runs (should be <= the FLOP budget)"]
leaderboard: list[MethodEntry]
def discover_results(results_dir: Path) -> dict[tuple[str, str, str], list[Path]]:
"""Scan results_dir and group result files by (preset, model_tag, method).
Expected layout: results_dir/<method>/<preset>/<model_tag>/sample_*_seed_*.json
"""
groups: dict[tuple[str, str, str], list[Path]] = {}
for path in results_dir.rglob("sample_*_seed_*.json"):
parts = path.relative_to(results_dir).parts
if len(parts) != 4:
continue
method, preset, model_tag, _ = parts
groups.setdefault((preset, model_tag, method), []).append(path)
return groups
def build_leaderboard(
groups: dict[tuple[str, str, str], list[Path]],
preset: str,
model_tag: str,
) -> Leaderboard:
"""Build a leaderboard dict for one (preset, model_tag) combination."""
methods = []
for (p, m, method_name), paths in sorted(groups.items()):
if p != preset or m != model_tag:
continue
losses = []
total_flops_list = []
for path in paths:
try:
with open(path) as f:
d = json.load(f)
except Exception:
logger.warning("Failed to load %s", path)
continue
# Respect eval_on: use soft loss only when the method says to evaluate on soft
if d.get("eval_on") == "soft" and d.get("best_soft_loss") is not None:
loss = d["best_soft_loss"]
else:
loss = d["best_loss"]
losses.append(loss)
total_flops_list.append(d.get("total_flops", 0))
if not losses:
continue
avg_loss = sum(losses) / len(losses)
std_loss = (sum((x - avg_loss) ** 2 for x in losses) / len(losses)) ** 0.5
min_loss = min(losses)
methods.append(
MethodEntry(
rank=0, # filled after sorting
method=method_name,
avg_loss=round(avg_loss, 4),
std_loss=round(std_loss, 4),
min_loss=round(min_loss, 4),
num_runs=len(losses),
max_flops=round(max(total_flops_list), 2),
)
)
methods.sort(key=lambda m: m["avg_loss"])
for i, m in enumerate(methods, 1):
m["rank"] = i
# Extract model_name from first result file for metadata
model_name = None
for (p, m, _), paths in groups.items():
if p == preset and m == model_tag and paths:
try:
with open(paths[0]) as f:
d = json.load(f)
model_name = d.get("model_name")
except Exception:
pass
break
all_flops = [entry["max_flops"] for entry in methods if entry["max_flops"] > 0]
max_flops = round(max(all_flops), 2) if all_flops else None
return Leaderboard(
preset=preset,
model=model_name or model_tag,
model_tag=model_tag,
max_flops=max_flops,
leaderboard=methods,
)
@app.command()
def leaderboard(
results_dir: Annotated[str, typer.Argument(help="Path to results directory")] = "results",
preset: Annotated[str | None, typer.Option(help="Filter to a specific preset")] = None,
model_tag: Annotated[str | None, typer.Option(help="Filter to a specific model tag")] = None,
output_dir: Annotated[
str | None, typer.Option(help="Output directory (default: <results_dir>/loss_leaderboard)")
] = None,
):
"""Generate loss leaderboards from benchmark result files."""
logging.basicConfig(level=logging.INFO, format="%(message)s")
results_path = Path(results_dir)
if not results_path.is_dir():
raise typer.BadParameter(f"Results directory not found: {results_dir}")
out_path = Path(output_dir) if output_dir else results_path / "loss_leaderboard"
groups = discover_results(results_path)
if not groups:
logger.info("No result files found in %s", results_dir)
raise typer.Exit()
# Collect all (preset, model_tag) combinations
combos = sorted({(p, m) for p, m, _ in groups})
if preset:
combos = [(p, m) for p, m in combos if p == preset]
if model_tag:
combos = [(p, m) for p, m in combos if m == model_tag]
if not combos:
logger.info("No matching results for preset=%s model_tag=%s", preset, model_tag)
raise typer.Exit()
generated = 0
for p, m in combos:
board = build_leaderboard(groups, p, m)
if not board["leaderboard"]:
continue
dest = out_path / p / f"{m}.json"
dest.parent.mkdir(parents=True, exist_ok=True)
with open(dest, "w") as f:
json.dump(board, f, indent=2)
f.write("\n")
n_methods = len(board["leaderboard"])
logger.info(" %s/%s.json (%d methods)", p, m, n_methods)
generated += 1
logger.info("Generated %d leaderboard(s) in %s", generated, out_path)
if __name__ == "__main__":
app()
+2 -2
View File
@@ -43,7 +43,7 @@ logger = logging.getLogger("claudini")
app = typer.Typer(add_completion=False)
def _build_input_spec(preset_cfg: dict) -> InputSpec:
def build_input_spec(preset_cfg: dict) -> InputSpec:
"""Build InputSpec from preset YAML config."""
if "input_spec" in preset_cfg:
return InputSpec.from_dict(preset_cfg["input_spec"])
@@ -125,7 +125,7 @@ def run_bench(
model_name = model if model is not None else preset_cfg.get("model", "gpt2")
# Build InputSpec from preset config
input_spec = _build_input_spec(preset_cfg)
input_spec = build_input_spec(preset_cfg)
method_kwargs = preset_cfg.get("method_kwargs", {})