mirror of
https://github.com/romovpa/claudini.git
synced 2026-04-28 18:45:59 +02:00
Add leaderboard generation script (#2)
Add `claudini.leaderboard` module that scans benchmark result files and generates per-track, per model leaderboard JSONs ranking methods by average loss. Output: results/loss_leaderboard/<preset>/<model_tag>.json Also: rename _build_input_spec -> build_input_spec in run_bench.py. Assisted-by: Claude <noreply@anthropic.com> Co-authored-by: Alexander Panfilov <apanfilov@g003.internal.cluster.is.localnet> Co-authored-by: Peter Romov <peter@romov.com>
This commit is contained in:
committed by
GitHub
parent
59106bdf3c
commit
8221f0afb8
@@ -70,6 +70,8 @@ We consider white-box GCG-style attacks that search directly over the model's vo
|
||||
|
||||
See [`CLAUDE.md`](CLAUDE.md) for how to implement a new method.
|
||||
|
||||
**Leaderboard.** Run `uv run -m claudini.leaderboard results/` to generate per-track, per-model leaderboards ranking all methods by average loss. Results are saved to `results/loss_leaderboard/<preset>/<model_tag>.json`.
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
"""
|
||||
Generate loss leaderboards from benchmark results.
|
||||
|
||||
Usage:
|
||||
uv run -m claudini.leaderboard results/
|
||||
uv run -m claudini.leaderboard results/ --preset random_valid
|
||||
uv run -m claudini.leaderboard results/ --preset random_valid --model-tag Qwen2.5-7B-Instruct
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Annotated, TypedDict
|
||||
|
||||
import typer
|
||||
|
||||
logger = logging.getLogger("claudini")
|
||||
|
||||
app = typer.Typer(add_completion=False)
|
||||
|
||||
|
||||
class MethodEntry(TypedDict):
|
||||
rank: Annotated[int, "1-indexed position, sorted by avg_loss ascending"]
|
||||
method: Annotated[str, "Method name, e.g. 'gcg', 'claude_v82'"]
|
||||
num_runs: Annotated[int, "Number of runs (examples x seeds) for this method"]
|
||||
avg_loss: Annotated[float, "Mean of best_loss over all runs"]
|
||||
std_loss: Annotated[float, "Population stddev of best_loss over all runs"]
|
||||
min_loss: Annotated[float, "Minimum best_loss over all runs"]
|
||||
max_flops: Annotated[float, "Maximum FLOPs consumed over all runs"]
|
||||
|
||||
|
||||
class Leaderboard(TypedDict):
|
||||
preset: Annotated[str, "Config preset name, e.g. 'random_valid'"]
|
||||
model: Annotated[str, "Full HuggingFace model ID, e.g. 'Qwen/Qwen2.5-7B-Instruct'"]
|
||||
model_tag: Annotated[str, "Short model directory name, e.g. 'Qwen2.5-7B-Instruct'"]
|
||||
max_flops: Annotated[float | None, "Maximum FLOPs consumed over all runs (should be <= the FLOP budget)"]
|
||||
leaderboard: list[MethodEntry]
|
||||
|
||||
|
||||
def discover_results(results_dir: Path) -> dict[tuple[str, str, str], list[Path]]:
|
||||
"""Scan results_dir and group result files by (preset, model_tag, method).
|
||||
|
||||
Expected layout: results_dir/<method>/<preset>/<model_tag>/sample_*_seed_*.json
|
||||
"""
|
||||
groups: dict[tuple[str, str, str], list[Path]] = {}
|
||||
for path in results_dir.rglob("sample_*_seed_*.json"):
|
||||
parts = path.relative_to(results_dir).parts
|
||||
if len(parts) != 4:
|
||||
continue
|
||||
method, preset, model_tag, _ = parts
|
||||
groups.setdefault((preset, model_tag, method), []).append(path)
|
||||
return groups
|
||||
|
||||
|
||||
def build_leaderboard(
|
||||
groups: dict[tuple[str, str, str], list[Path]],
|
||||
preset: str,
|
||||
model_tag: str,
|
||||
) -> Leaderboard:
|
||||
"""Build a leaderboard dict for one (preset, model_tag) combination."""
|
||||
methods = []
|
||||
for (p, m, method_name), paths in sorted(groups.items()):
|
||||
if p != preset or m != model_tag:
|
||||
continue
|
||||
|
||||
losses = []
|
||||
total_flops_list = []
|
||||
for path in paths:
|
||||
try:
|
||||
with open(path) as f:
|
||||
d = json.load(f)
|
||||
except Exception:
|
||||
logger.warning("Failed to load %s", path)
|
||||
continue
|
||||
# Respect eval_on: use soft loss only when the method says to evaluate on soft
|
||||
if d.get("eval_on") == "soft" and d.get("best_soft_loss") is not None:
|
||||
loss = d["best_soft_loss"]
|
||||
else:
|
||||
loss = d["best_loss"]
|
||||
losses.append(loss)
|
||||
total_flops_list.append(d.get("total_flops", 0))
|
||||
|
||||
if not losses:
|
||||
continue
|
||||
|
||||
avg_loss = sum(losses) / len(losses)
|
||||
std_loss = (sum((x - avg_loss) ** 2 for x in losses) / len(losses)) ** 0.5
|
||||
min_loss = min(losses)
|
||||
|
||||
methods.append(
|
||||
MethodEntry(
|
||||
rank=0, # filled after sorting
|
||||
method=method_name,
|
||||
avg_loss=round(avg_loss, 4),
|
||||
std_loss=round(std_loss, 4),
|
||||
min_loss=round(min_loss, 4),
|
||||
num_runs=len(losses),
|
||||
max_flops=round(max(total_flops_list), 2),
|
||||
)
|
||||
)
|
||||
|
||||
methods.sort(key=lambda m: m["avg_loss"])
|
||||
for i, m in enumerate(methods, 1):
|
||||
m["rank"] = i
|
||||
|
||||
# Extract model_name from first result file for metadata
|
||||
model_name = None
|
||||
for (p, m, _), paths in groups.items():
|
||||
if p == preset and m == model_tag and paths:
|
||||
try:
|
||||
with open(paths[0]) as f:
|
||||
d = json.load(f)
|
||||
model_name = d.get("model_name")
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
all_flops = [entry["max_flops"] for entry in methods if entry["max_flops"] > 0]
|
||||
max_flops = round(max(all_flops), 2) if all_flops else None
|
||||
|
||||
return Leaderboard(
|
||||
preset=preset,
|
||||
model=model_name or model_tag,
|
||||
model_tag=model_tag,
|
||||
max_flops=max_flops,
|
||||
leaderboard=methods,
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def leaderboard(
|
||||
results_dir: Annotated[str, typer.Argument(help="Path to results directory")] = "results",
|
||||
preset: Annotated[str | None, typer.Option(help="Filter to a specific preset")] = None,
|
||||
model_tag: Annotated[str | None, typer.Option(help="Filter to a specific model tag")] = None,
|
||||
output_dir: Annotated[
|
||||
str | None, typer.Option(help="Output directory (default: <results_dir>/loss_leaderboard)")
|
||||
] = None,
|
||||
):
|
||||
"""Generate loss leaderboards from benchmark result files."""
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
|
||||
results_path = Path(results_dir)
|
||||
if not results_path.is_dir():
|
||||
raise typer.BadParameter(f"Results directory not found: {results_dir}")
|
||||
|
||||
out_path = Path(output_dir) if output_dir else results_path / "loss_leaderboard"
|
||||
|
||||
groups = discover_results(results_path)
|
||||
if not groups:
|
||||
logger.info("No result files found in %s", results_dir)
|
||||
raise typer.Exit()
|
||||
|
||||
# Collect all (preset, model_tag) combinations
|
||||
combos = sorted({(p, m) for p, m, _ in groups})
|
||||
|
||||
if preset:
|
||||
combos = [(p, m) for p, m in combos if p == preset]
|
||||
if model_tag:
|
||||
combos = [(p, m) for p, m in combos if m == model_tag]
|
||||
|
||||
if not combos:
|
||||
logger.info("No matching results for preset=%s model_tag=%s", preset, model_tag)
|
||||
raise typer.Exit()
|
||||
|
||||
generated = 0
|
||||
for p, m in combos:
|
||||
board = build_leaderboard(groups, p, m)
|
||||
if not board["leaderboard"]:
|
||||
continue
|
||||
|
||||
dest = out_path / p / f"{m}.json"
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dest, "w") as f:
|
||||
json.dump(board, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
n_methods = len(board["leaderboard"])
|
||||
logger.info(" %s/%s.json (%d methods)", p, m, n_methods)
|
||||
generated += 1
|
||||
|
||||
logger.info("Generated %d leaderboard(s) in %s", generated, out_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
@@ -43,7 +43,7 @@ logger = logging.getLogger("claudini")
|
||||
app = typer.Typer(add_completion=False)
|
||||
|
||||
|
||||
def _build_input_spec(preset_cfg: dict) -> InputSpec:
|
||||
def build_input_spec(preset_cfg: dict) -> InputSpec:
|
||||
"""Build InputSpec from preset YAML config."""
|
||||
if "input_spec" in preset_cfg:
|
||||
return InputSpec.from_dict(preset_cfg["input_spec"])
|
||||
@@ -125,7 +125,7 @@ def run_bench(
|
||||
model_name = model if model is not None else preset_cfg.get("model", "gpt2")
|
||||
|
||||
# Build InputSpec from preset config
|
||||
input_spec = _build_input_spec(preset_cfg)
|
||||
input_spec = build_input_spec(preset_cfg)
|
||||
|
||||
method_kwargs = preset_cfg.get("method_kwargs", {})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user