diff --git a/README.md b/README.md index 3105dc1..fd20162 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,8 @@ We consider white-box GCG-style attacks that search directly over the model's vo See [`CLAUDE.md`](CLAUDE.md) for how to implement a new method. +**Leaderboard.** Run `uv run -m claudini.leaderboard results/` to generate per-track, per-model leaderboards ranking all methods by average loss. Results are saved to `results/loss_leaderboard//.json`. + ## Citation ```bibtex diff --git a/claudini/leaderboard.py b/claudini/leaderboard.py new file mode 100644 index 0000000..2fe0de5 --- /dev/null +++ b/claudini/leaderboard.py @@ -0,0 +1,185 @@ +""" +Generate loss leaderboards from benchmark results. + +Usage: + uv run -m claudini.leaderboard results/ + uv run -m claudini.leaderboard results/ --preset random_valid + uv run -m claudini.leaderboard results/ --preset random_valid --model-tag Qwen2.5-7B-Instruct +""" + +import json +import logging +from pathlib import Path +from typing import Annotated, TypedDict + +import typer + +logger = logging.getLogger("claudini") + +app = typer.Typer(add_completion=False) + + +class MethodEntry(TypedDict): + rank: Annotated[int, "1-indexed position, sorted by avg_loss ascending"] + method: Annotated[str, "Method name, e.g. 'gcg', 'claude_v82'"] + num_runs: Annotated[int, "Number of runs (examples x seeds) for this method"] + avg_loss: Annotated[float, "Mean of best_loss over all runs"] + std_loss: Annotated[float, "Population stddev of best_loss over all runs"] + min_loss: Annotated[float, "Minimum best_loss over all runs"] + max_flops: Annotated[float, "Maximum FLOPs consumed over all runs"] + + +class Leaderboard(TypedDict): + preset: Annotated[str, "Config preset name, e.g. 'random_valid'"] + model: Annotated[str, "Full HuggingFace model ID, e.g. 'Qwen/Qwen2.5-7B-Instruct'"] + model_tag: Annotated[str, "Short model directory name, e.g. 'Qwen2.5-7B-Instruct'"] + max_flops: Annotated[float | None, "Maximum FLOPs consumed over all runs (should be <= the FLOP budget)"] + leaderboard: list[MethodEntry] + + +def discover_results(results_dir: Path) -> dict[tuple[str, str, str], list[Path]]: + """Scan results_dir and group result files by (preset, model_tag, method). + + Expected layout: results_dir////sample_*_seed_*.json + """ + groups: dict[tuple[str, str, str], list[Path]] = {} + for path in results_dir.rglob("sample_*_seed_*.json"): + parts = path.relative_to(results_dir).parts + if len(parts) != 4: + continue + method, preset, model_tag, _ = parts + groups.setdefault((preset, model_tag, method), []).append(path) + return groups + + +def build_leaderboard( + groups: dict[tuple[str, str, str], list[Path]], + preset: str, + model_tag: str, +) -> Leaderboard: + """Build a leaderboard dict for one (preset, model_tag) combination.""" + methods = [] + for (p, m, method_name), paths in sorted(groups.items()): + if p != preset or m != model_tag: + continue + + losses = [] + total_flops_list = [] + for path in paths: + try: + with open(path) as f: + d = json.load(f) + except Exception: + logger.warning("Failed to load %s", path) + continue + # Respect eval_on: use soft loss only when the method says to evaluate on soft + if d.get("eval_on") == "soft" and d.get("best_soft_loss") is not None: + loss = d["best_soft_loss"] + else: + loss = d["best_loss"] + losses.append(loss) + total_flops_list.append(d.get("total_flops", 0)) + + if not losses: + continue + + avg_loss = sum(losses) / len(losses) + std_loss = (sum((x - avg_loss) ** 2 for x in losses) / len(losses)) ** 0.5 + min_loss = min(losses) + + methods.append( + MethodEntry( + rank=0, # filled after sorting + method=method_name, + avg_loss=round(avg_loss, 4), + std_loss=round(std_loss, 4), + min_loss=round(min_loss, 4), + num_runs=len(losses), + max_flops=round(max(total_flops_list), 2), + ) + ) + + methods.sort(key=lambda m: m["avg_loss"]) + for i, m in enumerate(methods, 1): + m["rank"] = i + + # Extract model_name from first result file for metadata + model_name = None + for (p, m, _), paths in groups.items(): + if p == preset and m == model_tag and paths: + try: + with open(paths[0]) as f: + d = json.load(f) + model_name = d.get("model_name") + except Exception: + pass + break + + all_flops = [entry["max_flops"] for entry in methods if entry["max_flops"] > 0] + max_flops = round(max(all_flops), 2) if all_flops else None + + return Leaderboard( + preset=preset, + model=model_name or model_tag, + model_tag=model_tag, + max_flops=max_flops, + leaderboard=methods, + ) + + +@app.command() +def leaderboard( + results_dir: Annotated[str, typer.Argument(help="Path to results directory")] = "results", + preset: Annotated[str | None, typer.Option(help="Filter to a specific preset")] = None, + model_tag: Annotated[str | None, typer.Option(help="Filter to a specific model tag")] = None, + output_dir: Annotated[ + str | None, typer.Option(help="Output directory (default: /loss_leaderboard)") + ] = None, +): + """Generate loss leaderboards from benchmark result files.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + results_path = Path(results_dir) + if not results_path.is_dir(): + raise typer.BadParameter(f"Results directory not found: {results_dir}") + + out_path = Path(output_dir) if output_dir else results_path / "loss_leaderboard" + + groups = discover_results(results_path) + if not groups: + logger.info("No result files found in %s", results_dir) + raise typer.Exit() + + # Collect all (preset, model_tag) combinations + combos = sorted({(p, m) for p, m, _ in groups}) + + if preset: + combos = [(p, m) for p, m in combos if p == preset] + if model_tag: + combos = [(p, m) for p, m in combos if m == model_tag] + + if not combos: + logger.info("No matching results for preset=%s model_tag=%s", preset, model_tag) + raise typer.Exit() + + generated = 0 + for p, m in combos: + board = build_leaderboard(groups, p, m) + if not board["leaderboard"]: + continue + + dest = out_path / p / f"{m}.json" + dest.parent.mkdir(parents=True, exist_ok=True) + with open(dest, "w") as f: + json.dump(board, f, indent=2) + f.write("\n") + + n_methods = len(board["leaderboard"]) + logger.info(" %s/%s.json (%d methods)", p, m, n_methods) + generated += 1 + + logger.info("Generated %d leaderboard(s) in %s", generated, out_path) + + +if __name__ == "__main__": + app() diff --git a/claudini/run_bench.py b/claudini/run_bench.py index c799509..d9b95ec 100644 --- a/claudini/run_bench.py +++ b/claudini/run_bench.py @@ -43,7 +43,7 @@ logger = logging.getLogger("claudini") app = typer.Typer(add_completion=False) -def _build_input_spec(preset_cfg: dict) -> InputSpec: +def build_input_spec(preset_cfg: dict) -> InputSpec: """Build InputSpec from preset YAML config.""" if "input_spec" in preset_cfg: return InputSpec.from_dict(preset_cfg["input_spec"]) @@ -125,7 +125,7 @@ def run_bench( model_name = model if model is not None else preset_cfg.get("model", "gpt2") # Build InputSpec from preset config - input_spec = _build_input_spec(preset_cfg) + input_spec = build_input_spec(preset_cfg) method_kwargs = preset_cfg.get("method_kwargs", {})