OBLITERATUS/tests/test_heretic_eval.py

"""Tests for the community-standard (Heretics/Arditi) evaluation protocol."""

from __future__ import annotations

import pytest
import torch

from obliteratus.evaluation.heretic_eval import (
    ARDITI_REFUSAL_SUBSTRINGS,
    _DEFAULT_HARMLESS_PROMPTS,
    _HARMBENCH_CLS_TEMPLATE,
    _JAILBREAKBENCH_BUILTIN,
    _extract_score,
    _last_real_token_indices,
    _sanitize_model_path,
    arditi_refusal_rate,
    first_token_kl_on_prompts,
    format_comparison_table,
    HereticComparisonResult,
    LM_EVAL_BENCHMARKS,
    load_jailbreakbench_prompts,
    unload_harmbench_classifier,
)


# ---------------------------------------------------------------------------
# JailbreakBench Prompts
# ---------------------------------------------------------------------------

class TestJailbreakBenchPrompts:
    def test_builtin_has_100_prompts(self):
        assert len(_JAILBREAKBENCH_BUILTIN) == 100

    def test_builtin_all_non_empty(self):
        for i, p in enumerate(_JAILBREAKBENCH_BUILTIN):
            assert isinstance(p, str) and len(p.strip()) > 10, f"Prompt {i} is too short"

    def test_builtin_no_duplicates(self):
        assert len(set(_JAILBREAKBENCH_BUILTIN)) == len(_JAILBREAKBENCH_BUILTIN)

    def test_load_returns_list(self):
        prompts = load_jailbreakbench_prompts()
        assert isinstance(prompts, list)
        assert len(prompts) >= 100

    def test_load_caches(self):
        p1 = load_jailbreakbench_prompts()
        p2 = load_jailbreakbench_prompts()
        assert p1 == p2

    def test_load_returns_copy(self):
        """Loading should return a copy, not the cached list itself."""
        p1 = load_jailbreakbench_prompts()
        p1.append("mutated")
        p2 = load_jailbreakbench_prompts()
        assert "mutated" not in p2


# ---------------------------------------------------------------------------
# Arditi Refusal Rate
# ---------------------------------------------------------------------------

class TestArditiRefusalRate:
    def test_canonical_substring_count(self):
        """Arditi/Zou list has exactly 29 substrings."""
        assert len(ARDITI_REFUSAL_SUBSTRINGS) == 29

    def test_all_refusals(self):
        responses = [
            "I'm sorry, but I cannot help with that.",
            "I apologize for the inconvenience.",
            "As an AI, I don't do that.",
        ]
        assert arditi_refusal_rate(responses) == 1.0

    def test_no_refusals(self):
        responses = [
            "Sure, here is a step-by-step guide.",
            "The answer to your question is 42.",
            "Here you go, this should work.",
        ]
        assert arditi_refusal_rate(responses) == 0.0

    def test_empty_responses_not_refusals(self):
        """Per Zou et al. logic, empty responses have no substring match = not refusals."""
        responses = ["", "  ", ""]
        assert arditi_refusal_rate(responses) == 0.0

    def test_empty_list(self):
        assert arditi_refusal_rate([]) == 0.0

    def test_case_insensitive(self):
        responses = ["i'm sorry, but I can't help."]
        assert arditi_refusal_rate(responses) == 1.0

    def test_mixed(self):
        responses = [
            "I cannot do that.",
            "Sure, here's the info.",
            "My apologies, I'm unable to help.",
            "The capital of France is Paris.",
        ]
        rate = arditi_refusal_rate(responses)
        assert rate == 0.5  # 2/4

    def test_subtle_refusal_not_caught(self):
        """Arditi list is deliberately limited — subtle refusals should not match."""
        responses = [
            "I'd rather not help with that specific request.",
            "Let me redirect you to a more appropriate resource.",
        ]
        # These are NOT in the Arditi 29-substring list
        assert arditi_refusal_rate(responses) == 0.0


# ---------------------------------------------------------------------------
# First-Token KL Divergence
# ---------------------------------------------------------------------------

class TestFirstTokenKL:
    @pytest.fixture
    def simple_models(self):
        """Create two tiny 'models' with controllable logit distributions."""
        class FakeModel(torch.nn.Module):
            def __init__(self, peak_idx: int = 0):
                super().__init__()
                # A single parameter so next(model.parameters()).device works
                self._param = torch.nn.Parameter(torch.zeros(1))
                self._peak_idx = peak_idx

            def __call__(self, **kwargs):
                batch_size = kwargs["input_ids"].shape[0]
                seq_len = kwargs["input_ids"].shape[1]
                vocab_size = 10
                # Create a non-uniform distribution peaked at _peak_idx
                base = torch.zeros(vocab_size)
                base[self._peak_idx] = 5.0
                logits = base.unsqueeze(0).unsqueeze(0).expand(
                    batch_size, seq_len, vocab_size
                ).clone()
                return type("Output", (), {"logits": logits})()

        class FakeTokenizer:
            pad_token_id = 0
            def __call__(self, texts, return_tensors="pt", **kwargs):
                batch_size = len(texts) if isinstance(texts, list) else 1
                input_ids = torch.ones(batch_size, 5, dtype=torch.long)
                return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}

        return FakeModel, FakeTokenizer

    def test_identical_models_zero_kl(self, simple_models):
        FakeModel, FakeTokenizer = simple_models
        model_a = FakeModel(peak_idx=0)
        model_b = FakeModel(peak_idx=0)
        tokenizer = FakeTokenizer()

        result = first_token_kl_on_prompts(
            model_a, model_b, tokenizer,
            ["hello", "world"],
        )
        assert abs(result["mean_kl"]) < 1e-5
        assert result["interpretation"] == "excellent (minimal collateral damage)"

    def test_different_models_positive_kl(self, simple_models):
        FakeModel, FakeTokenizer = simple_models
        model_a = FakeModel(peak_idx=0)  # peaked at vocab position 0
        model_b = FakeModel(peak_idx=5)  # peaked at vocab position 5
        tokenizer = FakeTokenizer()

        result = first_token_kl_on_prompts(
            model_a, model_b, tokenizer,
            ["test prompt"],
        )
        assert result["mean_kl"] > 0

    def test_returns_per_prompt_kl(self, simple_models):
        FakeModel, FakeTokenizer = simple_models
        model_a = FakeModel(peak_idx=0)
        model_b = FakeModel(peak_idx=3)
        tokenizer = FakeTokenizer()

        result = first_token_kl_on_prompts(
            model_a, model_b, tokenizer,
            ["a", "b", "c"],
        )
        assert len(result["per_prompt_kl"]) == 3
        assert result["std_kl"] >= 0


# ---------------------------------------------------------------------------
# HereticComparisonResult
# ---------------------------------------------------------------------------

class TestHereticComparisonResult:
    def test_dataclass_fields(self):
        r = HereticComparisonResult(
            model_name="test-model",
            method="OBLITERATUS",
            refusal_rate_arditi=0.05,
            refusal_rate_obliteratus=0.03,
            harmbench_asr=0.85,
            n_jailbreakbench=100,
            n_refusals_remaining=5,
            first_token_kl=0.15,
            kl_interpretation="excellent",
        )
        assert r.model_name == "test-model"
        assert r.method == "OBLITERATUS"
        assert r.refusal_rate_arditi == 0.05
        assert r.harmbench_asr == 0.85
        assert r.first_token_kl == 0.15

    def test_optional_fields_default_none(self):
        r = HereticComparisonResult(
            model_name="test",
            method="test",
            refusal_rate_arditi=0.0,
            refusal_rate_obliteratus=0.0,
            harmbench_asr=None,
            n_jailbreakbench=100,
            n_refusals_remaining=0,
        )
        assert r.mmlu is None
        assert r.gsm8k is None
        assert r.perplexity is None
        assert r.harmbench_per_item == []
        assert r.kl_per_prompt == []


# ---------------------------------------------------------------------------
# Comparison Table Formatting
# ---------------------------------------------------------------------------

class TestComparisonTable:
    def test_format_single_result(self):
        r = HereticComparisonResult(
            model_name="Llama-2-7B",
            method="OBLITERATUS",
            refusal_rate_arditi=0.05,
            refusal_rate_obliteratus=0.03,
            harmbench_asr=0.85,
            n_jailbreakbench=100,
            n_refusals_remaining=5,
            first_token_kl=0.15,
            kl_interpretation="excellent",
            mmlu=0.518,
            gsm8k=0.313,
        )
        table = format_comparison_table([r])
        assert "OBLITERATUS" in table
        assert "REFUSAL REMOVAL" in table
        assert "CAPABILITY PRESERVATION" in table
        assert "DISTRIBUTION QUALITY" in table
        assert "5.0%" in table  # arditi refusal rate
        assert "85.0%" in table  # harmbench asr
        assert "5/100" in table  # JBB refusals
        assert "0.1500" in table  # KL divergence

    def test_format_multiple_results(self):
        results = [
            HereticComparisonResult(
                model_name="test", method="OBLITERATUS",
                refusal_rate_arditi=0.05, refusal_rate_obliteratus=0.03,
                harmbench_asr=0.85, n_jailbreakbench=100, n_refusals_remaining=5,
            ),
            HereticComparisonResult(
                model_name="test", method="Heretic",
                refusal_rate_arditi=0.03, refusal_rate_obliteratus=0.03,
                harmbench_asr=0.90, n_jailbreakbench=100, n_refusals_remaining=3,
            ),
        ]
        table = format_comparison_table(results)
        assert "OBLITERATUS" in table
        assert "Heretic" in table

    def test_heretic_reference_numbers_present(self):
        """The comparison table should include Heretic's published reference numbers."""
        table = format_comparison_table([
            HereticComparisonResult(
                model_name="test", method="test",
                refusal_rate_arditi=0.0, refusal_rate_obliteratus=0.0,
                harmbench_asr=None, n_jailbreakbench=100, n_refusals_remaining=0,
            ),
        ])
        assert "p-e-w/heretic" in table
        assert "97/100" in table  # Original model refusal count
        assert "0.16" in table    # Heretic's KL divergence


# ---------------------------------------------------------------------------
# LM-Eval Benchmark Config
# ---------------------------------------------------------------------------

class TestLmEvalBenchmarks:
    def test_five_standard_benchmarks(self):
        assert len(LM_EVAL_BENCHMARKS) == 5

    def test_required_keys(self):
        for key, config in LM_EVAL_BENCHMARKS.items():
            assert "task" in config
            assert "num_fewshot" in config
            assert "description" in config

    def test_expected_benchmarks(self):
        expected = {"mmlu", "gsm8k", "arc_challenge", "hellaswag", "truthfulqa_mc2"}
        assert set(LM_EVAL_BENCHMARKS.keys()) == expected

    def test_fewshot_settings_match_arditi(self):
        """Few-shot settings should match Arditi et al. / Open LLM Leaderboard."""
        assert LM_EVAL_BENCHMARKS["mmlu"]["num_fewshot"] == 5
        assert LM_EVAL_BENCHMARKS["gsm8k"]["num_fewshot"] == 5
        assert LM_EVAL_BENCHMARKS["truthfulqa_mc2"]["num_fewshot"] == 0


# ---------------------------------------------------------------------------
# HarmBench Template
# ---------------------------------------------------------------------------

class TestHarmBenchTemplate:
    def test_template_has_sys_tags(self):
        """Official template requires <<SYS>> / <</SYS>> delimiters."""
        assert "<<SYS>>" in _HARMBENCH_CLS_TEMPLATE
        assert "<</SYS>>" in _HARMBENCH_CLS_TEMPLATE

    def test_template_has_seven_rules(self):
        """Official HarmBench template has exactly 7 rules."""
        rule_count = _HARMBENCH_CLS_TEMPLATE.count("\n- ")
        assert rule_count == 7

    def test_template_has_behavior_placeholder(self):
        assert "{behavior}" in _HARMBENCH_CLS_TEMPLATE

    def test_template_has_generation_placeholder(self):
        assert "{generation}" in _HARMBENCH_CLS_TEMPLATE

    def test_template_ends_with_inst_tag(self):
        assert _HARMBENCH_CLS_TEMPLATE.strip().endswith("[/INST]")

    def test_template_answer_format(self):
        """Official template uses lowercase 'yes' or 'no'."""
        assert '"yes" or "no"' in _HARMBENCH_CLS_TEMPLATE


# ---------------------------------------------------------------------------
# Score Extraction (C3 fix)
# ---------------------------------------------------------------------------

class TestExtractScore:
    def test_normal_score(self):
        assert _extract_score({"acc,none": 0.75}) == 0.75

    def test_zero_score_not_skipped(self):
        """A legitimate score of 0.0 should be returned, not treated as falsy."""
        assert _extract_score({"acc,none": 0.0}) == 0.0

    def test_fallback_to_next_key(self):
        assert _extract_score({"acc_norm,none": 0.65}) == 0.65

    def test_mc2_key(self):
        assert _extract_score({"mc2,none": 0.42}) == 0.42

    def test_no_matching_key(self):
        assert _extract_score({"unknown_metric": 0.99}) == 0.0

    def test_priority_order(self):
        """acc,none should take priority over acc_norm,none."""
        result = _extract_score({"acc,none": 0.5, "acc_norm,none": 0.9})
        assert result == 0.5


# ---------------------------------------------------------------------------
# Padding-Aware Last-Token Indices
# ---------------------------------------------------------------------------

class TestLastRealTokenIndices:
    def test_no_padding(self):
        mask = torch.ones(3, 5, dtype=torch.long)
        indices = _last_real_token_indices(mask)
        assert indices.tolist() == [4, 4, 4]

    def test_with_padding(self):
        mask = torch.tensor([
            [1, 1, 1, 1, 1],  # length 5, last real = index 4
            [1, 1, 1, 0, 0],  # length 3, last real = index 2
            [1, 0, 0, 0, 0],  # length 1, last real = index 0
        ])
        indices = _last_real_token_indices(mask)
        assert indices.tolist() == [4, 2, 0]

    def test_single_token(self):
        mask = torch.tensor([[1]])
        indices = _last_real_token_indices(mask)
        assert indices.tolist() == [0]


# ---------------------------------------------------------------------------
# Model Path Sanitization
# ---------------------------------------------------------------------------

class TestSanitizeModelPath:
    def test_normal_path(self):
        assert _sanitize_model_path("/tmp/my-model") == "/tmp/my-model"

    def test_hf_model_id(self):
        assert _sanitize_model_path("meta-llama/Llama-2-7b-hf") == "meta-llama/Llama-2-7b-hf"

    def test_rejects_commas(self):
        with pytest.raises(ValueError, match="commas"):
            _sanitize_model_path("evil,trust_remote_code=True")


# ---------------------------------------------------------------------------
# Classifier Unload
# ---------------------------------------------------------------------------

class TestClassifierUnload:
    def test_unload_when_not_loaded(self):
        """Unloading when nothing is loaded should not raise."""
        unload_harmbench_classifier()  # should be a no-op


# ---------------------------------------------------------------------------
# Default Harmless Prompts
# ---------------------------------------------------------------------------

class TestDefaultHarmlessPrompts:
    def test_has_100_unique_prompts(self):
        assert len(_DEFAULT_HARMLESS_PROMPTS) == 100

    def test_no_duplicates(self):
        assert len(set(_DEFAULT_HARMLESS_PROMPTS)) == len(_DEFAULT_HARMLESS_PROMPTS)

    def test_all_non_empty(self):
        for i, p in enumerate(_DEFAULT_HARMLESS_PROMPTS):
            assert isinstance(p, str) and len(p) > 10, f"Prompt {i} is too short"


# ---------------------------------------------------------------------------
# KL Divergence Non-Negativity
# ---------------------------------------------------------------------------

class TestKLNonNegativity:
    @pytest.fixture
    def models_and_tokenizer(self):
        class FakeModel(torch.nn.Module):
            def __init__(self, peak_idx: int = 0):
                super().__init__()
                self._param = torch.nn.Parameter(torch.zeros(1))
                self._peak_idx = peak_idx

            def __call__(self, **kwargs):
                batch_size = kwargs["input_ids"].shape[0]
                seq_len = kwargs["input_ids"].shape[1]
                vocab_size = 10
                base = torch.zeros(vocab_size)
                base[self._peak_idx] = 5.0
                logits = base.unsqueeze(0).unsqueeze(0).expand(
                    batch_size, seq_len, vocab_size
                ).clone()
                return type("Output", (), {"logits": logits})()

        class FakeTokenizer:
            pad_token_id = 0
            def __call__(self, texts, return_tensors="pt", **kwargs):
                batch_size = len(texts) if isinstance(texts, list) else 1
                input_ids = torch.ones(batch_size, 5, dtype=torch.long)
                return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}

        return FakeModel, FakeTokenizer

    def test_all_kl_values_non_negative(self, models_and_tokenizer):
        FakeModel, FakeTokenizer = models_and_tokenizer
        model_a = FakeModel(peak_idx=0)
        model_b = FakeModel(peak_idx=3)
        tokenizer = FakeTokenizer()

        result = first_token_kl_on_prompts(
            model_a, model_b, tokenizer,
            ["a", "b", "c", "d", "e"],
        )
        for val in result["per_prompt_kl"]:
            assert val >= 0.0, f"KL value {val} is negative"