mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-04-23 11:46:28 +02:00
184 lines
6.3 KiB
Python
184 lines
6.3 KiB
Python
"""Tests for lightweight benchmark harnesses."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import MagicMock
|
|
|
|
import torch
|
|
|
|
from obliteratus.evaluation.benchmarks import (
|
|
KNOWLEDGE_ITEMS,
|
|
TRUTHFULNESS_ITEMS,
|
|
MATH_REASONING_ITEMS,
|
|
BenchmarkRunner,
|
|
BenchmarkResult,
|
|
format_benchmark_report,
|
|
)
|
|
|
|
|
|
def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64):
|
|
"""Create mock model and tokenizer for benchmark testing."""
|
|
model = MagicMock()
|
|
|
|
# Model returns logits when called
|
|
def mock_forward(**kwargs):
|
|
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
|
|
batch_size, seq_len = input_ids.shape
|
|
result = MagicMock()
|
|
result.logits = torch.randn(batch_size, seq_len, vocab_size)
|
|
return result
|
|
|
|
model.side_effect = mock_forward
|
|
model.__call__ = mock_forward
|
|
|
|
# Model.generate returns token IDs
|
|
def mock_generate(**kwargs):
|
|
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
|
|
# Append some "generated" tokens
|
|
gen_tokens = torch.randint(0, vocab_size, (1, 20))
|
|
return torch.cat([input_ids, gen_tokens], dim=1)
|
|
|
|
model.generate = mock_generate
|
|
|
|
# Model.parameters for device detection
|
|
param = torch.nn.Parameter(torch.randn(1))
|
|
model.parameters = MagicMock(return_value=iter([param]))
|
|
|
|
tokenizer = MagicMock()
|
|
tokenizer.return_value = {
|
|
"input_ids": torch.randint(0, vocab_size, (1, 15)),
|
|
"attention_mask": torch.ones(1, 15, dtype=torch.long),
|
|
}
|
|
tokenizer.side_effect = lambda text, **kwargs: {
|
|
"input_ids": torch.randint(0, vocab_size, (1, 15)),
|
|
"attention_mask": torch.ones(1, 15, dtype=torch.long),
|
|
}
|
|
|
|
def mock_decode(ids, **kwargs):
|
|
return "The answer is 42. This is a generated response about the topic."
|
|
|
|
def mock_encode(text, **kwargs):
|
|
# Return different IDs for A, B, C, D
|
|
if text == "A":
|
|
return [65]
|
|
elif text == "B":
|
|
return [66]
|
|
elif text == "C":
|
|
return [67]
|
|
elif text == "D":
|
|
return [68]
|
|
return [hash(text) % vocab_size]
|
|
|
|
tokenizer.decode = mock_decode
|
|
tokenizer.encode = mock_encode
|
|
|
|
return model, tokenizer
|
|
|
|
|
|
class TestBenchmarkItems:
|
|
def test_knowledge_items_have_required_fields(self):
|
|
for item in KNOWLEDGE_ITEMS:
|
|
assert "q" in item
|
|
assert "choices" in item
|
|
assert "answer" in item
|
|
assert "category" in item
|
|
assert 0 <= item["answer"] < len(item["choices"])
|
|
|
|
def test_knowledge_items_count(self):
|
|
assert len(KNOWLEDGE_ITEMS) >= 20
|
|
|
|
def test_knowledge_categories(self):
|
|
categories = set(item["category"] for item in KNOWLEDGE_ITEMS)
|
|
assert len(categories) >= 4 # multiple categories
|
|
|
|
def test_truthfulness_items_have_required_fields(self):
|
|
for item in TRUTHFULNESS_ITEMS:
|
|
assert "q" in item
|
|
assert "true_answer" in item
|
|
assert "common_false" in item
|
|
assert "category" in item
|
|
|
|
def test_truthfulness_items_count(self):
|
|
assert len(TRUTHFULNESS_ITEMS) >= 10
|
|
|
|
def test_math_items_have_required_fields(self):
|
|
for item in MATH_REASONING_ITEMS:
|
|
assert "q" in item
|
|
assert "answer" in item
|
|
assert "category" in item
|
|
assert isinstance(item["answer"], (int, float))
|
|
|
|
def test_math_items_count(self):
|
|
assert len(MATH_REASONING_ITEMS) >= 10
|
|
|
|
|
|
class TestBenchmarkRunner:
|
|
def test_knowledge_probe_returns_result(self):
|
|
model, tokenizer = _make_mock_model_and_tokenizer()
|
|
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
|
result = runner.run_knowledge_probe()
|
|
|
|
assert isinstance(result, BenchmarkResult)
|
|
assert result.benchmark_name == "knowledge_probe"
|
|
assert 0 <= result.score <= 1.0
|
|
assert result.n_total == len(KNOWLEDGE_ITEMS)
|
|
assert result.n_correct >= 0
|
|
assert len(result.per_category) > 0
|
|
|
|
def test_truthfulness_probe_returns_result(self):
|
|
model, tokenizer = _make_mock_model_and_tokenizer()
|
|
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
|
result = runner.run_truthfulness_probe()
|
|
|
|
assert isinstance(result, BenchmarkResult)
|
|
assert result.benchmark_name == "truthfulness_probe"
|
|
assert 0 <= result.score <= 1.0
|
|
assert result.n_total == len(TRUTHFULNESS_ITEMS)
|
|
|
|
def test_math_probe_returns_result(self):
|
|
model, tokenizer = _make_mock_model_and_tokenizer()
|
|
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
|
result = runner.run_math_reasoning_probe()
|
|
|
|
assert isinstance(result, BenchmarkResult)
|
|
assert result.benchmark_name == "math_reasoning_probe"
|
|
assert 0 <= result.score <= 1.0
|
|
assert result.n_total == len(MATH_REASONING_ITEMS)
|
|
|
|
def test_run_all(self):
|
|
model, tokenizer = _make_mock_model_and_tokenizer()
|
|
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
|
results = runner.run_all()
|
|
|
|
assert "knowledge" in results
|
|
assert "truthfulness" in results
|
|
assert "math_reasoning" in results
|
|
|
|
def test_format_report(self):
|
|
model, tokenizer = _make_mock_model_and_tokenizer()
|
|
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
|
results = runner.run_all()
|
|
report = format_benchmark_report(results)
|
|
|
|
assert "Capability" in report
|
|
assert "knowledge" in report
|
|
assert "truthfulness" in report
|
|
assert "math" in report
|
|
|
|
def test_per_category_scores_bounded(self):
|
|
model, tokenizer = _make_mock_model_and_tokenizer()
|
|
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
|
result = runner.run_knowledge_probe()
|
|
|
|
for cat, score in result.per_category.items():
|
|
assert 0 <= score <= 1.0
|
|
|
|
def test_extract_number(self):
|
|
model, tokenizer = _make_mock_model_and_tokenizer()
|
|
runner = BenchmarkRunner(model, tokenizer, device="cpu")
|
|
|
|
assert runner._extract_number("The answer is 42.") == 42.0
|
|
assert runner._extract_number("$20.50 is the price") == 20.50
|
|
assert runner._extract_number("Result: -3.14") == -3.14
|
|
assert runner._extract_number("No numbers here") is None
|