Files
EvoSynth/jailbreak_toolbox/evaluators/implementations/grouped_evaluator.py
T
dongdongunique f3af94df0b first commit
2025-12-10 00:54:02 +08:00

131 lines
4.6 KiB
Python

# jailbreak_toolbox/evaluators/implementations/grouped_evaluator.py
from typing import List, Dict, Any, Optional, Callable
from collections import defaultdict
from ..multi_thread_evaluator import MultiThreadedEvaluator
from ...attacks.base_attack import AttackResult
from jailbreak_toolbox.evaluators.base_evaluator import EvaluationMetrics
from ...core.registry import evaluator_registry
from jailbreak_toolbox.judges.base_judge import BaseJudge
@evaluator_registry.register("grouped")
class GroupedEvaluator(MultiThreadedEvaluator):
"""
Evaluator that groups results by a specified attribute (e.g., attack method).
This allows comparing the effectiveness of different attack methods.
"""
def __init__(
self,
group_by: str = "method",
judge: BaseJudge = None,
max_workers: int = 5,
**kwargs
):
"""
Initialize the grouped evaluator.
Args:
group_by: Attack result attribute to group by
judge: Judge to evaluate responses
max_workers: Maximum number of worker threads
"""
super().__init__(max_workers=max_workers, **kwargs)
self.group_by = group_by
self.judge = judge
assert self.judge is not None, "Judge must be provided"
def evaluate(self, results: List[AttackResult]) -> Dict[str, Any]:
"""
Evaluate attack results grouped by the specified attribute.
Args:
results: List of attack results to evaluate
Returns:
Dictionary containing:
- overall: Overall evaluation metrics
- by_group: Metrics broken down by group
"""
if not results:
return {
"overall": EvaluationMetrics(attack_success_rate=0.0, average_attack_score=0.0),
"by_group": {}
}
# Group results
grouped_results = defaultdict(list)
for result in results:
group = self._get_group(result)
grouped_results[group].append(result)
# Evaluate each group
group_metrics = {}
overall_success_count=0
for group, group_results in grouped_results.items():
metrics = super().evaluate(group_results)
group_metrics[group] = metrics
overall_success_count+= metrics.attack_success_rate*len(group_results)
# Calculate overall metrics
# overall_success_count = sum(1 for result in results if result.success)
overall_success_rate = overall_success_count / len(results)
return {
"overall": EvaluationMetrics(
attack_success_rate=overall_success_rate,
average_attack_score=0.0 # We don't calculate this for overall
),
"by_group": group_metrics
}
def _get_group(self, result: AttackResult) -> str:
"""
Get the group identifier for a result.
Args:
result: Attack result
Returns:
Group identifier
"""
# First, check if the method attribute is directly set on the AttackResult
if hasattr(result, "method") and result.method:
return str(result.method)
# If not, try to get the group from history
if result.history and len(result.history) > 0:
history_item = result.history[0]
if isinstance(history_item, dict) and self.group_by in history_item:
return str(history_item[self.group_by])
# Try to get from original_record if available
if hasattr(result, "original_record") and self.group_by in result.original_record:
return str(result.original_record[self.group_by])
# Default to unknown group
return "unknown"
def _evaluate_single(self, result: AttackResult) -> tuple:
"""
Evaluate a single attack result.
Args:
result: Attack result to evaluate
Returns:
Tuple of (success, score)
"""
# success = result.success
success = False
# If we have a judge, use it to evaluate
if self.judge and not success:
scores, _ = self.judge.judge_prompt([result.target], result.output_text)
success = self.judge.is_success(scores)
score = scores if scores else 0.0
else:
score = 5.0 if success else 0.0
# print(success, score)
return (success, score)