mirror of
https://github.com/dongdongunique/EvoSynth.git
synced 2026-05-31 20:31:35 +02:00
131 lines
4.6 KiB
Python
131 lines
4.6 KiB
Python
# jailbreak_toolbox/evaluators/implementations/grouped_evaluator.py
|
|
from typing import List, Dict, Any, Optional, Callable
|
|
from collections import defaultdict
|
|
from ..multi_thread_evaluator import MultiThreadedEvaluator
|
|
from ...attacks.base_attack import AttackResult
|
|
from jailbreak_toolbox.evaluators.base_evaluator import EvaluationMetrics
|
|
from ...core.registry import evaluator_registry
|
|
from jailbreak_toolbox.judges.base_judge import BaseJudge
|
|
|
|
@evaluator_registry.register("grouped")
|
|
class GroupedEvaluator(MultiThreadedEvaluator):
|
|
"""
|
|
Evaluator that groups results by a specified attribute (e.g., attack method).
|
|
|
|
This allows comparing the effectiveness of different attack methods.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
group_by: str = "method",
|
|
judge: BaseJudge = None,
|
|
max_workers: int = 5,
|
|
**kwargs
|
|
):
|
|
"""
|
|
Initialize the grouped evaluator.
|
|
|
|
Args:
|
|
group_by: Attack result attribute to group by
|
|
judge: Judge to evaluate responses
|
|
max_workers: Maximum number of worker threads
|
|
"""
|
|
super().__init__(max_workers=max_workers, **kwargs)
|
|
self.group_by = group_by
|
|
self.judge = judge
|
|
assert self.judge is not None, "Judge must be provided"
|
|
|
|
def evaluate(self, results: List[AttackResult]) -> Dict[str, Any]:
|
|
"""
|
|
Evaluate attack results grouped by the specified attribute.
|
|
|
|
Args:
|
|
results: List of attack results to evaluate
|
|
|
|
Returns:
|
|
Dictionary containing:
|
|
- overall: Overall evaluation metrics
|
|
- by_group: Metrics broken down by group
|
|
"""
|
|
if not results:
|
|
return {
|
|
"overall": EvaluationMetrics(attack_success_rate=0.0, average_attack_score=0.0),
|
|
"by_group": {}
|
|
}
|
|
|
|
# Group results
|
|
grouped_results = defaultdict(list)
|
|
for result in results:
|
|
group = self._get_group(result)
|
|
grouped_results[group].append(result)
|
|
|
|
# Evaluate each group
|
|
group_metrics = {}
|
|
overall_success_count=0
|
|
for group, group_results in grouped_results.items():
|
|
metrics = super().evaluate(group_results)
|
|
group_metrics[group] = metrics
|
|
overall_success_count+= metrics.attack_success_rate*len(group_results)
|
|
|
|
# Calculate overall metrics
|
|
# overall_success_count = sum(1 for result in results if result.success)
|
|
overall_success_rate = overall_success_count / len(results)
|
|
|
|
return {
|
|
"overall": EvaluationMetrics(
|
|
attack_success_rate=overall_success_rate,
|
|
average_attack_score=0.0 # We don't calculate this for overall
|
|
),
|
|
"by_group": group_metrics
|
|
}
|
|
|
|
def _get_group(self, result: AttackResult) -> str:
|
|
"""
|
|
Get the group identifier for a result.
|
|
|
|
Args:
|
|
result: Attack result
|
|
|
|
Returns:
|
|
Group identifier
|
|
"""
|
|
# First, check if the method attribute is directly set on the AttackResult
|
|
if hasattr(result, "method") and result.method:
|
|
return str(result.method)
|
|
|
|
# If not, try to get the group from history
|
|
if result.history and len(result.history) > 0:
|
|
history_item = result.history[0]
|
|
if isinstance(history_item, dict) and self.group_by in history_item:
|
|
return str(history_item[self.group_by])
|
|
|
|
# Try to get from original_record if available
|
|
if hasattr(result, "original_record") and self.group_by in result.original_record:
|
|
return str(result.original_record[self.group_by])
|
|
|
|
# Default to unknown group
|
|
return "unknown"
|
|
|
|
def _evaluate_single(self, result: AttackResult) -> tuple:
|
|
"""
|
|
Evaluate a single attack result.
|
|
|
|
Args:
|
|
result: Attack result to evaluate
|
|
|
|
Returns:
|
|
Tuple of (success, score)
|
|
"""
|
|
# success = result.success
|
|
success = False
|
|
|
|
# If we have a judge, use it to evaluate
|
|
if self.judge and not success:
|
|
scores, _ = self.judge.judge_prompt([result.target], result.output_text)
|
|
success = self.judge.is_success(scores)
|
|
score = scores if scores else 0.0
|
|
else:
|
|
score = 5.0 if success else 0.0
|
|
|
|
# print(success, score)
|
|
return (success, score) |