EvoSynth/jailbreak_toolbox/evaluators/implementations/grouped_evaluator.py

# jailbreak_toolbox/evaluators/implementations/grouped_evaluator.py
from typing import List, Dict, Any, Optional, Callable
from collections import defaultdict
from ..multi_thread_evaluator import MultiThreadedEvaluator
from ...attacks.base_attack import AttackResult
from jailbreak_toolbox.evaluators.base_evaluator import EvaluationMetrics
from ...core.registry import evaluator_registry
from jailbreak_toolbox.judges.base_judge import BaseJudge

@evaluator_registry.register("grouped")
class GroupedEvaluator(MultiThreadedEvaluator):
    """
    Evaluator that groups results by a specified attribute (e.g., attack method).

    This allows comparing the effectiveness of different attack methods.
    """

    def __init__(
        self,
        group_by: str = "method",
        judge: BaseJudge = None,
        max_workers: int = 5,
        **kwargs
    ):
        """
        Initialize the grouped evaluator.

        Args:
            group_by: Attack result attribute to group by
            judge: Judge to evaluate responses
            max_workers: Maximum number of worker threads
        """
        super().__init__(max_workers=max_workers, **kwargs)
        self.group_by = group_by
        self.judge = judge
        assert self.judge is not None, "Judge must be provided"

    def evaluate(self, results: List[AttackResult]) -> Dict[str, Any]:
        """
        Evaluate attack results grouped by the specified attribute.

        Args:
            results: List of attack results to evaluate

        Returns:
            Dictionary containing:
            - overall: Overall evaluation metrics
            - by_group: Metrics broken down by group
        """
        if not results:
            return {
                "overall": EvaluationMetrics(attack_success_rate=0.0, average_attack_score=0.0),
                "by_group": {}
            }

        # Group results
        grouped_results = defaultdict(list)
        for result in results:
            group = self._get_group(result)
            grouped_results[group].append(result)

        # Evaluate each group
        group_metrics = {}
        overall_success_count=0
        for group, group_results in grouped_results.items():
            metrics = super().evaluate(group_results)
            group_metrics[group] = metrics
            overall_success_count+= metrics.attack_success_rate*len(group_results)

        # Calculate overall metrics
        # overall_success_count = sum(1 for result in results if result.success)
        overall_success_rate = overall_success_count / len(results)

        return {
            "overall": EvaluationMetrics(
                attack_success_rate=overall_success_rate,
                average_attack_score=0.0  # We don't calculate this for overall
            ),
            "by_group": group_metrics
        }

    def _get_group(self, result: AttackResult) -> str:
        """
        Get the group identifier for a result.

        Args:
            result: Attack result

        Returns:
            Group identifier
        """
        # First, check if the method attribute is directly set on the AttackResult
        if hasattr(result, "method") and result.method:
            return str(result.method)

        # If not, try to get the group from history
        if result.history and len(result.history) > 0:
            history_item = result.history[0]
            if isinstance(history_item, dict) and self.group_by in history_item:
                return str(history_item[self.group_by])

        # Try to get from original_record if available
        if hasattr(result, "original_record") and self.group_by in result.original_record:
            return str(result.original_record[self.group_by])

        # Default to unknown group
        return "unknown"

    def _evaluate_single(self, result: AttackResult) -> tuple:
        """
        Evaluate a single attack result.

        Args:
            result: Attack result to evaluate

        Returns:
            Tuple of (success, score)
        """
        # success = result.success
        success = False

        # If we have a judge, use it to evaluate
        if self.judge and not success:
            scores, _ = self.judge.judge_prompt([result.target], result.output_text)
            success = self.judge.is_success(scores)
            score = scores if scores else 0.0
        else:
            score = 5.0 if success else 0.0

        # print(success, score)
        return (success, score)