EvoSynth/jailbreak_toolbox/datasets/implementations/jsonl_dataset.py

# jailbreak_toolbox/datasets/implementations/jsonl_dataset.py
import json
import os
from typing import Iterator, Any, List, Dict, Optional
from ..base_dataset import BaseDataset
from ...core.registry import dataset_registry

@dataset_registry.register("jsonl")
class JSONLDataset(BaseDataset):
    """
    Dataset implementation that loads data from JSONL files.

    Supports extraction of prompts and associated images from JSONL records.
    """

    def __init__(
        self,
        jsonl_path: str,
        prompt_field: str = "jailbreak_prompt",
        image_field: Optional[str] = "image",
        image_prefix: Optional[str] = None,
        filter_field: Optional[str] = None,
        filter_value: Optional[str] = None,
        **kwargs
    ):
        """
        Initialize the JSONL dataset.

        Args:
            jsonl_path: Path to the JSONL file
            prompt_field: Field name in JSON containing the prompt to use
            image_field: Field name in JSON containing image path (if applicable)
            image_prefix: Prefix path to prepend to image paths
            filter_field: Optional field to filter records on
            filter_value: Value that filter_field must match
        """
        super().__init__(**kwargs)
        self.jsonl_path = jsonl_path
        self.prompt_field = prompt_field
        self.image_field = image_field
        self.image_prefix = image_prefix
        self.filter_field = filter_field
        self.filter_value = filter_value

        self.items = self._load_jsonl()

    def _load_jsonl(self) -> List[Dict[str, Any]]:
        """
        Load data from JSONL file.

        Returns:
            List of processed records
        """
        if not os.path.exists(self.jsonl_path):
            raise FileNotFoundError(f"JSONL file not found: {self.jsonl_path}")

        items = []
        with open(self.jsonl_path, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue

                try:
                    record = json.loads(line.strip())

                    # Apply filtering if specified
                    if self.filter_field and self.filter_value:
                        if record.get(self.filter_field) != self.filter_value:
                            continue

                    # Process the record
                    item = self._process_record(record)
                    if item:
                        items.append(item)

                except json.JSONDecodeError:
                    continue  # Skip invalid JSON lines

        return items

    def _process_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Process a single JSONL record.

        Args:
            record: The JSON record to process

        Returns:
            Processed record or None if invalid
        """
        # Check if required prompt field exists
        if self.prompt_field not in record:
            return None

        prompt = record[self.prompt_field]

        # Process image field if applicable
        image_path = None
        if self.image_field and self.image_field in record and record[self.image_field]:
            image_filename = record[self.image_field]
            if self.image_prefix:
                image_path = os.path.join(self.image_prefix, image_filename)
            else:
                image_path = image_filename

        # Return processed item
        return {
            "prompt": prompt,
            "image_path": image_path,
            "method": record.get("jailbreak_method", "unknown"),
            "original_record": record  # Keep the original record for reference
        }

    def __iter__(self) -> Iterator[Dict[str, Any]]:
        """Iterate through dataset items."""
        return iter(self.items)

    def __len__(self) -> int:
        """Return the number of items in the dataset."""
        return len(self.items)

    def __getitem__(self, index: int) -> Dict[str, Any]:
        """Support indexing for dataset access"""
        return self.items[index]