Files
EvoSynth/jailbreak_toolbox/datasets/implementations/jsonl_dataset.py
T
dongdongunique f3af94df0b first commit
2025-12-10 00:54:02 +08:00

124 lines
4.2 KiB
Python

# jailbreak_toolbox/datasets/implementations/jsonl_dataset.py
import json
import os
from typing import Iterator, Any, List, Dict, Optional
from ..base_dataset import BaseDataset
from ...core.registry import dataset_registry
@dataset_registry.register("jsonl")
class JSONLDataset(BaseDataset):
"""
Dataset implementation that loads data from JSONL files.
Supports extraction of prompts and associated images from JSONL records.
"""
def __init__(
self,
jsonl_path: str,
prompt_field: str = "jailbreak_prompt",
image_field: Optional[str] = "image",
image_prefix: Optional[str] = None,
filter_field: Optional[str] = None,
filter_value: Optional[str] = None,
**kwargs
):
"""
Initialize the JSONL dataset.
Args:
jsonl_path: Path to the JSONL file
prompt_field: Field name in JSON containing the prompt to use
image_field: Field name in JSON containing image path (if applicable)
image_prefix: Prefix path to prepend to image paths
filter_field: Optional field to filter records on
filter_value: Value that filter_field must match
"""
super().__init__(**kwargs)
self.jsonl_path = jsonl_path
self.prompt_field = prompt_field
self.image_field = image_field
self.image_prefix = image_prefix
self.filter_field = filter_field
self.filter_value = filter_value
self.items = self._load_jsonl()
def _load_jsonl(self) -> List[Dict[str, Any]]:
"""
Load data from JSONL file.
Returns:
List of processed records
"""
if not os.path.exists(self.jsonl_path):
raise FileNotFoundError(f"JSONL file not found: {self.jsonl_path}")
items = []
with open(self.jsonl_path, 'r', encoding='utf-8') as f:
for line in f:
if not line.strip():
continue
try:
record = json.loads(line.strip())
# Apply filtering if specified
if self.filter_field and self.filter_value:
if record.get(self.filter_field) != self.filter_value:
continue
# Process the record
item = self._process_record(record)
if item:
items.append(item)
except json.JSONDecodeError:
continue # Skip invalid JSON lines
return items
def _process_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Process a single JSONL record.
Args:
record: The JSON record to process
Returns:
Processed record or None if invalid
"""
# Check if required prompt field exists
if self.prompt_field not in record:
return None
prompt = record[self.prompt_field]
# Process image field if applicable
image_path = None
if self.image_field and self.image_field in record and record[self.image_field]:
image_filename = record[self.image_field]
if self.image_prefix:
image_path = os.path.join(self.image_prefix, image_filename)
else:
image_path = image_filename
# Return processed item
return {
"prompt": prompt,
"image_path": image_path,
"method": record.get("jailbreak_method", "unknown"),
"original_record": record # Keep the original record for reference
}
def __iter__(self) -> Iterator[Dict[str, Any]]:
"""Iterate through dataset items."""
return iter(self.items)
def __len__(self) -> int:
"""Return the number of items in the dataset."""
return len(self.items)
def __getitem__(self, index: int) -> Dict[str, Any]:
"""Support indexing for dataset access"""
return self.items[index]