mirror of
https://github.com/dongdongunique/EvoSynth.git
synced 2026-05-31 20:31:35 +02:00
124 lines
4.2 KiB
Python
124 lines
4.2 KiB
Python
# jailbreak_toolbox/datasets/implementations/jsonl_dataset.py
|
|
import json
|
|
import os
|
|
from typing import Iterator, Any, List, Dict, Optional
|
|
from ..base_dataset import BaseDataset
|
|
from ...core.registry import dataset_registry
|
|
|
|
@dataset_registry.register("jsonl")
|
|
class JSONLDataset(BaseDataset):
|
|
"""
|
|
Dataset implementation that loads data from JSONL files.
|
|
|
|
Supports extraction of prompts and associated images from JSONL records.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
jsonl_path: str,
|
|
prompt_field: str = "jailbreak_prompt",
|
|
image_field: Optional[str] = "image",
|
|
image_prefix: Optional[str] = None,
|
|
filter_field: Optional[str] = None,
|
|
filter_value: Optional[str] = None,
|
|
**kwargs
|
|
):
|
|
"""
|
|
Initialize the JSONL dataset.
|
|
|
|
Args:
|
|
jsonl_path: Path to the JSONL file
|
|
prompt_field: Field name in JSON containing the prompt to use
|
|
image_field: Field name in JSON containing image path (if applicable)
|
|
image_prefix: Prefix path to prepend to image paths
|
|
filter_field: Optional field to filter records on
|
|
filter_value: Value that filter_field must match
|
|
"""
|
|
super().__init__(**kwargs)
|
|
self.jsonl_path = jsonl_path
|
|
self.prompt_field = prompt_field
|
|
self.image_field = image_field
|
|
self.image_prefix = image_prefix
|
|
self.filter_field = filter_field
|
|
self.filter_value = filter_value
|
|
|
|
self.items = self._load_jsonl()
|
|
|
|
def _load_jsonl(self) -> List[Dict[str, Any]]:
|
|
"""
|
|
Load data from JSONL file.
|
|
|
|
Returns:
|
|
List of processed records
|
|
"""
|
|
if not os.path.exists(self.jsonl_path):
|
|
raise FileNotFoundError(f"JSONL file not found: {self.jsonl_path}")
|
|
|
|
items = []
|
|
with open(self.jsonl_path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
if not line.strip():
|
|
continue
|
|
|
|
try:
|
|
record = json.loads(line.strip())
|
|
|
|
# Apply filtering if specified
|
|
if self.filter_field and self.filter_value:
|
|
if record.get(self.filter_field) != self.filter_value:
|
|
continue
|
|
|
|
# Process the record
|
|
item = self._process_record(record)
|
|
if item:
|
|
items.append(item)
|
|
|
|
except json.JSONDecodeError:
|
|
continue # Skip invalid JSON lines
|
|
|
|
return items
|
|
|
|
def _process_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Process a single JSONL record.
|
|
|
|
Args:
|
|
record: The JSON record to process
|
|
|
|
Returns:
|
|
Processed record or None if invalid
|
|
"""
|
|
# Check if required prompt field exists
|
|
if self.prompt_field not in record:
|
|
return None
|
|
|
|
prompt = record[self.prompt_field]
|
|
|
|
# Process image field if applicable
|
|
image_path = None
|
|
if self.image_field and self.image_field in record and record[self.image_field]:
|
|
image_filename = record[self.image_field]
|
|
if self.image_prefix:
|
|
image_path = os.path.join(self.image_prefix, image_filename)
|
|
else:
|
|
image_path = image_filename
|
|
|
|
# Return processed item
|
|
return {
|
|
"prompt": prompt,
|
|
"image_path": image_path,
|
|
"method": record.get("jailbreak_method", "unknown"),
|
|
"original_record": record # Keep the original record for reference
|
|
}
|
|
|
|
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
"""Iterate through dataset items."""
|
|
return iter(self.items)
|
|
|
|
def __len__(self) -> int:
|
|
"""Return the number of items in the dataset."""
|
|
return len(self.items)
|
|
|
|
def __getitem__(self, index: int) -> Dict[str, Any]:
|
|
"""Support indexing for dataset access"""
|
|
return self.items[index] |