mirror of
https://github.com/msoedov/agentic_security.git
synced 2026-06-24 14:19:55 +02:00
refactor(data module):
This commit is contained in:
+281
-390
@@ -1,7 +1,8 @@
|
||||
import io
|
||||
import os
|
||||
import random
|
||||
from functools import lru_cache
|
||||
from collections.abc import Callable, Iterator
|
||||
from functools import partial
|
||||
|
||||
import httpx
|
||||
import pandas as pd
|
||||
@@ -19,281 +20,200 @@ from agentic_security.probe_data.modules import (
|
||||
rl_model,
|
||||
)
|
||||
|
||||
# Type aliases for clarity
|
||||
FilterFn = Callable[[pd.Series], bool]
|
||||
ColumnMappings = dict[str, str]
|
||||
DatasetLoader = Callable[[], ProbeDataset]
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_general(
|
||||
dataset_name,
|
||||
dataset_split="train",
|
||||
column_mappings=None,
|
||||
filter_fn=None,
|
||||
custom_url=None,
|
||||
additional_metadata=None,
|
||||
):
|
||||
"""
|
||||
Generalized function to load datasets with flexible configurations.
|
||||
|
||||
:param dataset_name: Name of the dataset or URL for custom CSVs
|
||||
:param dataset_split: Split to load from the dataset (e.g., "train")
|
||||
:param column_mappings: Dictionary mapping dataset columns to expected keys, e.g., {'prompt': 'query'}
|
||||
:param filter_fn: A filtering function that takes a row and returns True/False
|
||||
:param custom_url: URL for custom CSV datasets
|
||||
:param additional_metadata: Additional metadata to include in the ProbeDataset
|
||||
:return: A ProbeDataset object with the processed data
|
||||
"""
|
||||
if custom_url:
|
||||
logger.info(f"Loading custom CSV dataset from {custom_url}")
|
||||
r = httpx.get(custom_url)
|
||||
content = r.content
|
||||
df = pd.read_csv(io.StringIO(content.decode("utf-8")))
|
||||
else:
|
||||
logger.info(f"Loading dataset {dataset_name} from Hugging Face datasets")
|
||||
dataset = load_dataset(dataset_name)
|
||||
df = pd.DataFrame(dataset[dataset_split])
|
||||
# Core data loading utilities
|
||||
def fetch_csv_content(url: str) -> str:
|
||||
"""Fetch CSV content from a URL."""
|
||||
response = httpx.get(url)
|
||||
return response.content.decode("utf-8")
|
||||
|
||||
# Apply column mappings if provided
|
||||
if column_mappings:
|
||||
df.rename(columns=column_mappings, inplace=True)
|
||||
|
||||
# Filter rows if filter_fn is provided
|
||||
def load_df_from_source(source: str, is_url: bool = False) -> pd.DataFrame:
|
||||
"""Load DataFrame from either URL or Hugging Face dataset."""
|
||||
if is_url:
|
||||
logger.info(f"Loading custom CSV from {source}")
|
||||
return pd.read_csv(io.StringIO(fetch_csv_content(source)))
|
||||
logger.info(f"Loading dataset {source} from Hugging Face")
|
||||
return pd.DataFrame(load_dataset(source)["train"])
|
||||
|
||||
|
||||
def transform_df(
|
||||
df: pd.DataFrame,
|
||||
mappings: ColumnMappings | None = None,
|
||||
filter_fn: FilterFn | None = None,
|
||||
) -> pd.DataFrame:
|
||||
"""Apply transformations to DataFrame."""
|
||||
result = df.copy()
|
||||
if mappings:
|
||||
result.rename(columns=mappings, inplace=True)
|
||||
if filter_fn:
|
||||
df = df[df.apply(filter_fn, axis=1)]
|
||||
|
||||
# Extract prompts
|
||||
prompts = df[column_mappings.get("prompt", "prompt")].tolist()
|
||||
|
||||
return ProbeDataset(
|
||||
dataset_name=dataset_name,
|
||||
metadata=additional_metadata or {},
|
||||
prompts=prompts,
|
||||
tokens=count_words_in_list(prompts),
|
||||
approx_cost=0.0,
|
||||
)
|
||||
result = result[result.apply(filter_fn, axis=1)]
|
||||
return result
|
||||
|
||||
|
||||
def count_words_in_list(str_list):
|
||||
"""Calculate the total number of words in a given list of strings.
|
||||
|
||||
:param str_list: List of strings
|
||||
:return: Total number of words across all strings in the list
|
||||
"""
|
||||
total_words = sum(len(str(s).split()) for s in str_list)
|
||||
return total_words
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v1():
|
||||
return load_dataset_general(
|
||||
dataset_name="ShawnMenz/DAN_jailbreak",
|
||||
column_mappings={"prompt": "prompt", "jailbreak": "jailbreak"},
|
||||
filter_fn=lambda row: row["jailbreak"],
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v2():
|
||||
return load_dataset_general(
|
||||
dataset_name="deepset/prompt-injections",
|
||||
column_mappings={"prompt": "text", "jailbreak": "label"},
|
||||
filter_fn=lambda row: row["label"],
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v4():
|
||||
return load_dataset_general(
|
||||
dataset_name="notrichardren/refuse-to-answer-prompts",
|
||||
column_mappings={"prompt": "claim", "jailbreak": "label"},
|
||||
filter_fn=lambda row: row["label"],
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v3():
|
||||
return load_dataset_general(
|
||||
dataset_name="rubend18/ChatGPT-Jailbreak-Prompts",
|
||||
column_mappings={"prompt": "Prompt"},
|
||||
filter_fn=lambda row: row["label"],
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v6():
|
||||
dataset = load_dataset("markush1/LLM-Jailbreak-Classifier")
|
||||
bad_prompts = [
|
||||
p
|
||||
for p, c in zip(dataset["train"]["prompt"], dataset["train"]["classification"])
|
||||
if c == "jailbreak"
|
||||
]
|
||||
return ProbeDataset(
|
||||
dataset_name="markush1/LLM-Jailbreak-Classifier",
|
||||
metadata={},
|
||||
prompts=bad_prompts,
|
||||
tokens=count_words_in_list(bad_prompts),
|
||||
approx_cost=0.0,
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v7():
|
||||
splits = {
|
||||
"mini_JailBreakV_28K": "JailBreakV_28K/mini_JailBreakV_28K.csv",
|
||||
"JailBreakV_28K": "JailBreakV_28K/JailBreakV_28K.csv",
|
||||
}
|
||||
df = pd.read_csv(
|
||||
"hf://datasets/JailbreakV-28K/JailBreakV-28k/" + splits["JailBreakV_28K"]
|
||||
)
|
||||
bad_prompts = df["jailbreak_query"].tolist()
|
||||
print(df.shape)
|
||||
return ProbeDataset(
|
||||
dataset_name="JailbreakV-28K/JailBreakV-28k",
|
||||
metadata={},
|
||||
prompts=bad_prompts,
|
||||
tokens=count_words_in_list(bad_prompts),
|
||||
approx_cost=0.0,
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v8():
|
||||
df = pd.read_csv(
|
||||
"hf://datasets/ShawnMenz/jailbreak_sft_rm_ds/jailbreak_sft_rm_ds.csv",
|
||||
names=["jailbreak", "prompt"],
|
||||
)
|
||||
filtered = df[df["jailbreak"] == "jailbreak"]["prompt"].tolist()
|
||||
return ProbeDataset(
|
||||
dataset_name="JailbreakV-28K/JailBreakV-28k",
|
||||
metadata={},
|
||||
prompts=filtered,
|
||||
tokens=count_words_in_list(filtered),
|
||||
approx_cost=0.0,
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_dataset_v5():
|
||||
ds = []
|
||||
for c in [
|
||||
"AdvBench",
|
||||
"ForbiddenQuestion",
|
||||
"MJP",
|
||||
"MaliciousInstruct",
|
||||
"QuestionList",
|
||||
]:
|
||||
dataset = load_dataset("Lemhf14/EasyJailbreak_Datasets", c)
|
||||
bad_prompts = dataset["train"]["query"]
|
||||
ds.extend(bad_prompts)
|
||||
|
||||
return ProbeDataset(
|
||||
dataset_name="Lemhf14/EasyJailbreak_Datasets",
|
||||
metadata={},
|
||||
prompts=ds,
|
||||
tokens=count_words_in_list(ds),
|
||||
approx_cost=0.0,
|
||||
)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_generic_csv(url, name, column="prompt", predicator=None):
|
||||
r = httpx.get(url)
|
||||
content = r.content
|
||||
df = pd.read_csv(io.StringIO(content.decode("utf-8")))
|
||||
logger.info(f"Loaded {len(df)} prompts from {url}")
|
||||
filtered_prompts = df[df.apply(predicator, axis=1)][column].tolist()
|
||||
def create_probe_dataset(
|
||||
name: str, prompts: list[str], metadata: dict = None
|
||||
) -> ProbeDataset:
|
||||
"""Create a ProbeDataset from prompts."""
|
||||
metadata = metadata or {}
|
||||
return ProbeDataset(
|
||||
dataset_name=name,
|
||||
metadata={},
|
||||
prompts=filtered_prompts,
|
||||
tokens=count_words_in_list(filtered_prompts),
|
||||
metadata=metadata,
|
||||
prompts=prompts,
|
||||
tokens=sum(len(str(s).split()) for s in prompts),
|
||||
approx_cost=0.0,
|
||||
)
|
||||
|
||||
|
||||
def prepare_prompts(dataset_names, budget, tools_inbox=None, options=[]):
|
||||
# ## Datasets used and cleaned:
|
||||
# markush1/LLM-Jailbreak-Classifier
|
||||
# 1. Open-Orca/OpenOrca
|
||||
# 2. ShawnMenz/DAN_jailbreak
|
||||
# 3. EddyLuo/JailBreakV_28K
|
||||
# 4. https://raw.githubusercontent.com/verazuo/jailbreak_llms/main/data/jailbreak_prompts.csv
|
||||
|
||||
dataset_map = {
|
||||
"ShawnMenz/DAN_jailbreak": load_dataset_v1,
|
||||
"deepset/prompt-injections": load_dataset_v2,
|
||||
"notrichardren/refuse-to-answer-prompts": load_dataset_v4,
|
||||
"rubend18/ChatGPT-Jailbreak-Prompts": load_dataset_v3,
|
||||
"Lemhf14/EasyJailbreak_Datasets": load_dataset_v5,
|
||||
"markush1/LLM-Jailbreak-Classifier": load_dataset_v6,
|
||||
"JailbreakV-28K/JailBreakV-28k": load_dataset_v7,
|
||||
"ShawnMenz/jailbreak_sft_rm_ds": load_dataset_v8,
|
||||
"verazuo/jailbreak_llms/2023_05_07": lambda: load_generic_csv(
|
||||
url="https://raw.githubusercontent.com/verazuo/jailbreak_llms/main/data/prompts/jailbreak_prompts_2023_05_07.csv",
|
||||
name="verazuo/jailbreak_llms/2023_05_07",
|
||||
column="prompt",
|
||||
predicator=lambda x: bool(x["jailbreak"]),
|
||||
),
|
||||
"verazuo/jailbreak_llms/2023_12_25.csv": lambda: load_generic_csv(
|
||||
url="https://raw.githubusercontent.com/verazuo/jailbreak_llms/main/data/prompts/jailbreak_prompts_2023_12_25.csv.csv",
|
||||
name="verazuo/jailbreak_llms/2023_12_25.csv",
|
||||
column="prompt",
|
||||
predicator=lambda x: bool(x["jailbreak"]),
|
||||
),
|
||||
"Custom CSV": load_local_csv,
|
||||
}
|
||||
dataset_map.update(dataset_map_generics)
|
||||
|
||||
group = []
|
||||
for dataset_name in dataset_names:
|
||||
if dataset_name in dataset_map:
|
||||
logger.info(f"Loading {dataset_name}")
|
||||
try:
|
||||
group.append(dataset_map[dataset_name]())
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading {dataset_name}: {e}")
|
||||
|
||||
dynamic_datasets = {
|
||||
"AgenticBackend": lambda opts: dataset_from_iterator(
|
||||
"AgenticBackend",
|
||||
fine_tuned.Module(group, tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"Steganography": lambda opts: Stenography(group),
|
||||
"llm-adaptive-attacks": lambda opts: dataset_from_iterator(
|
||||
"llm-adaptive-attacks",
|
||||
adaptive_attacks.Module(group, tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
),
|
||||
"Garak": lambda opts: dataset_from_iterator(
|
||||
"Garak",
|
||||
garak_tool.Module(group, tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"Reinforcement Learning Optimization": lambda opts: dataset_from_iterator(
|
||||
"Reinforcement Learning Optimization",
|
||||
rl_model.Module(group, tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"InspectAI": lambda opts: dataset_from_iterator(
|
||||
"InspectAI",
|
||||
inspect_ai_tool.Module(group, tools_inbox=tools_inbox).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"GPT fuzzer": lambda opts: [],
|
||||
}
|
||||
|
||||
dynamic_groups = []
|
||||
options = options or [{} for _ in dataset_names]
|
||||
for dataset_name, opts in zip(dataset_names, options):
|
||||
if dataset_name in dynamic_datasets:
|
||||
logger.info(f"Loading {dataset_name}")
|
||||
|
||||
ds = dynamic_datasets[dataset_name](opts)
|
||||
|
||||
for g in ds:
|
||||
dynamic_groups.append(g)
|
||||
return group + dynamic_groups
|
||||
# Generalized dataset loader
|
||||
@cache_to_disk()
|
||||
def load_dataset_generic(
|
||||
name: str,
|
||||
mappings: ColumnMappings | None = None,
|
||||
filter_fn: FilterFn | None = None,
|
||||
url: str | None = None,
|
||||
metadata: dict | None = None,
|
||||
) -> ProbeDataset:
|
||||
"""Load and process a dataset with flexible configuration."""
|
||||
df = load_df_from_source(url or name, is_url=bool(url))
|
||||
transformed_df = transform_df(df, mappings, filter_fn)
|
||||
prompt_col = mappings.get("prompt", "prompt") if mappings else "prompt"
|
||||
prompts = transformed_df[prompt_col].tolist()
|
||||
return create_probe_dataset(name, prompts, metadata)
|
||||
|
||||
|
||||
class Stenography:
|
||||
fn_library = {
|
||||
# Dataset-specific configurations
|
||||
DATASET_CONFIGS = {
|
||||
"ShawnMenz/DAN_jailbreak": {
|
||||
"mappings": {"prompt": "prompt"},
|
||||
"filter_fn": lambda row: row["jailbreak"],
|
||||
},
|
||||
"deepset/prompt-injections": {
|
||||
"mappings": {"prompt": "text"},
|
||||
"filter_fn": lambda row: row["label"],
|
||||
},
|
||||
"notrichardren/refuse-to-answer-prompts": {
|
||||
"mappings": {"prompt": "claim"},
|
||||
"filter_fn": lambda row: row["label"],
|
||||
},
|
||||
"rubend18/ChatGPT-Jailbreak-Prompts": {
|
||||
"mappings": {"prompt": "Prompt"},
|
||||
"filter_fn": lambda row: row["label"],
|
||||
},
|
||||
"markush1/LLM-Jailbreak-Classifier": {
|
||||
"mappings": {"prompt": "prompt"},
|
||||
"filter_fn": lambda row: row["classification"] == "jailbreak",
|
||||
},
|
||||
"ShawnMenz/jailbreak_sft_rm_ds": {
|
||||
"url": "hf://datasets/ShawnMenz/jailbreak_sft_rm_ds/jailbreak_sft_rm_ds.csv",
|
||||
"mappings": {"prompt": "prompt"},
|
||||
"filter_fn": lambda row: row["jailbreak"] == "jailbreak",
|
||||
},
|
||||
"verazuo/jailbreak_llms/2023_05_07": {
|
||||
"url": "https://raw.githubusercontent.com/verazuo/jailbreak_llms/main/data/prompts/jailbreak_prompts_2023_05_07.csv",
|
||||
"mappings": {"prompt": "prompt"},
|
||||
"filter_fn": lambda row: bool(row["jailbreak"]),
|
||||
},
|
||||
"verazuo/jailbreak_llms/2023_12_25": {
|
||||
"url": "https://raw.githubusercontent.com/verazuo/jailbreak_llms/main/data/prompts/jailbreak_prompts_2023_12_25.csv",
|
||||
"mappings": {"prompt": "prompt"},
|
||||
"filter_fn": lambda row: bool(row["jailbreak"]),
|
||||
},
|
||||
}
|
||||
|
||||
# Additional generic dataset configurations
|
||||
DATASET_CONFIGS_GENERICS = {
|
||||
"simonycl/aya-23-8B_advbench_jailbreak": {"mappings": {"prompt": "prompt"}},
|
||||
"acmc/jailbreaks_dataset_with_perplexity_bigcode_starcoder2-3b_bigcode_starcoder2-7b": {},
|
||||
"karanxa/dolphin-jailbreak-finetuning-dataset": {"mappings": {"prompt": "text"}},
|
||||
"karanxa/llama-2-jailbreak-dataset": {"mappings": {"prompt": "text"}},
|
||||
"karanxa/llama2-uncensored-jailbreak-dataset-finetuning": {
|
||||
"mappings": {"prompt": "text"}
|
||||
},
|
||||
"liuyanchen1015/Llama-3.2-1B_jailbreak_responses": {
|
||||
"mappings": {"prompt": "jailbreak_prompt_text"}
|
||||
},
|
||||
"liuyanchen1015/Llama-3.2-1B-Instruct_jailbreak_responses": {
|
||||
"mappings": {"prompt": "jailbreak_prompt_text"}
|
||||
},
|
||||
"liuyanchen1015/Llama-3.2-1B-Instruct_jailbreak_responses_with_judgment": {
|
||||
"mappings": {"prompt": "jailbreak_prompt_text"}
|
||||
},
|
||||
"jackhhao/jailbreak-classification": {"mappings": {"prompt": "prompt"}},
|
||||
"walledai/JailbreakBench": {"mappings": {"prompt": "prompt"}},
|
||||
"walledai/JailbreakHub": {"mappings": {"prompt": "prompt"}},
|
||||
"Granther/evil-jailbreak": {"mappings": {"prompt": "text"}},
|
||||
"sevdeawesome/jailbreak_success": {"mappings": {"prompt": "jailbreak_prompt_text"}},
|
||||
"IDA-SERICS/Disaster-tweet-jailbreaking": {"mappings": {"prompt": "prompt_attack"}},
|
||||
"GeorgeDaDude/Jailbreak_Complete_DS_labeled": {"mappings": {"prompt": "question"}},
|
||||
"dayone3nder/jailbreak_prompt_JBB_sft_trainset": {"mappings": {"prompt": "prompt"}},
|
||||
"dayone3nder/general_safe_mix_jailbreak_prompt_JBB_trainset": {
|
||||
"mappings": {"prompt": "prompt"}
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Dataset factory
|
||||
def create_dataset_loader(name: str, config: dict) -> DatasetLoader:
|
||||
"""Create a dataset loader from configuration."""
|
||||
return partial(
|
||||
load_dataset_generic,
|
||||
name=name,
|
||||
mappings=config.get("mappings"),
|
||||
filter_fn=config.get("filter_fn"),
|
||||
url=config.get("url"),
|
||||
)
|
||||
|
||||
|
||||
# Specialized loaders
|
||||
@cache_to_disk()
|
||||
def load_multi_dataset(name: str, sub_datasets: list[str]) -> ProbeDataset:
|
||||
"""Load and combine multiple sub-datasets."""
|
||||
prompts = []
|
||||
for sub in sub_datasets:
|
||||
dataset = load_dataset(name, sub)
|
||||
prompts.extend(dataset["train"]["query"])
|
||||
return create_probe_dataset(f"{name}_combined", prompts)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_jailbreak_v28k() -> ProbeDataset:
|
||||
"""Load JailBreakV-28K dataset."""
|
||||
df = pd.read_csv("hf://datasets/JailbreakV-28K/JailBreakV-28k/JailBreakV_28K.csv")
|
||||
prompts = df["jailbreak_query"].tolist()
|
||||
return create_probe_dataset("JailbreakV-28K/JailBreakV-28k", prompts)
|
||||
|
||||
|
||||
@cache_to_disk()
|
||||
def load_local_csv() -> ProbeDataset:
|
||||
"""Load prompts from local CSV files."""
|
||||
csv_files = [f for f in os.listdir(".") if f.endswith(".csv")]
|
||||
logger.info(f"Found {len(csv_files)} CSV files: {csv_files}")
|
||||
|
||||
prompts = []
|
||||
for file in csv_files:
|
||||
try:
|
||||
df = pd.read_csv(file)
|
||||
if "prompt" in df.columns:
|
||||
prompts.extend(df["prompt"].tolist())
|
||||
else:
|
||||
logger.warning(f"File {file} lacks 'prompt' column")
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading {file}: {e}")
|
||||
|
||||
return create_probe_dataset("Local CSV", prompts, {"src": str(csv_files)})
|
||||
|
||||
|
||||
# Stenography transformer
|
||||
class StenographyTransformer:
|
||||
"""Apply stenography transformations to datasets."""
|
||||
|
||||
TRANSFORMATIONS = {
|
||||
"rot5": stenography_fn.rot5,
|
||||
"rot13": stenography_fn.rot13,
|
||||
"base64": stenography_fn.base64_encode,
|
||||
@@ -306,71 +226,32 @@ class Stenography:
|
||||
"zigzag_obfuscation": stenography_fn.zigzag_obfuscation,
|
||||
}
|
||||
|
||||
def __init__(self, prompt_groups: [ProbeDataset]):
|
||||
self.prompt_groups = prompt_groups
|
||||
def __init__(self, datasets: list[ProbeDataset]):
|
||||
self.datasets = datasets
|
||||
|
||||
def __iter__(self):
|
||||
return self.apply()
|
||||
def transform(self) -> Iterator[ProbeDataset]:
|
||||
"""Generate transformed datasets."""
|
||||
for dataset in self.datasets:
|
||||
size = len(dataset.prompts)
|
||||
sample_size = int(size * 0.25)
|
||||
|
||||
def apply(self):
|
||||
for prompt_group in self.prompt_groups:
|
||||
size = len(prompt_group.prompts)
|
||||
for name, fn in self.fn_library.items():
|
||||
logger.info(f"Applying {name} to {prompt_group.dataset_name}")
|
||||
selected_prompts = random.sample(prompt_group.prompts, int(size * 0.25))
|
||||
prompts = [
|
||||
f"Follow the instructions in {name}: {fn(p)}"
|
||||
for p in selected_prompts
|
||||
for name, fn in self.TRANSFORMATIONS.items():
|
||||
logger.info(f"Applying {name} to {dataset.dataset_name}")
|
||||
sampled = random.sample(dataset.prompts, sample_size)
|
||||
transformed = [
|
||||
f"Follow the instructions in {name}: {fn(p)}" for p in sampled
|
||||
]
|
||||
yield ProbeDataset(
|
||||
dataset_name=f"stenography.{name}({prompt_group.dataset_name})",
|
||||
metadata={},
|
||||
prompts=prompts,
|
||||
tokens=count_words_in_list(prompts),
|
||||
approx_cost=0.0,
|
||||
yield create_probe_dataset(
|
||||
f"stenography.{name}({dataset.dataset_name})", transformed
|
||||
)
|
||||
|
||||
|
||||
@lru_cache
|
||||
def load_local_csv() -> ProbeDataset:
|
||||
csv_files = [f for f in os.listdir(".") if f.endswith(".csv")]
|
||||
logger.info(f"Found {len(csv_files)} CSV files")
|
||||
logger.info(f"CSV files: {csv_files}")
|
||||
prompt_list = []
|
||||
|
||||
for file in csv_files:
|
||||
try:
|
||||
df = pd.read_csv(file)
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading {file}: {e}")
|
||||
continue
|
||||
# Check if 'prompt' column exists
|
||||
if "prompt" in df.columns:
|
||||
prompt_list.extend(df["prompt"].tolist())
|
||||
else:
|
||||
logger.warning(f"File {file} does not contain a 'prompt' column")
|
||||
return ProbeDataset(
|
||||
dataset_name="Local CSV",
|
||||
metadata={"src": str(csv_files)},
|
||||
prompts=prompt_list,
|
||||
tokens=count_words_in_list(prompt_list),
|
||||
approx_cost=0.0,
|
||||
)
|
||||
|
||||
|
||||
def dataset_from_iterator(name: str, iterator, lazy=False) -> list:
|
||||
"""Convert an iterator into a list of prompts and create a ProbeDataset
|
||||
object.
|
||||
|
||||
Args:
|
||||
name (str): The name of the dataset.
|
||||
iterator (iterator): An iterator that generates prompts.
|
||||
|
||||
Returns:
|
||||
list: A list containing a single ProbeDataset object.
|
||||
"""
|
||||
def dataset_from_iterator(
|
||||
name: str, iterator, lazy: bool = False
|
||||
) -> list[ProbeDataset]:
|
||||
"""Convert an iterator into a list of ProbeDataset objects."""
|
||||
prompts = list(iterator) if not lazy else iterator
|
||||
tokens = count_words_in_list(prompts) if not lazy else 0
|
||||
tokens = sum(len(str(s).split()) for s in prompts) if not lazy else 0
|
||||
dataset = ProbeDataset(
|
||||
dataset_name=name,
|
||||
metadata={},
|
||||
@@ -382,75 +263,85 @@ def dataset_from_iterator(name: str, iterator, lazy=False) -> list:
|
||||
return [dataset]
|
||||
|
||||
|
||||
# TODO: refactor this abstraction
|
||||
# Main dataset preparation
|
||||
def prepare_prompts(
|
||||
dataset_names: list[str],
|
||||
budget: float,
|
||||
tools_inbox=None,
|
||||
options: list[dict] = None,
|
||||
) -> list[ProbeDataset]:
|
||||
"""Prepare datasets based on names and options."""
|
||||
# Base dataset loaders
|
||||
dataset_loaders = {
|
||||
**{k: create_dataset_loader(k, v) for k, v in DATASET_CONFIGS.items()},
|
||||
**{k: create_dataset_loader(k, v) for k, v in DATASET_CONFIGS_GENERICS.items()},
|
||||
"Lemhf14/EasyJailbreak_Datasets": partial(
|
||||
load_multi_dataset,
|
||||
name="Lemhf14/EasyJailbreak_Datasets",
|
||||
sub_datasets=[
|
||||
"AdvBench",
|
||||
"ForbiddenQuestion",
|
||||
"MJP",
|
||||
"MaliciousInstruct",
|
||||
"QuestionList",
|
||||
],
|
||||
),
|
||||
"JailbreakV-28K/JailBreakV-28k": load_jailbreak_v28k,
|
||||
"Local CSV": load_local_csv,
|
||||
}
|
||||
|
||||
dataset_map_generics = {
|
||||
"simonycl/aya-23-8B_advbench_jailbreak": lambda: load_dataset_general(
|
||||
dataset_name="simonycl/aya-23-8B_advbench_jailbreak",
|
||||
column_mappings={"prompt": "prompt"},
|
||||
),
|
||||
"acmc/jailbreaks_dataset_with_perplexity_bigcode_starcoder2-3b_bigcode_starcoder2-7b": lambda: load_dataset_general(
|
||||
dataset_name="acmc/jailbreaks_dataset_with_perplexity_bigcode_starcoder2-3b_bigcode_starcoder2-7b"
|
||||
),
|
||||
"karanxa/dolphin-jailbreak-finetuning-dataset": lambda: load_dataset_general(
|
||||
dataset_name="karanxa/dolphin-jailbreak-finetuning-dataset",
|
||||
column_mappings={"prompt": "text"},
|
||||
),
|
||||
"karanxa/llama-2-jailbreak-dataset": lambda: load_dataset_general(
|
||||
dataset_name="karanxa/llama-2-jailbreak-dataset",
|
||||
column_mappings={"prompt": "text"},
|
||||
),
|
||||
"karanxa/llama2-uncensored-jailbreak-dataset-finetuning": lambda: load_dataset_general(
|
||||
dataset_name="karanxa/llama2-uncensored-jailbreak-dataset-finetuning",
|
||||
column_mappings={"prompt": "text"},
|
||||
),
|
||||
"liuyanchen1015/Llama-3.2-1B_jailbreak_responses": lambda: load_dataset_general(
|
||||
dataset_name="liuyanchen1015/Llama-3.2-1B_jailbreak_responses",
|
||||
column_mappings={"prompt": "jailbreak_prompt_text"},
|
||||
),
|
||||
"liuyanchen1015/Llama-3.2-1B-Instruct_jailbreak_responses": lambda: load_dataset_general(
|
||||
dataset_name="liuyanchen1015/Llama-3.2-1B-Instruct_jailbreak_responses",
|
||||
column_mappings={"prompt": "jailbreak_prompt_text"},
|
||||
),
|
||||
"liuyanchen1015/Llama-3.2-1B-Instruct_jailbreak_responses_with_judgment": lambda: load_dataset_general(
|
||||
dataset_name="liuyanchen1015/Llama-3.2-1B-Instruct_jailbreak_responses_with_judgment",
|
||||
column_mappings={"prompt": "jailbreak_prompt_text"},
|
||||
),
|
||||
"jackhhao/jailbreak-classification": lambda: load_dataset_general(
|
||||
dataset_name="jackhhao/jailbreak-classification",
|
||||
column_mappings={"prompt": "prompt"},
|
||||
),
|
||||
"markush1/LLM-Jailbreak-Classifier": lambda: load_dataset_general(
|
||||
dataset_name="markush1/LLM-Jailbreak-Classifier",
|
||||
column_mappings={"prompt": "prompt"},
|
||||
),
|
||||
"walledai/JailbreakBench": lambda: load_dataset_general(
|
||||
dataset_name="walledai/JailbreakBench", column_mappings={"prompt": "prompt"}
|
||||
),
|
||||
"walledai/JailbreakHub": lambda: load_dataset_general(
|
||||
dataset_name="walledai/JailbreakHub", column_mappings={"prompt": "prompt"}
|
||||
),
|
||||
"Granther/evil-jailbreak": lambda: load_dataset_general(
|
||||
dataset_name="Granther/evil-jailbreak", column_mappings={"prompt": "text"}
|
||||
),
|
||||
"sevdeawesome/jailbreak_success": lambda: load_dataset_general(
|
||||
dataset_name="sevdeawesome/jailbreak_success",
|
||||
column_mappings={"prompt": "jailbreak_prompt_text"},
|
||||
),
|
||||
"IDA-SERICS/Disaster-tweet-jailbreaking": lambda: load_dataset_general(
|
||||
dataset_name="IDA-SERICS/Disaster-tweet-jailbreaking",
|
||||
column_mappings={"prompt": "prompt_attack"},
|
||||
),
|
||||
"GeorgeDaDude/Jailbreak_Complete_DS_labeled": lambda: load_dataset_general(
|
||||
dataset_name="GeorgeDaDude/Jailbreak_Complete_DS_labeled",
|
||||
column_mappings={"prompt": "question"},
|
||||
),
|
||||
"dayone3nder/jailbreak_prompt_JBB_sft_trainset": lambda: load_dataset_general(
|
||||
dataset_name="dayone3nder/jailbreak_prompt_JBB_sft_trainset",
|
||||
column_mappings={"prompt": "prompt"},
|
||||
),
|
||||
"dayone3nder/general_safe_mix_jailbreak_prompt_JBB_trainset": lambda: load_dataset_general(
|
||||
dataset_name="dayone3nder/general_safe_mix_jailbreak_prompt_JBB_trainset",
|
||||
column_mappings={"prompt": "prompt"},
|
||||
),
|
||||
}
|
||||
# Dynamic dataset loaders
|
||||
dynamic_loaders = {
|
||||
"AgenticBackend": lambda opts: dataset_from_iterator(
|
||||
"AgenticBackend",
|
||||
fine_tuned.Module([], tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"Steganography": lambda opts: list(StenographyTransformer([]).transform()),
|
||||
"llm-adaptive-attacks": lambda opts: dataset_from_iterator(
|
||||
"llm-adaptive-attacks",
|
||||
adaptive_attacks.Module([], tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
),
|
||||
"Garak": lambda opts: dataset_from_iterator(
|
||||
"Garak",
|
||||
garak_tool.Module([], tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"Reinforcement Learning Optimization": lambda opts: dataset_from_iterator(
|
||||
"Reinforcement Learning Optimization",
|
||||
rl_model.Module([], tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"InspectAI": lambda opts: dataset_from_iterator(
|
||||
"InspectAI",
|
||||
inspect_ai_tool.Module([], tools_inbox=tools_inbox).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"GPT fuzzer": lambda opts: [],
|
||||
}
|
||||
|
||||
options = options or [{} for _ in dataset_names]
|
||||
datasets = []
|
||||
|
||||
# Load base datasets
|
||||
for name, opts in zip(dataset_names, options):
|
||||
if name in dataset_loaders:
|
||||
logger.info(f"Loading base dataset {name}")
|
||||
try:
|
||||
datasets.append(dataset_loaders[name]())
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading {name}: {e}")
|
||||
|
||||
# Load dynamic datasets and apply transformations
|
||||
for name, opts in zip(dataset_names, options):
|
||||
if name in dynamic_loaders:
|
||||
logger.info(f"Loading dynamic dataset {name}")
|
||||
try:
|
||||
dynamic_result = dynamic_loaders[name](opts)
|
||||
datasets.extend(dynamic_result)
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading dynamic {name}: {e}")
|
||||
elif name == "Steganography":
|
||||
datasets.extend(list(StenographyTransformer(datasets).transform()))
|
||||
|
||||
return datasets
|
||||
|
||||
Reference in New Issue
Block a user