mirror of
https://github.com/msoedov/agentic_security.git
synced 2026-06-24 06:09:55 +02:00
5e5469a1a7
msj_data.py contained a full copy of the ProbeDataset dataclass that
was already defined canonically in probe_data/models.py, violating DRY
and leaving a stale TODO comment in the source.
Changes:
- probe_data/msj_data.py: delete the 19-line duplicate ProbeDataset
definition and the now-unused 'from dataclasses import dataclass'
import; replace with a single re-export:
from agentic_security.probe_data.models import ProbeDataset
All call-sites inside the file (load_dataset_generic, prepare_prompts)
continue to work unchanged because the field signatures are identical.
The TODO comment is removed as the refactor is now complete.
No changes required in consumers (fuzzer.py, test_msj_data.py) because
they access ProbeDataset through msj_data's re-export.
36 lines
1.0 KiB
Python
36 lines
1.0 KiB
Python
from cache_to_disk import cache_to_disk # noqa
|
|
|
|
from agentic_security.probe_data.models import ProbeDataset
|
|
|
|
|
|
# @cache_to_disk(n_days_to_cache=1)
|
|
def load_dataset_generic(name, getter=lambda x: x["train"]["prompt"]):
|
|
from datasets import load_dataset
|
|
|
|
dataset = load_dataset(name)
|
|
mjs_prompts = getter(dataset)
|
|
return ProbeDataset(
|
|
dataset_name=name,
|
|
metadata={},
|
|
prompts=mjs_prompts,
|
|
tokens=0,
|
|
approx_cost=0.0,
|
|
)
|
|
|
|
|
|
def prepare_prompts(
|
|
dataset_names=[], budget=-1, tools_inbox=None
|
|
) -> list[ProbeDataset]:
|
|
# fka/awesome-chatgpt-prompts
|
|
# data-is-better-together/10k_prompts_ranked
|
|
# alespalla/chatbot_instruction_prompts
|
|
dataset_map = {
|
|
"data-is-better-together/10k_prompts_ranked": load_dataset_generic(
|
|
"data-is-better-together/10k_prompts_ranked"
|
|
),
|
|
"fka/awesome-chatgpt-prompts": load_dataset_generic(
|
|
"fka/awesome-chatgpt-prompts"
|
|
),
|
|
}
|
|
return [dataset_map[name] for name in dataset_map]
|