mirror of
https://github.com/msoedov/agentic_security.git
synced 2026-06-23 21:59:57 +02:00
feat(Integrated Garak):
This commit is contained in:
@@ -3,3 +3,5 @@
|
||||
.web
|
||||
__pycache__/
|
||||
failures.csv
|
||||
runs/
|
||||
*.todo
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
## Features
|
||||
|
||||
- Customizable Rule Sets or Agent based attacks🛠️
|
||||
- Comprehansive fuzzing for any LLMs 🧪
|
||||
- Comprehensive fuzzing for any LLMs 🧪
|
||||
- LLM API integration and stress testing 🛠️
|
||||
- Wide range of fuzzing and attack techniques 🌀
|
||||
|
||||
@@ -117,7 +117,7 @@ Content-Type: application/json
|
||||
"prompt": "<<PROMPT>>"
|
||||
}
|
||||
"""
|
||||
result = AgenticSecurity.scan(spec)
|
||||
result = AgenticSecurity.scan(llmSpec=spec)
|
||||
|
||||
# module: failure rate
|
||||
# {"Local CSV": 79.65116279069767, "llm-adaptive-attacks": 20.0}
|
||||
|
||||
@@ -10,7 +10,9 @@ from agentic_security.app import app
|
||||
class T:
|
||||
def server(self, port=8718, host="0.0.0.0"):
|
||||
sys.path.append(os.path.dirname("."))
|
||||
config = uvicorn.Config(app, port=port, host=host, log_level="info")
|
||||
config = uvicorn.Config(
|
||||
app, port=port, host=host, log_level="info", reload=True
|
||||
)
|
||||
server = uvicorn.Server(config)
|
||||
server.run()
|
||||
return
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import random
|
||||
import sys
|
||||
from asyncio import Event, Queue
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
@@ -39,6 +40,9 @@ app.add_middleware(
|
||||
allow_headers=["*"], # Allows all headers
|
||||
)
|
||||
|
||||
tools_inbox = Queue()
|
||||
FEATURE_PROXY = False
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
@@ -88,6 +92,7 @@ def streaming_response_generator(scan_parameters: Scan):
|
||||
request_factory=request_factory,
|
||||
max_budget=scan_parameters.maxBudget,
|
||||
datasets=scan_parameters.datasets,
|
||||
tools_inbox=tools_inbox,
|
||||
):
|
||||
yield scan_result + "\n" # Adding a newline for separation
|
||||
|
||||
@@ -149,3 +154,55 @@ class Table(BaseModel):
|
||||
async def get_plot(table: Table):
|
||||
buf = plot_security_report(table.table)
|
||||
return StreamingResponse(buf, media_type="image/jpeg")
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class CompletionRequest(BaseModel):
|
||||
model: str
|
||||
messages: list[Message]
|
||||
temperature: float
|
||||
top_p: float
|
||||
n: int
|
||||
stop: list[str]
|
||||
max_tokens: int
|
||||
presence_penalty: float
|
||||
frequency_penalty: float
|
||||
|
||||
|
||||
# OpenAI proxy endpoint
|
||||
@app.post("/proxy/chat/completions")
|
||||
async def proxy_completions(request: CompletionRequest):
|
||||
refuse = random.random() < 0.2
|
||||
message = random.choice(REFUSAL_MARKS) if refuse else "This is a test!"
|
||||
prompt_content = " ".join(
|
||||
[msg.content for msg in request.messages if msg.role == "user"]
|
||||
)
|
||||
message = prompt_content + " " + message
|
||||
ready = Event()
|
||||
ref = dict(message=message, reply="", ready=ready)
|
||||
tools_inbox.put_nowait(ref)
|
||||
if FEATURE_PROXY:
|
||||
# Proxy to agent
|
||||
await ready.wait()
|
||||
reply = ref["reply"]
|
||||
return reply
|
||||
# Simulate a completion response
|
||||
return {
|
||||
"id": "chatcmpl-abc123",
|
||||
"object": "chat.completion",
|
||||
"created": 1677858242,
|
||||
"model": "gpt-3.5-turbo-0613",
|
||||
"usage": {"prompt_tokens": 13, "completion_tokens": 7, "total_tokens": 20},
|
||||
"choices": [
|
||||
{
|
||||
"message": {"role": "assistant", "content": message},
|
||||
"logprobs": None,
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
@@ -3,9 +3,10 @@ import json
|
||||
|
||||
import colorama
|
||||
import tqdm.asyncio
|
||||
from tabulate import tabulate
|
||||
|
||||
from agentic_security.app import Scan, streaming_response_generator
|
||||
from agentic_security.probe_data import REGISTRY
|
||||
from tabulate import tabulate
|
||||
|
||||
RESET = colorama.Style.RESET_ALL
|
||||
BRIGHT = colorama.Style.BRIGHT
|
||||
@@ -25,7 +26,6 @@ Content-Type: application/json
|
||||
|
||||
|
||||
class AgenticSecurity:
|
||||
|
||||
@classmethod
|
||||
async def async_scan(
|
||||
self, llmSpec: str, maxBudget: int, datasets: list[dict], max_th: float
|
||||
|
||||
@@ -30,7 +30,18 @@ class ScanResult(BaseModel):
|
||||
).model_dump_json()
|
||||
|
||||
|
||||
async def perform_scan(request_factory, max_budget: int, datasets: list[dict] = []):
|
||||
async def prompt_iter(prompts):
|
||||
if isinstance(prompts, list):
|
||||
for p in prompts:
|
||||
yield p
|
||||
return
|
||||
async for p in prompts:
|
||||
yield p
|
||||
|
||||
|
||||
async def perform_scan(
|
||||
request_factory, max_budget: int, datasets: list[dict] = [], tools_inbox=None
|
||||
):
|
||||
yield ScanResult.status_msg("Loading datasets...")
|
||||
if IS_VERCEL:
|
||||
yield ScanResult.status_msg(
|
||||
@@ -40,20 +51,24 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
|
||||
prompt_modules = prepare_prompts(
|
||||
dataset_names=[m["dataset_name"] for m in datasets if m["selected"]],
|
||||
budget=max_budget,
|
||||
tools_inbox=tools_inbox,
|
||||
)
|
||||
yield ScanResult.status_msg("Datasets loaded. Starting scan...")
|
||||
|
||||
errors = []
|
||||
refusals = []
|
||||
size = sum(len(m.prompts) for m in prompt_modules)
|
||||
size = sum(len(m.prompts) for m in prompt_modules if not m.lazy)
|
||||
step = 0
|
||||
for mi, module in enumerate(prompt_modules):
|
||||
tokens = 0
|
||||
module_failures = 0
|
||||
logger.info(f"Scanning {module.dataset_name} {len(module.prompts)}")
|
||||
for i, prompt in enumerate(module.prompts):
|
||||
size = 0 if module.lazy else len(module.prompts)
|
||||
logger.info(f"Scanning {module.dataset_name} {size}")
|
||||
i = 0
|
||||
async for prompt in prompt_iter(module.prompts):
|
||||
i += 1
|
||||
step += 1
|
||||
progress = 100 * (step) / size
|
||||
progress = 100 * (step) / size if size else 0
|
||||
|
||||
# Naive token count
|
||||
tokens += len(prompt.split())
|
||||
@@ -86,12 +101,13 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
|
||||
module_failures += 1
|
||||
# Naive token count for llm response
|
||||
tokens += len(r.text.split())
|
||||
total = size if size else i
|
||||
yield ScanResult(
|
||||
module=module.dataset_name,
|
||||
tokens=round(tokens / 1000, 1),
|
||||
cost=round(tokens * 1.5 / 1000_000, 2),
|
||||
progress=round(progress, 2),
|
||||
failureRate=100 * module_failures / max(len(module.prompts), 1),
|
||||
failureRate=100 * module_failures / max(total, 1),
|
||||
).model_dump_json()
|
||||
yield ScanResult.status_msg("Done.")
|
||||
import pandas as pd
|
||||
|
||||
@@ -127,6 +127,15 @@ REGISTRY = [
|
||||
"selected": False,
|
||||
"url": "https://github.com/tml-epfl/llm-adaptive-attacks",
|
||||
},
|
||||
{
|
||||
"dataset_name": "Garak",
|
||||
"num_prompts": 0,
|
||||
"tokens": 0,
|
||||
"approx_cost": 0.0,
|
||||
"source": "Github: https://github.com/leondz/garak",
|
||||
"selected": False,
|
||||
"url": "https://github.com/leondz/garak",
|
||||
},
|
||||
{
|
||||
"dataset_name": "Custom CSV",
|
||||
"num_prompts": len(load_local_csv().prompts),
|
||||
|
||||
@@ -7,7 +7,7 @@ import pandas as pd
|
||||
from loguru import logger
|
||||
|
||||
from agentic_security.probe_data import stenography_fn
|
||||
from agentic_security.probe_data.modules import adaptive_attacks
|
||||
from agentic_security.probe_data.modules import adaptive_attacks, garak_tool
|
||||
|
||||
IS_VERCEL = os.getenv("IS_VERCEL", "f") == "t"
|
||||
|
||||
@@ -32,6 +32,7 @@ class ProbeDataset:
|
||||
prompts: list[str]
|
||||
tokens: int
|
||||
approx_cost: float
|
||||
lazy: bool = False
|
||||
|
||||
def metadata_summary(self):
|
||||
return {
|
||||
@@ -168,10 +169,7 @@ def load_dataset_v5():
|
||||
)
|
||||
|
||||
|
||||
def prepare_prompts(
|
||||
dataset_names,
|
||||
budget,
|
||||
):
|
||||
def prepare_prompts(dataset_names, budget, tools_inbox=None):
|
||||
# ## Datasets used and cleaned:
|
||||
# markush1/LLM-Jailbreak-Classifier
|
||||
# 1. Open-Orca/OpenOrca
|
||||
@@ -203,6 +201,11 @@ def prepare_prompts(
|
||||
"llm-adaptive-attacks": lambda: dataset_from_iterator(
|
||||
"llm-adaptive-attacks", adaptive_attacks.Module(group).apply()
|
||||
),
|
||||
"Garak": lambda: dataset_from_iterator(
|
||||
"Garak",
|
||||
garak_tool.Module(group, tools_inbox=tools_inbox).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"GPT fuzzer": lambda: [],
|
||||
}
|
||||
|
||||
@@ -217,22 +220,6 @@ def prepare_prompts(
|
||||
return group + dynamic_groups
|
||||
|
||||
|
||||
class MutationFn:
|
||||
def __init__(self, mutation_fn):
|
||||
self.mutation_fn = mutation_fn
|
||||
self.mutation_fn_name = mutation_fn.__name__
|
||||
self.input = ""
|
||||
self.output = ""
|
||||
|
||||
def __call__(self, prompt):
|
||||
self.input = prompt
|
||||
self.output = self.mutation_fn(prompt)
|
||||
return self.output
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.mutation_fn_name}({self.input}) => {self.output}"
|
||||
|
||||
|
||||
class Stenography:
|
||||
fn_library = {
|
||||
"rot5": stenography_fn.rot5,
|
||||
@@ -295,7 +282,7 @@ def load_local_csv() -> ProbeDataset:
|
||||
)
|
||||
|
||||
|
||||
def dataset_from_iterator(name: str, iterator) -> list:
|
||||
def dataset_from_iterator(name: str, iterator, lazy=False) -> list:
|
||||
"""Convert an iterator into a list of prompts and create a ProbeDataset
|
||||
object.
|
||||
|
||||
@@ -306,9 +293,14 @@ def dataset_from_iterator(name: str, iterator) -> list:
|
||||
Returns:
|
||||
list: A list containing a single ProbeDataset object.
|
||||
"""
|
||||
prompts = list(iterator)
|
||||
tokens = count_words_in_list(prompts)
|
||||
prompts = list(iterator) if not lazy else iterator
|
||||
tokens = count_words_in_list(prompts) if not lazy else 0
|
||||
dataset = ProbeDataset(
|
||||
dataset_name=name, metadata={}, prompts=prompts, tokens=tokens, approx_cost=0.0
|
||||
dataset_name=name,
|
||||
metadata={},
|
||||
prompts=prompts,
|
||||
tokens=tokens,
|
||||
approx_cost=0.0,
|
||||
lazy=lazy,
|
||||
)
|
||||
return [dataset]
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
import subprocess
|
||||
import os
|
||||
import asyncio
|
||||
from loguru import logger
|
||||
import asyncio
|
||||
|
||||
|
||||
class Module:
|
||||
|
||||
def __init__(self, prompt_groups: [], tools_inbox: asyncio.Queue):
|
||||
self.tools_inbox = tools_inbox
|
||||
|
||||
async def apply(self) -> []:
|
||||
env = os.environ.copy()
|
||||
env["OPENAI_API_BASE"] = "http://0.0.0.0:8718/proxy"
|
||||
|
||||
# Command to be executed
|
||||
command = [
|
||||
"python3",
|
||||
"-m",
|
||||
"garak",
|
||||
"--model_type",
|
||||
"openai",
|
||||
"--model_name",
|
||||
"gpt-3.5-turbo",
|
||||
"--probes",
|
||||
"encoding",
|
||||
]
|
||||
logger.info(f"Executing command: {command}")
|
||||
# Execute the command with the specific environment
|
||||
process = subprocess.Popen(
|
||||
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env
|
||||
)
|
||||
out, err = await asyncio.to_thread(process.communicate)
|
||||
|
||||
is_empty = self.tools_inbox.empty()
|
||||
logger.info(f"Is inbox empty? {is_empty}")
|
||||
while not self.tools_inbox.empty():
|
||||
ref = self.tools_inbox.get_nowait()
|
||||
message, _, ready = ref["message"], ref["reply"], ref["ready"]
|
||||
yield message
|
||||
ready.set()
|
||||
logger.info("Garak tool finished.")
|
||||
logger.info(f"stdout: {out}")
|
||||
logger.error(f"exit code: {process.returncode}")
|
||||
@@ -1,6 +1,7 @@
|
||||
from agentic_security.lib import REGISTRY, AgenticSecurity
|
||||
from inline_snapshot import snapshot
|
||||
|
||||
from agentic_security.lib import REGISTRY, AgenticSecurity
|
||||
|
||||
SAMPLE_SPEC = """
|
||||
POST http://0.0.0.0:8718/v1/self-probe
|
||||
Authorization: Bearer XXXXX
|
||||
|
||||
Reference in New Issue
Block a user