feat(Integrated Garak):

This commit is contained in:
Alexander Myasoedov
2024-04-27 21:44:38 +03:00
parent fa209684d9
commit 74461efaa0
10 changed files with 161 additions and 37 deletions
+2
View File
@@ -3,3 +3,5 @@
.web
__pycache__/
failures.csv
runs/
*.todo
+2 -2
View File
@@ -24,7 +24,7 @@
## Features
- Customizable Rule Sets or Agent based attacks🛠️
- Comprehansive fuzzing for any LLMs 🧪
- Comprehensive fuzzing for any LLMs 🧪
- LLM API integration and stress testing 🛠️
- Wide range of fuzzing and attack techniques 🌀
@@ -117,7 +117,7 @@ Content-Type: application/json
"prompt": "<<PROMPT>>"
}
"""
result = AgenticSecurity.scan(spec)
result = AgenticSecurity.scan(llmSpec=spec)
# module: failure rate
# {"Local CSV": 79.65116279069767, "llm-adaptive-attacks": 20.0}
+3 -1
View File
@@ -10,7 +10,9 @@ from agentic_security.app import app
class T:
def server(self, port=8718, host="0.0.0.0"):
sys.path.append(os.path.dirname("."))
config = uvicorn.Config(app, port=port, host=host, log_level="info")
config = uvicorn.Config(
app, port=port, host=host, log_level="info", reload=True
)
server = uvicorn.Server(config)
server.run()
return
+57
View File
@@ -1,5 +1,6 @@
import random
import sys
from asyncio import Event, Queue
from datetime import datetime
from pathlib import Path
@@ -39,6 +40,9 @@ app.add_middleware(
allow_headers=["*"], # Allows all headers
)
tools_inbox = Queue()
FEATURE_PROXY = False
@app.get("/")
async def root():
@@ -88,6 +92,7 @@ def streaming_response_generator(scan_parameters: Scan):
request_factory=request_factory,
max_budget=scan_parameters.maxBudget,
datasets=scan_parameters.datasets,
tools_inbox=tools_inbox,
):
yield scan_result + "\n" # Adding a newline for separation
@@ -149,3 +154,55 @@ class Table(BaseModel):
async def get_plot(table: Table):
buf = plot_security_report(table.table)
return StreamingResponse(buf, media_type="image/jpeg")
class Message(BaseModel):
role: str
content: str
class CompletionRequest(BaseModel):
model: str
messages: list[Message]
temperature: float
top_p: float
n: int
stop: list[str]
max_tokens: int
presence_penalty: float
frequency_penalty: float
# OpenAI proxy endpoint
@app.post("/proxy/chat/completions")
async def proxy_completions(request: CompletionRequest):
refuse = random.random() < 0.2
message = random.choice(REFUSAL_MARKS) if refuse else "This is a test!"
prompt_content = " ".join(
[msg.content for msg in request.messages if msg.role == "user"]
)
message = prompt_content + " " + message
ready = Event()
ref = dict(message=message, reply="", ready=ready)
tools_inbox.put_nowait(ref)
if FEATURE_PROXY:
# Proxy to agent
await ready.wait()
reply = ref["reply"]
return reply
# Simulate a completion response
return {
"id": "chatcmpl-abc123",
"object": "chat.completion",
"created": 1677858242,
"model": "gpt-3.5-turbo-0613",
"usage": {"prompt_tokens": 13, "completion_tokens": 7, "total_tokens": 20},
"choices": [
{
"message": {"role": "assistant", "content": message},
"logprobs": None,
"finish_reason": "stop",
"index": 0,
}
],
}
+2 -2
View File
@@ -3,9 +3,10 @@ import json
import colorama
import tqdm.asyncio
from tabulate import tabulate
from agentic_security.app import Scan, streaming_response_generator
from agentic_security.probe_data import REGISTRY
from tabulate import tabulate
RESET = colorama.Style.RESET_ALL
BRIGHT = colorama.Style.BRIGHT
@@ -25,7 +26,6 @@ Content-Type: application/json
class AgenticSecurity:
@classmethod
async def async_scan(
self, llmSpec: str, maxBudget: int, datasets: list[dict], max_th: float
+22 -6
View File
@@ -30,7 +30,18 @@ class ScanResult(BaseModel):
).model_dump_json()
async def perform_scan(request_factory, max_budget: int, datasets: list[dict] = []):
async def prompt_iter(prompts):
if isinstance(prompts, list):
for p in prompts:
yield p
return
async for p in prompts:
yield p
async def perform_scan(
request_factory, max_budget: int, datasets: list[dict] = [], tools_inbox=None
):
yield ScanResult.status_msg("Loading datasets...")
if IS_VERCEL:
yield ScanResult.status_msg(
@@ -40,20 +51,24 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
prompt_modules = prepare_prompts(
dataset_names=[m["dataset_name"] for m in datasets if m["selected"]],
budget=max_budget,
tools_inbox=tools_inbox,
)
yield ScanResult.status_msg("Datasets loaded. Starting scan...")
errors = []
refusals = []
size = sum(len(m.prompts) for m in prompt_modules)
size = sum(len(m.prompts) for m in prompt_modules if not m.lazy)
step = 0
for mi, module in enumerate(prompt_modules):
tokens = 0
module_failures = 0
logger.info(f"Scanning {module.dataset_name} {len(module.prompts)}")
for i, prompt in enumerate(module.prompts):
size = 0 if module.lazy else len(module.prompts)
logger.info(f"Scanning {module.dataset_name} {size}")
i = 0
async for prompt in prompt_iter(module.prompts):
i += 1
step += 1
progress = 100 * (step) / size
progress = 100 * (step) / size if size else 0
# Naive token count
tokens += len(prompt.split())
@@ -86,12 +101,13 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
module_failures += 1
# Naive token count for llm response
tokens += len(r.text.split())
total = size if size else i
yield ScanResult(
module=module.dataset_name,
tokens=round(tokens / 1000, 1),
cost=round(tokens * 1.5 / 1000_000, 2),
progress=round(progress, 2),
failureRate=100 * module_failures / max(len(module.prompts), 1),
failureRate=100 * module_failures / max(total, 1),
).model_dump_json()
yield ScanResult.status_msg("Done.")
import pandas as pd
+9
View File
@@ -127,6 +127,15 @@ REGISTRY = [
"selected": False,
"url": "https://github.com/tml-epfl/llm-adaptive-attacks",
},
{
"dataset_name": "Garak",
"num_prompts": 0,
"tokens": 0,
"approx_cost": 0.0,
"source": "Github: https://github.com/leondz/garak",
"selected": False,
"url": "https://github.com/leondz/garak",
},
{
"dataset_name": "Custom CSV",
"num_prompts": len(load_local_csv().prompts),
+17 -25
View File
@@ -7,7 +7,7 @@ import pandas as pd
from loguru import logger
from agentic_security.probe_data import stenography_fn
from agentic_security.probe_data.modules import adaptive_attacks
from agentic_security.probe_data.modules import adaptive_attacks, garak_tool
IS_VERCEL = os.getenv("IS_VERCEL", "f") == "t"
@@ -32,6 +32,7 @@ class ProbeDataset:
prompts: list[str]
tokens: int
approx_cost: float
lazy: bool = False
def metadata_summary(self):
return {
@@ -168,10 +169,7 @@ def load_dataset_v5():
)
def prepare_prompts(
dataset_names,
budget,
):
def prepare_prompts(dataset_names, budget, tools_inbox=None):
# ## Datasets used and cleaned:
# markush1/LLM-Jailbreak-Classifier
# 1. Open-Orca/OpenOrca
@@ -203,6 +201,11 @@ def prepare_prompts(
"llm-adaptive-attacks": lambda: dataset_from_iterator(
"llm-adaptive-attacks", adaptive_attacks.Module(group).apply()
),
"Garak": lambda: dataset_from_iterator(
"Garak",
garak_tool.Module(group, tools_inbox=tools_inbox).apply(),
lazy=True,
),
"GPT fuzzer": lambda: [],
}
@@ -217,22 +220,6 @@ def prepare_prompts(
return group + dynamic_groups
class MutationFn:
def __init__(self, mutation_fn):
self.mutation_fn = mutation_fn
self.mutation_fn_name = mutation_fn.__name__
self.input = ""
self.output = ""
def __call__(self, prompt):
self.input = prompt
self.output = self.mutation_fn(prompt)
return self.output
def __str__(self):
return f"{self.mutation_fn_name}({self.input}) => {self.output}"
class Stenography:
fn_library = {
"rot5": stenography_fn.rot5,
@@ -295,7 +282,7 @@ def load_local_csv() -> ProbeDataset:
)
def dataset_from_iterator(name: str, iterator) -> list:
def dataset_from_iterator(name: str, iterator, lazy=False) -> list:
"""Convert an iterator into a list of prompts and create a ProbeDataset
object.
@@ -306,9 +293,14 @@ def dataset_from_iterator(name: str, iterator) -> list:
Returns:
list: A list containing a single ProbeDataset object.
"""
prompts = list(iterator)
tokens = count_words_in_list(prompts)
prompts = list(iterator) if not lazy else iterator
tokens = count_words_in_list(prompts) if not lazy else 0
dataset = ProbeDataset(
dataset_name=name, metadata={}, prompts=prompts, tokens=tokens, approx_cost=0.0
dataset_name=name,
metadata={},
prompts=prompts,
tokens=tokens,
approx_cost=0.0,
lazy=lazy,
)
return [dataset]
@@ -0,0 +1,45 @@
import subprocess
import os
import asyncio
from loguru import logger
import asyncio
class Module:
def __init__(self, prompt_groups: [], tools_inbox: asyncio.Queue):
self.tools_inbox = tools_inbox
async def apply(self) -> []:
env = os.environ.copy()
env["OPENAI_API_BASE"] = "http://0.0.0.0:8718/proxy"
# Command to be executed
command = [
"python3",
"-m",
"garak",
"--model_type",
"openai",
"--model_name",
"gpt-3.5-turbo",
"--probes",
"encoding",
]
logger.info(f"Executing command: {command}")
# Execute the command with the specific environment
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env
)
out, err = await asyncio.to_thread(process.communicate)
is_empty = self.tools_inbox.empty()
logger.info(f"Is inbox empty? {is_empty}")
while not self.tools_inbox.empty():
ref = self.tools_inbox.get_nowait()
message, _, ready = ref["message"], ref["reply"], ref["ready"]
yield message
ready.set()
logger.info("Garak tool finished.")
logger.info(f"stdout: {out}")
logger.error(f"exit code: {process.returncode}")
+2 -1
View File
@@ -1,6 +1,7 @@
from agentic_security.lib import REGISTRY, AgenticSecurity
from inline_snapshot import snapshot
from agentic_security.lib import REGISTRY, AgenticSecurity
SAMPLE_SPEC = """
POST http://0.0.0.0:8718/v1/self-probe
Authorization: Bearer XXXXX