From 74461efaa005a1bbc1e3e69e33b3835322c44f14 Mon Sep 17 00:00:00 2001 From: Alexander Myasoedov Date: Sat, 27 Apr 2024 21:44:38 +0300 Subject: [PATCH] feat(Integrated Garak): --- .gitignore | 2 + Readme.md | 4 +- agentic_security/__main__.py | 4 +- agentic_security/app.py | 57 +++++++++++++++++++ agentic_security/lib.py | 4 +- agentic_security/probe_actor/fuzzer.py | 28 +++++++-- agentic_security/probe_data/__init__.py | 9 +++ agentic_security/probe_data/data.py | 42 ++++++-------- .../probe_data/modules/garak_tool.py | 45 +++++++++++++++ agentic_security/test_lib.py | 3 +- 10 files changed, 161 insertions(+), 37 deletions(-) create mode 100644 agentic_security/probe_data/modules/garak_tool.py diff --git a/.gitignore b/.gitignore index bb0374b..8526644 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ .web __pycache__/ failures.csv +runs/ +*.todo diff --git a/Readme.md b/Readme.md index 8bae3c8..a983c20 100644 --- a/Readme.md +++ b/Readme.md @@ -24,7 +24,7 @@ ## Features - Customizable Rule Sets or Agent based attacks🛠️ -- Comprehansive fuzzing for any LLMs 🧪 +- Comprehensive fuzzing for any LLMs 🧪 - LLM API integration and stress testing 🛠️ - Wide range of fuzzing and attack techniques 🌀 @@ -117,7 +117,7 @@ Content-Type: application/json "prompt": "<>" } """ -result = AgenticSecurity.scan(spec) +result = AgenticSecurity.scan(llmSpec=spec) # module: failure rate # {"Local CSV": 79.65116279069767, "llm-adaptive-attacks": 20.0} diff --git a/agentic_security/__main__.py b/agentic_security/__main__.py index f2c5f9d..aaa0078 100644 --- a/agentic_security/__main__.py +++ b/agentic_security/__main__.py @@ -10,7 +10,9 @@ from agentic_security.app import app class T: def server(self, port=8718, host="0.0.0.0"): sys.path.append(os.path.dirname(".")) - config = uvicorn.Config(app, port=port, host=host, log_level="info") + config = uvicorn.Config( + app, port=port, host=host, log_level="info", reload=True + ) server = uvicorn.Server(config) server.run() return diff --git a/agentic_security/app.py b/agentic_security/app.py index 36bdea5..d3f40c8 100644 --- a/agentic_security/app.py +++ b/agentic_security/app.py @@ -1,5 +1,6 @@ import random import sys +from asyncio import Event, Queue from datetime import datetime from pathlib import Path @@ -39,6 +40,9 @@ app.add_middleware( allow_headers=["*"], # Allows all headers ) +tools_inbox = Queue() +FEATURE_PROXY = False + @app.get("/") async def root(): @@ -88,6 +92,7 @@ def streaming_response_generator(scan_parameters: Scan): request_factory=request_factory, max_budget=scan_parameters.maxBudget, datasets=scan_parameters.datasets, + tools_inbox=tools_inbox, ): yield scan_result + "\n" # Adding a newline for separation @@ -149,3 +154,55 @@ class Table(BaseModel): async def get_plot(table: Table): buf = plot_security_report(table.table) return StreamingResponse(buf, media_type="image/jpeg") + + +class Message(BaseModel): + role: str + content: str + + +class CompletionRequest(BaseModel): + model: str + messages: list[Message] + temperature: float + top_p: float + n: int + stop: list[str] + max_tokens: int + presence_penalty: float + frequency_penalty: float + + +# OpenAI proxy endpoint +@app.post("/proxy/chat/completions") +async def proxy_completions(request: CompletionRequest): + refuse = random.random() < 0.2 + message = random.choice(REFUSAL_MARKS) if refuse else "This is a test!" + prompt_content = " ".join( + [msg.content for msg in request.messages if msg.role == "user"] + ) + message = prompt_content + " " + message + ready = Event() + ref = dict(message=message, reply="", ready=ready) + tools_inbox.put_nowait(ref) + if FEATURE_PROXY: + # Proxy to agent + await ready.wait() + reply = ref["reply"] + return reply + # Simulate a completion response + return { + "id": "chatcmpl-abc123", + "object": "chat.completion", + "created": 1677858242, + "model": "gpt-3.5-turbo-0613", + "usage": {"prompt_tokens": 13, "completion_tokens": 7, "total_tokens": 20}, + "choices": [ + { + "message": {"role": "assistant", "content": message}, + "logprobs": None, + "finish_reason": "stop", + "index": 0, + } + ], + } diff --git a/agentic_security/lib.py b/agentic_security/lib.py index 2875091..51a2563 100644 --- a/agentic_security/lib.py +++ b/agentic_security/lib.py @@ -3,9 +3,10 @@ import json import colorama import tqdm.asyncio +from tabulate import tabulate + from agentic_security.app import Scan, streaming_response_generator from agentic_security.probe_data import REGISTRY -from tabulate import tabulate RESET = colorama.Style.RESET_ALL BRIGHT = colorama.Style.BRIGHT @@ -25,7 +26,6 @@ Content-Type: application/json class AgenticSecurity: - @classmethod async def async_scan( self, llmSpec: str, maxBudget: int, datasets: list[dict], max_th: float diff --git a/agentic_security/probe_actor/fuzzer.py b/agentic_security/probe_actor/fuzzer.py index 664e97d..b8e9fc4 100644 --- a/agentic_security/probe_actor/fuzzer.py +++ b/agentic_security/probe_actor/fuzzer.py @@ -30,7 +30,18 @@ class ScanResult(BaseModel): ).model_dump_json() -async def perform_scan(request_factory, max_budget: int, datasets: list[dict] = []): +async def prompt_iter(prompts): + if isinstance(prompts, list): + for p in prompts: + yield p + return + async for p in prompts: + yield p + + +async def perform_scan( + request_factory, max_budget: int, datasets: list[dict] = [], tools_inbox=None +): yield ScanResult.status_msg("Loading datasets...") if IS_VERCEL: yield ScanResult.status_msg( @@ -40,20 +51,24 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] = prompt_modules = prepare_prompts( dataset_names=[m["dataset_name"] for m in datasets if m["selected"]], budget=max_budget, + tools_inbox=tools_inbox, ) yield ScanResult.status_msg("Datasets loaded. Starting scan...") errors = [] refusals = [] - size = sum(len(m.prompts) for m in prompt_modules) + size = sum(len(m.prompts) for m in prompt_modules if not m.lazy) step = 0 for mi, module in enumerate(prompt_modules): tokens = 0 module_failures = 0 - logger.info(f"Scanning {module.dataset_name} {len(module.prompts)}") - for i, prompt in enumerate(module.prompts): + size = 0 if module.lazy else len(module.prompts) + logger.info(f"Scanning {module.dataset_name} {size}") + i = 0 + async for prompt in prompt_iter(module.prompts): + i += 1 step += 1 - progress = 100 * (step) / size + progress = 100 * (step) / size if size else 0 # Naive token count tokens += len(prompt.split()) @@ -86,12 +101,13 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] = module_failures += 1 # Naive token count for llm response tokens += len(r.text.split()) + total = size if size else i yield ScanResult( module=module.dataset_name, tokens=round(tokens / 1000, 1), cost=round(tokens * 1.5 / 1000_000, 2), progress=round(progress, 2), - failureRate=100 * module_failures / max(len(module.prompts), 1), + failureRate=100 * module_failures / max(total, 1), ).model_dump_json() yield ScanResult.status_msg("Done.") import pandas as pd diff --git a/agentic_security/probe_data/__init__.py b/agentic_security/probe_data/__init__.py index 95023e6..8fe4283 100644 --- a/agentic_security/probe_data/__init__.py +++ b/agentic_security/probe_data/__init__.py @@ -127,6 +127,15 @@ REGISTRY = [ "selected": False, "url": "https://github.com/tml-epfl/llm-adaptive-attacks", }, + { + "dataset_name": "Garak", + "num_prompts": 0, + "tokens": 0, + "approx_cost": 0.0, + "source": "Github: https://github.com/leondz/garak", + "selected": False, + "url": "https://github.com/leondz/garak", + }, { "dataset_name": "Custom CSV", "num_prompts": len(load_local_csv().prompts), diff --git a/agentic_security/probe_data/data.py b/agentic_security/probe_data/data.py index e2541aa..a3ea64e 100644 --- a/agentic_security/probe_data/data.py +++ b/agentic_security/probe_data/data.py @@ -7,7 +7,7 @@ import pandas as pd from loguru import logger from agentic_security.probe_data import stenography_fn -from agentic_security.probe_data.modules import adaptive_attacks +from agentic_security.probe_data.modules import adaptive_attacks, garak_tool IS_VERCEL = os.getenv("IS_VERCEL", "f") == "t" @@ -32,6 +32,7 @@ class ProbeDataset: prompts: list[str] tokens: int approx_cost: float + lazy: bool = False def metadata_summary(self): return { @@ -168,10 +169,7 @@ def load_dataset_v5(): ) -def prepare_prompts( - dataset_names, - budget, -): +def prepare_prompts(dataset_names, budget, tools_inbox=None): # ## Datasets used and cleaned: # markush1/LLM-Jailbreak-Classifier # 1. Open-Orca/OpenOrca @@ -203,6 +201,11 @@ def prepare_prompts( "llm-adaptive-attacks": lambda: dataset_from_iterator( "llm-adaptive-attacks", adaptive_attacks.Module(group).apply() ), + "Garak": lambda: dataset_from_iterator( + "Garak", + garak_tool.Module(group, tools_inbox=tools_inbox).apply(), + lazy=True, + ), "GPT fuzzer": lambda: [], } @@ -217,22 +220,6 @@ def prepare_prompts( return group + dynamic_groups -class MutationFn: - def __init__(self, mutation_fn): - self.mutation_fn = mutation_fn - self.mutation_fn_name = mutation_fn.__name__ - self.input = "" - self.output = "" - - def __call__(self, prompt): - self.input = prompt - self.output = self.mutation_fn(prompt) - return self.output - - def __str__(self): - return f"{self.mutation_fn_name}({self.input}) => {self.output}" - - class Stenography: fn_library = { "rot5": stenography_fn.rot5, @@ -295,7 +282,7 @@ def load_local_csv() -> ProbeDataset: ) -def dataset_from_iterator(name: str, iterator) -> list: +def dataset_from_iterator(name: str, iterator, lazy=False) -> list: """Convert an iterator into a list of prompts and create a ProbeDataset object. @@ -306,9 +293,14 @@ def dataset_from_iterator(name: str, iterator) -> list: Returns: list: A list containing a single ProbeDataset object. """ - prompts = list(iterator) - tokens = count_words_in_list(prompts) + prompts = list(iterator) if not lazy else iterator + tokens = count_words_in_list(prompts) if not lazy else 0 dataset = ProbeDataset( - dataset_name=name, metadata={}, prompts=prompts, tokens=tokens, approx_cost=0.0 + dataset_name=name, + metadata={}, + prompts=prompts, + tokens=tokens, + approx_cost=0.0, + lazy=lazy, ) return [dataset] diff --git a/agentic_security/probe_data/modules/garak_tool.py b/agentic_security/probe_data/modules/garak_tool.py new file mode 100644 index 0000000..a65560d --- /dev/null +++ b/agentic_security/probe_data/modules/garak_tool.py @@ -0,0 +1,45 @@ +import subprocess +import os +import asyncio +from loguru import logger +import asyncio + + +class Module: + + def __init__(self, prompt_groups: [], tools_inbox: asyncio.Queue): + self.tools_inbox = tools_inbox + + async def apply(self) -> []: + env = os.environ.copy() + env["OPENAI_API_BASE"] = "http://0.0.0.0:8718/proxy" + + # Command to be executed + command = [ + "python3", + "-m", + "garak", + "--model_type", + "openai", + "--model_name", + "gpt-3.5-turbo", + "--probes", + "encoding", + ] + logger.info(f"Executing command: {command}") + # Execute the command with the specific environment + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env + ) + out, err = await asyncio.to_thread(process.communicate) + + is_empty = self.tools_inbox.empty() + logger.info(f"Is inbox empty? {is_empty}") + while not self.tools_inbox.empty(): + ref = self.tools_inbox.get_nowait() + message, _, ready = ref["message"], ref["reply"], ref["ready"] + yield message + ready.set() + logger.info("Garak tool finished.") + logger.info(f"stdout: {out}") + logger.error(f"exit code: {process.returncode}") diff --git a/agentic_security/test_lib.py b/agentic_security/test_lib.py index b4dd322..c733fe1 100644 --- a/agentic_security/test_lib.py +++ b/agentic_security/test_lib.py @@ -1,6 +1,7 @@ -from agentic_security.lib import REGISTRY, AgenticSecurity from inline_snapshot import snapshot +from agentic_security.lib import REGISTRY, AgenticSecurity + SAMPLE_SPEC = """ POST http://0.0.0.0:8718/v1/self-probe Authorization: Bearer XXXXX