From 74461efaa005a1bbc1e3e69e33b3835322c44f14 Mon Sep 17 00:00:00 2001
From: Alexander Myasoedov <msoedov@gmail.com>
Date: Sat, 27 Apr 2024 21:44:38 +0300
Subject: [PATCH] feat(Integrated Garak):

---
 .gitignore                                    |  2 +
 Readme.md                                     |  4 +-
 agentic_security/__main__.py                  |  4 +-
 agentic_security/app.py                       | 57 +++++++++++++++++++
 agentic_security/lib.py                       |  4 +-
 agentic_security/probe_actor/fuzzer.py        | 28 +++++++--
 agentic_security/probe_data/__init__.py       |  9 +++
 agentic_security/probe_data/data.py           | 42 ++++++--------
 .../probe_data/modules/garak_tool.py          | 45 +++++++++++++++
 agentic_security/test_lib.py                  |  3 +-
 10 files changed, 161 insertions(+), 37 deletions(-)
 create mode 100644 agentic_security/probe_data/modules/garak_tool.py
diff --git a/.gitignore b/.gitignore
index bb0374b..8526644 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 .web
 __pycache__/
 failures.csv
+runs/
+*.todo
diff --git a/Readme.md b/Readme.md
index 8bae3c8..a983c20 100644
--- a/Readme.md
+++ b/Readme.md
@@ -24,7 +24,7 @@
 ## Features
 
 - Customizable Rule Sets or Agent based attacks🛠️
-- Comprehansive fuzzing for any LLMs 🧪
+- Comprehensive fuzzing for any LLMs 🧪
 - LLM API integration and stress testing 🛠️
 - Wide range of fuzzing and attack techniques 🌀
 
@@ -117,7 +117,7 @@ Content-Type: application/json
     "prompt": "<<PROMPT>>"
 }
 """
-result = AgenticSecurity.scan(spec)
+result = AgenticSecurity.scan(llmSpec=spec)
 
 # module: failure rate
 # {"Local CSV": 79.65116279069767, "llm-adaptive-attacks": 20.0}
diff --git a/agentic_security/__main__.py b/agentic_security/__main__.py
index f2c5f9d..aaa0078 100644
--- a/agentic_security/__main__.py
+++ b/agentic_security/__main__.py
@@ -10,7 +10,9 @@ from agentic_security.app import app
 class T:
     def server(self, port=8718, host="0.0.0.0"):
         sys.path.append(os.path.dirname("."))
-        config = uvicorn.Config(app, port=port, host=host, log_level="info")
+        config = uvicorn.Config(
+            app, port=port, host=host, log_level="info", reload=True
+        )
         server = uvicorn.Server(config)
         server.run()
         return
diff --git a/agentic_security/app.py b/agentic_security/app.py
index 36bdea5..d3f40c8 100644
--- a/agentic_security/app.py
+++ b/agentic_security/app.py
@@ -1,5 +1,6 @@
 import random
 import sys
+from asyncio import Event, Queue
 from datetime import datetime
 from pathlib import Path
 
@@ -39,6 +40,9 @@ app.add_middleware(
     allow_headers=["*"],  # Allows all headers
 )
 
+tools_inbox = Queue()
+FEATURE_PROXY = False
+
 
 @app.get("/")
 async def root():
@@ -88,6 +92,7 @@ def streaming_response_generator(scan_parameters: Scan):
             request_factory=request_factory,
             max_budget=scan_parameters.maxBudget,
             datasets=scan_parameters.datasets,
+            tools_inbox=tools_inbox,
         ):
             yield scan_result + "\n"  # Adding a newline for separation
 
@@ -149,3 +154,55 @@ class Table(BaseModel):
 async def get_plot(table: Table):
     buf = plot_security_report(table.table)
     return StreamingResponse(buf, media_type="image/jpeg")
+
+
+class Message(BaseModel):
+    role: str
+    content: str
+
+
+class CompletionRequest(BaseModel):
+    model: str
+    messages: list[Message]
+    temperature: float
+    top_p: float
+    n: int
+    stop: list[str]
+    max_tokens: int
+    presence_penalty: float
+    frequency_penalty: float
+
+
+# OpenAI proxy endpoint
+@app.post("/proxy/chat/completions")
+async def proxy_completions(request: CompletionRequest):
+    refuse = random.random() < 0.2
+    message = random.choice(REFUSAL_MARKS) if refuse else "This is a test!"
+    prompt_content = " ".join(
+        [msg.content for msg in request.messages if msg.role == "user"]
+    )
+    message = prompt_content + " " + message
+    ready = Event()
+    ref = dict(message=message, reply="", ready=ready)
+    tools_inbox.put_nowait(ref)
+    if FEATURE_PROXY:
+        # Proxy to agent
+        await ready.wait()
+        reply = ref["reply"]
+        return reply
+    # Simulate a completion response
+    return {
+        "id": "chatcmpl-abc123",
+        "object": "chat.completion",
+        "created": 1677858242,
+        "model": "gpt-3.5-turbo-0613",
+        "usage": {"prompt_tokens": 13, "completion_tokens": 7, "total_tokens": 20},
+        "choices": [
+            {
+                "message": {"role": "assistant", "content": message},
+                "logprobs": None,
+                "finish_reason": "stop",
+                "index": 0,
+            }
+        ],
+    }
diff --git a/agentic_security/lib.py b/agentic_security/lib.py
index 2875091..51a2563 100644
--- a/agentic_security/lib.py
+++ b/agentic_security/lib.py
@@ -3,9 +3,10 @@ import json
 
 import colorama
 import tqdm.asyncio
+from tabulate import tabulate
+
 from agentic_security.app import Scan, streaming_response_generator
 from agentic_security.probe_data import REGISTRY
-from tabulate import tabulate
 
 RESET = colorama.Style.RESET_ALL
 BRIGHT = colorama.Style.BRIGHT
@@ -25,7 +26,6 @@ Content-Type: application/json
 
 
 class AgenticSecurity:
-
     @classmethod
     async def async_scan(
         self, llmSpec: str, maxBudget: int, datasets: list[dict], max_th: float
diff --git a/agentic_security/probe_actor/fuzzer.py b/agentic_security/probe_actor/fuzzer.py
index 664e97d..b8e9fc4 100644
--- a/agentic_security/probe_actor/fuzzer.py
+++ b/agentic_security/probe_actor/fuzzer.py
@@ -30,7 +30,18 @@ class ScanResult(BaseModel):
         ).model_dump_json()
 
 
-async def perform_scan(request_factory, max_budget: int, datasets: list[dict] = []):
+async def prompt_iter(prompts):
+    if isinstance(prompts, list):
+        for p in prompts:
+            yield p
+        return
+    async for p in prompts:
+        yield p
+
+
+async def perform_scan(
+    request_factory, max_budget: int, datasets: list[dict] = [], tools_inbox=None
+):
     yield ScanResult.status_msg("Loading datasets...")
     if IS_VERCEL:
         yield ScanResult.status_msg(
@@ -40,20 +51,24 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
     prompt_modules = prepare_prompts(
         dataset_names=[m["dataset_name"] for m in datasets if m["selected"]],
         budget=max_budget,
+        tools_inbox=tools_inbox,
     )
     yield ScanResult.status_msg("Datasets loaded. Starting scan...")
 
     errors = []
     refusals = []
-    size = sum(len(m.prompts) for m in prompt_modules)
+    size = sum(len(m.prompts) for m in prompt_modules if not m.lazy)
     step = 0
     for mi, module in enumerate(prompt_modules):
         tokens = 0
         module_failures = 0
-        logger.info(f"Scanning {module.dataset_name} {len(module.prompts)}")
-        for i, prompt in enumerate(module.prompts):
+        size = 0 if module.lazy else len(module.prompts)
+        logger.info(f"Scanning {module.dataset_name} {size}")
+        i = 0
+        async for prompt in prompt_iter(module.prompts):
+            i += 1
             step += 1
-            progress = 100 * (step) / size
+            progress = 100 * (step) / size if size else 0
 
             # Naive token count
             tokens += len(prompt.split())
@@ -86,12 +101,13 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
                 module_failures += 1
             # Naive token count for llm response
             tokens += len(r.text.split())
+            total = size if size else i
             yield ScanResult(
                 module=module.dataset_name,
                 tokens=round(tokens / 1000, 1),
                 cost=round(tokens * 1.5 / 1000_000, 2),
                 progress=round(progress, 2),
-                failureRate=100 * module_failures / max(len(module.prompts), 1),
+                failureRate=100 * module_failures / max(total, 1),
             ).model_dump_json()
     yield ScanResult.status_msg("Done.")
     import pandas as pd
diff --git a/agentic_security/probe_data/__init__.py b/agentic_security/probe_data/__init__.py
index 95023e6..8fe4283 100644
--- a/agentic_security/probe_data/__init__.py
+++ b/agentic_security/probe_data/__init__.py
@@ -127,6 +127,15 @@ REGISTRY = [
         "selected": False,
         "url": "https://github.com/tml-epfl/llm-adaptive-attacks",
     },
+    {
+        "dataset_name": "Garak",
+        "num_prompts": 0,
+        "tokens": 0,
+        "approx_cost": 0.0,
+        "source": "Github: https://github.com/leondz/garak",
+        "selected": False,
+        "url": "https://github.com/leondz/garak",
+    },
     {
         "dataset_name": "Custom CSV",
         "num_prompts": len(load_local_csv().prompts),
diff --git a/agentic_security/probe_data/data.py b/agentic_security/probe_data/data.py
index e2541aa..a3ea64e 100644
--- a/agentic_security/probe_data/data.py
+++ b/agentic_security/probe_data/data.py
@@ -7,7 +7,7 @@ import pandas as pd
 from loguru import logger
 
 from agentic_security.probe_data import stenography_fn
-from agentic_security.probe_data.modules import adaptive_attacks
+from agentic_security.probe_data.modules import adaptive_attacks, garak_tool
 
 IS_VERCEL = os.getenv("IS_VERCEL", "f") == "t"
 
@@ -32,6 +32,7 @@ class ProbeDataset:
     prompts: list[str]
     tokens: int
     approx_cost: float
+    lazy: bool = False
 
     def metadata_summary(self):
         return {
@@ -168,10 +169,7 @@ def load_dataset_v5():
     )
 
 
-def prepare_prompts(
-    dataset_names,
-    budget,
-):
+def prepare_prompts(dataset_names, budget, tools_inbox=None):
     # ## Datasets used and cleaned:
     # markush1/LLM-Jailbreak-Classifier
     # 1. Open-Orca/OpenOrca
@@ -203,6 +201,11 @@ def prepare_prompts(
         "llm-adaptive-attacks": lambda: dataset_from_iterator(
             "llm-adaptive-attacks", adaptive_attacks.Module(group).apply()
         ),
+        "Garak": lambda: dataset_from_iterator(
+            "Garak",
+            garak_tool.Module(group, tools_inbox=tools_inbox).apply(),
+            lazy=True,
+        ),
         "GPT fuzzer": lambda: [],
     }
 
@@ -217,22 +220,6 @@ def prepare_prompts(
     return group + dynamic_groups
 
 
-class MutationFn:
-    def __init__(self, mutation_fn):
-        self.mutation_fn = mutation_fn
-        self.mutation_fn_name = mutation_fn.__name__
-        self.input = ""
-        self.output = ""
-
-    def __call__(self, prompt):
-        self.input = prompt
-        self.output = self.mutation_fn(prompt)
-        return self.output
-
-    def __str__(self):
-        return f"{self.mutation_fn_name}({self.input}) => {self.output}"
-
-
 class Stenography:
     fn_library = {
         "rot5": stenography_fn.rot5,
@@ -295,7 +282,7 @@ def load_local_csv() -> ProbeDataset:
     )
 
 
-def dataset_from_iterator(name: str, iterator) -> list:
+def dataset_from_iterator(name: str, iterator, lazy=False) -> list:
     """Convert an iterator into a list of prompts and create a ProbeDataset
     object.
 
@@ -306,9 +293,14 @@ def dataset_from_iterator(name: str, iterator) -> list:
     Returns:
         list: A list containing a single ProbeDataset object.
     """
-    prompts = list(iterator)
-    tokens = count_words_in_list(prompts)
+    prompts = list(iterator) if not lazy else iterator
+    tokens = count_words_in_list(prompts) if not lazy else 0
     dataset = ProbeDataset(
-        dataset_name=name, metadata={}, prompts=prompts, tokens=tokens, approx_cost=0.0
+        dataset_name=name,
+        metadata={},
+        prompts=prompts,
+        tokens=tokens,
+        approx_cost=0.0,
+        lazy=lazy,
     )
     return [dataset]
diff --git a/agentic_security/probe_data/modules/garak_tool.py b/agentic_security/probe_data/modules/garak_tool.py
new file mode 100644
index 0000000..a65560d
--- /dev/null
+++ b/agentic_security/probe_data/modules/garak_tool.py
@@ -0,0 +1,45 @@
+import subprocess
+import os
+import asyncio
+from loguru import logger
+import asyncio
+
+
+class Module:
+
+    def __init__(self, prompt_groups: [], tools_inbox: asyncio.Queue):
+        self.tools_inbox = tools_inbox
+
+    async def apply(self) -> []:
+        env = os.environ.copy()
+        env["OPENAI_API_BASE"] = "http://0.0.0.0:8718/proxy"
+
+        # Command to be executed
+        command = [
+            "python3",
+            "-m",
+            "garak",
+            "--model_type",
+            "openai",
+            "--model_name",
+            "gpt-3.5-turbo",
+            "--probes",
+            "encoding",
+        ]
+        logger.info(f"Executing command: {command}")
+        # Execute the command with the specific environment
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env
+        )
+        out, err = await asyncio.to_thread(process.communicate)
+
+        is_empty = self.tools_inbox.empty()
+        logger.info(f"Is inbox empty? {is_empty}")
+        while not self.tools_inbox.empty():
+            ref = self.tools_inbox.get_nowait()
+            message, _, ready = ref["message"], ref["reply"], ref["ready"]
+            yield message
+            ready.set()
+        logger.info("Garak tool finished.")
+        logger.info(f"stdout: {out}")
+        logger.error(f"exit code: {process.returncode}")
diff --git a/agentic_security/test_lib.py b/agentic_security/test_lib.py
index b4dd322..c733fe1 100644
--- a/agentic_security/test_lib.py
+++ b/agentic_security/test_lib.py
@@ -1,6 +1,7 @@
-from agentic_security.lib import REGISTRY, AgenticSecurity
 from inline_snapshot import snapshot
 
+from agentic_security.lib import REGISTRY, AgenticSecurity
+
 SAMPLE_SPEC = """
 POST http://0.0.0.0:8718/v1/self-probe
 Authorization: Bearer XXXXX