feat(Integrated Garak):

2026-06-23 21:59:57 +02:00 · 2024-04-27 21:44:38 +03:00
parent fa209684d9
commit 74461efaa0
10 changed files with 161 additions and 37 deletions
@@ -3,3 +3,5 @@
 .web
 __pycache__/
 failures.csv
+runs/
+*.todo
@@ -24,7 +24,7 @@
 ## Features

 - Customizable Rule Sets or Agent based attacks🛠️
- Comprehansive fuzzing for any LLMs 🧪
+- Comprehensive fuzzing for any LLMs 🧪
 - LLM API integration and stress testing 🛠️
 - Wide range of fuzzing and attack techniques 🌀

@@ -117,7 +117,7 @@ Content-Type: application/json
    "prompt": "<<PROMPT>>"
 }
 """
-result = AgenticSecurity.scan(spec)
+result = AgenticSecurity.scan(llmSpec=spec)

 # module: failure rate
 # {"Local CSV": 79.65116279069767, "llm-adaptive-attacks": 20.0}
@@ -10,7 +10,9 @@ from agentic_security.app import app
 class T:
    def server(self, port=8718, host="0.0.0.0"):
        sys.path.append(os.path.dirname("."))
-        config = uvicorn.Config(app, port=port, host=host, log_level="info")
+        config = uvicorn.Config(
+            app, port=port, host=host, log_level="info", reload=True
+        )
        server = uvicorn.Server(config)
        server.run()
        return
@@ -1,5 +1,6 @@
 import random
 import sys
+from asyncio import Event, Queue
 from datetime import datetime
 from pathlib import Path

@@ -39,6 +40,9 @@ app.add_middleware(
    allow_headers=["*"],  # Allows all headers
 )

+tools_inbox = Queue()
+FEATURE_PROXY = False
+

@app.get("/")
 async def root():
@@ -88,6 +92,7 @@ def streaming_response_generator(scan_parameters: Scan):
            request_factory=request_factory,
            max_budget=scan_parameters.maxBudget,
            datasets=scan_parameters.datasets,
+            tools_inbox=tools_inbox,
        ):
            yield scan_result + "\n"  # Adding a newline for separation

@@ -149,3 +154,55 @@ class Table(BaseModel):
 async def get_plot(table: Table):
    buf = plot_security_report(table.table)
    return StreamingResponse(buf, media_type="image/jpeg")
+
+
+class Message(BaseModel):
+    role: str
+    content: str
+
+
+class CompletionRequest(BaseModel):
+    model: str
+    messages: list[Message]
+    temperature: float
+    top_p: float
+    n: int
+    stop: list[str]
+    max_tokens: int
+    presence_penalty: float
+    frequency_penalty: float
+
+
+# OpenAI proxy endpoint
+@app.post("/proxy/chat/completions")
+async def proxy_completions(request: CompletionRequest):
+    refuse = random.random() < 0.2
+    message = random.choice(REFUSAL_MARKS) if refuse else "This is a test!"
+    prompt_content = " ".join(
+        [msg.content for msg in request.messages if msg.role == "user"]
+    )
+    message = prompt_content + " " + message
+    ready = Event()
+    ref = dict(message=message, reply="", ready=ready)
+    tools_inbox.put_nowait(ref)
+    if FEATURE_PROXY:
+        # Proxy to agent
+        await ready.wait()
+        reply = ref["reply"]
+        return reply
+    # Simulate a completion response
+    return {
+        "id": "chatcmpl-abc123",
+        "object": "chat.completion",
+        "created": 1677858242,
+        "model": "gpt-3.5-turbo-0613",
+        "usage": {"prompt_tokens": 13, "completion_tokens": 7, "total_tokens": 20},
+        "choices": [
+            {
+                "message": {"role": "assistant", "content": message},
+                "logprobs": None,
+                "finish_reason": "stop",
+                "index": 0,
+            }
+        ],
+    }
@@ -3,9 +3,10 @@ import json

 import colorama
 import tqdm.asyncio
+from tabulate import tabulate
+
 from agentic_security.app import Scan, streaming_response_generator
 from agentic_security.probe_data import REGISTRY
-from tabulate import tabulate

 RESET = colorama.Style.RESET_ALL
 BRIGHT = colorama.Style.BRIGHT
@@ -25,7 +26,6 @@ Content-Type: application/json


 class AgenticSecurity:
-
    @classmethod
    async def async_scan(
        self, llmSpec: str, maxBudget: int, datasets: list[dict], max_th: float
@@ -30,7 +30,18 @@ class ScanResult(BaseModel):
        ).model_dump_json()


-async def perform_scan(request_factory, max_budget: int, datasets: list[dict] = []):
+async def prompt_iter(prompts):
+    if isinstance(prompts, list):
+        for p in prompts:
+            yield p
+        return
+    async for p in prompts:
+        yield p
+
+
+async def perform_scan(
+    request_factory, max_budget: int, datasets: list[dict] = [], tools_inbox=None
+):
    yield ScanResult.status_msg("Loading datasets...")
    if IS_VERCEL:
        yield ScanResult.status_msg(
@@ -40,20 +51,24 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
    prompt_modules = prepare_prompts(
        dataset_names=[m["dataset_name"] for m in datasets if m["selected"]],
        budget=max_budget,
+        tools_inbox=tools_inbox,
    )
    yield ScanResult.status_msg("Datasets loaded. Starting scan...")

    errors = []
    refusals = []
-    size = sum(len(m.prompts) for m in prompt_modules)
+    size = sum(len(m.prompts) for m in prompt_modules if not m.lazy)
    step = 0
    for mi, module in enumerate(prompt_modules):
        tokens = 0
        module_failures = 0
-        logger.info(f"Scanning {module.dataset_name} {len(module.prompts)}")
-        for i, prompt in enumerate(module.prompts):
+        size = 0 if module.lazy else len(module.prompts)
+        logger.info(f"Scanning {module.dataset_name} {size}")
+        i = 0
+        async for prompt in prompt_iter(module.prompts):
+            i += 1
            step += 1
-            progress = 100 * (step) / size
+            progress = 100 * (step) / size if size else 0

            # Naive token count
            tokens += len(prompt.split())
@@ -86,12 +101,13 @@ async def perform_scan(request_factory, max_budget: int, datasets: list[dict] =
                module_failures += 1
            # Naive token count for llm response
            tokens += len(r.text.split())
+            total = size if size else i
            yield ScanResult(
                module=module.dataset_name,
                tokens=round(tokens / 1000, 1),
                cost=round(tokens * 1.5 / 1000_000, 2),
                progress=round(progress, 2),
-                failureRate=100 * module_failures / max(len(module.prompts), 1),
+                failureRate=100 * module_failures / max(total, 1),
            ).model_dump_json()
    yield ScanResult.status_msg("Done.")
    import pandas as pd
@@ -127,6 +127,15 @@ REGISTRY = [
        "selected": False,
        "url": "https://github.com/tml-epfl/llm-adaptive-attacks",
    },
+    {
+        "dataset_name": "Garak",
+        "num_prompts": 0,
+        "tokens": 0,
+        "approx_cost": 0.0,
+        "source": "Github: https://github.com/leondz/garak",
+        "selected": False,
+        "url": "https://github.com/leondz/garak",
+    },
    {
        "dataset_name": "Custom CSV",
        "num_prompts": len(load_local_csv().prompts),
@@ -7,7 +7,7 @@ import pandas as pd
 from loguru import logger

 from agentic_security.probe_data import stenography_fn
-from agentic_security.probe_data.modules import adaptive_attacks
+from agentic_security.probe_data.modules import adaptive_attacks, garak_tool

 IS_VERCEL = os.getenv("IS_VERCEL", "f") == "t"

@@ -32,6 +32,7 @@ class ProbeDataset:
    prompts: list[str]
    tokens: int
    approx_cost: float
+    lazy: bool = False

    def metadata_summary(self):
        return {
@@ -168,10 +169,7 @@ def load_dataset_v5():
    )


-def prepare_prompts(
-    dataset_names,
-    budget,
-):
+def prepare_prompts(dataset_names, budget, tools_inbox=None):
    # ## Datasets used and cleaned:
    # markush1/LLM-Jailbreak-Classifier
    # 1. Open-Orca/OpenOrca
@@ -203,6 +201,11 @@ def prepare_prompts(
        "llm-adaptive-attacks": lambda: dataset_from_iterator(
            "llm-adaptive-attacks", adaptive_attacks.Module(group).apply()
        ),
+        "Garak": lambda: dataset_from_iterator(
+            "Garak",
+            garak_tool.Module(group, tools_inbox=tools_inbox).apply(),
+            lazy=True,
+        ),
        "GPT fuzzer": lambda: [],
    }

@@ -217,22 +220,6 @@ def prepare_prompts(
    return group + dynamic_groups


-class MutationFn:
-    def __init__(self, mutation_fn):
-        self.mutation_fn = mutation_fn
-        self.mutation_fn_name = mutation_fn.__name__
-        self.input = ""
-        self.output = ""
-
-    def __call__(self, prompt):
-        self.input = prompt
-        self.output = self.mutation_fn(prompt)
-        return self.output
-
-    def __str__(self):
-        return f"{self.mutation_fn_name}({self.input}) => {self.output}"
-
-
 class Stenography:
    fn_library = {
        "rot5": stenography_fn.rot5,
@@ -295,7 +282,7 @@ def load_local_csv() -> ProbeDataset:
    )


-def dataset_from_iterator(name: str, iterator) -> list:
+def dataset_from_iterator(name: str, iterator, lazy=False) -> list:
    """Convert an iterator into a list of prompts and create a ProbeDataset
    object.

@@ -306,9 +293,14 @@ def dataset_from_iterator(name: str, iterator) -> list:
    Returns:
        list: A list containing a single ProbeDataset object.
    """
-    prompts = list(iterator)
-    tokens = count_words_in_list(prompts)
+    prompts = list(iterator) if not lazy else iterator
+    tokens = count_words_in_list(prompts) if not lazy else 0
    dataset = ProbeDataset(
-        dataset_name=name, metadata={}, prompts=prompts, tokens=tokens, approx_cost=0.0
+        dataset_name=name,
+        metadata={},
+        prompts=prompts,
+        tokens=tokens,
+        approx_cost=0.0,
+        lazy=lazy,
    )
    return [dataset]
@@ -0,0 +1,45 @@
+import subprocess
+import os
+import asyncio
+from loguru import logger
+import asyncio
+
+
+class Module:
+
+    def __init__(self, prompt_groups: [], tools_inbox: asyncio.Queue):
+        self.tools_inbox = tools_inbox
+
+    async def apply(self) -> []:
+        env = os.environ.copy()
+        env["OPENAI_API_BASE"] = "http://0.0.0.0:8718/proxy"
+
+        # Command to be executed
+        command = [
+            "python3",
+            "-m",
+            "garak",
+            "--model_type",
+            "openai",
+            "--model_name",
+            "gpt-3.5-turbo",
+            "--probes",
+            "encoding",
+        ]
+        logger.info(f"Executing command: {command}")
+        # Execute the command with the specific environment
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env
+        )
+        out, err = await asyncio.to_thread(process.communicate)
+
+        is_empty = self.tools_inbox.empty()
+        logger.info(f"Is inbox empty? {is_empty}")
+        while not self.tools_inbox.empty():
+            ref = self.tools_inbox.get_nowait()
+            message, _, ready = ref["message"], ref["reply"], ref["ready"]
+            yield message
+            ready.set()
+        logger.info("Garak tool finished.")
+        logger.info(f"stdout: {out}")
+        logger.error(f"exit code: {process.returncode}")
@@ -1,6 +1,7 @@
-from agentic_security.lib import REGISTRY, AgenticSecurity
 from inline_snapshot import snapshot

+from agentic_security.lib import REGISTRY, AgenticSecurity
+
 SAMPLE_SPEC = """
 POST http://0.0.0.0:8718/v1/self-probe
 Authorization: Bearer XXXXX