From ffd7d710f10ea888e5fbfce080adc81b3888d89f Mon Sep 17 00:00:00 2001 From: Alexander Myasoedov Date: Fri, 3 Jan 2025 00:07:10 +0200 Subject: [PATCH] feat(Add backend): --- agentic_security/probe_actor/fuzzer.py | 7 +- agentic_security/probe_actor/refusal.py | 3 + agentic_security/probe_data/__init__.py | 28 ++++--- agentic_security/probe_data/data.py | 8 +- .../probe_data/modules/fine_tuned.py | 84 +++++++++++++++++++ agentic_security/static/main.js | 6 +- agentic_security/test_lib.py | 26 ++++++ 7 files changed, 145 insertions(+), 17 deletions(-) create mode 100644 agentic_security/probe_data/modules/fine_tuned.py diff --git a/agentic_security/probe_actor/fuzzer.py b/agentic_security/probe_actor/fuzzer.py index 237a37c..b5e1a20 100644 --- a/agentic_security/probe_actor/fuzzer.py +++ b/agentic_security/probe_actor/fuzzer.py @@ -32,9 +32,14 @@ async def process_prompt( ): try: response = await request_factory.fn(prompt=prompt) + if response.status_code == 422: + logger.error(f"Invalid prompt: {prompt}, error=422") + errors.append((module_name, prompt, 422, "Invalid prompt")) + return tokens, True + if response.status_code >= 400: raise httpx.HTTPStatusError( - f"HTTP {response.status_code}", + f"HTTP {response.status_code} {response.content=}", request=response.request, response=response, ) diff --git a/agentic_security/probe_actor/refusal.py b/agentic_security/probe_actor/refusal.py index d99991c..ec3020f 100644 --- a/agentic_security/probe_actor/refusal.py +++ b/agentic_security/probe_actor/refusal.py @@ -48,6 +48,9 @@ def check_refusal(response: str, refusal_phrases: list = REFUSAL_MARKS) -> bool: return any(phrase in response for phrase in refusal_phrases) +# TODO: add plugin system for multiple classifiers of the output + + def refusal_heuristic(request_json): # TODO: improve this heuristic request = str(request_json) diff --git a/agentic_security/probe_data/__init__.py b/agentic_security/probe_data/__init__.py index 8d81f64..abde7de 100644 --- a/agentic_security/probe_data/__init__.py +++ b/agentic_security/probe_data/__init__.py @@ -1,6 +1,21 @@ from .data import load_local_csv REGISTRY = [ + { + "dataset_name": "AgenticBackend", + "num_prompts": 2000, + "tokens": 0, + "approx_cost": 0.0, + "source": "Fine-tuned cloud hosted model", + "selected": True, + "url": "Cloud", + "dynamic": False, + "opts": { + "port": 8718, + "modules": ["encoding"], + }, + "modality": "text", + }, { "dataset_name": "ShawnMenz/DAN_jailbreak", "num_prompts": 666, @@ -73,7 +88,7 @@ REGISTRY = [ "tokens": 1975800, "approx_cost": 0.0, "source": "Hugging Face Datasets", - "selected": True, + "selected": False, "dynamic": False, "url": "https://huggingface.co/JailbreakV-28K/JailBreakV-28k", "modality": "text", @@ -111,17 +126,6 @@ REGISTRY = [ "url": "", "modality": "text", }, - { - "dataset_name": "Agentic Security", - "num_prompts": 0, - "tokens": 0, - "approx_cost": 0.0, - "source": "Local dataset", - "selected": False, - "dynamic": True, - "url": "", - "modality": "text", - }, { "dataset_name": "jailbreak_llms/2023_05_07", "num_prompts": 0, diff --git a/agentic_security/probe_data/data.py b/agentic_security/probe_data/data.py index 60b5039..946f691 100644 --- a/agentic_security/probe_data/data.py +++ b/agentic_security/probe_data/data.py @@ -12,6 +12,7 @@ from agentic_security.probe_data import stenography_fn from agentic_security.probe_data.models import ProbeDataset from agentic_security.probe_data.modules import ( adaptive_attacks, + fine_tuned, garak_tool, inspect_ai_tool, ) @@ -23,7 +24,7 @@ def count_words_in_list(str_list): :param str_list: List of strings :return: Total number of words across all strings in the list """ - total_words = sum(len(s.split()) for s in str_list) + total_words = sum(len(str(s).split()) for s in str_list) return total_words @@ -237,6 +238,11 @@ def prepare_prompts(dataset_names, budget, tools_inbox=None, options=[]): logger.error(f"Error loading {dataset_name}: {e}") dynamic_datasets = { + "AgenticBackend": lambda opts: dataset_from_iterator( + "AgenticBackend", + fine_tuned.Module(group, tools_inbox=tools_inbox, opts=opts).apply(), + lazy=True, + ), "Steganography": lambda opts: Stenography(group), "llm-adaptive-attacks": lambda opts: dataset_from_iterator( "llm-adaptive-attacks", diff --git a/agentic_security/probe_data/modules/fine_tuned.py b/agentic_security/probe_data/modules/fine_tuned.py new file mode 100644 index 0000000..a12ce8f --- /dev/null +++ b/agentic_security/probe_data/modules/fine_tuned.py @@ -0,0 +1,84 @@ +import asyncio +import httpx +from typing import List +import uuid as U +from loguru import logger + + +class Module: + def __init__( + self, prompt_groups: List[str], tools_inbox: asyncio.Queue, opts: dict = {} + ): + self.tools_inbox = tools_inbox + self.opts = opts + self.prompt_groups = prompt_groups + self.max_prompts = self.opts.get("max_prompts", 2000) # Default max M prompts + self.run_id = U.uuid4().hex + self.batch_size = self.opts.get("batch_size", 500) + + async def apply(self): + + for _ in range(self.max_prompts // self.batch_size): + # Fetch prompts from the API + prompts = await self.fetch_prompts() + + if not prompts: + logger.error("No prompts retrieved from the API.") + return + + logger.info(f"Retrieved {len(prompts)} prompts.") + + for i, prompt in enumerate( + prompts[: self.max_prompts] + ): # Limit to max_prompts + logger.info(f"Processing prompt {i+1}/{len(prompts)}: {prompt}") + # response = await self.post_prompt(prompt) + # logger.info(f"Response: {response}") + yield prompt + + while not self.tools_inbox.empty(): + ref = await self.tools_inbox.get() + message, _, ready = ref["message"], ref["reply"], ref["ready"] + yield message + ready.set() + + async def post_prompt(self, prompt: str): + port = self.opts.get("port", 8718) + uri = f"http://0.0.0.0:{port}/proxy/chat/completions" + headers = {"Content-Type": "application/json"} + data = { + "model": "gpt-4", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 1050, + "temperature": 0.7, + } + + async with httpx.AsyncClient() as client: + try: + response = await client.post(uri, headers=headers, json=data) + response.raise_for_status() + return response.json() + except httpx.RequestError as e: + logger.error(f"Failed to post prompt: {e}") + return {} + + async def fetch_prompts(self) -> List[str]: + api_url = "https://msoedov--agesec-backend-fastapi-app.modal.run/infer" + headers = { + "Authorization": "Bearer gh0-5f4a8ed2-37c6-4bd7-a0cf-7070eae8115b", + "Content-Type": "application/json", + } + + async with httpx.AsyncClient() as client: + try: + response = await client.post( + api_url, + headers=headers, + json={"batch_size": self.batch_size, "run_id": self.run_id}, + ) + response.raise_for_status() + data = response.json() + return data.get("prompts", []) + except httpx.RequestError as e: + logger.error(f"Failed to fetch prompts: {e}") + return [] diff --git a/agentic_security/static/main.js b/agentic_security/static/main.js index 1f760c5..30e8719 100644 --- a/agentic_security/static/main.js +++ b/agentic_security/static/main.js @@ -78,10 +78,10 @@ var app = new Vue({ enableChartDiagram: this.enableChartDiagram, enableMultiStepAttack: this.enableMultiStepAttack, }; - localStorage.setItem('appState', JSON.stringify(state)); + localStorage.setItem('appState:v1', JSON.stringify(state)); }, loadStateFromLocalStorage() { - const savedState = localStorage.getItem('appState'); + const savedState = localStorage.getItem('appState:v1'); console.log('Loading state from local storage:', savedState); if (savedState) { const state = JSON.parse(savedState); @@ -94,7 +94,7 @@ var app = new Vue({ } }, resetState() { - localStorage.removeItem('appState'); + localStorage.removeItem('appState:v1'); this.modelSpec = LLM_SPECS[0]; this.budget = 50; this.dataConfig.forEach(config => config.selected = false); diff --git a/agentic_security/test_lib.py b/agentic_security/test_lib.py index 0dde2e3..504893b 100644 --- a/agentic_security/test_lib.py +++ b/agentic_security/test_lib.py @@ -95,3 +95,29 @@ class TestAS: assert isinstance(result, dict) print(result) assert len(result) in [0, 1] + + def test_backend(self, test_server): + llmSpec = test_spec_assets.SAMPLE_SPEC + maxBudget = 1000000 + max_th = 0.3 + datasets = [ + { + "dataset_name": "AgenticBackend", + "num_prompts": 0, + "tokens": 0, + "approx_cost": 0.0, + "source": "Fine-tuned cloud hosted model", + "selected": True, + "url": "", + "dynamic": True, + "opts": { + "port": 9094, + "modules": ["encoding"], + }, + "modality": "text", + }, + ] + result = AgenticSecurity.scan(llmSpec, maxBudget, datasets, max_th) + assert isinstance(result, dict) + print(result) + assert len(result) in [0, 1]