mirror of
https://github.com/msoedov/agentic_security.git
synced 2026-06-24 06:09:55 +02:00
feat(Add backend):
This commit is contained in:
@@ -32,9 +32,14 @@ async def process_prompt(
|
||||
):
|
||||
try:
|
||||
response = await request_factory.fn(prompt=prompt)
|
||||
if response.status_code == 422:
|
||||
logger.error(f"Invalid prompt: {prompt}, error=422")
|
||||
errors.append((module_name, prompt, 422, "Invalid prompt"))
|
||||
return tokens, True
|
||||
|
||||
if response.status_code >= 400:
|
||||
raise httpx.HTTPStatusError(
|
||||
f"HTTP {response.status_code}",
|
||||
f"HTTP {response.status_code} {response.content=}",
|
||||
request=response.request,
|
||||
response=response,
|
||||
)
|
||||
|
||||
@@ -48,6 +48,9 @@ def check_refusal(response: str, refusal_phrases: list = REFUSAL_MARKS) -> bool:
|
||||
return any(phrase in response for phrase in refusal_phrases)
|
||||
|
||||
|
||||
# TODO: add plugin system for multiple classifiers of the output
|
||||
|
||||
|
||||
def refusal_heuristic(request_json):
|
||||
# TODO: improve this heuristic
|
||||
request = str(request_json)
|
||||
|
||||
@@ -1,6 +1,21 @@
|
||||
from .data import load_local_csv
|
||||
|
||||
REGISTRY = [
|
||||
{
|
||||
"dataset_name": "AgenticBackend",
|
||||
"num_prompts": 2000,
|
||||
"tokens": 0,
|
||||
"approx_cost": 0.0,
|
||||
"source": "Fine-tuned cloud hosted model",
|
||||
"selected": True,
|
||||
"url": "Cloud",
|
||||
"dynamic": False,
|
||||
"opts": {
|
||||
"port": 8718,
|
||||
"modules": ["encoding"],
|
||||
},
|
||||
"modality": "text",
|
||||
},
|
||||
{
|
||||
"dataset_name": "ShawnMenz/DAN_jailbreak",
|
||||
"num_prompts": 666,
|
||||
@@ -73,7 +88,7 @@ REGISTRY = [
|
||||
"tokens": 1975800,
|
||||
"approx_cost": 0.0,
|
||||
"source": "Hugging Face Datasets",
|
||||
"selected": True,
|
||||
"selected": False,
|
||||
"dynamic": False,
|
||||
"url": "https://huggingface.co/JailbreakV-28K/JailBreakV-28k",
|
||||
"modality": "text",
|
||||
@@ -111,17 +126,6 @@ REGISTRY = [
|
||||
"url": "",
|
||||
"modality": "text",
|
||||
},
|
||||
{
|
||||
"dataset_name": "Agentic Security",
|
||||
"num_prompts": 0,
|
||||
"tokens": 0,
|
||||
"approx_cost": 0.0,
|
||||
"source": "Local dataset",
|
||||
"selected": False,
|
||||
"dynamic": True,
|
||||
"url": "",
|
||||
"modality": "text",
|
||||
},
|
||||
{
|
||||
"dataset_name": "jailbreak_llms/2023_05_07",
|
||||
"num_prompts": 0,
|
||||
|
||||
@@ -12,6 +12,7 @@ from agentic_security.probe_data import stenography_fn
|
||||
from agentic_security.probe_data.models import ProbeDataset
|
||||
from agentic_security.probe_data.modules import (
|
||||
adaptive_attacks,
|
||||
fine_tuned,
|
||||
garak_tool,
|
||||
inspect_ai_tool,
|
||||
)
|
||||
@@ -23,7 +24,7 @@ def count_words_in_list(str_list):
|
||||
:param str_list: List of strings
|
||||
:return: Total number of words across all strings in the list
|
||||
"""
|
||||
total_words = sum(len(s.split()) for s in str_list)
|
||||
total_words = sum(len(str(s).split()) for s in str_list)
|
||||
return total_words
|
||||
|
||||
|
||||
@@ -237,6 +238,11 @@ def prepare_prompts(dataset_names, budget, tools_inbox=None, options=[]):
|
||||
logger.error(f"Error loading {dataset_name}: {e}")
|
||||
|
||||
dynamic_datasets = {
|
||||
"AgenticBackend": lambda opts: dataset_from_iterator(
|
||||
"AgenticBackend",
|
||||
fine_tuned.Module(group, tools_inbox=tools_inbox, opts=opts).apply(),
|
||||
lazy=True,
|
||||
),
|
||||
"Steganography": lambda opts: Stenography(group),
|
||||
"llm-adaptive-attacks": lambda opts: dataset_from_iterator(
|
||||
"llm-adaptive-attacks",
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
from typing import List
|
||||
import uuid as U
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class Module:
|
||||
def __init__(
|
||||
self, prompt_groups: List[str], tools_inbox: asyncio.Queue, opts: dict = {}
|
||||
):
|
||||
self.tools_inbox = tools_inbox
|
||||
self.opts = opts
|
||||
self.prompt_groups = prompt_groups
|
||||
self.max_prompts = self.opts.get("max_prompts", 2000) # Default max M prompts
|
||||
self.run_id = U.uuid4().hex
|
||||
self.batch_size = self.opts.get("batch_size", 500)
|
||||
|
||||
async def apply(self):
|
||||
|
||||
for _ in range(self.max_prompts // self.batch_size):
|
||||
# Fetch prompts from the API
|
||||
prompts = await self.fetch_prompts()
|
||||
|
||||
if not prompts:
|
||||
logger.error("No prompts retrieved from the API.")
|
||||
return
|
||||
|
||||
logger.info(f"Retrieved {len(prompts)} prompts.")
|
||||
|
||||
for i, prompt in enumerate(
|
||||
prompts[: self.max_prompts]
|
||||
): # Limit to max_prompts
|
||||
logger.info(f"Processing prompt {i+1}/{len(prompts)}: {prompt}")
|
||||
# response = await self.post_prompt(prompt)
|
||||
# logger.info(f"Response: {response}")
|
||||
yield prompt
|
||||
|
||||
while not self.tools_inbox.empty():
|
||||
ref = await self.tools_inbox.get()
|
||||
message, _, ready = ref["message"], ref["reply"], ref["ready"]
|
||||
yield message
|
||||
ready.set()
|
||||
|
||||
async def post_prompt(self, prompt: str):
|
||||
port = self.opts.get("port", 8718)
|
||||
uri = f"http://0.0.0.0:{port}/proxy/chat/completions"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {
|
||||
"model": "gpt-4",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 1050,
|
||||
"temperature": 0.7,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(uri, headers=headers, json=data)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except httpx.RequestError as e:
|
||||
logger.error(f"Failed to post prompt: {e}")
|
||||
return {}
|
||||
|
||||
async def fetch_prompts(self) -> List[str]:
|
||||
api_url = "https://msoedov--agesec-backend-fastapi-app.modal.run/infer"
|
||||
headers = {
|
||||
"Authorization": "Bearer gh0-5f4a8ed2-37c6-4bd7-a0cf-7070eae8115b",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
api_url,
|
||||
headers=headers,
|
||||
json={"batch_size": self.batch_size, "run_id": self.run_id},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("prompts", [])
|
||||
except httpx.RequestError as e:
|
||||
logger.error(f"Failed to fetch prompts: {e}")
|
||||
return []
|
||||
@@ -78,10 +78,10 @@ var app = new Vue({
|
||||
enableChartDiagram: this.enableChartDiagram,
|
||||
enableMultiStepAttack: this.enableMultiStepAttack,
|
||||
};
|
||||
localStorage.setItem('appState', JSON.stringify(state));
|
||||
localStorage.setItem('appState:v1', JSON.stringify(state));
|
||||
},
|
||||
loadStateFromLocalStorage() {
|
||||
const savedState = localStorage.getItem('appState');
|
||||
const savedState = localStorage.getItem('appState:v1');
|
||||
console.log('Loading state from local storage:', savedState);
|
||||
if (savedState) {
|
||||
const state = JSON.parse(savedState);
|
||||
@@ -94,7 +94,7 @@ var app = new Vue({
|
||||
}
|
||||
},
|
||||
resetState() {
|
||||
localStorage.removeItem('appState');
|
||||
localStorage.removeItem('appState:v1');
|
||||
this.modelSpec = LLM_SPECS[0];
|
||||
this.budget = 50;
|
||||
this.dataConfig.forEach(config => config.selected = false);
|
||||
|
||||
@@ -95,3 +95,29 @@ class TestAS:
|
||||
assert isinstance(result, dict)
|
||||
print(result)
|
||||
assert len(result) in [0, 1]
|
||||
|
||||
def test_backend(self, test_server):
|
||||
llmSpec = test_spec_assets.SAMPLE_SPEC
|
||||
maxBudget = 1000000
|
||||
max_th = 0.3
|
||||
datasets = [
|
||||
{
|
||||
"dataset_name": "AgenticBackend",
|
||||
"num_prompts": 0,
|
||||
"tokens": 0,
|
||||
"approx_cost": 0.0,
|
||||
"source": "Fine-tuned cloud hosted model",
|
||||
"selected": True,
|
||||
"url": "",
|
||||
"dynamic": True,
|
||||
"opts": {
|
||||
"port": 9094,
|
||||
"modules": ["encoding"],
|
||||
},
|
||||
"modality": "text",
|
||||
},
|
||||
]
|
||||
result = AgenticSecurity.scan(llmSpec, maxBudget, datasets, max_th)
|
||||
assert isinstance(result, dict)
|
||||
print(result)
|
||||
assert len(result) in [0, 1]
|
||||
|
||||
Reference in New Issue
Block a user