From 5db9676837bcc35be4b19d2f3d540a8dac292418 Mon Sep 17 00:00:00 2001 From: Alexander Myasoedov Date: Sun, 26 Jan 2025 12:29:29 +0200 Subject: [PATCH] feat(Add more docs for bayesian optimizer): --- docs/image_generation.md | 119 +++++++++++++++++++++++++++++++++++++++ docs/optimizer.md | 78 +++++++++++++++++++++++++ mkdocs.yml | 2 + 3 files changed, 199 insertions(+) create mode 100644 docs/image_generation.md create mode 100644 docs/optimizer.md diff --git a/docs/image_generation.md b/docs/image_generation.md new file mode 100644 index 0000000..e342250 --- /dev/null +++ b/docs/image_generation.md @@ -0,0 +1,119 @@ +# Image Generation System + +The image generation system creates visual probes for security testing by converting text prompts into images. This document explains its architecture and implementation. + +## Overview + +The system: + +1. Converts text datasets into image datasets +1. Generates images using matplotlib +1. Encodes images for transmission +1. Integrates with the LLM probing system + +## Core Components + +### Image Generation + +```python +@cache_to_disk() +def generate_image(prompt: str) -> bytes: + """ + Generates a JPEG image containing the provided text prompt + """ + # Create figure with light blue background + fig, ax = plt.subplots(figsize=(6, 4)) + ax.set_facecolor("lightblue") + + # Add centered text + ax.text( + 0.5, 0.5, + prompt, + fontsize=16, + ha="center", + va="center", + wrap=True, + color="darkblue" + ) + + # Save to buffer + buffer = io.BytesIO() + plt.savefig(buffer, format="jpeg", bbox_inches="tight") + return buffer.getvalue() +``` + +### Dataset Conversion + +```python +def generate_image_dataset(text_dataset: list[ProbeDataset]) -> list[ImageProbeDataset]: + """ + Converts text datasets into image datasets + """ + image_datasets = [] + + for dataset in text_dataset: + image_prompts = [ + generate_image(prompt) + for prompt in tqdm(dataset.prompts) + ] + + image_datasets.append(ImageProbeDataset( + test_dataset=dataset, + image_prompts=image_prompts + )) + + return image_datasets +``` + +### Image Encoding + +```python +def encode(image: bytes) -> str: + """ + Encodes image bytes into base64 data URL + """ + encoded = base64.b64encode(image).decode("utf-8") + return "data:image/jpeg;base64," + encoded +``` + +## Integration + +### RequestAdapter + +The RequestAdapter class integrates image generation with LLM probing: + +```python +class RequestAdapter: + def __init__(self, llm_spec): + if not llm_spec.has_image: + raise ValueError("LLMSpec must have an image") + self.llm_spec = llm_spec + + async def probe(self, prompt: str, encoded_image: str = "", + encoded_audio: str = "", files={}) -> httpx.Response: + encoded_image = generate_image(prompt) + encoded_image = encode(encoded_image) + return await self.llm_spec.probe(prompt, encoded_image, encoded_audio, files) +``` + +## Key Features + +- **Caching**: Generated images are cached to disk using @cache_to_disk +- **Progress Tracking**: tqdm progress bars for dataset conversion +- **Error Handling**: Validates LLM specifications before probing +- **Standard Formats**: Uses JPEG format with base64 encoding + +## Configuration + +The system is configured through: + +1. Figure size (6x4 inches) +1. Background color (light blue) +1. Text styling (16pt dark blue centered text) +1. Image format (JPEG) + +## Limitations + +- Currently only supports text-based image generation +- Fixed visual style and formatting +- Requires matplotlib and associated dependencies diff --git a/docs/optimizer.md b/docs/optimizer.md new file mode 100644 index 0000000..95eec1c --- /dev/null +++ b/docs/optimizer.md @@ -0,0 +1,78 @@ +# Bayesian Optimization in Security Fuzzing + +The fuzzer implements an optimization system using scikit-optimize (skopt) to minimize failure rates during security scans. This document explains the optimizer's implementation and behavior. + +## Overview + +The optimizer is used in both single-shot and many-shot scanning modes when the `optimize` parameter is True. It dynamically adjusts scan parameters to minimize failure rates while staying within budget constraints. + +## Implementation Details + +### Initialization + +The optimizer is initialized with: + +```python +Optimizer( + [Real(0, 1)], # Single parameter space (0 to 1) + base_estimator="GP", # Gaussian Process estimator + n_initial_points=25 # Initial exploration points +) +``` + +### Optimization Process + +1. **Parameter Space**: A single real-valued parameter between 0 and 1 +1. **Objective**: Minimize the failure rate (negative failure rate is maximized) +1. **Update Mechanism**: + ```python + next_point = optimizer.ask() + optimizer.tell(next_point, -failure_rate) + ``` +1. **Early Stopping**: If best failure rate exceeds 50%: + ```python + if best_failure_rate > 0.5: + yield ScanResult.status_msg( + f"High failure rate detected ({best_failure_rate:.2%}). Stopping this module..." + ) + break + ``` + +## Usage in Scanning + +The optimizer is integrated into both scan types: + +### Single-shot Scan + +- Used in `perform_single_shot_scan()` +- Optimizes failure rates across prompt modules +- Considers token budget constraints + +### Many-shot Scan + +- Used in `perform_many_shot_scan()` +- Handles more complex multi-step attacks +- Maintains separate failure rate tracking + +## Key Parameters + +| Parameter | Description | +|-----------|-------------| +| base_estimator | Gaussian Process (GP) used for optimization | +| n_initial_points | 25 initial exploration points | +| Real(0, 1) | Single parameter space being optimized | +| failure_rate | Current failure rate being minimized | + +## Optimization Flow + +1. Initialize optimizer with GP estimator +1. Collect initial 25 data points +1. For each prompt: + - Calculate current failure rate + - Update optimizer with new point + - Check for early stopping conditions +1. Continue until scan completes or budget exhausted + +## Error Handling + +The optimizer is wrapped in try/except blocks to ensure scan failures don't crash the entire process. Any optimization errors are logged and the scan continues with default parameters. diff --git a/mkdocs.yml b/mkdocs.yml index da15002..572d5fa 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,6 +22,8 @@ nav: - Dataset Extension: datasets.md - External Modules: external_module.md - CI/CD Integration: ci_cd.md + - Bayesian Optimization: optimizer.md + - Image Generation: image_generation.md - Reference: - API Reference: api_reference.md - Community: