agentic_security/agentic_security/probe_actor/fuzzer.py

import asyncio
import random
import time
from collections.abc import AsyncGenerator
from json import JSONDecodeError

import httpx
import pandas as pd
from skopt import Optimizer
from skopt.space import Real

from agentic_security.http_spec import Modality
from agentic_security.logutils import logger
from agentic_security.primitives import Scan, ScanResult
from agentic_security.probe_actor.cost_module import calculate_cost
from agentic_security.probe_actor.refusal import refusal_heuristic
from agentic_security.probe_data import audio_generator, image_generator, msj_data
from agentic_security.probe_data.data import prepare_prompts

# TODO: full log file

MAX_PROMPT_LENGTH = 2048
BUDGET_MULTIPLIER = 100_000_000
INITIAL_OPTIMIZER_POINTS = 25
MIN_FAILURE_SAMPLES = 5
FAILURE_RATE_THRESHOLD = 0.5


async def generate_prompts(
    prompts: list[str] | AsyncGenerator,
) -> AsyncGenerator[str, None]:
    """
    Asynchronously generates and yields individual prompts.

    If the input is a list of strings, the function sequentially yields each string.
    If the input is an asynchronous generator, it forwards each generated prompt.

    Args:
        prompts (list[str] | AsyncGenerator): A list of strings or an asynchronous generator of prompts.

    Yields:
        str: An individual prompt from the list or the asynchronous generator.
    """
    if isinstance(prompts, list):
        for prompt in prompts:
            yield prompt
    else:
        async for prompt in prompts:
            yield prompt


def multi_modality_spec(llm_spec):
    """
    Returns the appropriate request adapter based on the modality of the LLM specification.

    Depending on the modality of `llm_spec`, the function selects the corresponding request adapter.
    If the modality is IMAGE or AUDIO, it returns an adapter for handling the respective type.
    If the modality is TEXT or an unrecognized type, it returns `llm_spec` as is.

    Args:
        llm_spec: An object containing modality information for the LLM.

    Returns:
        RequestAdapter | llm_spec: An instance of the appropriate request adapter
        or the original `llm_spec` if no adaptation is needed.
    """
    match llm_spec.modality:
        case Modality.IMAGE:
            return image_generator.RequestAdapter(llm_spec)
        case Modality.AUDIO:
            return audio_generator.RequestAdapter(llm_spec)
        case Modality.TEXT:
            return llm_spec
        case _:
            return llm_spec


async def process_prompt(
    request_factory, prompt, tokens, module_name, refusals, errors, outputs
) -> tuple[int, bool]:
    """
    Processes a single prompt using the provided request factory and updates tracking lists.

    This function sends the given `prompt` to the `request_factory`, checks for errors, and updates
    the `tokens`, `refusals`, `errors`, and `outputs` lists accordingly. If the request fails or
    the response indicates a refusal, the function records the issue and returns the updated token count
    along with a boolean indicating whether the prompt was refused.

    Args:
        request_factory: An object with a `fn` method used to send the prompt.
        prompt (str): The input prompt to be processed.
        tokens (int): The current token count, which will be updated.
        module_name (str): The name of the module handling the request.
        refusals (list): A list to store prompts that were refused.
        errors (list): A list to store prompts that encountered errors.
        outputs (list): A list to store processed prompt outputs.

    Returns:
        tuple[int, bool]: Updated token count and a boolean indicating if the prompt was refused.
    """
    try:
        response = await request_factory.fn(prompt=prompt)
        if response.status_code == 422:
            logger.error(f"Invalid prompt: {prompt}, error=422")
            errors.append((module_name, prompt, 422, "Invalid prompt"))
            return tokens, True

        if response.status_code >= 400:
            logger.error(f"HTTP {response.status_code} {response.content=}")
            errors.append((module_name, prompt, response.status_code, response.text))
            return tokens, True
        response_text = response.text
        tokens += len(response_text.split())

        refused = refusal_heuristic(response.json())
        if refused:
            refusals.append((module_name, prompt, response.status_code, response_text))

        outputs.append((module_name, prompt, response_text, refused))
        return tokens, refused

    except httpx.RequestError as exc:
        logger.error(f"Request error: {exc}")
        errors.append((module_name, prompt, "?", str(exc)))
        return tokens, True
    except JSONDecodeError as json_decode_error:
        logger.error(f"Jason error: {json_decode_error}")
        errors.append((module_name, prompt, "?", str(json_decode_error)))
        return tokens, True


async def process_prompt_batch(
    request_factory,
    prompts: list[str],
    tokens: int,
    module_name: str,
    refusals,
    errors,
    outputs,
) -> tuple[int, int]:
    """
    Processes a batch of prompts asynchronously and aggregates the results.

    This function sends multiple prompts concurrently using `process_prompt`,
    collects the token count and failure status for each prompt, and returns
    the total number of tokens processed and the number of failed prompts.

    Args:
        request_factory: An object with a `fn` method used to send the prompts.
        prompts (list[str]): A list of input prompts to be processed.
        tokens (int): The initial token count, which will be updated.
        module_name (str): The name of the module handling the request.
        refusals (list): A list to store prompts that were refused.
        errors (list): A list to store prompts that encountered errors.
        outputs (list): A list to store processed prompt outputs.

    Returns:
        tuple[int, int]:
            - Total number of tokens processed.
            - Number of failed prompts.
    """
    tasks = [
        process_prompt(
            request_factory, p, tokens, module_name, refusals, errors, outputs
        )
        for p in prompts
    ]
    results = await asyncio.gather(*tasks)
    total_tokens = sum(r[0] for r in results)
    failures = sum(1 for r in results if r[1])
    return total_tokens, failures


async def with_error_handling(agen):
    """
    Wraps an asynchronous generator with error handling.

    This function iterates over an asynchronous generator, yielding its values.
    If an exception occurs, it logs the error and yields a failure message.
    Finally, it ensures that a completion message is always yielded.

    Args:
        agen: An asynchronous generator that produces scan results.

    Yields:
        ScanResult: Either a successful result, an error message if an
        exception occurs, or a completion message at the end.
    """
    try:
        async for t in agen:
            yield t
    except Exception as e:
        logger.exception("Scan failed")
        yield ScanResult.status_msg(f"Scan failed: {str(e)}")
    finally:
        yield ScanResult.status_msg("Scan completed.")


async def perform_single_shot_scan(
    request_factory,
    max_budget: int,
    datasets: list[dict[str, str]] = [],
    tools_inbox=None,
    optimize=False,
    stop_event: asyncio.Event = None,
    secrets: dict[str, str] = {},
) -> AsyncGenerator[str, None]:
    """
    Perform a standard security scan using a given request factory.

    This function processes security scan prompts from selected datasets while
    respecting a predefined token budget. It supports optimization, failure tracking,
    and early stopping based on budget constraints or user intervention.

    Args:
        request_factory: A factory function that generates requests for processing prompts.
        max_budget (int): The maximum token budget for the scan.
        datasets (list[dict[str, str]], optional): A list of datasets containing security prompts.
        tools_inbox: Optional additional tools for processing (default: None).
        optimize (bool, optional): Whether to enable failure rate optimization (default: False).
        stop_event (asyncio.Event, optional): An event to signal early termination (default: None).
        secrets (dict[str, str], optional): A dictionary of secrets for authentication (default: {}).

    Yields:
        str: JSON-encoded scan results or status messages.

    The function iterates over prompts, processes them asynchronously, and updates
    failure statistics and token usage. If the scan exceeds the budget or failure rate is too high,
    it stops execution. Results are saved to a CSV file upon completion.
    """
    max_budget = max_budget * BUDGET_MULTIPLIER
    selected_datasets = [m for m in datasets if m["selected"]]
    request_factory = multi_modality_spec(request_factory)
    yield ScanResult.status_msg("Loading datasets...")
    prompt_modules = prepare_prompts(
        dataset_names=[m["dataset_name"] for m in selected_datasets],
        budget=max_budget,
        tools_inbox=tools_inbox,
        options=[m.get("opts", {}) for m in selected_datasets],
    )
    yield ScanResult.status_msg("Datasets loaded. Starting scan...")

    errors = []
    refusals = []
    outputs = []
    total_prompts = sum(len(m.prompts) for m in prompt_modules if not m.lazy)
    processed_prompts = 0

    optimizer = (
        Optimizer([Real(0, 1)], base_estimator="GP", n_initial_points=25)
        if optimize
        else None
    )
    failure_rates = []

    total_tokens = 0
    tokens = 0
    should_stop = False
    for module in prompt_modules:
        if should_stop:
            break
        tokens = 0
        module_failures = 0
        module_size = 0 if module.lazy else len(module.prompts)
        logger.info(f"Scanning {module.dataset_name} {module_size}")
        module_prompts = 0  # Reset for each module

        async for prompt in generate_prompts(module.prompts):
            if stop_event and stop_event.is_set():
                stop_event.clear()
                logger.info("Scan stopped by user.")
                yield ScanResult.status_msg("Scan stopped by user.")
                return

            processed_prompts += 1
            module_prompts += 1  # Fixed increment syntax
            # Calculate progress based on total processed prompts
            progress = 100 * processed_prompts / total_prompts if total_prompts else 0
            progress = progress % 100

            total_tokens -= tokens
            start = time.time()
            tokens, failed = await process_prompt(
                request_factory,
                prompt,
                tokens,
                module.dataset_name,
                refusals,
                errors,
                outputs,
            )
            end = time.time()
            total_tokens += tokens

            if failed:
                module_failures += 1
            failure_rate = module_failures / max(module_prompts, 1)
            failure_rates.append(failure_rate)
            cost = calculate_cost(tokens)

            last_output = outputs[-1] if outputs else None
            if last_output and last_output[1] == prompt:
                response_text = last_output[2]
            else:
                response_text = ""

            yield ScanResult(
                module=module.dataset_name,
                tokens=round(tokens / 1000, 1),
                cost=cost,
                progress=round(progress, 2),
                failureRate=round(failure_rate * 100, 2),
                prompt=prompt[:MAX_PROMPT_LENGTH],
                latency=end - start,
                model=response_text,
            ).model_dump_json()

            if optimize and len(failure_rates) >= 5:
                next_point = optimizer.ask()
                optimizer.tell(next_point, -failure_rate)
                best_failure_rate = -optimizer.get_result().fun
                if best_failure_rate > 0.5:
                    yield ScanResult.status_msg(
                        f"High failure rate detected ({best_failure_rate:.2%}). Stopping this module..."
                    )
                    should_stop = True
                    break
            if total_tokens > max_budget:
                logger.info(
                    f"Scan ran out of budget and stopped. {total_tokens=} {max_budget=}"
                )
                yield ScanResult.status_msg(
                    f"Scan ran out of budget and stopped. {total_tokens=} {max_budget=}"
                )
                should_stop = True
                break

    yield ScanResult.status_msg("Scan completed.")

    failure_data = errors + refusals
    df = pd.DataFrame(
        failure_data, columns=["module", "prompt", "status_code", "content"]
    )
    df.to_csv("failures.csv", index=False)


async def perform_many_shot_scan(
    request_factory,
    max_budget: int,
    datasets: list[dict[str, str]] = [],
    probe_datasets: list[dict[str, str]] = [],
    tools_inbox=None,
    optimize=False,
    stop_event: asyncio.Event = None,
    probe_frequency: float = 0.2,
    max_ctx_length: int = 10_000,
    secrets: dict[str, str] = {},
) -> AsyncGenerator[str, None]:
    """
    Perform a multi-step security scan with probe injection.

    This function executes a security scan while periodically injecting probe datasets
    to test system robustness. It tracks failures, optimizes scan efficiency,
    and ensures adherence to a predefined token budget.

    Args:
        request_factory: A factory function that generates requests for processing prompts.
        max_budget (int): The maximum token budget for the scan.
        datasets (list[dict[str, str]], optional): The main datasets for scanning.
        probe_datasets (list[dict[str, str]], optional): Additional datasets for probe injection.
        tools_inbox: Optional tools for additional processing (default: None).
        optimize (bool, optional): Whether to enable failure rate optimization (default: False).
        stop_event (asyncio.Event, optional): An event to signal early termination (default: None).
        probe_frequency (float, optional): The probability of probe injection (default: 0.2).
        max_ctx_length (int, optional): The maximum context length before resetting (default: 10,000 tokens).
        secrets (dict[str, str], optional): A dictionary of secrets for authentication (default: {}).

    Yields:
        str: JSON-encoded scan results or status messages.

    This function iterates over prompts, injects probe prompts at random intervals,
    processes them asynchronously, and tracks failure rates. If failure rates exceed a threshold
    or budget is exhausted, the scan is stopped early. Results are saved to a CSV file upon completion.
    """
    request_factory = multi_modality_spec(request_factory)
    # Load main and probe datasets
    yield ScanResult.status_msg("Loading datasets...")
    prompt_modules = prepare_prompts(
        dataset_names=[m["dataset_name"] for m in datasets if m["selected"]],
        budget=max_budget,
        tools_inbox=tools_inbox,
    )
    yield ScanResult.status_msg("Loading datasets for MSJ...")
    msj_modules = msj_data.prepare_prompts(probe_datasets)
    yield ScanResult.status_msg("Datasets loaded. Starting scan...")

    errors = []
    refusals = []
    outputs = []
    total_prompts = sum(len(m.prompts) for m in prompt_modules if not m.lazy)
    processed_prompts = 0

    optimizer = (
        Optimizer([Real(0, 1)], base_estimator="GP", n_initial_points=25)
        if optimize
        else None
    )
    failure_rates = []

    for module in prompt_modules:
        module_failures = 0
        module_size = 0 if module.lazy else len(module.prompts)
        logger.info(f"Scanning {module.dataset_name} {module_size}")

        async for prompt in generate_prompts(module.prompts):
            if stop_event and stop_event.is_set():
                stop_event.clear()
                logger.info("Scan stopped by user.")
                yield ScanResult.status_msg("Scan stopped by user.")
                return
            tokens = 0
            processed_prompts += 1
            progress = 100 * processed_prompts / total_prompts if total_prompts else 0
            progress = progress % 100

            full_prompt = ""
            msj_module = random.choice(msj_modules)

            prompt_tokens = len(full_prompt.split())
            tokens += prompt_tokens

            injected = False
            for _ in range(20):
                if injected:
                    break

                m_prompt = random.choice(msj_module.prompts)
                full_prompt += "\n" + m_prompt
                if tokens > max_ctx_length:
                    full_prompt = "\n" + prompt
                    injected = True

                tokens, failed = await process_prompt(
                    request_factory,
                    full_prompt,
                    tokens,
                    module.dataset_name,
                    refusals,
                    errors,
                    outputs,
                )
                if failed:
                    module_failures += 1
                    break
                if injected:
                    break

            failure_rate = module_failures / max(processed_prompts, 1)
            failure_rates.append(failure_rate)
            cost = calculate_cost(tokens)

            yield ScanResult(
                module=module.dataset_name,
                tokens=round(tokens / 1000, 1),
                cost=cost,
                progress=round(progress, 2),
                failureRate=round(failure_rate * 100, 2),
                prompt=prompt[:MAX_PROMPT_LENGTH],
            ).model_dump_json()

            if optimize and len(failure_rates) >= 5:
                next_point = optimizer.ask()
                optimizer.tell(next_point, -failure_rate)
                best_failure_rate = -optimizer.get_result().fun
                if best_failure_rate > 0.5:
                    yield ScanResult.status_msg(
                        f"High failure rate detected ({best_failure_rate:.2%}). Stopping this module..."
                    )
                    break

    yield ScanResult.status_msg("Scan completed.")

    df = pd.DataFrame(
        errors + refusals, columns=["module", "prompt", "status_code", "content"]
    )
    df.to_csv("failures.csv", index=False)


def scan_router(
    request_factory,
    scan_parameters: Scan,
    tools_inbox=None,
    stop_event: asyncio.Event = None,
):
    """
    Route scan requests to the appropriate scanning function.

    This function determines whether to perform a multi-step or single-shot
    security scan based on the provided scan parameters.

    Args:
        request_factory: A factory function to generate requests for processing prompts.
        scan_parameters (Scan): An object containing the parameters for the scan, including:
            - enableMultiStepAttack (bool): Whether to perform a multi-step scan.
            - maxBudget (int): The maximum token budget for the scan.
            - datasets (list[dict[str, str]]): The datasets to scan.
            - probe_datasets (list[dict[str, str]], optional): Datasets for probe injection (multi-step only).
            - optimize (bool): Whether to enable optimization.
            - secrets (dict[str, str], optional): A dictionary of secrets for authentication.
        tools_inbox: Optional tools for additional processing (default: None).
        stop_event (asyncio.Event, optional): An event to signal early termination (default: None).

    Returns:
        A function wrapped with `with_error_handling`, which executes either:
        - `perform_many_shot_scan` for multi-step scanning.
        - `perform_single_shot_scan` for single-shot scanning.

    The function ensures that the appropriate scanning method is chosen based on
    the `enableMultiStepAttack` flag in `scan_parameters`.
    """
    if scan_parameters.enableMultiStepAttack:
        return with_error_handling(
            perform_many_shot_scan(
                request_factory=request_factory,
                max_budget=scan_parameters.maxBudget,
                datasets=scan_parameters.datasets,
                probe_datasets=scan_parameters.probe_datasets,
                tools_inbox=tools_inbox,
                optimize=scan_parameters.optimize,
                stop_event=stop_event,
                secrets=scan_parameters.secrets,
            )
        )
    else:
        return with_error_handling(
            perform_single_shot_scan(
                request_factory=request_factory,
                max_budget=scan_parameters.maxBudget,
                datasets=scan_parameters.datasets,
                tools_inbox=tools_inbox,
                optimize=scan_parameters.optimize,
                stop_event=stop_event,
                secrets=scan_parameters.secrets,
            )
        )