Files
fuzzforge_ai/backend/toolbox/modules/fuzzer/atheris_fuzzer.py
tduhamel42 fccd8f32ab refactor: Update all modules to use new create_finding signature
Updated 10 modules to use the new create_finding() signature with required rule_id and found_by parameters:

- llm_analyzer.py: Added FoundBy and LLMContext for AI-detected findings
- bandit_analyzer.py: Added tool attribution and moved CWE/confidence to proper fields
- security_analyzer.py: Updated all three finding types (secrets, SQL injection, dangerous functions)
- mypy_analyzer.py: Added tool attribution and moved column info to column_start
- mobsf_scanner.py: Updated all 6 finding types (permissions, manifest, code analysis, behavior) with proper line number handling
- opengrep_android.py: Added tool attribution, proper CWE/OWASP formatting, and confidence mapping
- dependency_scanner.py: Added pip-audit attribution for CVE findings
- file_scanner.py: Updated both sensitive file and enumeration findings
- cargo_fuzzer.py: Added fuzzer type attribution for crash findings
- atheris_fuzzer.py: Added fuzzer type attribution for Python crash findings

All modules now properly track:
- Finding source (module, tool name, version, type)
- Confidence levels (high/medium/low)
- CWE and OWASP mappings where applicable
- LLM context for AI-detected issues
2025-11-14 10:51:38 +01:00

620 lines
23 KiB
Python

"""
Atheris Fuzzer Module
Reusable module for fuzzing Python code using Atheris.
Discovers and fuzzes user-provided Python targets with TestOneInput() function.
"""
import asyncio
import base64
import importlib.util
import logging
import multiprocessing
import os
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List, Optional, Callable
import uuid
import httpx
from modules.base import BaseModule, ModuleMetadata, ModuleResult, ModuleFinding, FoundBy
logger = logging.getLogger(__name__)
def _run_atheris_in_subprocess(
target_path_str: str,
corpus_dir_str: str,
max_iterations: int,
timeout_seconds: int,
shared_crashes: Any,
exec_counter: multiprocessing.Value,
crash_counter: multiprocessing.Value,
coverage_counter: multiprocessing.Value
):
"""
Run atheris.Fuzz() in a separate process to isolate os._exit() calls.
This function runs in a subprocess and loads the target module,
sets up atheris, and runs fuzzing. Stats are communicated via shared memory.
Args:
target_path_str: String path to target file
corpus_dir_str: String path to corpus directory
max_iterations: Maximum fuzzing iterations
timeout_seconds: Timeout in seconds
shared_crashes: Manager().list() for storing crash details
exec_counter: Shared counter for executions
crash_counter: Shared counter for crashes
coverage_counter: Shared counter for coverage edges
"""
import atheris
import importlib.util
import traceback
from pathlib import Path
target_path = Path(target_path_str)
total_executions = 0
# NOTE: Crash details are written directly to shared_crashes (Manager().list())
# so they can be accessed by parent process after subprocess exits.
# We don't use a local crashes list because os._exit() prevents cleanup code.
try:
# Load target module in subprocess
module_name = f"fuzz_target_{uuid.uuid4().hex[:8]}"
spec = importlib.util.spec_from_file_location(module_name, target_path)
if spec is None or spec.loader is None:
raise ImportError(f"Could not load module from {target_path}")
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
if not hasattr(module, "TestOneInput"):
raise AttributeError("Module does not have TestOneInput() function")
test_one_input = module.TestOneInput
# Wrapper to track executions and crashes
def fuzz_wrapper(data):
nonlocal total_executions
total_executions += 1
# Update shared counter for live stats
with exec_counter.get_lock():
exec_counter.value += 1
try:
test_one_input(data)
except Exception as e:
# Capture crash details to shared memory
crash_info = {
"input": bytes(data), # Convert to bytes for serialization
"exception_type": type(e).__name__,
"exception_message": str(e),
"stack_trace": traceback.format_exc(),
"execution": total_executions
}
# Write to shared memory so parent process can access crash details
shared_crashes.append(crash_info)
# Update shared crash counter
with crash_counter.get_lock():
crash_counter.value += 1
# Re-raise so Atheris detects it
raise
# Check for dictionary file in target directory
dict_args = []
target_dir = target_path.parent
for dict_name in ["fuzz.dict", "fuzzing.dict", "dict.txt"]:
dict_path = target_dir / dict_name
if dict_path.exists():
dict_args.append(f"-dict={dict_path}")
break
# Configure Atheris
atheris_args = [
"atheris_fuzzer",
f"-runs={max_iterations}",
f"-max_total_time={timeout_seconds}",
"-print_final_stats=1"
] + dict_args + [corpus_dir_str] # Corpus directory as positional arg
atheris.Setup(atheris_args, fuzz_wrapper)
# Run fuzzing (this will call os._exit() when done)
atheris.Fuzz()
except SystemExit:
# Atheris exits when done - this is normal
# Crash details already written to shared_crashes
pass
except Exception:
# Fatal error - traceback already written to shared memory
# via crash handler in fuzz_wrapper
pass
class AtherisFuzzer(BaseModule):
"""
Atheris fuzzing module - discovers and fuzzes Python code.
This module can be used by any workflow to fuzz Python targets.
"""
def __init__(self):
super().__init__()
self.crashes = []
self.total_executions = 0
self.start_time = None
self.last_stats_time = 0
self.run_id = None
def get_metadata(self) -> ModuleMetadata:
"""Return module metadata"""
return ModuleMetadata(
name="atheris_fuzzer",
version="1.0.0",
description="Python fuzzing using Atheris - discovers and fuzzes TestOneInput() functions",
author="FuzzForge Team",
category="fuzzer",
tags=["fuzzing", "atheris", "python", "coverage"],
input_schema={
"type": "object",
"properties": {
"target_file": {
"type": "string",
"description": "Python file with TestOneInput() function (auto-discovered if not specified)"
},
"max_iterations": {
"type": "integer",
"description": "Maximum fuzzing iterations",
"default": 100000
},
"timeout_seconds": {
"type": "integer",
"description": "Fuzzing timeout in seconds",
"default": 300
},
"stats_callback": {
"description": "Optional callback for real-time statistics"
}
}
},
requires_workspace=True
)
def validate_config(self, config: Dict[str, Any]) -> bool:
"""Validate fuzzing configuration"""
max_iterations = config.get("max_iterations", 100000)
if not isinstance(max_iterations, int) or max_iterations <= 0:
raise ValueError(f"max_iterations must be positive integer, got: {max_iterations}")
timeout = config.get("timeout_seconds", 300)
if not isinstance(timeout, int) or timeout <= 0:
raise ValueError(f"timeout_seconds must be positive integer, got: {timeout}")
return True
async def execute(self, config: Dict[str, Any], workspace: Path) -> ModuleResult:
"""
Execute Atheris fuzzing on user code.
Args:
config: Fuzzing configuration
workspace: Path to user's uploaded code
Returns:
ModuleResult with crash findings
"""
self.start_timer()
self.start_time = time.time()
# Validate configuration
self.validate_config(config)
self.validate_workspace(workspace)
# Extract config
target_file = config.get("target_file")
max_iterations = config.get("max_iterations", 100000)
timeout_seconds = config.get("timeout_seconds", 300)
stats_callback = config.get("stats_callback")
self.run_id = config.get("run_id")
logger.info(
f"Starting Atheris fuzzing (max_iterations={max_iterations}, "
f"timeout={timeout_seconds}s, target={target_file or 'auto-discover'})"
)
try:
# Step 1: Discover or load target
target_path = self._discover_target(workspace, target_file)
logger.info(f"Using fuzz target: {target_path}")
# Step 2: Load target module
test_one_input = self._load_target_module(target_path)
logger.info(f"Loaded TestOneInput function from {target_path}")
# Step 3: Run fuzzing
await self._run_fuzzing(
test_one_input=test_one_input,
target_path=target_path,
workspace=workspace,
max_iterations=max_iterations,
timeout_seconds=timeout_seconds,
stats_callback=stats_callback
)
# Step 4: Generate findings from crashes
findings = await self._generate_findings(target_path)
logger.info(
f"Fuzzing completed: {self.total_executions} executions, "
f"{len(self.crashes)} crashes found"
)
# Generate SARIF report (always, even with no findings)
from modules.reporter import SARIFReporter
reporter = SARIFReporter()
reporter_config = {
"findings": findings,
"tool_name": "Atheris Fuzzer",
"tool_version": self._metadata.version
}
reporter_result = await reporter.execute(reporter_config, workspace)
sarif_report = reporter_result.sarif
return ModuleResult(
module=self._metadata.name,
version=self._metadata.version,
status="success",
execution_time=self.get_execution_time(),
findings=findings,
summary={
"total_executions": self.total_executions,
"crashes_found": len(self.crashes),
"execution_time": self.get_execution_time(),
"target_file": str(target_path.relative_to(workspace))
},
metadata={
"max_iterations": max_iterations,
"timeout_seconds": timeout_seconds
},
sarif=sarif_report
)
except Exception as e:
logger.error(f"Fuzzing failed: {e}", exc_info=True)
return self.create_result(
findings=[],
status="failed",
error=str(e)
)
def _discover_target(self, workspace: Path, target_file: Optional[str]) -> Path:
"""
Discover fuzz target in workspace.
Args:
workspace: Path to workspace
target_file: Explicit target file or None for auto-discovery
Returns:
Path to target file
"""
if target_file:
# Use specified target
target_path = workspace / target_file
if not target_path.exists():
raise FileNotFoundError(f"Target file not found: {target_file}")
return target_path
# Auto-discover: look for fuzz_*.py or *_fuzz.py
logger.info("Auto-discovering fuzz targets...")
candidates = []
# Use rglob for recursive search (searches all subdirectories)
for pattern in ["fuzz_*.py", "*_fuzz.py", "fuzz_target.py"]:
matches = list(workspace.rglob(pattern))
candidates.extend(matches)
if not candidates:
raise FileNotFoundError(
"No fuzz targets found. Expected files matching: fuzz_*.py, *_fuzz.py, or fuzz_target.py"
)
# Use first candidate
target = candidates[0]
if len(candidates) > 1:
logger.warning(
f"Multiple fuzz targets found: {[str(c) for c in candidates]}. "
f"Using: {target.name}"
)
return target
def _load_target_module(self, target_path: Path) -> Callable:
"""
Load target module and get TestOneInput function.
Args:
target_path: Path to Python file with TestOneInput
Returns:
TestOneInput function
"""
# Add target directory to sys.path
target_dir = target_path.parent
if str(target_dir) not in sys.path:
sys.path.insert(0, str(target_dir))
# Load module dynamically
module_name = target_path.stem
spec = importlib.util.spec_from_file_location(module_name, target_path)
if spec is None or spec.loader is None:
raise ImportError(f"Cannot load module from {target_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Get TestOneInput function
if not hasattr(module, "TestOneInput"):
raise AttributeError(
f"Module {module_name} does not have TestOneInput() function. "
"Atheris requires a TestOneInput(data: bytes) function."
)
return module.TestOneInput
async def _run_fuzzing(
self,
test_one_input: Callable,
target_path: Path,
workspace: Path,
max_iterations: int,
timeout_seconds: int,
stats_callback: Optional[Callable] = None
):
"""
Run Atheris fuzzing with real-time monitoring.
Args:
test_one_input: TestOneInput function to fuzz (not used, loaded in subprocess)
target_path: Path to target file
workspace: Path to workspace directory
max_iterations: Max iterations
timeout_seconds: Timeout in seconds
stats_callback: Optional callback for stats
"""
self.crashes = []
self.total_executions = 0
# Create corpus directory in workspace
corpus_dir = workspace / ".fuzzforge_corpus"
corpus_dir.mkdir(exist_ok=True)
logger.info(f"Using corpus directory: {corpus_dir}")
logger.info(f"Starting Atheris fuzzer in subprocess (max_runs={max_iterations}, timeout={timeout_seconds}s)...")
# Create shared memory for subprocess communication
ctx = multiprocessing.get_context('spawn')
manager = ctx.Manager()
shared_crashes = manager.list() # Shared list for crash details
exec_counter = ctx.Value('i', 0) # Shared execution counter
crash_counter = ctx.Value('i', 0) # Shared crash counter
coverage_counter = ctx.Value('i', 0) # Shared coverage counter
# Start fuzzing in subprocess
process = ctx.Process(
target=_run_atheris_in_subprocess,
args=(str(target_path), str(corpus_dir), max_iterations, timeout_seconds, shared_crashes, exec_counter, crash_counter, coverage_counter)
)
# Run fuzzing in a separate task with monitoring
async def monitor_stats():
"""Monitor and report stats every 0.5 seconds"""
while True:
await asyncio.sleep(0.5)
if stats_callback:
elapsed = time.time() - self.start_time
# Read from shared counters
current_execs = exec_counter.value
current_crashes = crash_counter.value
current_coverage = coverage_counter.value
execs_per_sec = current_execs / elapsed if elapsed > 0 else 0
# Count corpus files
try:
corpus_size = len(list(corpus_dir.iterdir())) if corpus_dir.exists() else 0
except Exception:
corpus_size = 0
# TODO: Get real coverage from Atheris
# For now use corpus_size as proxy
coverage_value = current_coverage if current_coverage > 0 else corpus_size
await stats_callback({
"total_execs": current_execs,
"execs_per_sec": execs_per_sec,
"crashes": current_crashes,
"corpus_size": corpus_size,
"coverage": coverage_value, # Using corpus as coverage proxy
"elapsed_time": int(elapsed)
})
# Start monitoring task
monitor_task = None
if stats_callback:
monitor_task = asyncio.create_task(monitor_stats())
try:
# Start subprocess
process.start()
logger.info(f"Fuzzing subprocess started (PID: {process.pid})")
# Wait for subprocess to complete
while process.is_alive():
await asyncio.sleep(0.1)
# NOTE: We cannot use result_queue because Atheris calls os._exit()
# which terminates immediately without putting results in the queue.
# Instead, we rely on shared memory (Manager().list() and Value counters).
# Read final values from shared memory
self.total_executions = exec_counter.value
total_crashes = crash_counter.value
# Read crash details from shared memory and convert to our format
self.crashes = []
for crash_data in shared_crashes:
# Reconstruct crash info with exception object
crash_info = {
"input": crash_data["input"],
"exception": Exception(crash_data["exception_message"]),
"exception_type": crash_data["exception_type"],
"stack_trace": crash_data["stack_trace"],
"execution": crash_data["execution"]
}
self.crashes.append(crash_info)
logger.warning(
f"Crash found (execution {crash_data['execution']}): "
f"{crash_data['exception_type']}: {crash_data['exception_message']}"
)
logger.info(f"Fuzzing completed: {self.total_executions} executions, {total_crashes} crashes found")
# Send final stats update
if stats_callback:
elapsed = time.time() - self.start_time
execs_per_sec = self.total_executions / elapsed if elapsed > 0 else 0
# Count final corpus size
try:
final_corpus_size = len(list(corpus_dir.iterdir())) if corpus_dir.exists() else 0
except Exception:
final_corpus_size = 0
# TODO: Parse coverage from Atheris output
# For now, use corpus size as proxy (corpus grows with coverage)
# libFuzzer writes coverage to stdout but sys.stdout redirection
# doesn't work because it writes to FD 1 directly from C++
final_coverage = coverage_counter.value if coverage_counter.value > 0 else final_corpus_size
await stats_callback({
"total_execs": self.total_executions,
"execs_per_sec": execs_per_sec,
"crashes": total_crashes,
"corpus_size": final_corpus_size,
"coverage": final_coverage,
"elapsed_time": int(elapsed)
})
# Wait for process to fully terminate
process.join(timeout=5)
if process.exitcode is not None and process.exitcode != 0:
logger.warning(f"Subprocess exited with code: {process.exitcode}")
except Exception as e:
logger.error(f"Fuzzing execution error: {e}")
if process.is_alive():
logger.warning("Terminating fuzzing subprocess...")
process.terminate()
process.join(timeout=5)
if process.is_alive():
process.kill()
raise
finally:
# Stop monitoring
if monitor_task:
monitor_task.cancel()
try:
await monitor_task
except asyncio.CancelledError:
pass
async def _generate_findings(self, target_path: Path) -> List[ModuleFinding]:
"""
Generate ModuleFinding objects from crashes.
Args:
target_path: Path to target file
Returns:
List of findings
"""
findings = []
for idx, crash in enumerate(self.crashes):
# Encode crash input for storage
crash_input_b64 = base64.b64encode(crash["input"]).decode()
# Create FoundBy attribution
found_by = FoundBy(
module="atheris_fuzzer",
tool_name="Atheris",
tool_version="unknown",
type="fuzzer"
)
finding = self.create_finding(
rule_id=f"fuzzer_crash_{crash['exception_type'].lower().replace(' ', '_')}",
title=f"Crash: {crash['exception_type']}",
description=(
f"Atheris found crash during fuzzing:\n"
f"Exception: {crash['exception_type']}\n"
f"Message: {str(crash['exception'])}\n"
f"Execution: {crash['execution']}"
),
severity="critical",
category="crash",
found_by=found_by,
confidence="high", # Fuzzer-found crashes are highly reliable
file_path=str(target_path),
metadata={
"crash_input_base64": crash_input_b64,
"crash_input_hex": crash["input"].hex(),
"exception_type": crash["exception_type"],
"stack_trace": crash["stack_trace"],
"execution_number": crash["execution"]
},
recommendation=(
"Review the crash stack trace and input to identify the vulnerability. "
"The crash input is provided in base64 and hex formats for reproduction."
)
)
findings.append(finding)
# Report crash to backend for real-time monitoring
if self.run_id:
try:
crash_report = {
"run_id": self.run_id,
"crash_id": f"crash_{idx + 1}",
"timestamp": datetime.utcnow().isoformat(),
"crash_type": crash["exception_type"],
"stack_trace": crash["stack_trace"],
"input_file": crash_input_b64,
"severity": "critical",
"exploitability": "unknown"
}
backend_url = os.getenv("BACKEND_URL", "http://backend:8000")
async with httpx.AsyncClient(timeout=5.0) as client:
await client.post(
f"{backend_url}/fuzzing/{self.run_id}/crash",
json=crash_report
)
logger.debug(f"Crash report sent to backend: {crash_report['crash_id']}")
except Exception as e:
logger.debug(f"Failed to post crash report to backend: {e}")
return findings