fuzzforge_ai/backend/toolbox/modules/fuzzing/atheris.py

"""
Atheris Fuzzing Module

This module uses Atheris for fuzzing Python code to find bugs and security
vulnerabilities in Python applications and libraries.
"""
# Copyright (c) 2025 FuzzingLabs
#
# Licensed under the Business Source License 1.1 (BSL). See the LICENSE file
# at the root of this repository for details.
#
# After the Change Date (four years from publication), this version of the
# Licensed Work will be made available under the Apache License, Version 2.0.
# See the LICENSE-APACHE file or http://www.apache.org/licenses/LICENSE-2.0
#
# Additional attribution and requirements are provided in the NOTICE file.


import asyncio
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, List
import subprocess
import logging
import traceback

from ..base import BaseModule, ModuleMetadata, ModuleFinding, ModuleResult
from . import register_module

logger = logging.getLogger(__name__)


@register_module
class AtherisModule(BaseModule):
    """Atheris Python fuzzing module"""

    def get_metadata(self) -> ModuleMetadata:
        """Get module metadata"""
        return ModuleMetadata(
            name="atheris",
            version="2.3.0",
            description="Coverage-guided Python fuzzing engine for finding bugs in Python code",
            author="FuzzForge Team",
            category="fuzzing",
            tags=["python", "coverage-guided", "native", "sanitizers", "libfuzzer"],
            input_schema={
                "type": "object",
                "properties": {
                    "target_script": {
                        "type": "string",
                        "description": "Path to the Python script containing the fuzz target function"
                    },
                    "target_function": {
                        "type": "string",
                        "default": "TestOneInput",
                        "description": "Name of the target function to fuzz"
                    },
                    "corpus_dir": {
                        "type": "string",
                        "description": "Directory containing initial corpus files"
                    },
                    "dict_file": {
                        "type": "string",
                        "description": "Dictionary file for fuzzing keywords"
                    },
                    "max_total_time": {
                        "type": "integer",
                        "default": 600,
                        "description": "Maximum total time to run fuzzing (seconds)"
                    },
                    "max_len": {
                        "type": "integer",
                        "default": 4096,
                        "description": "Maximum length of test input"
                    },
                    "timeout": {
                        "type": "integer",
                        "default": 25,
                        "description": "Timeout for individual test cases (seconds)"
                    },
                    "runs": {
                        "type": "integer",
                        "default": -1,
                        "description": "Number of individual test runs (-1 for unlimited)"
                    },
                    "jobs": {
                        "type": "integer",
                        "default": 1,
                        "description": "Number of fuzzing jobs to run in parallel"
                    },
                    "print_final_stats": {
                        "type": "boolean",
                        "default": true,
                        "description": "Print final statistics"
                    },
                    "print_pcs": {
                        "type": "boolean",
                        "default": false,
                        "description": "Print newly covered PCs"
                    },
                    "print_coverage": {
                        "type": "boolean",
                        "default": true,
                        "description": "Print coverage information"
                    },
                    "artifact_prefix": {
                        "type": "string",
                        "default": "crash-",
                        "description": "Prefix for artifact files"
                    },
                    "seed": {
                        "type": "integer",
                        "description": "Random seed for reproducibility"
                    },
                    "python_path": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Additional Python paths to add to sys.path"
                    },
                    "enable_sanitizers": {
                        "type": "boolean",
                        "default": true,
                        "description": "Enable Python-specific sanitizers and checks"
                    },
                    "detect_leaks": {
                        "type": "boolean",
                        "default": true,
                        "description": "Detect memory leaks in native extensions"
                    },
                    "detect_stack_use_after_return": {
                        "type": "boolean",
                        "default": false,
                        "description": "Detect stack use-after-return"
                    },
                    "setup_code": {
                        "type": "string",
                        "description": "Python code to execute before fuzzing starts"
                    },
                    "enable_value_profile": {
                        "type": "boolean",
                        "default": false,
                        "description": "Enable value profiling for better mutation"
                    },
                    "shrink": {
                        "type": "boolean",
                        "default": true,
                        "description": "Try to shrink the corpus"
                    },
                    "only_ascii": {
                        "type": "boolean",
                        "default": false,
                        "description": "Only generate ASCII inputs"
                    }
                }
            },
            output_schema={
                "type": "object",
                "properties": {
                    "findings": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "exception_type": {"type": "string"},
                                "exception_message": {"type": "string"},
                                "stack_trace": {"type": "string"},
                                "crash_input": {"type": "string"}
                            }
                        }
                    }
                }
            }
        )

    def validate_config(self, config: Dict[str, Any]) -> bool:
        """Validate configuration"""
        target_script = config.get("target_script")
        if not target_script:
            raise ValueError("target_script is required for Atheris")

        max_total_time = config.get("max_total_time", 600)
        if max_total_time <= 0:
            raise ValueError("max_total_time must be positive")

        return True

    async def execute(self, config: Dict[str, Any], workspace: Path) -> ModuleResult:
        """Execute Atheris Python fuzzing"""
        self.start_timer()

        try:
            # Validate inputs
            self.validate_config(config)
            self.validate_workspace(workspace)

            logger.info("Running Atheris Python fuzzing")

            # Check Atheris installation
            await self._check_atheris_installation()

            # Validate target script
            target_script = workspace / config["target_script"]
            if not target_script.exists():
                raise FileNotFoundError(f"Target script not found: {target_script}")

            # Run Atheris fuzzing
            findings = await self._run_atheris_fuzzing(target_script, config, workspace)

            # Create summary
            summary = self._create_summary(findings)

            logger.info(f"Atheris found {len(findings)} issues")

            return self.create_result(
                findings=findings,
                status="success",
                summary=summary
            )

        except Exception as e:
            logger.error(f"Atheris module failed: {e}")
            return self.create_result(
                findings=[],
                status="failed",
                error=str(e)
            )

    async def _check_atheris_installation(self):
        """Check if Atheris is installed"""
        try:
            process = await asyncio.create_subprocess_exec(
                sys.executable, "-c", "import atheris; print(atheris.__version__)",
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )
            stdout, stderr = await process.communicate()

            if process.returncode != 0:
                raise RuntimeError("Atheris not installed. Install with: pip install atheris")

            version = stdout.decode().strip()
            logger.info(f"Using Atheris version: {version}")

        except Exception as e:
            raise RuntimeError(f"Atheris installation check failed: {e}")

    async def _run_atheris_fuzzing(self, target_script: Path, config: Dict[str, Any], workspace: Path) -> List[ModuleFinding]:
        """Run Atheris fuzzing"""
        findings = []

        try:
            # Create output directory for artifacts
            output_dir = workspace / "atheris_output"
            output_dir.mkdir(exist_ok=True)

            # Create wrapper script for fuzzing
            wrapper_script = await self._create_atheris_wrapper(target_script, config, workspace, output_dir)

            # Build Atheris command
            cmd = [sys.executable, str(wrapper_script)]

            # Add corpus directory
            corpus_dir = config.get("corpus_dir")
            if corpus_dir:
                corpus_path = workspace / corpus_dir
                if corpus_path.exists():
                    cmd.append(str(corpus_path))

            # Set up environment
            env = self._setup_atheris_environment(config)

            logger.debug(f"Running command: {' '.join(cmd)}")

            # Run Atheris with timeout
            max_total_time = config.get("max_total_time", 600)

            try:
                process = await asyncio.create_subprocess_exec(
                    *cmd,
                    stdout=asyncio.subprocess.PIPE,
                    stderr=asyncio.subprocess.PIPE,
                    cwd=workspace,
                    env=env
                )

                # Wait for specified time then terminate
                try:
                    stdout, stderr = await asyncio.wait_for(
                        process.communicate(), timeout=max_total_time
                    )
                except asyncio.TimeoutError:
                    logger.info(f"Atheris fuzzing timed out after {max_total_time} seconds")
                    process.terminate()
                    try:
                        await asyncio.wait_for(process.wait(), timeout=10)
                    except asyncio.TimeoutError:
                        process.kill()
                        await process.wait()

                # Parse results
                findings = self._parse_atheris_output(
                    stdout.decode(), stderr.decode(), output_dir, workspace
                )

                # Look for crash files
                crash_findings = self._parse_crash_files(output_dir, workspace)
                findings.extend(crash_findings)

            except Exception as e:
                logger.warning(f"Error running Atheris process: {e}")

        except Exception as e:
            logger.warning(f"Error in Atheris fuzzing: {e}")

        return findings

    async def _create_atheris_wrapper(self, target_script: Path, config: Dict[str, Any], workspace: Path, output_dir: Path) -> Path:
        """Create wrapper script for Atheris fuzzing"""
        wrapper_path = workspace / "atheris_wrapper.py"

        wrapper_code = f'''#!/usr/bin/env python3
import sys
import os
import atheris
import traceback

# Add Python paths
python_paths = {config.get("python_path", [])}
for path in python_paths:
    if path not in sys.path:
        sys.path.insert(0, path)

# Add workspace to Python path
sys.path.insert(0, r"{workspace}")

# Setup code
setup_code = """{config.get("setup_code", "")}"""
if setup_code:
    exec(setup_code)

# Import target script
target_module_name = "{target_script.stem}"
sys.path.insert(0, r"{target_script.parent}")

try:
    target_module = __import__(target_module_name)
    target_function = getattr(target_module, "{config.get("target_function", "TestOneInput")}")
except Exception as e:
    print(f"Failed to import target: {{e}}")
    sys.exit(1)

# Wrapper function to catch exceptions
original_target = target_function

def wrapped_target(data):
    try:
        return original_target(data)
    except Exception as e:
        # Write crash information
        crash_info = {{
            "exception_type": type(e).__name__,
            "exception_message": str(e),
            "stack_trace": traceback.format_exc(),
            "input_data": data[:1000].hex() if isinstance(data, bytes) else str(data)[:1000]
        }}

        crash_file = r"{output_dir}" + "/crash_" + type(e).__name__ + ".txt"
        with open(crash_file, "a") as f:
            f.write(f"Exception: {{type(e).__name__}}\\n")
            f.write(f"Message: {{str(e)}}\\n")
            f.write(f"Stack trace:\\n{{traceback.format_exc()}}\\n")
            f.write(f"Input data (first 1000 chars/bytes): {{crash_info['input_data']}}\\n")
            f.write("-" * 80 + "\\n")

        # Re-raise to let Atheris handle it
        raise

if __name__ == "__main__":
    # Configure Atheris
    atheris.Setup(sys.argv, wrapped_target)

    # Set Atheris options
    options = []

    options.append(f"-max_total_time={{config.get('max_total_time', 600)}}")
    options.append(f"-max_len={{config.get('max_len', 4096)}}")
    options.append(f"-timeout={{config.get('timeout', 25)}}")
    options.append(f"-runs={{config.get('runs', -1)}}")

    if {config.get('jobs', 1)} > 1:
        options.append(f"-jobs={{config.get('jobs', 1)}}")

    if {config.get('print_final_stats', True)}:
        options.append("-print_final_stats=1")
    else:
        options.append("-print_final_stats=0")

    if {config.get('print_pcs', False)}:
        options.append("-print_pcs=1")

    if {config.get('print_coverage', True)}:
        options.append("-print_coverage=1")

    artifact_prefix = "{config.get('artifact_prefix', 'crash-')}"
    options.append(f"-artifact_prefix={{r'{output_dir}'}}/" + artifact_prefix)

    seed = {config.get('seed')}
    if seed is not None:
        options.append(f"-seed={{seed}}")

    if {config.get('enable_value_profile', False)}:
        options.append("-use_value_profile=1")

    if {config.get('shrink', True)}:
        options.append("-shrink=1")

    if {config.get('only_ascii', False)}:
        options.append("-only_ascii=1")

    dict_file = "{config.get('dict_file', '')}"
    if dict_file:
        dict_path = r"{workspace}" + "/" + dict_file
        if os.path.exists(dict_path):
            options.append(f"-dict={{dict_path}}")

    # Add options to sys.argv
    sys.argv.extend(options)

    # Start fuzzing
    atheris.Fuzz()
'''

        with open(wrapper_path, 'w') as f:
            f.write(wrapper_code)

        return wrapper_path

    def _setup_atheris_environment(self, config: Dict[str, Any]) -> Dict[str, str]:
        """Setup environment variables for Atheris"""
        env = os.environ.copy()

        # Enable sanitizers if requested
        if config.get("enable_sanitizers", True):
            env["ASAN_OPTIONS"] = env.get("ASAN_OPTIONS", "") + ":detect_leaks=1:halt_on_error=1"

        if config.get("detect_leaks", True):
            env["ASAN_OPTIONS"] = env.get("ASAN_OPTIONS", "") + ":detect_leaks=1"

        if config.get("detect_stack_use_after_return", False):
            env["ASAN_OPTIONS"] = env.get("ASAN_OPTIONS", "") + ":detect_stack_use_after_return=1"

        return env

    def _parse_atheris_output(self, stdout: str, stderr: str, output_dir: Path, workspace: Path) -> List[ModuleFinding]:
        """Parse Atheris output for crashes and issues"""
        findings = []

        try:
            # Combine stdout and stderr
            full_output = stdout + "\n" + stderr

            # Look for Python exceptions in output
            exception_patterns = [
                r"Traceback \(most recent call last\):(.*?)(?=\n\w|\nDONE|\n=|\Z)",
                r"Exception: (\w+).*?\nMessage: (.*?)\nStack trace:\n(.*?)(?=\n-{20,}|\Z)"
            ]

            for pattern in exception_patterns:
                import re
                matches = re.findall(pattern, full_output, re.DOTALL | re.MULTILINE)
                for match in matches:
                    finding = self._create_exception_finding(match, full_output, output_dir)
                    if finding:
                        findings.append(finding)

        except Exception as e:
            logger.warning(f"Error parsing Atheris output: {e}")

        return findings

    def _parse_crash_files(self, output_dir: Path, workspace: Path) -> List[ModuleFinding]:
        """Parse crash files created by wrapper"""
        findings = []

        try:
            # Look for crash files
            crash_files = list(output_dir.glob("crash_*.txt"))

            for crash_file in crash_files:
                findings.extend(self._parse_crash_file(crash_file, workspace))

            # Also look for Atheris artifact files
            artifact_files = list(output_dir.glob("crash-*"))
            for artifact_file in artifact_files:
                finding = self._create_artifact_finding(artifact_file, workspace)
                if finding:
                    findings.append(finding)

        except Exception as e:
            logger.warning(f"Error parsing crash files: {e}")

        return findings

    def _parse_crash_file(self, crash_file: Path, workspace: Path) -> List[ModuleFinding]:
        """Parse individual crash file"""
        findings = []

        try:
            content = crash_file.read_text()

            # Split by separator
            crash_entries = content.split("-" * 80)

            for entry in crash_entries:
                if not entry.strip():
                    continue

                finding = self._parse_crash_entry(entry, crash_file, workspace)
                if finding:
                    findings.append(finding)

        except Exception as e:
            logger.warning(f"Error parsing crash file {crash_file}: {e}")

        return findings

    def _parse_crash_entry(self, entry: str, crash_file: Path, workspace: Path) -> ModuleFinding:
        """Parse individual crash entry"""
        try:
            lines = entry.strip().split('\n')

            exception_type = ""
            exception_message = ""
            stack_trace = ""
            input_data = ""

            current_section = None
            stack_lines = []

            for line in lines:
                if line.startswith("Exception: "):
                    exception_type = line.replace("Exception: ", "")
                elif line.startswith("Message: "):
                    exception_message = line.replace("Message: ", "")
                elif line.startswith("Stack trace:"):
                    current_section = "stack"
                elif line.startswith("Input data"):
                    current_section = "input"
                    input_data = line.split(":", 1)[1].strip() if ":" in line else ""
                elif current_section == "stack":
                    stack_lines.append(line)

            stack_trace = '\n'.join(stack_lines)

            if not exception_type:
                return None

            # Determine severity based on exception type
            severity = self._get_exception_severity(exception_type)

            # Create relative path
            try:
                rel_path = crash_file.relative_to(workspace)
                file_path = str(rel_path)
            except ValueError:
                file_path = str(crash_file)

            finding = self.create_finding(
                title=f"Atheris Exception: {exception_type}",
                description=f"Atheris discovered a Python exception: {exception_type}{': ' + exception_message if exception_message else ''}",
                severity=severity,
                category=self._get_exception_category(exception_type),
                file_path=file_path,
                recommendation=self._get_exception_recommendation(exception_type, exception_message),
                metadata={
                    "exception_type": exception_type,
                    "exception_message": exception_message,
                    "stack_trace": stack_trace[:2000] if stack_trace else "",  # Limit size
                    "crash_input_preview": input_data[:500] if input_data else "",
                    "fuzzer": "atheris"
                }
            )

            return finding

        except Exception as e:
            logger.warning(f"Error parsing crash entry: {e}")
            return None

    def _create_exception_finding(self, match, full_output: str, output_dir: Path) -> ModuleFinding:
        """Create finding from exception match"""
        try:
            if isinstance(match, tuple) and len(match) >= 1:
                # Handle different match formats
                if len(match) == 3:  # Exception format
                    exception_type, exception_message, stack_trace = match
                else:
                    stack_trace = match[0]
                    exception_type = "Unknown"
                    exception_message = ""
            else:
                stack_trace = str(match)
                exception_type = "Unknown"
                exception_message = ""

            # Try to extract exception type from stack trace
            if not exception_type or exception_type == "Unknown":
                lines = stack_trace.split('\n')
                for line in reversed(lines):
                    if ':' in line and any(exc in line for exc in ['Error', 'Exception', 'Warning']):
                        exception_type = line.split(':')[0].strip()
                        exception_message = line.split(':', 1)[1].strip() if ':' in line else ""
                        break

            severity = self._get_exception_severity(exception_type)

            finding = self.create_finding(
                title=f"Atheris Exception: {exception_type}",
                description=f"Atheris discovered a Python exception during fuzzing: {exception_type}",
                severity=severity,
                category=self._get_exception_category(exception_type),
                file_path=None,
                recommendation=self._get_exception_recommendation(exception_type, exception_message),
                metadata={
                    "exception_type": exception_type,
                    "exception_message": exception_message,
                    "stack_trace": stack_trace[:2000] if stack_trace else "",
                    "fuzzer": "atheris"
                }
            )

            return finding

        except Exception as e:
            logger.warning(f"Error creating exception finding: {e}")
            return None

    def _create_artifact_finding(self, artifact_file: Path, workspace: Path) -> ModuleFinding:
        """Create finding from Atheris artifact file"""
        try:
            # Try to read artifact content (limited)
            artifact_content = ""
            try:
                content_bytes = artifact_file.read_bytes()[:1000]
                artifact_content = content_bytes.hex()
            except Exception:
                pass

            # Create relative path
            try:
                rel_path = artifact_file.relative_to(workspace)
                file_path = str(rel_path)
            except ValueError:
                file_path = str(artifact_file)

            finding = self.create_finding(
                title="Atheris Crash Artifact",
                description=f"Atheris generated a crash artifact file: {artifact_file.name}",
                severity="medium",
                category="program_crash",
                file_path=file_path,
                recommendation="Analyze the crash artifact to reproduce and debug the issue. The artifact contains the input that caused the crash.",
                metadata={
                    "artifact_type": "crash",
                    "artifact_file": artifact_file.name,
                    "artifact_content_hex": artifact_content,
                    "fuzzer": "atheris"
                }
            )

            return finding

        except Exception as e:
            logger.warning(f"Error creating artifact finding: {e}")
            return None

    def _get_exception_severity(self, exception_type: str) -> str:
        """Determine severity based on exception type"""
        if not exception_type:
            return "medium"

        exception_lower = exception_type.lower()

        # Critical security issues
        if any(term in exception_lower for term in ["segmentationfault", "accessviolation", "memoryerror"]):
            return "critical"

        # High severity exceptions
        elif any(term in exception_lower for term in ["attributeerror", "typeerror", "indexerror", "keyerror", "valueerror"]):
            return "high"

        # Medium severity exceptions
        elif any(term in exception_lower for term in ["assertionerror", "runtimeerror", "ioerror", "oserror"]):
            return "medium"

        # Lower severity exceptions
        elif any(term in exception_lower for term in ["warning", "deprecation"]):
            return "low"

        else:
            return "medium"

    def _get_exception_category(self, exception_type: str) -> str:
        """Determine category based on exception type"""
        if not exception_type:
            return "python_exception"

        exception_lower = exception_type.lower()

        if any(term in exception_lower for term in ["memory", "segmentation", "access"]):
            return "memory_corruption"
        elif any(term in exception_lower for term in ["attribute", "type"]):
            return "type_error"
        elif any(term in exception_lower for term in ["index", "key", "value"]):
            return "data_error"
        elif any(term in exception_lower for term in ["io", "os", "file"]):
            return "io_error"
        elif any(term in exception_lower for term in ["assertion"]):
            return "assertion_failure"
        else:
            return "python_exception"

    def _get_exception_recommendation(self, exception_type: str, exception_message: str) -> str:
        """Generate recommendation based on exception type"""
        if not exception_type:
            return "Analyze the exception and fix the underlying code issue."

        exception_lower = exception_type.lower()

        if "attributeerror" in exception_lower:
            return "Fix AttributeError by ensuring objects have the expected attributes before accessing them. Add proper error handling and validation."
        elif "typeerror" in exception_lower:
            return "Fix TypeError by ensuring correct data types are used. Add type checking and validation for function parameters."
        elif "indexerror" in exception_lower:
            return "Fix IndexError by adding bounds checking before accessing list/array elements. Validate indices are within valid range."
        elif "keyerror" in exception_lower:
            return "Fix KeyError by checking if keys exist in dictionaries before accessing them. Use .get() method or proper key validation."
        elif "valueerror" in exception_lower:
            return "Fix ValueError by validating input values before processing. Add proper input sanitization and validation."
        elif "memoryerror" in exception_lower:
            return "Fix MemoryError by optimizing memory usage, processing data in chunks, or increasing available memory."
        elif "assertionerror" in exception_lower:
            return "Fix AssertionError by reviewing assertion conditions and ensuring they properly validate the expected state."
        else:
            return f"Fix the {exception_type} exception by analyzing the root cause and implementing appropriate error handling and validation."

    def _create_summary(self, findings: List[ModuleFinding]) -> Dict[str, Any]:
        """Create analysis summary"""
        severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0, "info": 0}
        category_counts = {}
        exception_counts = {}

        for finding in findings:
            # Count by severity
            severity_counts[finding.severity] += 1

            # Count by category
            category = finding.category
            category_counts[category] = category_counts.get(category, 0) + 1

            # Count by exception type
            exception_type = finding.metadata.get("exception_type", "unknown")
            exception_counts[exception_type] = exception_counts.get(exception_type, 0) + 1

        return {
            "total_findings": len(findings),
            "severity_counts": severity_counts,
            "category_counts": category_counts,
            "exception_counts": exception_counts,
            "unique_exceptions": len(exception_counts),
            "python_specific_issues": sum(category_counts.get(cat, 0) for cat in ["type_error", "data_error", "python_exception"])
        }