Files
fuzzforge_ai/sdk/src/fuzzforge_sdk/utils.py
T
Tanguy Duhamel 323a434c73 Initial commit
2025-09-29 21:26:41 +02:00

432 lines
12 KiB
Python

"""
Utility functions for the FuzzForge SDK.
Provides helper functions for path validation, SARIF processing,
volume mount creation, and other common operations.
"""
# Copyright (c) 2025 FuzzingLabs
#
# Licensed under the Business Source License 1.1 (BSL). See the LICENSE file
# at the root of this repository for details.
#
# After the Change Date (four years from publication), this version of the
# Licensed Work will be made available under the Apache License, Version 2.0.
# See the LICENSE-APACHE file or http://www.apache.org/licenses/LICENSE-2.0
#
# Additional attribution and requirements are provided in the NOTICE file.
import os
import json
from pathlib import Path
from typing import Dict, Any, List, Optional, Union
from datetime import datetime
from .models import VolumeMount, ResourceLimits, WorkflowSubmission
from .exceptions import ValidationError
def validate_absolute_path(path: Union[str, Path]) -> Path:
"""
Validate that a path is absolute and exists.
Args:
path: Path to validate
Returns:
Validated Path object
Raises:
ValidationError: If path is not absolute or doesn't exist
"""
path_obj = Path(path)
if not path_obj.is_absolute():
raise ValidationError(f"Path must be absolute: {path}")
if not path_obj.exists():
raise ValidationError(f"Path does not exist: {path}")
return path_obj
def create_volume_mount(
host_path: Union[str, Path],
container_path: str,
mode: str = "ro"
) -> VolumeMount:
"""
Create a volume mount with path validation.
Args:
host_path: Host path to mount (must exist)
container_path: Container path for the mount
mode: Mount mode ("ro" or "rw")
Returns:
VolumeMount object
Raises:
ValidationError: If paths are invalid
"""
# Validate host path exists and is absolute
validated_host_path = validate_absolute_path(host_path)
# Validate container path is absolute
if not container_path.startswith('/'):
raise ValidationError(f"Container path must be absolute: {container_path}")
# Validate mode
if mode not in ["ro", "rw"]:
raise ValidationError(f"Mode must be 'ro' or 'rw': {mode}")
return VolumeMount(
host_path=str(validated_host_path),
container_path=container_path,
mode=mode # type: ignore
)
def create_resource_limits(
cpu_limit: Optional[str] = None,
memory_limit: Optional[str] = None,
cpu_request: Optional[str] = None,
memory_request: Optional[str] = None
) -> ResourceLimits:
"""
Create resource limits with validation.
Args:
cpu_limit: CPU limit (e.g., "2", "500m")
memory_limit: Memory limit (e.g., "1Gi", "512Mi")
cpu_request: CPU request (guaranteed)
memory_request: Memory request (guaranteed)
Returns:
ResourceLimits object
Raises:
ValidationError: If resource specifications are invalid
"""
# Basic validation for CPU limits
if cpu_limit is not None:
if not (cpu_limit.endswith('m') or cpu_limit.isdigit()):
raise ValidationError(f"Invalid CPU limit format: {cpu_limit}")
if cpu_request is not None:
if not (cpu_request.endswith('m') or cpu_request.isdigit()):
raise ValidationError(f"Invalid CPU request format: {cpu_request}")
# Basic validation for memory limits
memory_suffixes = ['Ki', 'Mi', 'Gi', 'Ti', 'K', 'M', 'G', 'T']
if memory_limit is not None:
if not any(memory_limit.endswith(suffix) for suffix in memory_suffixes):
if not memory_limit.isdigit():
raise ValidationError(f"Invalid memory limit format: {memory_limit}")
if memory_request is not None:
if not any(memory_request.endswith(suffix) for suffix in memory_suffixes):
if not memory_request.isdigit():
raise ValidationError(f"Invalid memory request format: {memory_request}")
return ResourceLimits(
cpu_limit=cpu_limit,
memory_limit=memory_limit,
cpu_request=cpu_request,
memory_request=memory_request
)
def create_workflow_submission(
target_path: Union[str, Path],
volume_mode: str = "ro",
parameters: Optional[Dict[str, Any]] = None,
timeout: Optional[int] = None,
resource_limits: Optional[ResourceLimits] = None,
additional_volumes: Optional[List[VolumeMount]] = None
) -> WorkflowSubmission:
"""
Create a workflow submission with path validation.
Args:
target_path: Path to analyze (must exist)
volume_mode: Mount mode for target path
parameters: Workflow-specific parameters
timeout: Execution timeout in seconds
resource_limits: Resource limits for the container
additional_volumes: Additional volume mounts
Returns:
WorkflowSubmission object
Raises:
ValidationError: If parameters are invalid
"""
# Validate target path
validated_target_path = validate_absolute_path(target_path)
# Validate volume mode
if volume_mode not in ["ro", "rw"]:
raise ValidationError(f"Volume mode must be 'ro' or 'rw': {volume_mode}")
# Validate timeout
if timeout is not None:
if timeout < 1 or timeout > 604800: # Max 7 days
raise ValidationError(f"Timeout must be between 1 and 604800 seconds: {timeout}")
return WorkflowSubmission(
target_path=str(validated_target_path),
volume_mode=volume_mode, # type: ignore
parameters=parameters or {},
timeout=timeout,
resource_limits=resource_limits,
additional_volumes=additional_volumes or []
)
def extract_sarif_results(sarif_data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract results from SARIF format findings.
Args:
sarif_data: SARIF formatted data
Returns:
List of result objects from SARIF
Raises:
ValidationError: If SARIF data is malformed
"""
if not isinstance(sarif_data, dict):
raise ValidationError("SARIF data must be a dictionary")
runs = sarif_data.get("runs", [])
if not isinstance(runs, list):
raise ValidationError("SARIF runs must be a list")
results = []
for run in runs:
if not isinstance(run, dict):
continue
run_results = run.get("results", [])
if isinstance(run_results, list):
results.extend(run_results)
return results
def count_sarif_severity_levels(sarif_data: Dict[str, Any]) -> Dict[str, int]:
"""
Count findings by severity level in SARIF data.
Args:
sarif_data: SARIF formatted data
Returns:
Dictionary mapping severity levels to counts
"""
results = extract_sarif_results(sarif_data)
severity_counts = {"error": 0, "warning": 0, "note": 0, "info": 0}
for result in results:
level = result.get("level", "warning")
if level in severity_counts:
severity_counts[level] += 1
else:
# Default unknown levels to warning
severity_counts["warning"] += 1
return severity_counts
def format_sarif_summary(sarif_data: Dict[str, Any]) -> str:
"""
Create a human-readable summary of SARIF findings.
Args:
sarif_data: SARIF formatted data
Returns:
Formatted summary string
"""
severity_counts = count_sarif_severity_levels(sarif_data)
total_findings = sum(severity_counts.values())
if total_findings == 0:
return "No findings detected."
summary_parts = [f"Total findings: {total_findings}"]
for level, count in severity_counts.items():
if count > 0:
summary_parts.append(f"{level.title()}: {count}")
return " | ".join(summary_parts)
def save_sarif_to_file(sarif_data: Dict[str, Any], file_path: Union[str, Path]) -> None:
"""
Save SARIF data to a JSON file.
Args:
sarif_data: SARIF formatted data
file_path: Path to save the file
Raises:
ValidationError: If file cannot be written
"""
try:
path_obj = Path(file_path)
# Create parent directories if they don't exist
path_obj.parent.mkdir(parents=True, exist_ok=True)
with open(path_obj, 'w', encoding='utf-8') as f:
json.dump(sarif_data, f, indent=2, ensure_ascii=False)
except (OSError, json.JSONEncodeError) as e:
raise ValidationError(f"Failed to save SARIF file: {e}")
def format_duration(seconds: int) -> str:
"""
Format duration in seconds to human-readable string.
Args:
seconds: Duration in seconds
Returns:
Formatted duration string
"""
if seconds < 60:
return f"{seconds}s"
elif seconds < 3600:
minutes, secs = divmod(seconds, 60)
return f"{minutes}m {secs}s"
else:
hours, remainder = divmod(seconds, 3600)
minutes, secs = divmod(remainder, 60)
return f"{hours}h {minutes}m {secs}s"
def format_execution_rate(executions_per_sec: float) -> str:
"""
Format execution rate for display.
Args:
executions_per_sec: Executions per second
Returns:
Formatted rate string
"""
if executions_per_sec < 1:
return f"{executions_per_sec:.2f} exec/s"
elif executions_per_sec < 1000:
return f"{executions_per_sec:.1f} exec/s"
else:
return f"{executions_per_sec/1000:.1f}k exec/s"
def format_memory_size(size_bytes: int) -> str:
"""
Format memory size in bytes to human-readable string.
Args:
size_bytes: Size in bytes
Returns:
Formatted size string
"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024.0:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f} PB"
def get_project_files(
project_path: Union[str, Path],
extensions: Optional[List[str]] = None,
exclude_dirs: Optional[List[str]] = None
) -> List[Path]:
"""
Get list of files in a project directory.
Args:
project_path: Path to project directory
extensions: List of file extensions to include (e.g., ['.py', '.js'])
exclude_dirs: List of directory names to exclude (e.g., ['.git', 'node_modules'])
Returns:
List of file paths
Raises:
ValidationError: If project path is invalid
"""
project_path_obj = validate_absolute_path(project_path)
if not project_path_obj.is_dir():
raise ValidationError(f"Project path must be a directory: {project_path}")
exclude_dirs = exclude_dirs or ['.git', '__pycache__', 'node_modules', '.pytest_cache']
extensions = extensions or []
files = []
for root, dirs, filenames in os.walk(project_path_obj):
# Remove excluded directories from search
dirs[:] = [d for d in dirs if d not in exclude_dirs]
root_path = Path(root)
for filename in filenames:
file_path = root_path / filename
# Filter by extensions if specified
if extensions and not any(filename.endswith(ext) for ext in extensions):
continue
files.append(file_path)
return sorted(files)
def estimate_analysis_time(
project_path: Union[str, Path],
workflow_type: str = "static"
) -> int:
"""
Estimate analysis time based on project size and workflow type.
Args:
project_path: Path to project directory
workflow_type: Type of workflow ("static", "dynamic", "fuzzing")
Returns:
Estimated time in seconds
Raises:
ValidationError: If project path is invalid
"""
files = get_project_files(project_path)
total_size = sum(f.stat().st_size for f in files if f.exists())
# Base estimates (very rough)
if workflow_type == "static":
# ~1MB per second for static analysis
base_time = max(30, total_size // (1024 * 1024))
elif workflow_type == "dynamic":
# Dynamic analysis is slower
base_time = max(60, total_size // (512 * 1024))
elif workflow_type == "fuzzing":
# Fuzzing can run for hours/days
base_time = 3600 # Default to 1 hour
else:
# Unknown workflow type
base_time = max(60, total_size // (1024 * 1024))
# Factor in number of files
file_factor = max(1, len(files) // 100)
return base_time * file_factor