#!/usr/bin/env python3 """ NeuroSploit Benchmark Runner & Coverage Analyzer Real execution of 104 CTF-style validation benchmarks: - Builds and starts each Docker challenge - Runs NeuroSploit autonomous agent against the target - Checks if the agent extracts the flag - Generates pass/fail results with detailed reporting Usage: python3 tools/benchmark_runner.py run # Run all benchmarks python3 tools/benchmark_runner.py run --ids XBEN-050-24 # Run specific benchmark python3 tools/benchmark_runner.py run --tags xss sqli --level 1 # Filter by tag/level python3 tools/benchmark_runner.py run --dry-run # Show what would run python3 tools/benchmark_runner.py run --mode auto_pentest # Thorough mode python3 tools/benchmark_runner.py coverage # Static coverage analysis python3 tools/benchmark_runner.py coverage --json # Coverage as JSON """ import asyncio import json import os import re import signal import subprocess import sys import time import traceback from collections import Counter, defaultdict from dataclasses import dataclass, field, asdict from datetime import datetime from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Set, Tuple import yaml # Add project root to path PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT)) # =========================================================================== # Docker Compose command detection # =========================================================================== def _detect_compose_cmd() -> List[str]: """Detect whether to use 'docker compose' (plugin) or 'docker-compose' (standalone).""" # Try plugin first: docker compose version try: r = subprocess.run(["docker", "compose", "version"], capture_output=True, timeout=5) if r.returncode == 0: return ["docker", "compose"] except Exception: pass # Try standalone: docker-compose version try: r = subprocess.run(["docker-compose", "version"], capture_output=True, timeout=5) if r.returncode == 0: return ["docker-compose"] except Exception: pass return [] # neither found COMPOSE_CMD = _detect_compose_cmd() # =========================================================================== # Data Classes # =========================================================================== @dataclass class BenchmarkConfig: """Parsed benchmark metadata.""" id: str name: str description: str level: int tags: List[str] dir: str expected_flag: str service_name: str container_port: int is_fixed_port: bool fixed_host_port: Optional[int] = None @dataclass class BenchmarkResult: """Per-benchmark execution result.""" benchmark_id: str benchmark_name: str level: int tags: List[str] status: str # pass, fail, error, timeout, skip flag_found: bool = False flag_value: Optional[str] = None expected_flag: str = "" flag_source: str = "none" # finding_evidence, finding_response, etc. findings_count: int = 0 findings_summary: List[Dict] = field(default_factory=list) duration_seconds: float = 0.0 docker_build_time: float = 0.0 docker_startup_time: float = 0.0 agent_run_time: float = 0.0 error_message: Optional[str] = None agent_logs: List[str] = field(default_factory=list) # =========================================================================== # Flag Detection # =========================================================================== # Matches flag{uuid-v4} format used by the benchmarks FLAG_PATTERN = re.compile(r'flag\{[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\}') # Also match FLAG{sha256} format just in case FLAG_PATTERN_SHA = re.compile(r'FLAG\{[a-f0-9]{64}\}') class FlagDetector: """Multi-layer flag search in agent output.""" @staticmethod def search_findings(findings, expected_flag: str) -> Tuple[bool, Optional[str], str]: """Search agent findings for the flag. Returns (found, flag_value, source_field). """ search_fields = [ ("evidence", "finding_evidence"), ("response", "finding_response"), ("payload", "finding_payload"), ("poc_code", "finding_poc"), ("description", "finding_description"), ("request", "finding_request"), ] for finding in findings: for attr, source in search_fields: text = getattr(finding, attr, "") or "" if not text: continue # Exact match first if expected_flag and expected_flag in text: return True, expected_flag, source # Pattern match m = FLAG_PATTERN.search(text) if m: return True, m.group(0), source m = FLAG_PATTERN_SHA.search(text) if m: return True, m.group(0), source return False, None, "none" @staticmethod def search_logs(logs: List[str], expected_flag: str) -> Tuple[bool, Optional[str]]: """Search agent log messages for the flag.""" full_text = "\n".join(logs) if expected_flag and expected_flag in full_text: return True, expected_flag m = FLAG_PATTERN.search(full_text) if m: return True, m.group(0) m = FLAG_PATTERN_SHA.search(full_text) if m: return True, m.group(0) return False, None @staticmethod def search_report(report: Dict, expected_flag: str) -> Tuple[bool, Optional[str]]: """Search the full report dict for the flag.""" try: text = json.dumps(report, default=str) except Exception: text = str(report) if expected_flag and expected_flag in text: return True, expected_flag m = FLAG_PATTERN.search(text) if m: return True, m.group(0) m = FLAG_PATTERN_SHA.search(text) if m: return True, m.group(0) return False, None @staticmethod async def http_crawl_for_flag(target_url: str, expected_flag: str) -> Tuple[bool, Optional[str]]: """Fallback: direct HTTP requests to common flag endpoints.""" import aiohttp endpoints = [ "/", "/flag", "/flag.txt", "/FLAG.txt", "/FLAG", "/api/flag", "/secret", "/secret.txt", "/admin", "/robots.txt", "/sitemap.xml", ] try: connector = aiohttp.TCPConnector(ssl=False) timeout = aiohttp.ClientTimeout(total=10) async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: for ep in endpoints: try: url = target_url.rstrip("/") + ep async with session.get(url) as resp: text = await resp.text() if expected_flag and expected_flag in text: return True, expected_flag m = FLAG_PATTERN.search(text) if m: return True, m.group(0) m = FLAG_PATTERN_SHA.search(text) if m: return True, m.group(0) except Exception: continue except Exception: pass return False, None # =========================================================================== # Docker Orchestrator # =========================================================================== class DockerOrchestrator: """Manages Docker Compose lifecycle for a single benchmark.""" def __init__(self, benchmark_dir: str, project_name: str): self.benchmark_dir = benchmark_dir self.project_name = project_name self._started = False def _run(self, args: List[str], timeout: int = 120, capture: bool = True) -> subprocess.CompletedProcess: """Run a docker compose command.""" cmd = COMPOSE_CMD + ["-p", self.project_name] + args return subprocess.run( cmd, cwd=self.benchmark_dir, timeout=timeout, capture_output=capture, text=True, ) def build(self, timeout: int = 300) -> Tuple[bool, str]: """Build the benchmark containers.""" try: result = self._run(["build"], timeout=timeout) if result.returncode != 0: err = (result.stderr or result.stdout or "unknown error")[-500:] return False, f"Build failed (exit {result.returncode}): {err}" return True, "" except subprocess.TimeoutExpired: return False, f"Build timed out after {timeout}s" except Exception as e: return False, str(e) def start(self, timeout: int = 180) -> Tuple[bool, str]: """Start containers and wait for healthchecks.""" try: result = self._run(["up", "-d", "--wait"], timeout=timeout) if result.returncode != 0: err = (result.stderr or result.stdout or "unknown error")[-500:] return False, f"Start failed (exit {result.returncode}): {err}" self._started = True return True, "" except subprocess.TimeoutExpired: self._started = True # might be partially started return False, f"Start timed out after {timeout}s (healthcheck may have failed)" except Exception as e: return False, str(e) def get_target_url(self, service_name: str, container_port: int, is_fixed: bool, fixed_host_port: Optional[int]) -> Optional[str]: """Resolve the actual URL to test.""" if is_fixed and fixed_host_port: return f"http://localhost:{fixed_host_port}" # Dynamic port: use docker compose port try: result = self._run(["port", service_name, str(container_port)], timeout=10) if result.returncode == 0 and result.stdout.strip(): # Output format: "0.0.0.0:55432" or ":::55432" addr = result.stdout.strip() if ":" in addr: port = addr.rsplit(":", 1)[-1] return f"http://localhost:{port}" except Exception: pass # Fallback: try to inspect containers for the port try: result = subprocess.run( COMPOSE_CMD + ["-p", self.project_name, "ps", "--format", "json"], cwd=self.benchmark_dir, capture_output=True, text=True, timeout=10, ) if result.returncode == 0 and result.stdout.strip(): # Parse JSON output - may be one JSON per line for line in result.stdout.strip().split("\n"): try: info = json.loads(line) publishers = info.get("Publishers") or [] for pub in publishers: if pub.get("TargetPort") == container_port and pub.get("PublishedPort"): return f"http://localhost:{pub['PublishedPort']}" except (json.JSONDecodeError, TypeError): continue except Exception: pass return None def cleanup(self, remove_images: bool = True) -> None: """Stop and remove all containers/volumes/images for this benchmark.""" try: # --rmi all: also removes images built by this compose project rmi_flag = ["--rmi", "all"] if remove_images else [] self._run(["down", "-v", "--remove-orphans", "-t", "10"] + rmi_flag, timeout=120) except Exception: pass # Force remove any lingering containers with the project prefix try: subprocess.run( COMPOSE_CMD + ["-p", self.project_name, "rm", "-f", "-s", "-v"], cwd=self.benchmark_dir, capture_output=True, timeout=30, ) except Exception: pass def get_logs(self, service_name: str) -> str: """Get container logs for debugging.""" try: result = self._run(["logs", service_name, "--tail", "100"], timeout=10) return result.stdout or "" except Exception: return "" # =========================================================================== # Benchmark Config Loader # =========================================================================== HTTP_PORTS = {80, 443, 3000, 4567, 5000, 5003, 8000, 8001, 8002, 8080, 8081, 8082, 9000} def _parse_port_spec(port_spec) -> Tuple[int, bool, Optional[int]]: """Parse a docker-compose port specification. Returns: (container_port, is_fixed, fixed_host_port) """ if isinstance(port_spec, int): return port_spec, False, None s = str(port_spec).strip().strip('"').strip("'").rstrip() if ":" in s: parts = s.split(":") # Could be "host:container" or "ip:host:container" if len(parts) == 3: host_port = int(parts[1]) container_port = int(parts[2]) else: host_port = int(parts[0]) container_port = int(parts[1]) return container_port, True, host_port else: return int(s), False, None def _find_web_service(compose_data: Dict) -> Tuple[str, int, bool, Optional[int]]: """Find the web service from docker-compose data. Returns: (service_name, container_port, is_fixed, fixed_host_port) """ services = compose_data.get("services", {}) candidates = [] for svc_name, svc_config in services.items(): ports = svc_config.get("ports", []) if not ports: continue # Parse all ports for this service for port_spec in ports: container_port, is_fixed, fixed_host_port = _parse_port_spec(port_spec) has_http_healthcheck = False healthcheck = svc_config.get("healthcheck", {}) if healthcheck: test = healthcheck.get("test", []) test_str = str(test) if isinstance(test, list) else str(test) has_http_healthcheck = "curl" in test_str or "http" in test_str candidates.append({ "service_name": svc_name, "container_port": container_port, "is_fixed": is_fixed, "fixed_host_port": fixed_host_port, "has_http_healthcheck": has_http_healthcheck, "is_http_port": container_port in HTTP_PORTS, }) if not candidates: # Fallback: pick first service first_svc = next(iter(services.keys()), "web") return first_svc, 80, False, None # Priority: HTTP healthcheck > HTTP port > first candidate candidates.sort(key=lambda c: ( not c["has_http_healthcheck"], not c["is_http_port"], )) best = candidates[0] return best["service_name"], best["container_port"], best["is_fixed"], best["fixed_host_port"] def _read_expected_flag(benchmark_dir: str) -> str: """Read FLAG from .env file.""" env_path = os.path.join(benchmark_dir, ".env") if not os.path.exists(env_path): return "" try: with open(env_path) as f: for line in f: line = line.strip() if line.startswith("FLAG="): value = line[5:].strip().strip('"').strip("'") return value except Exception: pass return "" def load_benchmark_configs(benchmarks_dir: str, tag_filter: Optional[List[str]] = None, level_filter: Optional[List[int]] = None, id_filter: Optional[List[str]] = None) -> List[BenchmarkConfig]: """Load and parse all benchmark configurations with optional filtering.""" configs = [] bench_path = Path(benchmarks_dir) for d in sorted(bench_path.iterdir()): if not d.is_dir() or not d.name.startswith("XBEN-"): continue config_file = d / "benchmark.json" compose_file = d / "docker-compose.yml" if not config_file.exists() or not compose_file.exists(): continue # Apply ID filter early if id_filter and d.name not in id_filter: continue try: # Load benchmark.json with open(config_file) as f: meta = json.load(f) name = meta.get("name", d.name) description = meta.get("description", "") level = int(meta.get("level", 1)) tags = meta.get("tags", []) # Apply filters if level_filter and level not in level_filter: continue if tag_filter and not any(t in tags for t in tag_filter): continue # Read expected flag expected_flag = _read_expected_flag(str(d)) # Parse docker-compose.yml with open(compose_file) as f: compose_data = yaml.safe_load(f) service_name, container_port, is_fixed, fixed_host_port = _find_web_service(compose_data) configs.append(BenchmarkConfig( id=d.name, name=name, description=description, level=level, tags=tags, dir=str(d), expected_flag=expected_flag, service_name=service_name, container_port=container_port, is_fixed_port=is_fixed, fixed_host_port=fixed_host_port, )) except Exception as e: print(f" [WARN] Failed to load {d.name}: {e}") continue return configs # =========================================================================== # Report Generator # =========================================================================== class ReportGenerator: """Generates JSON and Markdown benchmark reports.""" @staticmethod def generate_json(results: List[BenchmarkResult], output_path: str) -> None: """Write full results as JSON.""" data = { "generated_at": datetime.utcnow().isoformat(), "total_benchmarks": len(results), "summary": ReportGenerator._compute_summary(results), "results": [asdict(r) for r in results], } # Don't include full agent_logs in main JSON (too large) - just count for entry in data["results"]: log_count = len(entry.get("agent_logs", [])) entry["agent_log_count"] = log_count entry.pop("agent_logs", None) os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w") as f: json.dump(data, f, indent=2, default=str) @staticmethod def _compute_summary(results: List[BenchmarkResult]) -> Dict: """Compute summary statistics.""" total = len(results) passed = sum(1 for r in results if r.status == "pass") failed = sum(1 for r in results if r.status == "fail") errors = sum(1 for r in results if r.status == "error") timeouts = sum(1 for r in results if r.status == "timeout") skipped = sum(1 for r in results if r.status == "skip") # Level breakdown level_stats = defaultdict(lambda: {"total": 0, "passed": 0}) for r in results: level_stats[r.level]["total"] += 1 if r.status == "pass": level_stats[r.level]["passed"] += 1 # Tag breakdown tag_stats = defaultdict(lambda: {"total": 0, "passed": 0}) for r in results: for tag in r.tags: tag_stats[tag]["total"] += 1 if r.status == "pass": tag_stats[tag]["passed"] += 1 # Flag source distribution source_counts = Counter(r.flag_source for r in results if r.flag_found) # Timing run_results = [r for r in results if r.status in ("pass", "fail")] avg_duration = (sum(r.duration_seconds for r in run_results) / len(run_results)) if run_results else 0 total_duration = sum(r.duration_seconds for r in results) return { "pass_rate": f"{passed}/{total} ({passed/total*100:.1f}%)" if total else "0/0", "passed": passed, "failed": failed, "errors": errors, "timeouts": timeouts, "skipped": skipped, "level_breakdown": dict(level_stats), "tag_breakdown": dict(tag_stats), "flag_source_distribution": dict(source_counts), "avg_duration_seconds": round(avg_duration, 1), "total_duration_seconds": round(total_duration, 1), "total_findings": sum(r.findings_count for r in results), } @staticmethod def generate_markdown(results: List[BenchmarkResult], output_path: str) -> None: """Write formatted Markdown summary.""" summary = ReportGenerator._compute_summary(results) total = len(results) passed = summary["passed"] lines = [ "# NeuroSploit Benchmark Results", "", f"**Date**: {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}", f"**Total Benchmarks**: {total}", f"**Total Duration**: {summary['total_duration_seconds']:.0f}s ({summary['total_duration_seconds']/60:.1f}min)", "", "## Overall Results", "", f"| Metric | Value |", f"|--------|-------|", f"| **Pass Rate** | **{passed}/{total} ({passed/total*100:.1f}%)** |" if total else "", f"| Passed | {summary['passed']} |", f"| Failed | {summary['failed']} |", f"| Errors | {summary['errors']} |", f"| Timeouts | {summary['timeouts']} |", f"| Skipped | {summary['skipped']} |", f"| Total Findings | {summary['total_findings']} |", f"| Avg Duration | {summary['avg_duration_seconds']:.1f}s |", "", "## Level Breakdown", "", "| Level | Passed | Total | Rate |", "|-------|--------|-------|------|", ] for level in sorted(summary["level_breakdown"].keys()): stats = summary["level_breakdown"][level] label = {1: "Easy", 2: "Medium", 3: "Hard"}.get(level, str(level)) rate = f"{stats['passed']/stats['total']*100:.1f}%" if stats["total"] else "N/A" lines.append(f"| {label} (L{level}) | {stats['passed']} | {stats['total']} | {rate} |") lines += [ "", "## Tag Breakdown", "", "| Tag | Passed | Total | Rate |", "|-----|--------|-------|------|", ] sorted_tags = sorted(summary["tag_breakdown"].items(), key=lambda x: -x[1]["total"]) for tag, stats in sorted_tags: rate = f"{stats['passed']/stats['total']*100:.1f}%" if stats["total"] else "N/A" lines.append(f"| {tag} | {stats['passed']} | {stats['total']} | {rate} |") if summary["flag_source_distribution"]: lines += [ "", "## Flag Source Distribution", "", "| Source | Count |", "|--------|-------|", ] for source, count in sorted(summary["flag_source_distribution"].items(), key=lambda x: -x[1]): lines.append(f"| {source} | {count} |") lines += [ "", "## Per-Benchmark Results", "", "| # | ID | Name | Level | Tags | Status | Flag | Findings | Duration |", "|---|-----|------|-------|------|--------|------|----------|----------|", ] for i, r in enumerate(results, 1): status_icon = { "pass": "PASS", "fail": "FAIL", "error": "ERR", "timeout": "T/O", "skip": "SKIP" }.get(r.status, r.status) flag_icon = "YES" if r.flag_found else "NO" tags_str = ", ".join(r.tags) name_short = r.benchmark_name[:40] lines.append( f"| {i} | {r.benchmark_id} | {name_short} | L{r.level} | {tags_str} | " f"{status_icon} | {flag_icon} | {r.findings_count} | {r.duration_seconds:.1f}s |" ) # Error summary error_results = [r for r in results if r.error_message] if error_results: lines += [ "", "## Errors", "", ] for r in error_results: lines.append(f"- **{r.benchmark_id}** ({r.status}): {r.error_message}") lines.append("") os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(lines)) @staticmethod def print_summary(results: List[BenchmarkResult]) -> None: """Print console summary.""" summary = ReportGenerator._compute_summary(results) total = len(results) passed = summary["passed"] print() print("=" * 70) print(" NEUROSPLOIT BENCHMARK RESULTS") print("=" * 70) print() print(f" Pass Rate: {passed}/{total} ({passed/total*100:.1f}%)" if total else " No results") print(f" Passed: {summary['passed']} | Failed: {summary['failed']} | " f"Errors: {summary['errors']} | Timeouts: {summary['timeouts']}") print(f" Total Findings: {summary['total_findings']}") print(f" Total Duration: {summary['total_duration_seconds']:.0f}s " f"({summary['total_duration_seconds']/60:.1f}min)") print() print(" --- LEVEL BREAKDOWN ---") for level in sorted(summary["level_breakdown"].keys()): stats = summary["level_breakdown"][level] label = {1: "Easy", 2: "Medium", 3: "Hard"}.get(level, str(level)) rate = f"{stats['passed']/stats['total']*100:.1f}%" if stats["total"] else "N/A" print(f" Level {level} ({label}): {stats['passed']}/{stats['total']} ({rate})") print() print(" --- PER-BENCHMARK ---") for r in results: icon = {"pass": "+", "fail": "-", "error": "!", "timeout": "T", "skip": "S"}.get(r.status, "?") flag = "FLAG" if r.flag_found else " " print(f" [{icon}] {r.benchmark_id} L{r.level} {flag} " f"findings={r.findings_count} {r.duration_seconds:.1f}s " f"tags={','.join(r.tags)}") print() print("=" * 70) # =========================================================================== # Benchmark Runner (Main Orchestrator) # =========================================================================== class BenchmarkRunner: """Orchestrates real benchmark execution.""" def __init__( self, benchmarks_dir: str = "/tmp/xbow-benchmarks/benchmarks", results_dir: str = "reports/benchmark_results", agent_mode: str = "full_auto", per_benchmark_timeout: int = 300, build_timeout: int = 300, startup_timeout: int = 180, tag_filter: Optional[List[str]] = None, level_filter: Optional[List[int]] = None, id_filter: Optional[List[str]] = None, resume: bool = True, dry_run: bool = False, keep_images: bool = False, ): self.benchmarks_dir = benchmarks_dir self.results_dir = os.path.join(PROJECT_ROOT, results_dir) self.logs_dir = os.path.join(self.results_dir, "logs") self.agent_mode = agent_mode self.per_benchmark_timeout = per_benchmark_timeout self.build_timeout = build_timeout self.startup_timeout = startup_timeout self.tag_filter = tag_filter self.level_filter = level_filter self.id_filter = id_filter self.resume = resume self.dry_run = dry_run self.keep_images = keep_images self._interrupted = False def _check_docker(self) -> bool: """Verify Docker Compose is available.""" return len(COMPOSE_CMD) > 0 def _progress_path(self) -> str: return os.path.join(self.results_dir, "progress.json") def _load_progress(self) -> Dict[str, Dict]: """Load previous results for resume.""" path = self._progress_path() if not os.path.exists(path): return {} try: with open(path) as f: data = json.load(f) return data.get("completed", {}) except Exception: return {} def _save_progress(self, completed: Dict[str, Dict]) -> None: """Save progress checkpoint.""" os.makedirs(self.results_dir, exist_ok=True) data = { "run_id": datetime.utcnow().isoformat(), "agent_mode": self.agent_mode, "completed": completed, } with open(self._progress_path(), "w") as f: json.dump(data, f, indent=2, default=str) def _save_benchmark_logs(self, benchmark_id: str, logs: List[str]) -> None: """Save per-benchmark agent logs.""" os.makedirs(self.logs_dir, exist_ok=True) log_path = os.path.join(self.logs_dir, f"{benchmark_id}.log") with open(log_path, "w") as f: f.write("\n".join(logs)) async def run_all(self) -> List[BenchmarkResult]: """Sequential execution of all benchmarks.""" # Check Docker if not self._check_docker(): print("ERROR: Docker Compose is not available.") print(" Install Docker Desktop or Docker Engine with compose plugin.") sys.exit(1) # Load configs print(f"\nLoading benchmarks from {self.benchmarks_dir}...") configs = load_benchmark_configs( self.benchmarks_dir, tag_filter=self.tag_filter, level_filter=self.level_filter, id_filter=self.id_filter, ) print(f"Found {len(configs)} benchmarks" + (f" (filtered)" if self.tag_filter or self.level_filter or self.id_filter else "")) if not configs: print("No benchmarks match the filters.") return [] # Dry run mode if self.dry_run: print(f"\n{'='*70}") print(f" DRY RUN - {len(configs)} benchmarks would be executed") print(f"{'='*70}\n") for i, cfg in enumerate(configs, 1): print(f" {i:3d}. {cfg.id} L{cfg.level} svc={cfg.service_name}:{cfg.container_port} " f"{'fixed' if cfg.is_fixed_port else 'dynamic'} " f"flag={'YES' if cfg.expected_flag else 'NO'} " f"tags={','.join(cfg.tags)}") print(f"\n Agent mode: {self.agent_mode}") print(f" Per-benchmark timeout: {self.per_benchmark_timeout}s") print(f" Build timeout: {self.build_timeout}s") return [] # Resume support completed = {} if self.resume: completed = self._load_progress() if completed: print(f"Resuming: {len(completed)} benchmarks already completed") # Setup signal handler original_sigint = signal.getsignal(signal.SIGINT) def handle_interrupt(signum, frame): self._interrupted = True print("\n\n [INTERRUPTED] Finishing current benchmark, then saving progress...") signal.signal(signal.SIGINT, handle_interrupt) # Ensure output dirs exist os.makedirs(self.results_dir, exist_ok=True) os.makedirs(self.logs_dir, exist_ok=True) results: List[BenchmarkResult] = [] # Include previously completed results for cfg in configs: if cfg.id in completed: prev = completed[cfg.id] results.append(BenchmarkResult( benchmark_id=prev["benchmark_id"], benchmark_name=prev["benchmark_name"], level=prev["level"], tags=prev["tags"], status=prev["status"], flag_found=prev.get("flag_found", False), flag_value=prev.get("flag_value"), expected_flag=prev.get("expected_flag", ""), flag_source=prev.get("flag_source", "none"), findings_count=prev.get("findings_count", 0), findings_summary=prev.get("findings_summary", []), duration_seconds=prev.get("duration_seconds", 0), error_message=prev.get("error_message"), )) remaining = [cfg for cfg in configs if cfg.id not in completed] total_remaining = len(remaining) run_idx = 0 print(f"\n{'='*70}") print(f" NEUROSPLOIT BENCHMARK RUNNER") print(f" Mode: {self.agent_mode} | Timeout: {self.per_benchmark_timeout}s/benchmark") print(f" Running {total_remaining}/{len(configs)} benchmarks") if completed: print(f" Skipping {len(completed)} already completed (resume)") print(f"{'='*70}\n") for cfg in remaining: if self._interrupted: # Mark remaining as skipped results.append(BenchmarkResult( benchmark_id=cfg.id, benchmark_name=cfg.name, level=cfg.level, tags=cfg.tags, status="skip", expected_flag=cfg.expected_flag, error_message="Interrupted by user", )) continue run_idx += 1 print(f"\n[{run_idx}/{total_remaining}] {cfg.id} - {cfg.name[:50]}") print(f" Level: {cfg.level} | Tags: {', '.join(cfg.tags)} | " f"Service: {cfg.service_name}:{cfg.container_port}") result = await self._run_single_benchmark(cfg) results.append(result) # Save logs if result.agent_logs: self._save_benchmark_logs(cfg.id, result.agent_logs) # Update progress completed[cfg.id] = asdict(result) # Don't save massive logs in progress file if cfg.id in completed: completed[cfg.id].pop("agent_logs", None) self._save_progress(completed) # Print result icon = {"pass": "PASS", "fail": "FAIL", "error": "ERR", "timeout": "T/O", "skip": "SKIP"}.get(result.status, "???") flag_str = f"flag={result.flag_source}" if result.flag_found else "no flag" print(f" Result: [{icon}] {flag_str} | " f"{result.findings_count} findings | {result.duration_seconds:.1f}s") if result.error_message: print(f" Error: {result.error_message[:100]}") # Restore signal handler signal.signal(signal.SIGINT, original_sigint) # Generate reports timestamp = datetime.utcnow().strftime("%Y-%m-%d_%H%M%S") json_path = os.path.join(self.results_dir, f"results_{timestamp}.json") md_path = os.path.join(self.results_dir, f"results_{timestamp}.md") ReportGenerator.generate_json(results, json_path) ReportGenerator.generate_markdown(results, md_path) ReportGenerator.print_summary(results) print(f"\n Reports saved:") print(f" JSON: {json_path}") print(f" Markdown: {md_path}") print(f" Logs: {self.logs_dir}/") # Final Docker cleanup: prune dangling images, volumes, build cache if not self.keep_images: self._docker_prune() return results async def _run_single_benchmark(self, config: BenchmarkConfig) -> BenchmarkResult: """Execute a single benchmark: build → start → agent → check → cleanup.""" orchestrator = DockerOrchestrator( config.dir, f"nsb-{config.id.lower()}" ) agent_logs: List[str] = [] start_time = time.time() try: # Phase 1: Build print(f" [1/5] Building...") build_start = time.time() success, err = orchestrator.build(timeout=self.build_timeout) build_time = time.time() - build_start if not success: return BenchmarkResult( benchmark_id=config.id, benchmark_name=config.name, level=config.level, tags=config.tags, status="error", expected_flag=config.expected_flag, duration_seconds=time.time() - start_time, docker_build_time=build_time, error_message=err, agent_logs=agent_logs, ) print(f" [2/5] Starting (waiting for healthcheck)...") startup_start = time.time() success, err = orchestrator.start(timeout=self.startup_timeout) startup_time = time.time() - startup_start if not success: return BenchmarkResult( benchmark_id=config.id, benchmark_name=config.name, level=config.level, tags=config.tags, status="error", expected_flag=config.expected_flag, duration_seconds=time.time() - start_time, docker_build_time=build_time, docker_startup_time=startup_time, error_message=err, agent_logs=agent_logs, ) # Phase 3: Resolve URL print(f" [3/5] Resolving target URL...") target_url = orchestrator.get_target_url( config.service_name, config.container_port, config.is_fixed_port, config.fixed_host_port, ) if not target_url: return BenchmarkResult( benchmark_id=config.id, benchmark_name=config.name, level=config.level, tags=config.tags, status="error", expected_flag=config.expected_flag, duration_seconds=time.time() - start_time, docker_build_time=build_time, docker_startup_time=startup_time, error_message=f"Could not resolve target URL for {config.service_name}:{config.container_port}", agent_logs=agent_logs, ) print(f" Target: {target_url}") # Phase 4: Run Agent print(f" [4/5] Running agent ({self.agent_mode})...") agent_start = time.time() try: report, findings = await asyncio.wait_for( self._run_agent(target_url, agent_logs), timeout=self.per_benchmark_timeout, ) except asyncio.TimeoutError: agent_time = time.time() - agent_start print(f" Agent timed out after {agent_time:.0f}s") # Check partial results in logs found, flag_val = FlagDetector.search_logs(agent_logs, config.expected_flag) return BenchmarkResult( benchmark_id=config.id, benchmark_name=config.name, level=config.level, tags=config.tags, status="pass" if found else "timeout", flag_found=found, flag_value=flag_val, expected_flag=config.expected_flag, flag_source="agent_log" if found else "none", duration_seconds=time.time() - start_time, docker_build_time=build_time, docker_startup_time=startup_time, agent_run_time=agent_time, error_message=f"Agent timed out after {self.per_benchmark_timeout}s", agent_logs=agent_logs, ) except Exception as e: agent_time = time.time() - agent_start return BenchmarkResult( benchmark_id=config.id, benchmark_name=config.name, level=config.level, tags=config.tags, status="error", expected_flag=config.expected_flag, duration_seconds=time.time() - start_time, docker_build_time=build_time, docker_startup_time=startup_time, agent_run_time=agent_time, error_message=f"Agent error: {str(e)[:200]}", agent_logs=agent_logs, ) agent_time = time.time() - agent_start # Phase 5: Flag Detection print(f" [5/5] Checking for flag...") found = False flag_val = None flag_source = "none" # Layer 1: Findings found, flag_val, flag_source = FlagDetector.search_findings( findings, config.expected_flag ) # Layer 2: Agent logs if not found: found, flag_val = FlagDetector.search_logs(agent_logs, config.expected_flag) if found: flag_source = "agent_log" # Layer 3: Full report if not found and report: found, flag_val = FlagDetector.search_report(report, config.expected_flag) if found: flag_source = "report" # Layer 4: HTTP crawl fallback if not found: found, flag_val = await FlagDetector.http_crawl_for_flag( target_url, config.expected_flag ) if found: flag_source = "http_crawl" # Build findings summary findings_summary = [] for f in findings: findings_summary.append({ "title": f.title, "severity": f.severity, "vulnerability_type": f.vulnerability_type, "endpoint": f.affected_endpoint, }) return BenchmarkResult( benchmark_id=config.id, benchmark_name=config.name, level=config.level, tags=config.tags, status="pass" if found else "fail", flag_found=found, flag_value=flag_val, expected_flag=config.expected_flag, flag_source=flag_source, findings_count=len(findings), findings_summary=findings_summary, duration_seconds=time.time() - start_time, docker_build_time=build_time, docker_startup_time=startup_time, agent_run_time=agent_time, agent_logs=agent_logs, ) except Exception as e: return BenchmarkResult( benchmark_id=config.id, benchmark_name=config.name, level=config.level, tags=config.tags, status="error", expected_flag=config.expected_flag, duration_seconds=time.time() - start_time, error_message=f"Unexpected error: {str(e)[:200]}", agent_logs=agent_logs, ) finally: print(f" Cleaning up{' (removing images)' if not self.keep_images else ''}...") orchestrator.cleanup(remove_images=not self.keep_images) @staticmethod def _docker_prune() -> None: """Remove all dangling images, stopped containers, unused networks, and build cache.""" print("\n Running Docker cleanup (pruning unused data)...") freed_total = 0 for cmd_label, cmd in [ ("containers", ["docker", "container", "prune", "-f"]), ("images", ["docker", "image", "prune", "-f"]), ("volumes", ["docker", "volume", "prune", "-f"]), ("networks", ["docker", "network", "prune", "-f"]), ("build cache", ["docker", "builder", "prune", "-f", "--keep-storage", "1g"]), ]: try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if result.returncode == 0: # Parse reclaimed space from output (e.g., "Total reclaimed space: 2.3GB") for line in (result.stdout or "").splitlines(): if "reclaimed" in line.lower(): print(f" {cmd_label}: {line.strip()}") except Exception: pass print(" Docker cleanup complete.") async def _run_agent(self, target_url: str, agent_logs: List[str]) -> Tuple[Dict, list]: """Run the NeuroSploit autonomous agent against a target.""" from backend.core.autonomous_agent import AutonomousAgent, OperationMode mode_map = { "full_auto": OperationMode.FULL_AUTO, "auto_pentest": OperationMode.AUTO_PENTEST, "recon_only": OperationMode.RECON_ONLY, } mode = mode_map.get(self.agent_mode, OperationMode.FULL_AUTO) async def log_callback(level: str, message: str): timestamp = datetime.utcnow().strftime("%H:%M:%S") entry = f"[{timestamp}] [{level.upper()}] {message}" agent_logs.append(entry) agent = AutonomousAgent( target=target_url, mode=mode, log_callback=log_callback, ) async with agent: report = await agent.run() return report, list(agent.findings) # =========================================================================== # Coverage Analysis (preserved from original) # =========================================================================== TAG_TO_NEUROSPLOIT = { "xss": ["xss_reflected", "xss_stored", "xss_dom", "blind_xss", "mutation_xss"], "idor": ["idor", "bola"], "sqli": ["sqli_error", "sqli_union", "sqli_blind", "sqli_time"], "blind_sqli": ["sqli_blind", "sqli_time"], "ssti": ["ssti"], "command_injection": ["command_injection"], "ssrf": ["ssrf", "ssrf_cloud"], "lfi": ["lfi"], "path_traversal": ["path_traversal"], "xxe": ["xxe"], "insecure_deserialization": ["insecure_deserialization"], "csrf": ["csrf"], "jwt": ["jwt_manipulation"], "default_credentials": ["default_credentials"], "brute_force": ["brute_force"], "privilege_escalation": ["privilege_escalation"], "business_logic": ["business_logic"], "information_disclosure": ["information_disclosure", "sensitive_data_exposure"], "arbitrary_file_upload": ["file_upload"], "race_condition": ["race_condition"], "nosqli": ["nosql_injection"], "graphql": ["graphql_injection", "graphql_introspection"], "smuggling_desync": ["http_smuggling"], "http_method_tamper": ["http_methods"], "crypto": ["weak_encryption", "weak_hashing"], "cve": [], "ssh": [], } _HARDCODED_TYPES = { "sqli_error", "sqli_union", "sqli_blind", "sqli_time", "command_injection", "ssti", "nosql_injection", "ldap_injection", "xpath_injection", "graphql_injection", "crlf_injection", "header_injection", "email_injection", "expression_language_injection", "log_injection", "html_injection", "csv_injection", "orm_injection", "xss_reflected", "xss_stored", "xss_dom", "blind_xss", "mutation_xss", "lfi", "rfi", "path_traversal", "xxe", "file_upload", "arbitrary_file_read", "arbitrary_file_delete", "zip_slip", "ssrf", "ssrf_cloud", "csrf", "cors_misconfig", "auth_bypass", "jwt_manipulation", "session_fixation", "weak_password", "default_credentials", "brute_force", "two_factor_bypass", "oauth_misconfiguration", "idor", "bola", "bfla", "privilege_escalation", "mass_assignment", "forced_browsing", "clickjacking", "open_redirect", "dom_clobbering", "postmessage_vulnerability", "websocket_hijacking", "prototype_pollution", "css_injection", "tabnabbing", "security_headers", "ssl_issues", "http_methods", "directory_listing", "debug_mode", "exposed_admin_panel", "exposed_api_docs", "insecure_cookie_flags", "http_smuggling", "cache_poisoning", "race_condition", "business_logic", "rate_limit_bypass", "parameter_pollution", "type_juggling", "insecure_deserialization", "subdomain_takeover", "host_header_injection", "timing_attack", "improper_error_handling", "sensitive_data_exposure", "information_disclosure", "api_key_exposure", "source_code_disclosure", "backup_file_exposure", "version_disclosure", "weak_encryption", "weak_hashing", "weak_random", "cleartext_transmission", "vulnerable_dependency", "outdated_component", "insecure_cdn", "container_escape", "s3_bucket_misconfiguration", "cloud_metadata_exposure", "serverless_misconfiguration", "graphql_introspection", "graphql_dos", "rest_api_versioning", "soap_injection", "api_rate_limiting", "excessive_data_exposure", } CAPABILITY_SCORES = { "sqli_error": 3, "sqli_union": 3, "sqli_blind": 3, "sqli_time": 3, "command_injection": 3, "ssti": 3, "nosql_injection": 3, "ldap_injection": 3, "xpath_injection": 3, "graphql_injection": 3, "crlf_injection": 3, "header_injection": 3, "email_injection": 2, "expression_language_injection": 3, "log_injection": 2, "html_injection": 3, "csv_injection": 2, "orm_injection": 2, "xss_reflected": 3, "xss_stored": 3, "xss_dom": 2, "blind_xss": 2, "mutation_xss": 2, "lfi": 3, "rfi": 3, "path_traversal": 3, "xxe": 3, "file_upload": 3, "arbitrary_file_read": 2, "arbitrary_file_delete": 2, "zip_slip": 2, "ssrf": 3, "ssrf_cloud": 3, "csrf": 2, "cors_misconfig": 2, "auth_bypass": 2, "jwt_manipulation": 3, "session_fixation": 2, "weak_password": 2, "default_credentials": 2, "brute_force": 2, "two_factor_bypass": 1, "oauth_misconfiguration": 1, "idor": 3, "bola": 2, "bfla": 2, "privilege_escalation": 2, "mass_assignment": 2, "forced_browsing": 2, "clickjacking": 2, "open_redirect": 3, "dom_clobbering": 1, "postmessage_vulnerability": 1, "websocket_hijacking": 1, "prototype_pollution": 2, "css_injection": 1, "tabnabbing": 1, "security_headers": 2, "ssl_issues": 2, "http_methods": 2, "directory_listing": 2, "debug_mode": 2, "exposed_admin_panel": 2, "exposed_api_docs": 2, "insecure_cookie_flags": 2, "http_smuggling": 2, "cache_poisoning": 2, "race_condition": 2, "business_logic": 1, "rate_limit_bypass": 2, "parameter_pollution": 2, "type_juggling": 2, "insecure_deserialization": 2, "subdomain_takeover": 2, "host_header_injection": 2, "timing_attack": 1, "improper_error_handling": 1, "sensitive_data_exposure": 2, "information_disclosure": 2, "api_key_exposure": 2, "source_code_disclosure": 2, "backup_file_exposure": 2, "version_disclosure": 2, "weak_encryption": 1, "weak_hashing": 1, "weak_random": 1, "cleartext_transmission": 1, "vulnerable_dependency": 1, "outdated_component": 1, "insecure_cdn": 1, "container_escape": 1, "s3_bucket_misconfiguration": 2, "cloud_metadata_exposure": 2, "serverless_misconfiguration": 1, "graphql_introspection": 2, "graphql_dos": 1, "rest_api_versioning": 1, "soap_injection": 2, "api_rate_limiting": 1, "excessive_data_exposure": 1, } def load_neurosploit_types() -> Tuple[Set[str], Dict]: """Load NeuroSploit's 100 vulnerability types from registry.""" try: from backend.core.vuln_engine.registry import VulnerabilityRegistry reg = VulnerabilityRegistry() types = set(reg.VULNERABILITY_INFO.keys()) return types, reg.VULNERABILITY_INFO except ImportError: return _HARDCODED_TYPES, {} def load_benchmarks(benchmarks_dir: str) -> List[Dict]: """Load all benchmark configurations (for coverage analysis).""" benchmarks = [] bench_path = Path(benchmarks_dir) for d in sorted(bench_path.iterdir()): if not d.is_dir() or not d.name.startswith("XBEN-"): continue config_file = d / "benchmark.json" if not config_file.exists(): continue try: with open(config_file) as f: config = json.load(f) config["id"] = d.name config["dir"] = str(d) benchmarks.append(config) except (json.JSONDecodeError, KeyError): continue return benchmarks def analyze_coverage(benchmarks: List[Dict], ns_types: Set[str]) -> Dict: """Analyze NeuroSploit coverage of benchmarks.""" tag_counter = Counter() for bench in benchmarks: tags = bench.get("tags", []) for tag in tags: tag_counter[tag] += 1 covered_tags = set() uncovered_tags = set() tag_mapping = {} for tag in tag_counter: ns_mapped = TAG_TO_NEUROSPLOIT.get(tag, []) if ns_mapped: matched = [t for t in ns_mapped if t in ns_types] if matched: covered_tags.add(tag) tag_mapping[tag] = matched else: uncovered_tags.add(tag) tag_mapping[tag] = [] else: uncovered_tags.add(tag) tag_mapping[tag] = [] fully_covered = 0 partially_covered = 0 not_covered = 0 benchmark_results = [] for bench in benchmarks: tags = bench.get("tags", []) mapped_tags = [t for t in tags if t in covered_tags] coverage_pct = (len(mapped_tags) / len(tags) * 100) if tags else 0 best_capability = 0 for tag in tags: for ns_type in tag_mapping.get(tag, []): cap = CAPABILITY_SCORES.get(ns_type, 0) if cap > best_capability: best_capability = cap status = "fully_covered" if len(mapped_tags) == len(tags) else ( "partially_covered" if mapped_tags else "not_covered" ) if status == "fully_covered": fully_covered += 1 elif status == "partially_covered": partially_covered += 1 else: not_covered += 1 benchmark_results.append({ "id": bench["id"], "name": bench.get("name", ""), "level": bench.get("level", ""), "tags": tags, "mapped_ns_types": [t for tag in tags for t in tag_mapping.get(tag, [])], "coverage_pct": coverage_pct, "capability_score": best_capability, "status": status, }) total_tags = len(tag_counter) covered_count = len(covered_tags) tag_coverage_pct = (covered_count / total_tags * 100) if total_tags else 0 total_benchmarks = len(benchmarks) benchmark_coverage_pct = (fully_covered / total_benchmarks * 100) if total_benchmarks else 0 benchmark_any_coverage_pct = ((fully_covered + partially_covered) / total_benchmarks * 100) if total_benchmarks else 0 level_stats = defaultdict(lambda: {"total": 0, "covered": 0}) for br in benchmark_results: level = str(br["level"]) level_stats[level]["total"] += 1 if br["status"] in ("fully_covered", "partially_covered"): level_stats[level]["covered"] += 1 total_cap = 0 max_cap = 0 for br in benchmark_results: total_cap += br["capability_score"] max_cap += 3 capability_accuracy = (total_cap / max_cap * 100) if max_cap else 0 return { "total_benchmarks": total_benchmarks, "total_tags": total_tags, "covered_tags": covered_count, "uncovered_tags": total_tags - covered_count, "tag_coverage_pct": round(tag_coverage_pct, 1), "fully_covered_benchmarks": fully_covered, "partially_covered_benchmarks": partially_covered, "not_covered_benchmarks": not_covered, "benchmark_full_coverage_pct": round(benchmark_coverage_pct, 1), "benchmark_any_coverage_pct": round(benchmark_any_coverage_pct, 1), "capability_weighted_accuracy": round(capability_accuracy, 1), "ns_total_types": len(ns_types), "tag_mapping": tag_mapping, "tag_counter": dict(tag_counter), "covered_tag_list": sorted(covered_tags), "uncovered_tag_list": sorted(uncovered_tags), "level_stats": dict(level_stats), "benchmark_results": benchmark_results, } def print_coverage_report(analysis: Dict): """Print formatted coverage report.""" print() print("=" * 70) print(" NEUROSPLOIT BENCHMARK COVERAGE ANALYSIS") print("=" * 70) print() print(f" Total Benchmarks: {analysis['total_benchmarks']}") print(f" NeuroSploit Vuln Types: {analysis['ns_total_types']}") print() print(" --- TAG COVERAGE ---") print(f" Unique Tags in Benchmarks: {analysis['total_tags']}") print(f" Tags Mapped to NS Types: {analysis['covered_tags']} / {analysis['total_tags']}") print(f" Tag Coverage: {analysis['tag_coverage_pct']}%") print() print(f" Covered Tags: {', '.join(analysis['covered_tag_list'])}") print(f" Uncovered Tags: {', '.join(analysis['uncovered_tag_list'])}") print() print(" --- BENCHMARK COVERAGE ---") print(f" Fully Covered: {analysis['fully_covered_benchmarks']} / {analysis['total_benchmarks']} ({analysis['benchmark_full_coverage_pct']}%)") print(f" Partially Covered: {analysis['partially_covered_benchmarks']} / {analysis['total_benchmarks']}") print(f" Not Covered: {analysis['not_covered_benchmarks']} / {analysis['total_benchmarks']}") print(f" Any Coverage: {analysis['benchmark_any_coverage_pct']}%") print() print(" --- DETECTION CAPABILITY ---") print(f" Capability-Weighted Accuracy: {analysis['capability_weighted_accuracy']}%") print(f" (Score: 3=full tester+payloads+AI, 2=tester+basic, 1=inspection, 0=none)") print() print(" --- LEVEL BREAKDOWN ---") for level in sorted(analysis["level_stats"].keys()): stats = analysis["level_stats"][level] pct = round(stats["covered"] / stats["total"] * 100, 1) if stats["total"] else 0 label = {"1": "Easy", "2": "Medium", "3": "Hard"}.get(level, level) print(f" Level {level} ({label}): {stats['covered']}/{stats['total']} covered ({pct}%)") print() print(" --- TAG FREQUENCY ---") sorted_tags = sorted(analysis["tag_counter"].items(), key=lambda x: -x[1]) for tag, count in sorted_tags: mapped = analysis["tag_mapping"].get(tag, []) status = "OK" if mapped else "NO MAP" ns_str = ", ".join(mapped[:3]) if mapped else "-" print(f" {tag:30s} {count:3d} benchmarks [{status}] -> {ns_str}") print() print(" --- PER-BENCHMARK DETAIL ---") for br in analysis["benchmark_results"]: cap_str = ["_", "L", "M", "H"][br["capability_score"]] status_sym = {"fully_covered": "+", "partially_covered": "~", "not_covered": "-"}[br["status"]] print(f" [{status_sym}][{cap_str}] {br['id']} L{br['level']} {br['coverage_pct']:5.0f}% tags={','.join(br['tags'])}") print() print("=" * 70) print(f" FINAL ACCURACY: {analysis['capability_weighted_accuracy']}% capability-weighted") print(f" TYPE COVERAGE: {analysis['tag_coverage_pct']}% of benchmark vuln tags") print(f" FULL COVERAGE: {analysis['benchmark_full_coverage_pct']}% of benchmarks fully covered") print(f" ANY COVERAGE: {analysis['benchmark_any_coverage_pct']}% of benchmarks with any coverage") print("=" * 70) print() # =========================================================================== # CLI Entry Point # =========================================================================== def main(): """Main CLI entry point with subcommands.""" import argparse parser = argparse.ArgumentParser( description="NeuroSploit Benchmark Runner & Coverage Analyzer", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s run Run all 104 benchmarks %(prog)s run --ids XBEN-050-24 Run specific benchmark %(prog)s run --tags xss sqli --level 1 Filter by tag and level %(prog)s run --mode auto_pentest --timeout 600 Thorough mode, 10min timeout %(prog)s run --dry-run Show what would run %(prog)s run --no-resume Start fresh (ignore progress) %(prog)s coverage Static coverage analysis %(prog)s coverage --json Coverage output as JSON """, ) subparsers = parser.add_subparsers(dest="command", help="Command") # 'run' subcommand run_parser = subparsers.add_parser("run", help="Execute benchmarks against live targets") run_parser.add_argument("--benchmarks", default="/tmp/xbow-benchmarks/benchmarks", help="Path to benchmarks directory") run_parser.add_argument("--results-dir", default="reports/benchmark_results", help="Output directory for results") run_parser.add_argument("--mode", choices=["full_auto", "auto_pentest", "recon_only"], default="full_auto", help="Agent operation mode") run_parser.add_argument("--timeout", type=int, default=300, help="Per-benchmark agent timeout in seconds (default: 300)") run_parser.add_argument("--build-timeout", type=int, default=300, help="Docker build timeout in seconds (default: 300)") run_parser.add_argument("--startup-timeout", type=int, default=180, help="Docker startup timeout in seconds (default: 180)") run_parser.add_argument("--tags", nargs="+", help="Filter by benchmark tags") run_parser.add_argument("--level", nargs="+", type=int, help="Filter by level (1, 2, 3)") run_parser.add_argument("--ids", nargs="+", help="Filter by benchmark IDs") run_parser.add_argument("--no-resume", action="store_true", help="Start fresh (ignore previous progress)") run_parser.add_argument("--dry-run", action="store_true", help="Show which benchmarks would run without executing") run_parser.add_argument("--keep-images", action="store_true", help="Keep Docker images after each benchmark (faster re-runs, uses more disk)") # 'coverage' subcommand cov_parser = subparsers.add_parser("coverage", help="Static coverage analysis (no execution)") cov_parser.add_argument("--benchmarks", default="/tmp/xbow-benchmarks/benchmarks", help="Path to benchmarks directory") cov_parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() # Default to 'coverage' if no subcommand (backward compatible) if args.command == "run": if not os.path.isdir(args.benchmarks): print(f"Error: Benchmarks directory not found: {args.benchmarks}") sys.exit(1) runner = BenchmarkRunner( benchmarks_dir=args.benchmarks, results_dir=args.results_dir, agent_mode=args.mode, per_benchmark_timeout=args.timeout, build_timeout=args.build_timeout, startup_timeout=args.startup_timeout, tag_filter=args.tags, level_filter=args.level, id_filter=args.ids, resume=not args.no_resume, dry_run=args.dry_run, keep_images=args.keep_images, ) asyncio.run(runner.run_all()) else: # Coverage analysis (default or explicit 'coverage' subcommand) benchmarks_dir = getattr(args, "benchmarks", "/tmp/xbow-benchmarks/benchmarks") output_json = getattr(args, "json", False) if not os.path.isdir(benchmarks_dir): print(f"Error: Benchmarks directory not found: {benchmarks_dir}") sys.exit(1) benchmarks = load_benchmarks(benchmarks_dir) if not benchmarks: print("Error: No benchmarks found") sys.exit(1) ns_types, ns_info = load_neurosploit_types() analysis = analyze_coverage(benchmarks, ns_types) if output_json: output = {k: v for k, v in analysis.items() if k != "benchmark_results"} output["benchmark_summary"] = [ {"id": br["id"], "coverage": br["coverage_pct"], "capability": br["capability_score"]} for br in analysis["benchmark_results"] ] print(json.dumps(output, indent=2)) else: print_coverage_report(analysis) if __name__ == "__main__": main()