NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform

116 modules | 100 vuln types | 18 API routes | 18 frontend pages Major features: - VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts - Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop - CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers - Validation Pipeline: negative controls, proof of execution, confidence scoring, judge - AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon - Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents - RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates - Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh - Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install - Full IA Testing: methodology-driven comprehensive pentest sessions - Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts - Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-05-28 17:01:26 +02:00 · 2026-02-22 17:58:12 -03:00
commit e0935793c5
271 changed files with 132462 additions and 0 deletions
@@ -0,0 +1,552 @@
+"""
+Methodology Loader - Parses external pentest methodology .md files and indexes them
+for smart injection into all LLM call sites in the autonomous agent.
+
+Supports FASE-based methodology documents (like pentestcompleto.md) as well as
+generic markdown documents. Maps sections to vulnerability types and agent contexts
+for targeted injection with per-context character budgets.
+"""
+
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# ─── FASE → Vulnerability Type Mapping ───────────────────────────────────────
+# Maps each FASE section to the agent's vulnerability type identifiers.
+# These match the 100 types in vuln_engine/registry.py.
+
+FASE_VULN_TYPE_MAP: Dict[str, List[str]] = {
+    "fase_0": [],  # Recon - broad, no specific vuln types
+    "fase_1": [],  # Architecture analysis - broad strategy
+    "fase_2": [
+        "jwt_manipulation", "session_fixation", "broken_auth", "auth_bypass",
+        "insecure_password_reset", "account_takeover", "cookie_manipulation",
+        "captcha_bypass", "session_hijacking",
+    ],
+    "fase_3": [
+        "idor", "bola", "bfla", "privilege_escalation", "forced_browsing",
+        "auth_bypass", "mass_assignment",
+    ],
+    "fase_4": [
+        "race_condition", "business_logic", "workflow_bypass",
+        "payment_manipulation", "insufficient_anti_automation",
+    ],
+    "fase_5": [],  # CVE/Zero-day - applies to all types via strategy context
+    "fase_6": [
+        "ssrf", "cloud_misconfig", "s3_bucket_misconfiguration",
+        "cloud_metadata_exposure", "serverless_misconfiguration",
+        "kubernetes_misconfig", "iam_misconfig",
+    ],
+    "fase_7": [],  # OWASP WSTG reference - strategy context
+    "fase_8": [
+        "bola", "bfla", "mass_assignment", "excessive_data_exposure",
+        "api_abuse", "api_rate_limiting", "rest_api_versioning",
+        "broken_auth", "ssrf",
+    ],
+    "fase_9": [
+        "graphql_injection", "graphql_introspection", "graphql_dos",
+        "websocket_security", "grpc_security",
+    ],
+    "fase_10": [
+        "sqli_error", "sqli_union", "sqli_blind", "sqli_time", "sqli_oob",
+        "nosql_injection", "ssti", "ldap_injection", "xpath_injection",
+        "crlf_injection", "header_injection", "parameter_pollution",
+        "command_injection", "email_injection", "expression_language_injection",
+        "log_injection", "orm_injection", "ssi_injection", "xslt_injection",
+        "csv_injection",
+    ],
+    "fase_11": [
+        "xss_reflected", "xss_stored", "xss_dom", "cors_misconfig",
+        "csp_bypass", "clickjacking", "open_redirect", "prototype_pollution",
+        "html_injection", "css_injection", "dom_clobbering", "postmessage_abuse",
+        "dangling_markup",
+    ],
+    "fase_12": [
+        "http_request_smuggling", "cache_poisoning", "cache_deception",
+        "http2_smuggling", "connection_pool_poisoning", "http_method_tampering",
+    ],
+    "fase_13": [
+        "file_upload", "lfi", "rfi", "path_traversal", "zip_slip",
+    ],
+    "fase_14": [
+        "ssrf", "dns_rebinding", "blind_ssrf",
+    ],
+    "fase_15": [
+        "broken_auth", "insecure_password_reset", "brute_force",
+        "account_enumeration", "captcha_bypass", "session_fixation",
+        "account_takeover", "mfa_bypass",
+    ],
+    "fase_16": [
+        "mass_assignment", "rate_limit_bypass", "api_rate_limiting",
+        "brute_force",
+    ],
+    "fase_17": [
+        "information_disclosure", "subdomain_takeover", "directory_listing",
+        "default_credentials", "security_headers", "ssl_tls",
+        "debug_endpoints", "backup_files", "source_code_exposure",
+        "sensitive_data_exposure",
+    ],
+    "fase_18": [
+        "insecure_deserialization",
+    ],
+    "fase_19": [
+        "denial_of_service", "graphql_dos", "redos", "xml_bomb",
+    ],
+    "fase_20": [
+        "xxe",
+    ],
+}
+
+
+# ─── FASE → Agent Context Mapping ────────────────────────────────────────────
+# Maps each FASE to the agent contexts where it should be injected.
+
+FASE_CONTEXT_MAP: Dict[str, List[str]] = {
+    "fase_0": ["strategy"],
+    "fase_1": ["strategy"],
+    "fase_2": ["testing", "verification", "confirmation"],
+    "fase_3": ["testing", "verification", "confirmation"],
+    "fase_4": ["testing", "confirmation", "strategy"],
+    "fase_5": ["strategy", "testing"],
+    "fase_6": ["testing", "verification"],
+    "fase_7": ["strategy"],
+    "fase_8": ["testing", "verification", "confirmation"],
+    "fase_9": ["testing", "verification"],
+    "fase_10": ["testing", "verification", "confirmation"],
+    "fase_11": ["testing", "verification", "confirmation"],
+    "fase_12": ["testing", "verification"],
+    "fase_13": ["testing", "verification", "confirmation"],
+    "fase_14": ["testing", "verification"],
+    "fase_15": ["testing", "verification", "confirmation"],
+    "fase_16": ["testing", "confirmation"],
+    "fase_17": ["testing", "reporting"],
+    "fase_18": ["testing", "verification", "confirmation"],
+    "fase_19": ["testing"],
+    "fase_20": ["testing", "verification", "confirmation"],
+}
+
+
+# ─── Keyword → Vuln Type Mapping (for non-FASE documents) ───────────────────
+
+KEYWORD_VULN_MAP: Dict[str, List[str]] = {
+    "sql injection": ["sqli_error", "sqli_union", "sqli_blind", "sqli_time"],
+    "xss": ["xss_reflected", "xss_stored", "xss_dom"],
+    "cross-site scripting": ["xss_reflected", "xss_stored", "xss_dom"],
+    "ssrf": ["ssrf", "blind_ssrf"],
+    "server-side request forgery": ["ssrf", "blind_ssrf"],
+    "xxe": ["xxe"],
+    "xml external entity": ["xxe"],
+    "ssti": ["ssti"],
+    "template injection": ["ssti"],
+    "idor": ["idor", "bola"],
+    "broken access": ["bola", "bfla", "idor"],
+    "deserialization": ["insecure_deserialization"],
+    "file upload": ["file_upload"],
+    "lfi": ["lfi", "path_traversal"],
+    "local file inclusion": ["lfi", "path_traversal"],
+    "rfi": ["rfi"],
+    "remote file inclusion": ["rfi"],
+    "command injection": ["command_injection"],
+    "cors": ["cors_misconfig"],
+    "csrf": ["csrf"],
+    "clickjacking": ["clickjacking"],
+    "open redirect": ["open_redirect"],
+    "jwt": ["jwt_manipulation"],
+    "oauth": ["broken_auth", "auth_bypass"],
+    "race condition": ["race_condition"],
+    "prototype pollution": ["prototype_pollution"],
+    "request smuggling": ["http_request_smuggling"],
+    "cache poisoning": ["cache_poisoning"],
+    "graphql": ["graphql_injection", "graphql_introspection", "graphql_dos"],
+    "websocket": ["websocket_security"],
+    "nosql": ["nosql_injection"],
+    "ldap": ["ldap_injection"],
+    "crlf": ["crlf_injection"],
+    "mass assignment": ["mass_assignment"],
+    "rate limit": ["rate_limit_bypass", "api_rate_limiting"],
+}
+
+
+@dataclass
+class MethodologySection:
+    """A parsed section from a methodology document."""
+    fase_id: str
+    title: str
+    content: str
+    sub_sections: Dict[str, str] = field(default_factory=dict)
+    vuln_types: List[str] = field(default_factory=list)
+    contexts: List[str] = field(default_factory=list)
+
+    @property
+    def char_count(self) -> int:
+        return len(self.content)
+
+
+class MethodologyIndex:
+    """Indexed methodology for fast retrieval by vuln_type and context."""
+
+    def __init__(self):
+        self.sections: Dict[str, MethodologySection] = {}
+        self.vuln_type_index: Dict[str, List[str]] = {}  # vuln_type → [fase_ids]
+        self.context_index: Dict[str, List[str]] = {}    # context → [fase_ids]
+
+    def add_section(self, section: MethodologySection) -> None:
+        self.sections[section.fase_id] = section
+        for vt in section.vuln_types:
+            self.vuln_type_index.setdefault(vt, []).append(section.fase_id)
+        for ctx in section.contexts:
+            self.context_index.setdefault(ctx, []).append(section.fase_id)
+
+    def get_for_vuln_and_context(
+        self,
+        vuln_type: str,
+        context: str,
+        max_chars: int = 2000,
+    ) -> str:
+        """Get methodology text relevant to both vuln_type and context.
+
+        Prefers sub-sections that mention the vuln_type for precision.
+        Truncates to max_chars budget.
+        """
+        if not self.sections:
+            return ""
+
+        candidate_fase_ids: set = set()
+
+        # Find FASEs matching vuln_type
+        if vuln_type:
+            # Direct match
+            for fid in self.vuln_type_index.get(vuln_type, []):
+                candidate_fase_ids.add(fid)
+            # Fuzzy match: try without common suffixes
+            base_vt = vuln_type.replace("_reflected", "").replace("_stored", "").replace("_dom", "")
+            base_vt = base_vt.replace("_error", "").replace("_union", "").replace("_blind", "").replace("_time", "")
+            if base_vt != vuln_type:
+                for fid in self.vuln_type_index.get(base_vt, []):
+                    candidate_fase_ids.add(fid)
+
+        # Filter by context
+        if context:
+            context_fases = set(self.context_index.get(context, []))
+            if candidate_fase_ids:
+                # Intersect for precision
+                filtered = candidate_fase_ids & context_fases
+                if filtered:
+                    candidate_fase_ids = filtered
+                # If intersection is empty, keep vuln_type matches (they're more specific)
+            else:
+                # No vuln_type specified: use all context matches
+                candidate_fase_ids = context_fases
+
+        if not candidate_fase_ids:
+            return ""
+
+        # Build output, preferring targeted sub-sections
+        parts: List[str] = []
+        total = 0
+
+        for fase_id in sorted(candidate_fase_ids):
+            section = self.sections.get(fase_id)
+            if not section:
+                continue
+
+            remaining = max_chars - total
+            if remaining < 100:
+                break
+
+            # Try to find a targeted sub-section first
+            best_sub = self._find_best_subsection(section, vuln_type)
+
+            if best_sub:
+                title, content = best_sub
+                text = f"### {title}\n{content}"
+            else:
+                # Use full section content, truncated
+                text = f"### {section.title}\n{section.content}"
+
+            if len(text) > remaining:
+                text = text[:remaining]
+
+            if len(text) < 50:
+                continue  # Skip tiny fragments
+
+            parts.append(text)
+            total += len(text)
+
+        return "\n\n".join(parts)
+
+    def _find_best_subsection(
+        self, section: MethodologySection, vuln_type: str
+    ) -> Optional[tuple]:
+        """Find the sub-section most relevant to a vuln_type."""
+        if not vuln_type or not section.sub_sections:
+            return None
+
+        # Normalize for matching
+        vt_variants = set()
+        vt_lower = vuln_type.lower()
+        vt_variants.add(vt_lower)
+        vt_variants.add(vt_lower.replace("_", " "))
+        vt_variants.add(vt_lower.replace("_", "-"))
+
+        # Common name mappings
+        name_map = {
+            "sqli": "sql injection",
+            "xss_reflected": "reflected xss",
+            "xss_stored": "stored xss",
+            "xss_dom": "dom xss",
+            "lfi": "lfi",
+            "rfi": "rfi",
+            "ssrf": "ssrf",
+            "ssti": "ssti",
+            "xxe": "xxe",
+            "nosql_injection": "nosql",
+            "crlf_injection": "crlf",
+            "cors_misconfig": "cors",
+            "insecure_deserialization": "deserialization",
+            "http_request_smuggling": "request smuggling",
+            "cache_poisoning": "cache poisoning",
+            "prototype_pollution": "prototype pollution",
+        }
+        mapped = name_map.get(vt_lower)
+        if mapped:
+            vt_variants.add(mapped)
+
+        best_score = 0
+        best = None
+
+        for sub_title, sub_content in section.sub_sections.items():
+            title_lower = sub_title.lower()
+            score = 0
+            for variant in vt_variants:
+                if variant in title_lower:
+                    score = 10  # Title match is strongest
+                    break
+                if variant in sub_content[:500].lower():
+                    score = max(score, 5)  # Content match
+
+            if score > best_score:
+                best_score = score
+                best = (sub_title, sub_content)
+
+        return best
+
+
+class MethodologyLoader:
+    """Loads and indexes methodology documents from files or DB prompts."""
+
+    def load_from_file(self, file_path: str) -> MethodologyIndex:
+        """Load a .md methodology file and build an index."""
+        if not os.path.exists(file_path):
+            logger.warning(f"Methodology file not found: {file_path}")
+            return MethodologyIndex()
+
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+        except Exception as e:
+            logger.error(f"Failed to read methodology file: {e}")
+            return MethodologyIndex()
+
+        sections = self._parse_markdown_sections(content)
+        index = MethodologyIndex()
+        for section in sections:
+            index.add_section(section)
+
+        logger.info(
+            f"[METHODOLOGY] Loaded {len(sections)} sections from {file_path} "
+            f"({sum(s.char_count for s in sections)} chars, "
+            f"{len(index.vuln_type_index)} vuln types mapped)"
+        )
+        return index
+
+    def load_from_db_prompts(self, prompts: List[Dict]) -> MethodologyIndex:
+        """Index database-loaded custom prompts into a MethodologyIndex."""
+        index = MethodologyIndex()
+
+        for i, p in enumerate(prompts):
+            content = p.get("content", "")
+            if not content:
+                continue
+
+            parsed_vulns = p.get("parsed_vulnerabilities", [])
+
+            # Try FASE-based parsing first
+            sections = self._parse_markdown_sections(content)
+
+            if not sections:
+                # Treat entire content as one section
+                vuln_types = [
+                    v.get("type", "") for v in parsed_vulns if v.get("type")
+                ]
+                if not vuln_types:
+                    vuln_types = self._detect_vuln_types_by_keywords(content)
+
+                section = MethodologySection(
+                    fase_id=f"db_prompt_{i}",
+                    title=p.get("name", f"Custom Prompt {i}"),
+                    content=content,
+                    sub_sections={},
+                    vuln_types=vuln_types,
+                    contexts=["testing", "strategy", "confirmation",
+                              "verification", "reporting"],
+                )
+                sections = [section]
+
+            for section in sections:
+                index.add_section(section)
+
+        logger.info(
+            f"[METHODOLOGY] Indexed {len(index.sections)} sections from "
+            f"{len(prompts)} DB prompts"
+        )
+        return index
+
+    def merge_indices(self, *indices: MethodologyIndex) -> MethodologyIndex:
+        """Merge multiple MethodologyIndex objects into one."""
+        merged = MethodologyIndex()
+        for idx in indices:
+            for section in idx.sections.values():
+                # Avoid duplicate fase_ids
+                if section.fase_id not in merged.sections:
+                    merged.add_section(section)
+        return merged
+
+    def _parse_markdown_sections(self, content: str) -> List[MethodologySection]:
+        """Parse a markdown document into indexed sections.
+
+        Looks for FASE headings first, falls back to generic ## headings.
+        """
+        sections = self._parse_fase_sections(content)
+        if sections:
+            return sections
+
+        # Fallback: parse generic ## headings
+        return self._parse_generic_sections(content)
+
+    def _parse_fase_sections(self, content: str) -> List[MethodologySection]:
+        """Parse FASE-structured methodology documents."""
+        # Match ## FASE N: or # FASE N: or ## 🔐 FASE N: (with emoji)
+        fase_pattern = re.compile(
+            r'^(#{1,2})\s*(?:[^\w]*\s*)?FASE\s+(\d+)\s*[:\-]?\s*(.*?)$',
+            re.MULTILINE | re.IGNORECASE,
+        )
+
+        matches = list(fase_pattern.finditer(content))
+        if not matches:
+            return []
+
+        sections: List[MethodologySection] = []
+
+        # Also capture pre-FASE content (e.g., recon steps before FASE 1)
+        if matches[0].start() > 200:
+            pre_content = content[:matches[0].start()].strip()
+            if pre_content:
+                pre_subs = self._extract_sub_sections(pre_content)
+                sections.append(MethodologySection(
+                    fase_id="fase_0",
+                    title="Recon & Preparation",
+                    content=pre_content,
+                    sub_sections=pre_subs,
+                    vuln_types=FASE_VULN_TYPE_MAP.get("fase_0", []),
+                    contexts=FASE_CONTEXT_MAP.get("fase_0", ["strategy"]),
+                ))
+
+        for i, match in enumerate(matches):
+            fase_num = match.group(2)
+            fase_title = f"FASE {fase_num}: {match.group(3).strip()}"
+            start = match.end()
+            end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
+            body = content[start:end].strip()
+
+            fase_id = f"fase_{fase_num}"
+            sub_sections = self._extract_sub_sections(body)
+            vuln_types = FASE_VULN_TYPE_MAP.get(fase_id, [])
+            contexts = FASE_CONTEXT_MAP.get(fase_id, ["testing"])
+
+            # If not in our hardcoded map, try keyword detection
+            if not vuln_types:
+                vuln_types = self._detect_vuln_types_by_keywords(body)
+
+            sections.append(MethodologySection(
+                fase_id=fase_id,
+                title=fase_title,
+                content=body,
+                sub_sections=sub_sections,
+                vuln_types=vuln_types,
+                contexts=contexts,
+            ))
+
+        return sections
+
+    def _parse_generic_sections(self, content: str) -> List[MethodologySection]:
+        """Parse generic ## heading structured documents."""
+        heading_pattern = re.compile(r'^##\s+(.*?)$', re.MULTILINE)
+        matches = list(heading_pattern.finditer(content))
+
+        if not matches:
+            return []
+
+        sections: List[MethodologySection] = []
+
+        for i, match in enumerate(matches):
+            title = match.group(1).strip()
+            start = match.end()
+            end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
+            body = content[start:end].strip()
+
+            vuln_types = self._detect_vuln_types_by_keywords(
+                title + " " + body[:1000]
+            )
+            sub_sections = self._extract_sub_sections(body)
+
+            sections.append(MethodologySection(
+                fase_id=f"section_{i}",
+                title=title,
+                content=body,
+                sub_sections=sub_sections,
+                vuln_types=vuln_types,
+                contexts=["testing", "strategy"],
+            ))
+
+        return sections
+
+    def _extract_sub_sections(self, body: str) -> Dict[str, str]:
+        """Extract ### sub-sections from a section body."""
+        sub_pattern = re.compile(r'^###\s+(.*?)$', re.MULTILINE)
+        sub_matches = list(sub_pattern.finditer(body))
+        sub_sections: Dict[str, str] = {}
+
+        for j, sub in enumerate(sub_matches):
+            sub_title = sub.group(1).strip()
+            sub_start = sub.end()
+            sub_end = (
+                sub_matches[j + 1].start()
+                if j + 1 < len(sub_matches)
+                else len(body)
+            )
+            sub_content = body[sub_start:sub_end].strip()
+            if sub_content:
+                sub_sections[sub_title] = sub_content
+
+        return sub_sections
+
+    def _detect_vuln_types_by_keywords(self, text: str) -> List[str]:
+        """Detect vuln types from text content via keyword matching."""
+        text_lower = text.lower()
+        found: List[str] = []
+        seen: set = set()
+
+        for keyword, types in KEYWORD_VULN_MAP.items():
+            if keyword in text_lower:
+                for vt in types:
+                    if vt not in seen:
+                        found.append(vt)
+                        seen.add(vt)
+
+        return found