NeuroSploit/backend/core/methodology_loader.py

"""
Methodology Loader - Parses external pentest methodology .md files and indexes them
for smart injection into all LLM call sites in the autonomous agent.

Supports FASE-based methodology documents (like pentestcompleto.md) as well as
generic markdown documents. Maps sections to vulnerability types and agent contexts
for targeted injection with per-context character budgets.
"""

import logging
import os
import re
from dataclasses import dataclass, field
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)


# ─── FASE → Vulnerability Type Mapping ───────────────────────────────────────
# Maps each FASE section to the agent's vulnerability type identifiers.
# These match the 100 types in vuln_engine/registry.py.

FASE_VULN_TYPE_MAP: Dict[str, List[str]] = {
    "fase_0": [],  # Recon - broad, no specific vuln types
    "fase_1": [],  # Architecture analysis - broad strategy
    "fase_2": [
        "jwt_manipulation", "session_fixation", "broken_auth", "auth_bypass",
        "insecure_password_reset", "account_takeover", "cookie_manipulation",
        "captcha_bypass", "session_hijacking",
    ],
    "fase_3": [
        "idor", "bola", "bfla", "privilege_escalation", "forced_browsing",
        "auth_bypass", "mass_assignment",
    ],
    "fase_4": [
        "race_condition", "business_logic", "workflow_bypass",
        "payment_manipulation", "insufficient_anti_automation",
    ],
    "fase_5": [],  # CVE/Zero-day - applies to all types via strategy context
    "fase_6": [
        "ssrf", "cloud_misconfig", "s3_bucket_misconfiguration",
        "cloud_metadata_exposure", "serverless_misconfiguration",
        "kubernetes_misconfig", "iam_misconfig",
    ],
    "fase_7": [],  # OWASP WSTG reference - strategy context
    "fase_8": [
        "bola", "bfla", "mass_assignment", "excessive_data_exposure",
        "api_abuse", "api_rate_limiting", "rest_api_versioning",
        "broken_auth", "ssrf",
    ],
    "fase_9": [
        "graphql_injection", "graphql_introspection", "graphql_dos",
        "websocket_security", "grpc_security",
    ],
    "fase_10": [
        "sqli_error", "sqli_union", "sqli_blind", "sqli_time", "sqli_oob",
        "nosql_injection", "ssti", "ldap_injection", "xpath_injection",
        "crlf_injection", "header_injection", "parameter_pollution",
        "command_injection", "email_injection", "expression_language_injection",
        "log_injection", "orm_injection", "ssi_injection", "xslt_injection",
        "csv_injection",
    ],
    "fase_11": [
        "xss_reflected", "xss_stored", "xss_dom", "cors_misconfig",
        "csp_bypass", "clickjacking", "open_redirect", "prototype_pollution",
        "html_injection", "css_injection", "dom_clobbering", "postmessage_abuse",
        "dangling_markup",
    ],
    "fase_12": [
        "http_request_smuggling", "cache_poisoning", "cache_deception",
        "http2_smuggling", "connection_pool_poisoning", "http_method_tampering",
    ],
    "fase_13": [
        "file_upload", "lfi", "rfi", "path_traversal", "zip_slip",
    ],
    "fase_14": [
        "ssrf", "dns_rebinding", "blind_ssrf",
    ],
    "fase_15": [
        "broken_auth", "insecure_password_reset", "brute_force",
        "account_enumeration", "captcha_bypass", "session_fixation",
        "account_takeover", "mfa_bypass",
    ],
    "fase_16": [
        "mass_assignment", "rate_limit_bypass", "api_rate_limiting",
        "brute_force",
    ],
    "fase_17": [
        "information_disclosure", "subdomain_takeover", "directory_listing",
        "default_credentials", "security_headers", "ssl_tls",
        "debug_endpoints", "backup_files", "source_code_exposure",
        "sensitive_data_exposure",
    ],
    "fase_18": [
        "insecure_deserialization",
    ],
    "fase_19": [
        "denial_of_service", "graphql_dos", "redos", "xml_bomb",
    ],
    "fase_20": [
        "xxe",
    ],
}


# ─── FASE → Agent Context Mapping ────────────────────────────────────────────
# Maps each FASE to the agent contexts where it should be injected.

FASE_CONTEXT_MAP: Dict[str, List[str]] = {
    "fase_0": ["strategy"],
    "fase_1": ["strategy"],
    "fase_2": ["testing", "verification", "confirmation"],
    "fase_3": ["testing", "verification", "confirmation"],
    "fase_4": ["testing", "confirmation", "strategy"],
    "fase_5": ["strategy", "testing"],
    "fase_6": ["testing", "verification"],
    "fase_7": ["strategy"],
    "fase_8": ["testing", "verification", "confirmation"],
    "fase_9": ["testing", "verification"],
    "fase_10": ["testing", "verification", "confirmation"],
    "fase_11": ["testing", "verification", "confirmation"],
    "fase_12": ["testing", "verification"],
    "fase_13": ["testing", "verification", "confirmation"],
    "fase_14": ["testing", "verification"],
    "fase_15": ["testing", "verification", "confirmation"],
    "fase_16": ["testing", "confirmation"],
    "fase_17": ["testing", "reporting"],
    "fase_18": ["testing", "verification", "confirmation"],
    "fase_19": ["testing"],
    "fase_20": ["testing", "verification", "confirmation"],
}


# ─── Keyword → Vuln Type Mapping (for non-FASE documents) ───────────────────

KEYWORD_VULN_MAP: Dict[str, List[str]] = {
    "sql injection": ["sqli_error", "sqli_union", "sqli_blind", "sqli_time"],
    "xss": ["xss_reflected", "xss_stored", "xss_dom"],
    "cross-site scripting": ["xss_reflected", "xss_stored", "xss_dom"],
    "ssrf": ["ssrf", "blind_ssrf"],
    "server-side request forgery": ["ssrf", "blind_ssrf"],
    "xxe": ["xxe"],
    "xml external entity": ["xxe"],
    "ssti": ["ssti"],
    "template injection": ["ssti"],
    "idor": ["idor", "bola"],
    "broken access": ["bola", "bfla", "idor"],
    "deserialization": ["insecure_deserialization"],
    "file upload": ["file_upload"],
    "lfi": ["lfi", "path_traversal"],
    "local file inclusion": ["lfi", "path_traversal"],
    "rfi": ["rfi"],
    "remote file inclusion": ["rfi"],
    "command injection": ["command_injection"],
    "cors": ["cors_misconfig"],
    "csrf": ["csrf"],
    "clickjacking": ["clickjacking"],
    "open redirect": ["open_redirect"],
    "jwt": ["jwt_manipulation"],
    "oauth": ["broken_auth", "auth_bypass"],
    "race condition": ["race_condition"],
    "prototype pollution": ["prototype_pollution"],
    "request smuggling": ["http_request_smuggling"],
    "cache poisoning": ["cache_poisoning"],
    "graphql": ["graphql_injection", "graphql_introspection", "graphql_dos"],
    "websocket": ["websocket_security"],
    "nosql": ["nosql_injection"],
    "ldap": ["ldap_injection"],
    "crlf": ["crlf_injection"],
    "mass assignment": ["mass_assignment"],
    "rate limit": ["rate_limit_bypass", "api_rate_limiting"],
}


@dataclass
class MethodologySection:
    """A parsed section from a methodology document."""
    fase_id: str
    title: str
    content: str
    sub_sections: Dict[str, str] = field(default_factory=dict)
    vuln_types: List[str] = field(default_factory=list)
    contexts: List[str] = field(default_factory=list)

    @property
    def char_count(self) -> int:
        return len(self.content)


class MethodologyIndex:
    """Indexed methodology for fast retrieval by vuln_type and context."""

    def __init__(self):
        self.sections: Dict[str, MethodologySection] = {}
        self.vuln_type_index: Dict[str, List[str]] = {}  # vuln_type → [fase_ids]
        self.context_index: Dict[str, List[str]] = {}    # context → [fase_ids]

    def add_section(self, section: MethodologySection) -> None:
        self.sections[section.fase_id] = section
        for vt in section.vuln_types:
            self.vuln_type_index.setdefault(vt, []).append(section.fase_id)
        for ctx in section.contexts:
            self.context_index.setdefault(ctx, []).append(section.fase_id)

    def get_for_vuln_and_context(
        self,
        vuln_type: str,
        context: str,
        max_chars: int = 2000,
    ) -> str:
        """Get methodology text relevant to both vuln_type and context.

        Prefers sub-sections that mention the vuln_type for precision.
        Truncates to max_chars budget.
        """
        if not self.sections:
            return ""

        candidate_fase_ids: set = set()

        # Find FASEs matching vuln_type
        if vuln_type:
            # Direct match
            for fid in self.vuln_type_index.get(vuln_type, []):
                candidate_fase_ids.add(fid)
            # Fuzzy match: try without common suffixes
            base_vt = vuln_type.replace("_reflected", "").replace("_stored", "").replace("_dom", "")
            base_vt = base_vt.replace("_error", "").replace("_union", "").replace("_blind", "").replace("_time", "")
            if base_vt != vuln_type:
                for fid in self.vuln_type_index.get(base_vt, []):
                    candidate_fase_ids.add(fid)

        # Filter by context
        if context:
            context_fases = set(self.context_index.get(context, []))
            if candidate_fase_ids:
                # Intersect for precision
                filtered = candidate_fase_ids & context_fases
                if filtered:
                    candidate_fase_ids = filtered
                # If intersection is empty, keep vuln_type matches (they're more specific)
            else:
                # No vuln_type specified: use all context matches
                candidate_fase_ids = context_fases

        if not candidate_fase_ids:
            return ""

        # Build output, preferring targeted sub-sections
        parts: List[str] = []
        total = 0

        for fase_id in sorted(candidate_fase_ids):
            section = self.sections.get(fase_id)
            if not section:
                continue

            remaining = max_chars - total
            if remaining < 100:
                break

            # Try to find a targeted sub-section first
            best_sub = self._find_best_subsection(section, vuln_type)

            if best_sub:
                title, content = best_sub
                text = f"### {title}\n{content}"
            else:
                # Use full section content, truncated
                text = f"### {section.title}\n{section.content}"

            if len(text) > remaining:
                text = text[:remaining]

            if len(text) < 50:
                continue  # Skip tiny fragments

            parts.append(text)
            total += len(text)

        return "\n\n".join(parts)

    def _find_best_subsection(
        self, section: MethodologySection, vuln_type: str
    ) -> Optional[tuple]:
        """Find the sub-section most relevant to a vuln_type."""
        if not vuln_type or not section.sub_sections:
            return None

        # Normalize for matching
        vt_variants = set()
        vt_lower = vuln_type.lower()
        vt_variants.add(vt_lower)
        vt_variants.add(vt_lower.replace("_", " "))
        vt_variants.add(vt_lower.replace("_", "-"))

        # Common name mappings
        name_map = {
            "sqli": "sql injection",
            "xss_reflected": "reflected xss",
            "xss_stored": "stored xss",
            "xss_dom": "dom xss",
            "lfi": "lfi",
            "rfi": "rfi",
            "ssrf": "ssrf",
            "ssti": "ssti",
            "xxe": "xxe",
            "nosql_injection": "nosql",
            "crlf_injection": "crlf",
            "cors_misconfig": "cors",
            "insecure_deserialization": "deserialization",
            "http_request_smuggling": "request smuggling",
            "cache_poisoning": "cache poisoning",
            "prototype_pollution": "prototype pollution",
        }
        mapped = name_map.get(vt_lower)
        if mapped:
            vt_variants.add(mapped)

        best_score = 0
        best = None

        for sub_title, sub_content in section.sub_sections.items():
            title_lower = sub_title.lower()
            score = 0
            for variant in vt_variants:
                if variant in title_lower:
                    score = 10  # Title match is strongest
                    break
                if variant in sub_content[:500].lower():
                    score = max(score, 5)  # Content match

            if score > best_score:
                best_score = score
                best = (sub_title, sub_content)

        return best


class MethodologyLoader:
    """Loads and indexes methodology documents from files or DB prompts."""

    def load_from_file(self, file_path: str) -> MethodologyIndex:
        """Load a .md methodology file and build an index."""
        if not os.path.exists(file_path):
            logger.warning(f"Methodology file not found: {file_path}")
            return MethodologyIndex()

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
        except Exception as e:
            logger.error(f"Failed to read methodology file: {e}")
            return MethodologyIndex()

        sections = self._parse_markdown_sections(content)
        index = MethodologyIndex()
        for section in sections:
            index.add_section(section)

        logger.info(
            f"[METHODOLOGY] Loaded {len(sections)} sections from {file_path} "
            f"({sum(s.char_count for s in sections)} chars, "
            f"{len(index.vuln_type_index)} vuln types mapped)"
        )
        return index

    def load_from_db_prompts(self, prompts: List[Dict]) -> MethodologyIndex:
        """Index database-loaded custom prompts into a MethodologyIndex."""
        index = MethodologyIndex()

        for i, p in enumerate(prompts):
            content = p.get("content", "")
            if not content:
                continue

            parsed_vulns = p.get("parsed_vulnerabilities", [])

            # Try FASE-based parsing first
            sections = self._parse_markdown_sections(content)

            if not sections:
                # Treat entire content as one section
                vuln_types = [
                    v.get("type", "") for v in parsed_vulns if v.get("type")
                ]
                if not vuln_types:
                    vuln_types = self._detect_vuln_types_by_keywords(content)

                section = MethodologySection(
                    fase_id=f"db_prompt_{i}",
                    title=p.get("name", f"Custom Prompt {i}"),
                    content=content,
                    sub_sections={},
                    vuln_types=vuln_types,
                    contexts=["testing", "strategy", "confirmation",
                              "verification", "reporting"],
                )
                sections = [section]

            for section in sections:
                index.add_section(section)

        logger.info(
            f"[METHODOLOGY] Indexed {len(index.sections)} sections from "
            f"{len(prompts)} DB prompts"
        )
        return index

    def merge_indices(self, *indices: MethodologyIndex) -> MethodologyIndex:
        """Merge multiple MethodologyIndex objects into one."""
        merged = MethodologyIndex()
        for idx in indices:
            for section in idx.sections.values():
                # Avoid duplicate fase_ids
                if section.fase_id not in merged.sections:
                    merged.add_section(section)
        return merged

    def _parse_markdown_sections(self, content: str) -> List[MethodologySection]:
        """Parse a markdown document into indexed sections.

        Looks for FASE headings first, falls back to generic ## headings.
        """
        sections = self._parse_fase_sections(content)
        if sections:
            return sections

        # Fallback: parse generic ## headings
        return self._parse_generic_sections(content)

    def _parse_fase_sections(self, content: str) -> List[MethodologySection]:
        """Parse FASE-structured methodology documents."""
        # Match ## FASE N: or # FASE N: or ## 🔐 FASE N: (with emoji)
        fase_pattern = re.compile(
            r'^(#{1,2})\s*(?:[^\w]*\s*)?FASE\s+(\d+)\s*[:\-]?\s*(.*?)$',
            re.MULTILINE | re.IGNORECASE,
        )

        matches = list(fase_pattern.finditer(content))
        if not matches:
            return []

        sections: List[MethodologySection] = []

        # Also capture pre-FASE content (e.g., recon steps before FASE 1)
        if matches[0].start() > 200:
            pre_content = content[:matches[0].start()].strip()
            if pre_content:
                pre_subs = self._extract_sub_sections(pre_content)
                sections.append(MethodologySection(
                    fase_id="fase_0",
                    title="Recon & Preparation",
                    content=pre_content,
                    sub_sections=pre_subs,
                    vuln_types=FASE_VULN_TYPE_MAP.get("fase_0", []),
                    contexts=FASE_CONTEXT_MAP.get("fase_0", ["strategy"]),
                ))

        for i, match in enumerate(matches):
            fase_num = match.group(2)
            fase_title = f"FASE {fase_num}: {match.group(3).strip()}"
            start = match.end()
            end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
            body = content[start:end].strip()

            fase_id = f"fase_{fase_num}"
            sub_sections = self._extract_sub_sections(body)
            vuln_types = FASE_VULN_TYPE_MAP.get(fase_id, [])
            contexts = FASE_CONTEXT_MAP.get(fase_id, ["testing"])

            # If not in our hardcoded map, try keyword detection
            if not vuln_types:
                vuln_types = self._detect_vuln_types_by_keywords(body)

            sections.append(MethodologySection(
                fase_id=fase_id,
                title=fase_title,
                content=body,
                sub_sections=sub_sections,
                vuln_types=vuln_types,
                contexts=contexts,
            ))

        return sections

    def _parse_generic_sections(self, content: str) -> List[MethodologySection]:
        """Parse generic ## heading structured documents."""
        heading_pattern = re.compile(r'^##\s+(.*?)$', re.MULTILINE)
        matches = list(heading_pattern.finditer(content))

        if not matches:
            return []

        sections: List[MethodologySection] = []

        for i, match in enumerate(matches):
            title = match.group(1).strip()
            start = match.end()
            end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
            body = content[start:end].strip()

            vuln_types = self._detect_vuln_types_by_keywords(
                title + " " + body[:1000]
            )
            sub_sections = self._extract_sub_sections(body)

            sections.append(MethodologySection(
                fase_id=f"section_{i}",
                title=title,
                content=body,
                sub_sections=sub_sections,
                vuln_types=vuln_types,
                contexts=["testing", "strategy"],
            ))

        return sections

    def _extract_sub_sections(self, body: str) -> Dict[str, str]:
        """Extract ### sub-sections from a section body."""
        sub_pattern = re.compile(r'^###\s+(.*?)$', re.MULTILINE)
        sub_matches = list(sub_pattern.finditer(body))
        sub_sections: Dict[str, str] = {}

        for j, sub in enumerate(sub_matches):
            sub_title = sub.group(1).strip()
            sub_start = sub.end()
            sub_end = (
                sub_matches[j + 1].start()
                if j + 1 < len(sub_matches)
                else len(body)
            )
            sub_content = body[sub_start:sub_end].strip()
            if sub_content:
                sub_sections[sub_title] = sub_content

        return sub_sections

    def _detect_vuln_types_by_keywords(self, text: str) -> List[str]:
        """Detect vuln types from text content via keyword matching."""
        text_lower = text.lower()
        found: List[str] = []
        seen: set = set()

        for keyword, types in KEYWORD_VULN_MAP.items():
            if keyword in text_lower:
                for vt in types:
                    if vt not in seen:
                        found.append(vt)
                        seen.add(vt)

        return found