NeuroSploit/legacy/backend_fastapi/core/endpoint_classifier.py

"""
NeuroSploit v3 - Endpoint Classifier

Classifies endpoints by function type (auth, upload, api, admin, etc.)
and assigns risk scores + priority vulnerability types for targeted testing.
Replaces linear endpoint iteration with risk-ranked testing order.
"""

import re
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse, parse_qs

logger = logging.getLogger(__name__)


@dataclass
class EndpointProfile:
    """Classification result for a single endpoint."""
    url: str
    endpoint_type: str  # "auth", "upload", "api", "admin", "search", "data", "static", "generic"
    risk_score: float   # 0.0 - 1.0
    priority_vulns: List[str] = field(default_factory=list)
    method: str = "GET"
    params: List[str] = field(default_factory=list)
    has_auth_indicators: bool = False
    has_file_handling: bool = False
    has_user_input: bool = False
    tech_hints: List[str] = field(default_factory=list)


class EndpointClassifier:
    """Classifies endpoints by function and assigns risk scores.

    Instead of testing all endpoints equally, this module ranks them
    by type and attack potential, directing more testing effort at
    high-value targets (admin panels, auth endpoints, file uploads).
    """

    ENDPOINT_TYPES = {
        "auth": {
            "indicators": [
                "/login", "/auth", "/signin", "/sign-in", "/register",
                "/signup", "/sign-up", "/password", "/forgot", "/reset",
                "/logout", "/signout", "/sign-out", "/oauth", "/sso",
                "/token", "/session", "/2fa", "/mfa", "/verify",
                "/activate", "/confirm", "/callback",
            ],
            "priority_vulns": [
                "auth_bypass", "brute_force", "weak_password",
                "credential_stuffing", "session_fixation",
                "jwt_manipulation", "broken_authentication",
            ],
            "risk_weight": 0.90,
        },
        "upload": {
            "indicators": [
                "/upload", "/file", "/import", "/attachment", "/media",
                "/image", "/avatar", "/photo", "/document", "/asset",
                "/resource", "/blob", "/storage",
            ],
            "priority_vulns": [
                "file_upload", "xxe", "path_traversal",
                "arbitrary_file_read", "rfi", "command_injection",
            ],
            "risk_weight": 0.85,
        },
        "admin": {
            "indicators": [
                "/admin", "/manage", "/dashboard", "/panel", "/console",
                "/control", "/backend", "/cms", "/wp-admin", "/administrator",
                "/phpmyadmin", "/cpanel", "/webadmin", "/sysadmin",
                "/management", "/internal", "/debug", "/monitoring",
            ],
            "priority_vulns": [
                "auth_bypass", "privilege_escalation", "default_credentials",
                "exposed_admin_panel", "idor", "bfla",
            ],
            "risk_weight": 0.95,
        },
        "api": {
            "indicators": [
                "/api/", "/v1/", "/v2/", "/v3/", "/graphql", "/rest/",
                "/json", "/xml", "/soap", "/rpc", "/grpc", "/webhook",
                "/ws/", "/websocket",
            ],
            "priority_vulns": [
                "idor", "bola", "bfla", "jwt_manipulation",
                "mass_assignment", "sqli_error", "nosql_injection",
                "api_rate_limiting", "broken_authentication",
            ],
            "risk_weight": 0.80,
        },
        "search": {
            "indicators": [
                "/search", "/query", "/find", "/lookup", "/filter",
                "/browse", "/explore", "/autocomplete", "/suggest",
            ],
            "param_indicators": ["q", "query", "search", "keyword", "s", "term"],
            "priority_vulns": [
                "sqli_error", "sqli_blind", "xss_reflected",
                "nosql_injection", "ssti", "xss_stored",
            ],
            "risk_weight": 0.75,
        },
        "data": {
            "indicators": [
                "/users", "/accounts", "/orders", "/profile", "/settings",
                "/preferences", "/billing", "/payment", "/invoice",
                "/transaction", "/subscription", "/notification",
                "/message", "/comment", "/post", "/article",
            ],
            "priority_vulns": [
                "idor", "bola", "mass_assignment",
                "data_exposure", "information_disclosure",
                "privilege_escalation",
            ],
            "risk_weight": 0.75,
        },
        "redirect": {
            "indicators": [
                "/redirect", "/goto", "/out", "/external", "/link",
                "/url", "/forward", "/return", "/next", "/continue",
            ],
            "param_indicators": ["url", "redirect", "next", "return", "goto", "dest"],
            "priority_vulns": [
                "open_redirect", "ssrf", "ssrf_cloud",
            ],
            "risk_weight": 0.70,
        },
        "download": {
            "indicators": [
                "/download", "/export", "/report", "/generate",
                "/pdf", "/csv", "/xlsx", "/print",
            ],
            "priority_vulns": [
                "lfi", "path_traversal", "arbitrary_file_read",
                "ssrf", "command_injection", "ssti",
            ],
            "risk_weight": 0.80,
        },
    }

    # Response header indicators for tech detection
    TECH_INDICATORS = {
        "php": ["x-powered-by: php", ".php", "phpsessid"],
        "asp": ["x-powered-by: asp", "x-aspnet-version", ".aspx", ".asp"],
        "java": ["x-powered-by: servlet", "jsessionid", ".jsp", ".do", ".action"],
        "python": ["x-powered-by: flask", "x-powered-by: django", "csrftoken"],
        "node": ["x-powered-by: express", "connect.sid"],
        "ruby": ["x-powered-by: phusion", "_session_id", ".rb"],
        "wordpress": ["wp-", "wordpress", "wp-content", "wp-json"],
        "drupal": ["drupal", "sites/default"],
        "joomla": ["joomla", "administrator/index.php"],
    }

    def classify(self, url: str, method: str = "GET",
                 params: List[str] = None,
                 response_headers: Dict = None) -> EndpointProfile:
        """Classify a single endpoint and return its profile."""
        parsed = urlparse(url)
        path = parsed.path.lower()
        params = params or list(parse_qs(parsed.query).keys())

        best_type = "generic"
        best_score = 0.30
        best_vulns = ["xss_reflected", "sqli_error"]

        for etype, config in self.ENDPOINT_TYPES.items():
            score = 0.0

            # Path-based matching
            for indicator in config["indicators"]:
                if indicator in path:
                    score = max(score, config["risk_weight"])
                    break

            # Parameter-based matching
            param_indicators = config.get("param_indicators", [])
            if params and param_indicators:
                for p in params:
                    if p.lower() in param_indicators:
                        score = max(score, config["risk_weight"] * 0.9)
                        break

            if score > best_score:
                best_score = score
                best_type = etype
                best_vulns = list(config["priority_vulns"])

        # Boost for dangerous methods
        if method in ("POST", "PUT", "PATCH", "DELETE"):
            best_score = min(1.0, best_score + 0.05)

        # Boost for parameters (more params = more attack surface)
        if params:
            param_boost = min(0.10, len(params) * 0.02)
            best_score = min(1.0, best_score + param_boost)

        # Detect tech hints from headers
        tech_hints = []
        if response_headers:
            header_str = str(response_headers).lower()
            for tech, indicators in self.TECH_INDICATORS.items():
                if any(ind in header_str for ind in indicators):
                    tech_hints.append(tech)

        return EndpointProfile(
            url=url,
            endpoint_type=best_type,
            risk_score=round(best_score, 2),
            priority_vulns=best_vulns,
            method=method,
            params=params or [],
            has_auth_indicators=best_type == "auth",
            has_file_handling=best_type in ("upload", "download"),
            has_user_input=best_type in ("search", "data", "api"),
            tech_hints=tech_hints,
        )

    def rank_endpoints(self, endpoints: List[Dict]) -> List[Tuple[Dict, float]]:
        """Rank a list of endpoints by risk score.

        Args:
            endpoints: List of dicts with at minimum 'url' key,
                       optionally 'method', 'params', 'headers'.

        Returns:
            List of (endpoint_dict, risk_score) sorted by risk descending.
        """
        ranked = []
        for ep in endpoints:
            url = ep.get("url", ep.get("endpoint", ""))
            method = ep.get("method", "GET")
            params = ep.get("params", [])
            headers = ep.get("headers", ep.get("response_headers", {}))

            profile = self.classify(url, method, params, headers)
            ep["_profile"] = profile
            ranked.append((ep, profile.risk_score))

        ranked.sort(key=lambda x: x[1], reverse=True)
        return ranked

    def get_endpoint_vuln_priorities(self, url: str, method: str = "GET",
                                      params: List[str] = None) -> List[str]:
        """Return vuln types most likely to succeed on this endpoint."""
        profile = self.classify(url, method, params)
        return profile.priority_vulns

    def get_high_risk_endpoints(self, endpoints: List[Dict],
                                 threshold: float = 0.7) -> List[Dict]:
        """Filter endpoints to only high-risk ones."""
        ranked = self.rank_endpoints(endpoints)
        return [ep for ep, score in ranked if score >= threshold]

    def get_endpoint_test_budget(self, risk_score: float,
                                  base_types: int = 5) -> int:
        """Return how many vuln types to test based on risk score.

        High-risk endpoints get more testing effort.
        """
        if risk_score >= 0.90:
            return base_types * 3     # 15 types for admin/auth
        elif risk_score >= 0.80:
            return base_types * 2     # 10 types for api/upload
        elif risk_score >= 0.70:
            return int(base_types * 1.5)  # 7 types for search/data
        return base_types              # 5 types for generic