NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform

116 modules | 100 vuln types | 18 API routes | 18 frontend pages Major features: - VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts - Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop - CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers - Validation Pipeline: negative controls, proof of execution, confidence scoring, judge - AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon - Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents - RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates - Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh - Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install - Full IA Testing: methodology-driven comprehensive pentest sessions - Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts - Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-06-05 04:18:00 +02:00 · 2026-02-22 17:58:12 -03:00
commit e0935793c5
271 changed files with 132462 additions and 0 deletions
@@ -0,0 +1,457 @@
+"""
+CLI Output Parser - 3-tier finding extraction from CLI agent output.
+
+Tier 1: JSON marker blocks (===FINDING_START=== / ===FINDING_END===)
+Tier 2: Regex patterns for known tool output formats (nuclei, nmap, sqlmap)
+Tier 3: AI-assisted extraction via LLM for unstructured text
+"""
+import json
+import re
+import logging
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional, Set
+
+logger = logging.getLogger(__name__)
+
+# JSON finding markers used in CLI instructions
+FINDING_START = "===FINDING_START==="
+FINDING_END = "===FINDING_END==="
+
+# Progress markers
+PHASE_PATTERN = re.compile(r'\[PHASE\]\s*(.+)', re.IGNORECASE)
+COMPLETE_PATTERN = re.compile(r'\[COMPLETE\]', re.IGNORECASE)
+PROGRESS_PATTERN = re.compile(r'\[PROGRESS\]\s*(\d+)%?\s*(.*)', re.IGNORECASE)
+
+# Severity keywords for regex extraction
+SEVERITY_MAP = {
+    "critical": "critical", "crit": "critical",
+    "high": "high",
+    "medium": "medium", "med": "medium",
+    "low": "low",
+    "info": "info", "informational": "info",
+}
+
+# Nuclei JSONL output pattern
+NUCLEI_JSON_PATTERN = re.compile(r'^\{.*"template-id".*"matched-at".*\}$', re.MULTILINE)
+
+# Generic vulnerability patterns in CLI output
+VULN_PATTERNS = [
+    # [VULNERABILITY] Title - Severity
+    re.compile(
+        r'\[(?:VULNERABILITY|VULN|FINDING|ALERT)\]\s*(.+?)(?:\s*[-–]\s*(critical|high|medium|low|info))?$',
+        re.IGNORECASE | re.MULTILINE
+    ),
+    # SQLMap style: Parameter 'X' is vulnerable
+    re.compile(
+        r"(?:Parameter|Param)\s+['\"]?(\w+)['\"]?\s+(?:is|appears)\s+(?:vulnerable|injectable)",
+        re.IGNORECASE
+    ),
+    # Nuclei text: [severity] [template-id] URL
+    re.compile(
+        r'\[(critical|high|medium|low|info)\]\s*\[([^\]]+)\]\s*(https?://\S+)',
+        re.IGNORECASE
+    ),
+]
+
+
+@dataclass
+class ParsedFinding:
+    """A finding extracted from CLI output."""
+    title: str
+    severity: str = "medium"
+    vulnerability_type: str = ""
+    endpoint: str = ""
+    parameter: str = ""
+    evidence: str = ""
+    poc_code: str = ""
+    request: str = ""
+    response: str = ""
+    impact: str = ""
+    cvss_score: Optional[float] = None
+    source: str = "cli_agent"
+
+    def to_dict(self) -> Dict:
+        d = {
+            "title": self.title,
+            "severity": self.severity,
+            "vulnerability_type": self.vulnerability_type or self._infer_vuln_type(),
+            "affected_endpoint": self.endpoint,
+            "parameter": self.parameter,
+            "evidence": self.evidence,
+            "poc_code": self.poc_code,
+            "request": self.request,
+            "response": self.response,
+            "impact": self.impact,
+            "source": self.source,
+            "ai_status": "confirmed",
+            "ai_verified": True,
+            "confidence_score": 70,
+        }
+        if self.cvss_score:
+            d["cvss_score"] = self.cvss_score
+        return d
+
+    def _infer_vuln_type(self) -> str:
+        """Infer vulnerability type from title keywords."""
+        title_lower = self.title.lower()
+        type_map = {
+            "sql injection": "sqli_error", "sqli": "sqli_error",
+            "xss": "xss_reflected", "cross-site scripting": "xss_reflected",
+            "stored xss": "xss_stored", "dom xss": "xss_dom",
+            "command injection": "command_injection", "rce": "command_injection",
+            "ssrf": "ssrf", "server-side request": "ssrf",
+            "lfi": "lfi", "local file": "lfi", "path traversal": "path_traversal",
+            "rfi": "rfi", "remote file": "rfi",
+            "xxe": "xxe", "xml external": "xxe",
+            "ssti": "ssti", "template injection": "ssti",
+            "csrf": "csrf", "cross-site request": "csrf",
+            "idor": "idor", "insecure direct": "idor",
+            "open redirect": "open_redirect",
+            "file upload": "file_upload",
+            "directory listing": "directory_listing",
+            "information disclosure": "information_disclosure",
+            "sensitive data": "sensitive_data_exposure",
+            "security header": "security_headers",
+            "ssl": "ssl_issues", "tls": "ssl_issues",
+            "cors": "cors_misconfig",
+            "crlf": "crlf_injection",
+            "nosql": "nosql_injection",
+            "ldap": "ldap_injection",
+            "jwt": "jwt_manipulation",
+            "auth bypass": "auth_bypass",
+            "brute force": "brute_force",
+            "rate limit": "rate_limit_bypass",
+            "clickjacking": "clickjacking",
+            "http smuggling": "http_smuggling",
+            "cache poison": "cache_poisoning",
+            "deserialization": "insecure_deserialization",
+            "prototype pollution": "prototype_pollution",
+            "graphql": "graphql_injection",
+            "host header": "host_header_injection",
+            "race condition": "race_condition",
+            "business logic": "business_logic",
+        }
+        for keyword, vtype in type_map.items():
+            if keyword in title_lower:
+                return vtype
+        return "unknown"
+
+
+class CLIOutputParser:
+    """3-tier output parser for CLI agent findings."""
+
+    def __init__(self):
+        self._seen_finding_hashes: Set[str] = set()
+        self._buffer = ""  # Accumulates partial JSON blocks across chunks
+        self._unparsed_chunks: List[str] = []
+        self._total_findings = 0
+        self._phases_seen: List[str] = []
+        self._is_complete = False
+
+    def parse_chunk(self, text: str) -> List[ParsedFinding]:
+        """Parse a chunk of CLI output. Returns newly extracted findings."""
+        if not text or not text.strip():
+            return []
+
+        findings: List[ParsedFinding] = []
+
+        # Track progress markers
+        for m in PHASE_PATTERN.finditer(text):
+            phase = m.group(1).strip()
+            if phase not in self._phases_seen:
+                self._phases_seen.append(phase)
+                logger.info(f"[CLI-PARSER] Phase: {phase}")
+
+        if COMPLETE_PATTERN.search(text):
+            self._is_complete = True
+
+        # Tier 1: JSON marker blocks
+        combined = self._buffer + text
+        tier1 = self._extract_json_markers(combined)
+        findings.extend(tier1)
+
+        # Tier 2: Regex patterns
+        tier2 = self._extract_regex_findings(text)
+        findings.extend(tier2)
+
+        # Tier 2b: Nuclei JSONL
+        tier2b = self._extract_nuclei_jsonl(text)
+        findings.extend(tier2b)
+
+        # Track unparsed text for later AI extraction
+        if not tier1 and not tier2 and not tier2b:
+            if len(text.strip()) > 50:
+                self._unparsed_chunks.append(text)
+
+        # Deduplicate
+        unique = []
+        for f in findings:
+            h = f"{f.title}|{f.endpoint}|{f.severity}"
+            if h not in self._seen_finding_hashes:
+                self._seen_finding_hashes.add(h)
+                unique.append(f)
+                self._total_findings += 1
+
+        return unique
+
+    def get_unparsed_text(self, clear: bool = True) -> str:
+        """Get accumulated unparsed text for AI extraction."""
+        text = "\n".join(self._unparsed_chunks)
+        if clear:
+            self._unparsed_chunks = []
+        return text
+
+    @property
+    def is_complete(self) -> bool:
+        return self._is_complete
+
+    @property
+    def phases(self) -> List[str]:
+        return self._phases_seen
+
+    @property
+    def total_findings(self) -> int:
+        return self._total_findings
+
+    def _extract_json_markers(self, text: str) -> List[ParsedFinding]:
+        """Tier 1: Extract findings from ===FINDING_START=== / ===FINDING_END=== blocks."""
+        findings = []
+        remaining_buffer = ""
+
+        # Find all complete blocks
+        parts = text.split(FINDING_START)
+        for i, part in enumerate(parts):
+            if i == 0:
+                continue  # Text before first marker
+
+            if FINDING_END in part:
+                json_text, after = part.split(FINDING_END, 1)
+                json_text = json_text.strip()
+                try:
+                    data = json.loads(json_text)
+                    f = self._json_to_finding(data)
+                    if f:
+                        findings.append(f)
+                except json.JSONDecodeError:
+                    # Try to fix common JSON issues
+                    fixed = self._try_fix_json(json_text)
+                    if fixed:
+                        f = self._json_to_finding(fixed)
+                        if f:
+                            findings.append(f)
+                    else:
+                        logger.debug(f"[CLI-PARSER] Invalid JSON in marker block: {json_text[:100]}")
+            else:
+                # Incomplete block - save to buffer for next chunk
+                remaining_buffer = FINDING_START + part
+
+        self._buffer = remaining_buffer
+        return findings
+
+    def _extract_regex_findings(self, text: str) -> List[ParsedFinding]:
+        """Tier 2: Extract findings using regex patterns."""
+        findings = []
+
+        for pattern in VULN_PATTERNS:
+            for match in pattern.finditer(text):
+                groups = match.groups()
+                if len(groups) >= 1:
+                    title = groups[0].strip()
+                    severity = "medium"
+                    endpoint = ""
+
+                    if len(groups) >= 2 and groups[1]:
+                        sev = groups[1].lower().strip()
+                        severity = SEVERITY_MAP.get(sev, "medium")
+
+                    if len(groups) >= 3 and groups[2]:
+                        endpoint = groups[2].strip()
+
+                    # Skip very short or generic titles
+                    if len(title) < 5 or title.lower() in ("n/a", "none", "test"):
+                        continue
+
+                    findings.append(ParsedFinding(
+                        title=title,
+                        severity=severity,
+                        endpoint=endpoint,
+                        evidence=match.group(0),
+                    ))
+
+        return findings
+
+    def _extract_nuclei_jsonl(self, text: str) -> List[ParsedFinding]:
+        """Tier 2b: Extract findings from Nuclei JSONL output."""
+        findings = []
+
+        for match in NUCLEI_JSON_PATTERN.finditer(text):
+            try:
+                data = json.loads(match.group(0))
+                template_id = data.get("template-id", "")
+                matched_at = data.get("matched-at", "")
+                info = data.get("info", {})
+                severity = info.get("severity", "medium").lower()
+                name = info.get("name", template_id)
+                description = info.get("description", "")
+
+                findings.append(ParsedFinding(
+                    title=f"[Nuclei] {name}",
+                    severity=SEVERITY_MAP.get(severity, "medium"),
+                    vulnerability_type=self._nuclei_to_vuln_type(template_id),
+                    endpoint=matched_at,
+                    evidence=f"Template: {template_id}\n{description}",
+                    poc_code=f"nuclei -t {template_id} -u {matched_at}",
+                ))
+            except json.JSONDecodeError:
+                continue
+
+        return findings
+
+    def _json_to_finding(self, data: Dict) -> Optional[ParsedFinding]:
+        """Convert a JSON dict to ParsedFinding."""
+        title = data.get("title", "").strip()
+        if not title:
+            return None
+
+        severity = data.get("severity", "medium").lower()
+        severity = SEVERITY_MAP.get(severity, severity)
+        if severity not in ("critical", "high", "medium", "low", "info"):
+            severity = "medium"
+
+        return ParsedFinding(
+            title=title,
+            severity=severity,
+            vulnerability_type=data.get("vulnerability_type", ""),
+            endpoint=data.get("endpoint", data.get("affected_endpoint", "")),
+            parameter=data.get("parameter", ""),
+            evidence=data.get("evidence", ""),
+            poc_code=data.get("poc_code", data.get("poc", "")),
+            request=data.get("request", ""),
+            response=data.get("response", ""),
+            impact=data.get("impact", ""),
+            cvss_score=data.get("cvss_score"),
+        )
+
+    @staticmethod
+    def _try_fix_json(text: str) -> Optional[Dict]:
+        """Try to fix common JSON issues."""
+        # Remove trailing commas
+        fixed = re.sub(r',\s*}', '}', text)
+        fixed = re.sub(r',\s*]', ']', fixed)
+        # Try to parse
+        try:
+            return json.loads(fixed)
+        except json.JSONDecodeError:
+            pass
+        # Try wrapping in braces
+        if not fixed.startswith('{'):
+            try:
+                return json.loads('{' + fixed + '}')
+            except json.JSONDecodeError:
+                pass
+        return None
+
+    @staticmethod
+    def _nuclei_to_vuln_type(template_id: str) -> str:
+        """Map nuclei template ID to vulnerability type."""
+        tid = template_id.lower()
+        mappings = {
+            "sqli": "sqli_error", "sql-injection": "sqli_error",
+            "xss": "xss_reflected", "cross-site-scripting": "xss_reflected",
+            "ssrf": "ssrf", "server-side-request": "ssrf",
+            "lfi": "lfi", "local-file": "lfi",
+            "rfi": "rfi", "remote-file": "rfi",
+            "rce": "command_injection", "command-injection": "command_injection",
+            "ssti": "ssti", "template-injection": "ssti",
+            "xxe": "xxe", "xml-external": "xxe",
+            "redirect": "open_redirect",
+            "cors": "cors_misconfig",
+            "crlf": "crlf_injection",
+            "csrf": "csrf",
+            "header-injection": "header_injection",
+            "directory-listing": "directory_listing",
+            "info-disclosure": "information_disclosure",
+            "exposure": "sensitive_data_exposure",
+            "ssl": "ssl_issues", "tls": "ssl_issues",
+            "default-login": "default_credentials",
+            "misconfig": "security_headers",
+        }
+        for key, vtype in mappings.items():
+            if key in tid:
+                return vtype
+        return "unknown"
+
+
+# AI-assisted extraction prompt template
+AI_EXTRACT_PROMPT = """Analyze this penetration testing CLI output and extract any CONFIRMED vulnerability findings.
+
+IMPORTANT: Only extract findings where there is clear evidence of a vulnerability (error messages,
+data leakage, successful exploitation). Do NOT extract theoretical or untested issues.
+
+CLI Output:
+{output}
+
+For each confirmed finding, provide:
+- title: concise vulnerability name
+- severity: critical|high|medium|low|info
+- vulnerability_type: e.g., sqli_error, xss_reflected, ssrf, command_injection, etc.
+- endpoint: the affected URL
+- parameter: affected parameter (if applicable)
+- evidence: the actual proof (HTTP response, error, data leaked)
+- poc_code: the command or request that confirmed it
+
+Respond ONLY with valid JSON:
+{{"findings": [{{"title": "...", "severity": "...", "vulnerability_type": "...", "endpoint": "...", "parameter": "...", "evidence": "...", "poc_code": "..."}}]}}
+
+If no confirmed findings, respond: {{"findings": []}}"""
+
+
+async def ai_extract_findings(text: str, llm, max_chars: int = 8000) -> List[ParsedFinding]:
+    """Tier 3: AI-assisted extraction of findings from unstructured CLI output."""
+    if not text or len(text.strip()) < 100:
+        return []
+
+    # Truncate to max_chars
+    if len(text) > max_chars:
+        text = text[:max_chars] + "\n... [truncated]"
+
+    prompt = AI_EXTRACT_PROMPT.format(output=text)
+
+    try:
+        response = await llm.generate(
+            prompt=prompt,
+            system="You are a security finding extractor. Extract only confirmed vulnerabilities with real evidence.",
+            max_tokens=2000,
+        )
+
+        if not response:
+            return []
+
+        # Extract JSON from response
+        json_match = re.search(r'\{.*"findings".*\}', response, re.DOTALL)
+        if not json_match:
+            return []
+
+        data = json.loads(json_match.group(0))
+        findings_data = data.get("findings", [])
+
+        findings = []
+        for fd in findings_data:
+            if not fd.get("title"):
+                continue
+            findings.append(ParsedFinding(
+                title=fd["title"],
+                severity=fd.get("severity", "medium"),
+                vulnerability_type=fd.get("vulnerability_type", ""),
+                endpoint=fd.get("endpoint", ""),
+                parameter=fd.get("parameter", ""),
+                evidence=fd.get("evidence", ""),
+                poc_code=fd.get("poc_code", ""),
+            ))
+
+        logger.info(f"[CLI-PARSER] AI extracted {len(findings)} findings")
+        return findings
+
+    except Exception as e:
+        logger.warning(f"[CLI-PARSER] AI extraction failed: {e}")
+        return []