NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform

116 modules | 100 vuln types | 18 API routes | 18 frontend pages Major features: - VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts - Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop - CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers - Validation Pipeline: negative controls, proof of execution, confidence scoring, judge - AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon - Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents - RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates - Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh - Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install - Full IA Testing: methodology-driven comprehensive pentest sessions - Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts - Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-06-30 07:15:30 +02:00 · 2026-02-22 17:58:12 -03:00
commit e0935793c5
271 changed files with 132462 additions and 0 deletions
@@ -0,0 +1,444 @@
+"""
+NeuroSploit v3 - Knowledge Processor
+
+Pipeline: Upload → Extract Text → AI Summarize → Index by Vuln Type → Store.
+Processes bug bounty papers, CVE documents, writeups, and lab reports
+into structured knowledge the agent uses during testing.
+"""
+import json
+import re
+import uuid
+import shutil
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Optional, Any
+import logging
+
+logger = logging.getLogger(__name__)
+
+# Optional PDF support
+try:
+    from PyPDF2 import PdfReader
+    HAS_PYPDF2 = True
+except ImportError:
+    HAS_PYPDF2 = False
+
+KNOWLEDGE_DIR = Path("data/custom-knowledge")
+UPLOADS_DIR = KNOWLEDGE_DIR / "uploads"
+INDEX_FILE = KNOWLEDGE_DIR / "index.json"
+
+SUPPORTED_FORMATS = {".pdf", ".md", ".txt", ".html", ".htm"}
+
+# Standard vuln type keywords for classification
+VULN_KEYWORDS = {
+    "xss": ["xss", "cross-site scripting", "cross site scripting", "script injection", "reflected xss", "stored xss", "dom xss"],
+    "sqli": ["sql injection", "sqli", "sql inject", "union select", "blind sql", "boolean-based", "time-based"],
+    "ssrf": ["ssrf", "server-side request forgery", "server side request forgery", "internal request"],
+    "idor": ["idor", "insecure direct object reference", "direct object reference", "horizontal privilege"],
+    "rce": ["rce", "remote code execution", "command injection", "os command", "code execution"],
+    "lfi": ["lfi", "local file inclusion", "file inclusion", "path traversal", "directory traversal"],
+    "rfi": ["rfi", "remote file inclusion"],
+    "csrf": ["csrf", "cross-site request forgery", "cross site request forgery"],
+    "xxe": ["xxe", "xml external entity", "xml injection"],
+    "ssti": ["ssti", "server-side template injection", "template injection"],
+    "auth_bypass": ["auth bypass", "authentication bypass", "login bypass", "2fa bypass", "mfa bypass"],
+    "open_redirect": ["open redirect", "url redirect", "redirect vulnerability"],
+    "race_condition": ["race condition", "toctou", "time of check"],
+    "deserialization": ["deserialization", "deserialize", "insecure deserialization", "pickle", "java serialization"],
+    "graphql": ["graphql", "graphql injection", "introspection"],
+    "nosql": ["nosql", "nosql injection", "mongodb injection"],
+    "jwt": ["jwt", "json web token", "jwt attack", "jwt bypass"],
+    "cors": ["cors", "cross-origin", "access-control-allow-origin"],
+    "crlf": ["crlf", "crlf injection", "header injection"],
+    "upload": ["file upload", "upload bypass", "unrestricted upload", "webshell"],
+    "subdomain_takeover": ["subdomain takeover", "dangling dns"],
+    "information_disclosure": ["information disclosure", "info leak", "data exposure", "sensitive data"],
+    "privilege_escalation": ["privilege escalation", "privesc", "vertical privilege"],
+    "bola": ["bola", "broken object level authorization"],
+    "bfla": ["bfla", "broken function level authorization"],
+    "api": ["api security", "api vulnerability", "rest api", "api abuse"],
+    "websocket": ["websocket", "ws hijacking"],
+    "cache_poisoning": ["cache poisoning", "web cache"],
+    "prototype_pollution": ["prototype pollution", "__proto__"],
+    "clickjacking": ["clickjacking", "ui redressing", "x-frame-options"],
+}
+
+AI_ANALYSIS_PROMPT = """You are a security research analyst. Analyze the following security document and extract structured knowledge for a penetration testing AI agent.
+
+Document filename: {filename}
+
+Document content (truncated):
+{text}
+
+Extract the following as JSON:
+{{
+    "title": "Short descriptive title for this document",
+    "summary": "2-3 sentence summary of the key security findings/methodology",
+    "vuln_types": ["list", "of", "vuln_types"],
+    "knowledge_entries": [
+        {{
+            "vuln_type": "the_vuln_type",
+            "methodology": "Step-by-step attack methodology described in the document",
+            "payloads": ["specific payloads or PoC code mentioned"],
+            "key_insights": "What makes this approach unique or effective",
+            "bypass_techniques": ["any WAF/filter/defense bypasses described"]
+        }}
+    ]
+}}
+
+RULES:
+- vuln_types must use standard identifiers: xss, sqli, ssrf, idor, rce, lfi, csrf, xxe, ssti, auth_bypass, open_redirect, race_condition, deserialization, graphql, nosql, jwt, cors, crlf, upload, subdomain_takeover, information_disclosure, privilege_escalation, bola, bfla, api, websocket, cache_poisoning, prototype_pollution, clickjacking
+- Only extract information EXPLICITLY present in the document
+- Do NOT fabricate payloads or methodologies not described in the text
+- Each knowledge_entry should map to exactly one vuln_type
+- If the document covers multiple vuln types, create separate entries for each
+"""
+
+
+class KnowledgeProcessor:
+    """Processes uploaded security documents into indexed knowledge."""
+
+    def __init__(self, llm_client=None):
+        self.llm_client = llm_client
+        self._index = self._load_index()
+        KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True)
+        UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
+
+    def _load_index(self) -> dict:
+        """Load or initialize the knowledge index."""
+        if INDEX_FILE.exists():
+            try:
+                return json.loads(INDEX_FILE.read_text())
+            except Exception as e:
+                logger.warning(f"Failed to load knowledge index: {e}")
+        return {"documents": [], "vuln_type_index": {}, "version": "1.0"}
+
+    def _save_index(self):
+        """Persist index to disk."""
+        self._index["updated_at"] = datetime.utcnow().isoformat()
+        INDEX_FILE.write_text(json.dumps(self._index, indent=2))
+
+    async def process_upload(self, file_bytes: bytes, filename: str) -> dict:
+        """Full pipeline for a single file upload."""
+        ext = Path(filename).suffix.lower()
+        if ext not in SUPPORTED_FORMATS:
+            raise ValueError(f"Unsupported format: {ext}. Supported: {', '.join(SUPPORTED_FORMATS)}")
+
+        # Generate unique ID
+        doc_id = str(uuid.uuid4())[:12]
+
+        # Save raw file
+        safe_filename = re.sub(r'[^a-zA-Z0-9._-]', '_', filename)
+        file_path = UPLOADS_DIR / f"{doc_id}_{safe_filename}"
+        file_path.write_bytes(file_bytes)
+
+        # Extract text
+        text = self._extract_text(file_path, ext)
+        if not text or len(text.strip()) < 50:
+            file_path.unlink(missing_ok=True)
+            raise ValueError("Document has insufficient text content (< 50 chars)")
+
+        # AI analysis (or keyword-based fallback)
+        if self.llm_client:
+            analysis = await self._ai_analyze(text, filename)
+        else:
+            analysis = self._keyword_analyze(text, filename)
+
+        # Build document entry
+        doc_entry = {
+            "id": doc_id,
+            "filename": filename,
+            "title": analysis.get("title", filename),
+            "source_type": ext.lstrip("."),
+            "uploaded_at": datetime.utcnow().isoformat(),
+            "processed": True,
+            "file_size_bytes": len(file_bytes),
+            "summary": analysis.get("summary", ""),
+            "vuln_types": analysis.get("vuln_types", []),
+            "knowledge_entries": analysis.get("knowledge_entries", []),
+        }
+
+        # Add to index
+        self._index_document(doc_entry)
+        self._save_index()
+
+        logger.info(f"Processed knowledge document: {filename} -> {len(doc_entry['knowledge_entries'])} entries")
+        return doc_entry
+
+    def _extract_text(self, file_path: Path, ext: str) -> str:
+        """Extract text from file based on format."""
+        if ext == ".pdf":
+            return self._extract_text_pdf(file_path)
+        elif ext in (".md", ".txt"):
+            return self._extract_text_plaintext(file_path)
+        elif ext in (".html", ".htm"):
+            return self._extract_text_html(file_path)
+        return ""
+
+    def _extract_text_pdf(self, file_path: Path) -> str:
+        """Extract text from PDF."""
+        if not HAS_PYPDKF2:
+            logger.warning("PyPDF2 not installed - PDF extraction unavailable. Install: pip install PyPDF2")
+            # Try reading as text fallback
+            try:
+                return file_path.read_text(errors="ignore")[:20000]
+            except Exception:
+                return ""
+        try:
+            reader = PdfReader(str(file_path))
+            text_parts = []
+            for page in reader.pages[:50]:  # Max 50 pages
+                page_text = page.extract_text()
+                if page_text:
+                    text_parts.append(page_text)
+            return "\n\n".join(text_parts)
+        except Exception as e:
+            logger.warning(f"PDF extraction failed: {e}")
+            return ""
+
+    def _extract_text_plaintext(self, file_path: Path) -> str:
+        """Read markdown or plain text file."""
+        try:
+            return file_path.read_text(errors="ignore")
+        except Exception:
+            return ""
+
+    def _extract_text_html(self, file_path: Path) -> str:
+        """Extract text from HTML by stripping tags."""
+        try:
+            html = file_path.read_text(errors="ignore")
+            # Remove script and style blocks
+            html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+            html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
+            # Strip all tags
+            text = re.sub(r'<[^>]+>', ' ', html)
+            # Clean whitespace
+            text = re.sub(r'\s+', ' ', text).strip()
+            return text
+        except Exception:
+            return ""
+
+    async def _ai_analyze(self, text: str, filename: str) -> dict:
+        """Use LLM to extract structured knowledge."""
+        truncated = text[:8000]
+        prompt = AI_ANALYSIS_PROMPT.format(filename=filename, text=truncated)
+
+        try:
+            response = await self.llm_client.generate(prompt)
+            # Parse JSON from response
+            match = re.search(r'\{.*\}', response, re.DOTALL)
+            if match:
+                data = json.loads(match.group())
+                # Validate vuln_types
+                valid_types = set(VULN_KEYWORDS.keys())
+                data["vuln_types"] = [vt for vt in data.get("vuln_types", []) if vt in valid_types]
+                for entry in data.get("knowledge_entries", []):
+                    if entry.get("vuln_type") not in valid_types:
+                        entry["vuln_type"] = data["vuln_types"][0] if data["vuln_types"] else "information_disclosure"
+                return data
+        except Exception as e:
+            logger.warning(f"AI analysis failed, falling back to keyword analysis: {e}")
+
+        return self._keyword_analyze(text, filename)
+
+    def _keyword_analyze(self, text: str, filename: str) -> dict:
+        """Fallback keyword-based analysis when no LLM available."""
+        text_lower = text.lower()
+        detected_types = []
+
+        for vuln_type, keywords in VULN_KEYWORDS.items():
+            for keyword in keywords:
+                if keyword in text_lower:
+                    detected_types.append(vuln_type)
+                    break
+
+        if not detected_types:
+            detected_types = ["information_disclosure"]
+
+        # Extract title from first line or filename
+        first_line = text.strip().split("\n")[0][:200]
+        title = first_line if len(first_line) > 10 else filename
+
+        # Build basic entries
+        entries = []
+        for vt in detected_types[:5]:  # Max 5 types
+            entries.append({
+                "vuln_type": vt,
+                "methodology": self._extract_section(text, ["methodology", "steps", "approach", "technique"]),
+                "payloads": self._extract_payloads(text),
+                "key_insights": self._extract_section(text, ["insight", "key finding", "conclusion", "takeaway"]),
+                "bypass_techniques": self._extract_payloads_by_pattern(text, ["bypass", "evasion", "waf", "filter"]),
+            })
+
+        return {
+            "title": title.strip("#").strip(),
+            "summary": text[:300].strip(),
+            "vuln_types": detected_types,
+            "knowledge_entries": entries,
+        }
+
+    def _extract_section(self, text: str, keywords: List[str]) -> str:
+        """Extract text section near keywords."""
+        text_lower = text.lower()
+        for keyword in keywords:
+            idx = text_lower.find(keyword)
+            if idx >= 0:
+                # Get surrounding context (up to 800 chars after keyword)
+                start = max(0, idx - 50)
+                end = min(len(text), idx + 800)
+                return text[start:end].strip()
+        return ""
+
+    def _extract_payloads(self, text: str) -> List[str]:
+        """Extract potential payloads from text."""
+        payloads = []
+        # Look for common payload patterns
+        patterns = [
+            r'`([^`]{5,200})`',  # Backtick-enclosed code
+            r"'([^']{10,200})'",  # Single-quoted strings that look like payloads
+        ]
+        for pattern in patterns:
+            matches = re.findall(pattern, text)
+            for m in matches:
+                if any(indicator in m.lower() for indicator in
+                       ["<script", "alert(", "onerror", "union select", "../", "{{",
+                        "curl ", "wget ", "%00", "127.0.0.1", "169.254", "; cat",
+                        "' or ", '" or ', "1=1", "exec(", "system("]):
+                    payloads.append(m)
+        return payloads[:20]  # Max 20 payloads
+
+    def _extract_payloads_by_pattern(self, text: str, keywords: List[str]) -> List[str]:
+        """Extract text fragments near specific keywords."""
+        results = []
+        text_lower = text.lower()
+        for keyword in keywords:
+            idx = text_lower.find(keyword)
+            if idx >= 0:
+                start = max(0, idx - 20)
+                end = min(len(text), idx + 200)
+                fragment = text[start:end].strip()
+                if fragment:
+                    results.append(fragment[:200])
+        return results[:10]
+
+    def _index_document(self, doc_entry: dict):
+        """Add document to the index."""
+        # Remove existing doc with same ID if re-processing
+        self._index["documents"] = [
+            d for d in self._index["documents"] if d["id"] != doc_entry["id"]
+        ]
+        self._index["documents"].append(doc_entry)
+
+        # Update vuln_type_index
+        for vt in doc_entry.get("vuln_types", []):
+            if vt not in self._index["vuln_type_index"]:
+                self._index["vuln_type_index"][vt] = []
+            if doc_entry["id"] not in self._index["vuln_type_index"][vt]:
+                self._index["vuln_type_index"][vt].append(doc_entry["id"])
+
+    def get_documents(self) -> List[dict]:
+        """Return all indexed documents (without full entries for list view)."""
+        docs = []
+        for d in self._index.get("documents", []):
+            docs.append({
+                "id": d["id"],
+                "filename": d["filename"],
+                "title": d["title"],
+                "source_type": d["source_type"],
+                "uploaded_at": d["uploaded_at"],
+                "processed": d["processed"],
+                "file_size_bytes": d["file_size_bytes"],
+                "summary": d["summary"],
+                "vuln_types": d["vuln_types"],
+                "entries_count": len(d.get("knowledge_entries", [])),
+            })
+        return docs
+
+    def get_document(self, doc_id: str) -> Optional[dict]:
+        """Get a specific document with full entries."""
+        for d in self._index.get("documents", []):
+            if d["id"] == doc_id:
+                return d
+        return None
+
+    def delete_document(self, doc_id: str) -> bool:
+        """Remove document from index and delete uploaded file."""
+        doc = self.get_document(doc_id)
+        if not doc:
+            return False
+
+        # Remove from documents list
+        self._index["documents"] = [
+            d for d in self._index["documents"] if d["id"] != doc_id
+        ]
+
+        # Remove from vuln_type_index
+        for vt, doc_ids in self._index.get("vuln_type_index", {}).items():
+            if doc_id in doc_ids:
+                doc_ids.remove(doc_id)
+
+        # Delete uploaded file
+        for f in UPLOADS_DIR.glob(f"{doc_id}_*"):
+            f.unlink(missing_ok=True)
+
+        self._save_index()
+        return True
+
+    def search_by_vuln_type(self, vuln_type: str, max_entries: int = 5) -> List[dict]:
+        """Search knowledge entries by vulnerability type."""
+        vuln_key = vuln_type.lower().replace(" ", "_").replace("-", "_")
+        doc_ids = self._index.get("vuln_type_index", {}).get(vuln_key, [])
+        if not doc_ids:
+            return []
+
+        entries = []
+        for doc in self._index.get("documents", []):
+            if doc["id"] in doc_ids:
+                for ke in doc.get("knowledge_entries", []):
+                    if ke.get("vuln_type") == vuln_key:
+                        entry = dict(ke)
+                        entry["source_document"] = doc["title"]
+                        entry["source_id"] = doc["id"]
+                        entries.append(entry)
+
+        return entries[:max_entries]
+
+    def get_stats(self) -> dict:
+        """Get knowledge base statistics."""
+        docs = self._index.get("documents", [])
+        total_entries = sum(len(d.get("knowledge_entries", [])) for d in docs)
+        vuln_types = list(self._index.get("vuln_type_index", {}).keys())
+
+        # Calculate storage size
+        storage_bytes = 0
+        if UPLOADS_DIR.exists():
+            for f in UPLOADS_DIR.iterdir():
+                storage_bytes += f.stat().st_size
+
+        return {
+            "total_documents": len(docs),
+            "total_entries": total_entries,
+            "vuln_types_covered": sorted(vuln_types),
+            "storage_bytes": storage_bytes,
+        }
+
+    def get_patterns_for_vuln(self, vuln_type: str, max_entries: int = 3) -> str:
+        """Get formatted knowledge patterns for a vuln type (for LLM context injection)."""
+        entries = self.search_by_vuln_type(vuln_type, max_entries)
+        if not entries:
+            return ""
+
+        result = "\n\n=== CUSTOM KNOWLEDGE (User-Uploaded Research) ===\n"
+        for i, entry in enumerate(entries, 1):
+            result += f"--- Research {i}: {entry.get('source_document', 'Unknown')} ---\n"
+            if entry.get("methodology"):
+                result += f"Methodology: {entry['methodology'][:800]}\n"
+            if entry.get("payloads"):
+                result += f"Payloads: {', '.join(entry['payloads'][:5])}\n"
+            if entry.get("key_insights"):
+                result += f"Key Insights: {entry['key_insights'][:400]}\n"
+            if entry.get("bypass_techniques"):
+                result += f"Bypass Techniques: {', '.join(entry['bypass_techniques'][:3])}\n"
+            result += "\n"
+        result += "=== END CUSTOM KNOWLEDGE ===\n"
+        return result