NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform

116 modules | 100 vuln types | 18 API routes | 18 frontend pages

Major features:
- VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts
- Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop
- CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers
- Validation Pipeline: negative controls, proof of execution, confidence scoring, judge
- AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon
- Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents
- RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates
- Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh
- Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install
- Full IA Testing: methodology-driven comprehensive pentest sessions
- Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts
- Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
This commit is contained in:
CyberSecurityUP
2026-02-22 17:58:12 -03:00
commit e0935793c5
271 changed files with 132462 additions and 0 deletions

View File

@@ -0,0 +1,444 @@
"""
NeuroSploit v3 - Knowledge Processor
Pipeline: Upload → Extract Text → AI Summarize → Index by Vuln Type → Store.
Processes bug bounty papers, CVE documents, writeups, and lab reports
into structured knowledge the agent uses during testing.
"""
import json
import re
import uuid
import shutil
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Any
import logging
logger = logging.getLogger(__name__)
# Optional PDF support
try:
from PyPDF2 import PdfReader
HAS_PYPDF2 = True
except ImportError:
HAS_PYPDF2 = False
KNOWLEDGE_DIR = Path("data/custom-knowledge")
UPLOADS_DIR = KNOWLEDGE_DIR / "uploads"
INDEX_FILE = KNOWLEDGE_DIR / "index.json"
SUPPORTED_FORMATS = {".pdf", ".md", ".txt", ".html", ".htm"}
# Standard vuln type keywords for classification
VULN_KEYWORDS = {
"xss": ["xss", "cross-site scripting", "cross site scripting", "script injection", "reflected xss", "stored xss", "dom xss"],
"sqli": ["sql injection", "sqli", "sql inject", "union select", "blind sql", "boolean-based", "time-based"],
"ssrf": ["ssrf", "server-side request forgery", "server side request forgery", "internal request"],
"idor": ["idor", "insecure direct object reference", "direct object reference", "horizontal privilege"],
"rce": ["rce", "remote code execution", "command injection", "os command", "code execution"],
"lfi": ["lfi", "local file inclusion", "file inclusion", "path traversal", "directory traversal"],
"rfi": ["rfi", "remote file inclusion"],
"csrf": ["csrf", "cross-site request forgery", "cross site request forgery"],
"xxe": ["xxe", "xml external entity", "xml injection"],
"ssti": ["ssti", "server-side template injection", "template injection"],
"auth_bypass": ["auth bypass", "authentication bypass", "login bypass", "2fa bypass", "mfa bypass"],
"open_redirect": ["open redirect", "url redirect", "redirect vulnerability"],
"race_condition": ["race condition", "toctou", "time of check"],
"deserialization": ["deserialization", "deserialize", "insecure deserialization", "pickle", "java serialization"],
"graphql": ["graphql", "graphql injection", "introspection"],
"nosql": ["nosql", "nosql injection", "mongodb injection"],
"jwt": ["jwt", "json web token", "jwt attack", "jwt bypass"],
"cors": ["cors", "cross-origin", "access-control-allow-origin"],
"crlf": ["crlf", "crlf injection", "header injection"],
"upload": ["file upload", "upload bypass", "unrestricted upload", "webshell"],
"subdomain_takeover": ["subdomain takeover", "dangling dns"],
"information_disclosure": ["information disclosure", "info leak", "data exposure", "sensitive data"],
"privilege_escalation": ["privilege escalation", "privesc", "vertical privilege"],
"bola": ["bola", "broken object level authorization"],
"bfla": ["bfla", "broken function level authorization"],
"api": ["api security", "api vulnerability", "rest api", "api abuse"],
"websocket": ["websocket", "ws hijacking"],
"cache_poisoning": ["cache poisoning", "web cache"],
"prototype_pollution": ["prototype pollution", "__proto__"],
"clickjacking": ["clickjacking", "ui redressing", "x-frame-options"],
}
AI_ANALYSIS_PROMPT = """You are a security research analyst. Analyze the following security document and extract structured knowledge for a penetration testing AI agent.
Document filename: {filename}
Document content (truncated):
{text}
Extract the following as JSON:
{{
"title": "Short descriptive title for this document",
"summary": "2-3 sentence summary of the key security findings/methodology",
"vuln_types": ["list", "of", "vuln_types"],
"knowledge_entries": [
{{
"vuln_type": "the_vuln_type",
"methodology": "Step-by-step attack methodology described in the document",
"payloads": ["specific payloads or PoC code mentioned"],
"key_insights": "What makes this approach unique or effective",
"bypass_techniques": ["any WAF/filter/defense bypasses described"]
}}
]
}}
RULES:
- vuln_types must use standard identifiers: xss, sqli, ssrf, idor, rce, lfi, csrf, xxe, ssti, auth_bypass, open_redirect, race_condition, deserialization, graphql, nosql, jwt, cors, crlf, upload, subdomain_takeover, information_disclosure, privilege_escalation, bola, bfla, api, websocket, cache_poisoning, prototype_pollution, clickjacking
- Only extract information EXPLICITLY present in the document
- Do NOT fabricate payloads or methodologies not described in the text
- Each knowledge_entry should map to exactly one vuln_type
- If the document covers multiple vuln types, create separate entries for each
"""
class KnowledgeProcessor:
"""Processes uploaded security documents into indexed knowledge."""
def __init__(self, llm_client=None):
self.llm_client = llm_client
self._index = self._load_index()
KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True)
UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
def _load_index(self) -> dict:
"""Load or initialize the knowledge index."""
if INDEX_FILE.exists():
try:
return json.loads(INDEX_FILE.read_text())
except Exception as e:
logger.warning(f"Failed to load knowledge index: {e}")
return {"documents": [], "vuln_type_index": {}, "version": "1.0"}
def _save_index(self):
"""Persist index to disk."""
self._index["updated_at"] = datetime.utcnow().isoformat()
INDEX_FILE.write_text(json.dumps(self._index, indent=2))
async def process_upload(self, file_bytes: bytes, filename: str) -> dict:
"""Full pipeline for a single file upload."""
ext = Path(filename).suffix.lower()
if ext not in SUPPORTED_FORMATS:
raise ValueError(f"Unsupported format: {ext}. Supported: {', '.join(SUPPORTED_FORMATS)}")
# Generate unique ID
doc_id = str(uuid.uuid4())[:12]
# Save raw file
safe_filename = re.sub(r'[^a-zA-Z0-9._-]', '_', filename)
file_path = UPLOADS_DIR / f"{doc_id}_{safe_filename}"
file_path.write_bytes(file_bytes)
# Extract text
text = self._extract_text(file_path, ext)
if not text or len(text.strip()) < 50:
file_path.unlink(missing_ok=True)
raise ValueError("Document has insufficient text content (< 50 chars)")
# AI analysis (or keyword-based fallback)
if self.llm_client:
analysis = await self._ai_analyze(text, filename)
else:
analysis = self._keyword_analyze(text, filename)
# Build document entry
doc_entry = {
"id": doc_id,
"filename": filename,
"title": analysis.get("title", filename),
"source_type": ext.lstrip("."),
"uploaded_at": datetime.utcnow().isoformat(),
"processed": True,
"file_size_bytes": len(file_bytes),
"summary": analysis.get("summary", ""),
"vuln_types": analysis.get("vuln_types", []),
"knowledge_entries": analysis.get("knowledge_entries", []),
}
# Add to index
self._index_document(doc_entry)
self._save_index()
logger.info(f"Processed knowledge document: {filename} -> {len(doc_entry['knowledge_entries'])} entries")
return doc_entry
def _extract_text(self, file_path: Path, ext: str) -> str:
"""Extract text from file based on format."""
if ext == ".pdf":
return self._extract_text_pdf(file_path)
elif ext in (".md", ".txt"):
return self._extract_text_plaintext(file_path)
elif ext in (".html", ".htm"):
return self._extract_text_html(file_path)
return ""
def _extract_text_pdf(self, file_path: Path) -> str:
"""Extract text from PDF."""
if not HAS_PYPDKF2:
logger.warning("PyPDF2 not installed - PDF extraction unavailable. Install: pip install PyPDF2")
# Try reading as text fallback
try:
return file_path.read_text(errors="ignore")[:20000]
except Exception:
return ""
try:
reader = PdfReader(str(file_path))
text_parts = []
for page in reader.pages[:50]: # Max 50 pages
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
return "\n\n".join(text_parts)
except Exception as e:
logger.warning(f"PDF extraction failed: {e}")
return ""
def _extract_text_plaintext(self, file_path: Path) -> str:
"""Read markdown or plain text file."""
try:
return file_path.read_text(errors="ignore")
except Exception:
return ""
def _extract_text_html(self, file_path: Path) -> str:
"""Extract text from HTML by stripping tags."""
try:
html = file_path.read_text(errors="ignore")
# Remove script and style blocks
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
# Strip all tags
text = re.sub(r'<[^>]+>', ' ', html)
# Clean whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
except Exception:
return ""
async def _ai_analyze(self, text: str, filename: str) -> dict:
"""Use LLM to extract structured knowledge."""
truncated = text[:8000]
prompt = AI_ANALYSIS_PROMPT.format(filename=filename, text=truncated)
try:
response = await self.llm_client.generate(prompt)
# Parse JSON from response
match = re.search(r'\{.*\}', response, re.DOTALL)
if match:
data = json.loads(match.group())
# Validate vuln_types
valid_types = set(VULN_KEYWORDS.keys())
data["vuln_types"] = [vt for vt in data.get("vuln_types", []) if vt in valid_types]
for entry in data.get("knowledge_entries", []):
if entry.get("vuln_type") not in valid_types:
entry["vuln_type"] = data["vuln_types"][0] if data["vuln_types"] else "information_disclosure"
return data
except Exception as e:
logger.warning(f"AI analysis failed, falling back to keyword analysis: {e}")
return self._keyword_analyze(text, filename)
def _keyword_analyze(self, text: str, filename: str) -> dict:
"""Fallback keyword-based analysis when no LLM available."""
text_lower = text.lower()
detected_types = []
for vuln_type, keywords in VULN_KEYWORDS.items():
for keyword in keywords:
if keyword in text_lower:
detected_types.append(vuln_type)
break
if not detected_types:
detected_types = ["information_disclosure"]
# Extract title from first line or filename
first_line = text.strip().split("\n")[0][:200]
title = first_line if len(first_line) > 10 else filename
# Build basic entries
entries = []
for vt in detected_types[:5]: # Max 5 types
entries.append({
"vuln_type": vt,
"methodology": self._extract_section(text, ["methodology", "steps", "approach", "technique"]),
"payloads": self._extract_payloads(text),
"key_insights": self._extract_section(text, ["insight", "key finding", "conclusion", "takeaway"]),
"bypass_techniques": self._extract_payloads_by_pattern(text, ["bypass", "evasion", "waf", "filter"]),
})
return {
"title": title.strip("#").strip(),
"summary": text[:300].strip(),
"vuln_types": detected_types,
"knowledge_entries": entries,
}
def _extract_section(self, text: str, keywords: List[str]) -> str:
"""Extract text section near keywords."""
text_lower = text.lower()
for keyword in keywords:
idx = text_lower.find(keyword)
if idx >= 0:
# Get surrounding context (up to 800 chars after keyword)
start = max(0, idx - 50)
end = min(len(text), idx + 800)
return text[start:end].strip()
return ""
def _extract_payloads(self, text: str) -> List[str]:
"""Extract potential payloads from text."""
payloads = []
# Look for common payload patterns
patterns = [
r'`([^`]{5,200})`', # Backtick-enclosed code
r"'([^']{10,200})'", # Single-quoted strings that look like payloads
]
for pattern in patterns:
matches = re.findall(pattern, text)
for m in matches:
if any(indicator in m.lower() for indicator in
["<script", "alert(", "onerror", "union select", "../", "{{",
"curl ", "wget ", "%00", "127.0.0.1", "169.254", "; cat",
"' or ", '" or ', "1=1", "exec(", "system("]):
payloads.append(m)
return payloads[:20] # Max 20 payloads
def _extract_payloads_by_pattern(self, text: str, keywords: List[str]) -> List[str]:
"""Extract text fragments near specific keywords."""
results = []
text_lower = text.lower()
for keyword in keywords:
idx = text_lower.find(keyword)
if idx >= 0:
start = max(0, idx - 20)
end = min(len(text), idx + 200)
fragment = text[start:end].strip()
if fragment:
results.append(fragment[:200])
return results[:10]
def _index_document(self, doc_entry: dict):
"""Add document to the index."""
# Remove existing doc with same ID if re-processing
self._index["documents"] = [
d for d in self._index["documents"] if d["id"] != doc_entry["id"]
]
self._index["documents"].append(doc_entry)
# Update vuln_type_index
for vt in doc_entry.get("vuln_types", []):
if vt not in self._index["vuln_type_index"]:
self._index["vuln_type_index"][vt] = []
if doc_entry["id"] not in self._index["vuln_type_index"][vt]:
self._index["vuln_type_index"][vt].append(doc_entry["id"])
def get_documents(self) -> List[dict]:
"""Return all indexed documents (without full entries for list view)."""
docs = []
for d in self._index.get("documents", []):
docs.append({
"id": d["id"],
"filename": d["filename"],
"title": d["title"],
"source_type": d["source_type"],
"uploaded_at": d["uploaded_at"],
"processed": d["processed"],
"file_size_bytes": d["file_size_bytes"],
"summary": d["summary"],
"vuln_types": d["vuln_types"],
"entries_count": len(d.get("knowledge_entries", [])),
})
return docs
def get_document(self, doc_id: str) -> Optional[dict]:
"""Get a specific document with full entries."""
for d in self._index.get("documents", []):
if d["id"] == doc_id:
return d
return None
def delete_document(self, doc_id: str) -> bool:
"""Remove document from index and delete uploaded file."""
doc = self.get_document(doc_id)
if not doc:
return False
# Remove from documents list
self._index["documents"] = [
d for d in self._index["documents"] if d["id"] != doc_id
]
# Remove from vuln_type_index
for vt, doc_ids in self._index.get("vuln_type_index", {}).items():
if doc_id in doc_ids:
doc_ids.remove(doc_id)
# Delete uploaded file
for f in UPLOADS_DIR.glob(f"{doc_id}_*"):
f.unlink(missing_ok=True)
self._save_index()
return True
def search_by_vuln_type(self, vuln_type: str, max_entries: int = 5) -> List[dict]:
"""Search knowledge entries by vulnerability type."""
vuln_key = vuln_type.lower().replace(" ", "_").replace("-", "_")
doc_ids = self._index.get("vuln_type_index", {}).get(vuln_key, [])
if not doc_ids:
return []
entries = []
for doc in self._index.get("documents", []):
if doc["id"] in doc_ids:
for ke in doc.get("knowledge_entries", []):
if ke.get("vuln_type") == vuln_key:
entry = dict(ke)
entry["source_document"] = doc["title"]
entry["source_id"] = doc["id"]
entries.append(entry)
return entries[:max_entries]
def get_stats(self) -> dict:
"""Get knowledge base statistics."""
docs = self._index.get("documents", [])
total_entries = sum(len(d.get("knowledge_entries", [])) for d in docs)
vuln_types = list(self._index.get("vuln_type_index", {}).keys())
# Calculate storage size
storage_bytes = 0
if UPLOADS_DIR.exists():
for f in UPLOADS_DIR.iterdir():
storage_bytes += f.stat().st_size
return {
"total_documents": len(docs),
"total_entries": total_entries,
"vuln_types_covered": sorted(vuln_types),
"storage_bytes": storage_bytes,
}
def get_patterns_for_vuln(self, vuln_type: str, max_entries: int = 3) -> str:
"""Get formatted knowledge patterns for a vuln type (for LLM context injection)."""
entries = self.search_by_vuln_type(vuln_type, max_entries)
if not entries:
return ""
result = "\n\n=== CUSTOM KNOWLEDGE (User-Uploaded Research) ===\n"
for i, entry in enumerate(entries, 1):
result += f"--- Research {i}: {entry.get('source_document', 'Unknown')} ---\n"
if entry.get("methodology"):
result += f"Methodology: {entry['methodology'][:800]}\n"
if entry.get("payloads"):
result += f"Payloads: {', '.join(entry['payloads'][:5])}\n"
if entry.get("key_insights"):
result += f"Key Insights: {entry['key_insights'][:400]}\n"
if entry.get("bypass_techniques"):
result += f"Bypass Techniques: {', '.join(entry['bypass_techniques'][:3])}\n"
result += "\n"
result += "=== END CUSTOM KNOWLEDGE ===\n"
return result