"""
NeuroSploit v3 - Knowledge Processor
Pipeline: Upload → Extract Text → AI Summarize → Index by Vuln Type → Store.
Processes bug bounty papers, CVE documents, writeups, and lab reports
into structured knowledge the agent uses during testing.
"""
import json
import re
import uuid
import shutil
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Any
import logging
logger = logging.getLogger(__name__)
# Optional PDF support
try:
from PyPDF2 import PdfReader
HAS_PYPDF2 = True
except ImportError:
HAS_PYPDF2 = False
KNOWLEDGE_DIR = Path("data/custom-knowledge")
UPLOADS_DIR = KNOWLEDGE_DIR / "uploads"
INDEX_FILE = KNOWLEDGE_DIR / "index.json"
SUPPORTED_FORMATS = {".pdf", ".md", ".txt", ".html", ".htm"}
# Standard vuln type keywords for classification
VULN_KEYWORDS = {
"xss": ["xss", "cross-site scripting", "cross site scripting", "script injection", "reflected xss", "stored xss", "dom xss"],
"sqli": ["sql injection", "sqli", "sql inject", "union select", "blind sql", "boolean-based", "time-based"],
"ssrf": ["ssrf", "server-side request forgery", "server side request forgery", "internal request"],
"idor": ["idor", "insecure direct object reference", "direct object reference", "horizontal privilege"],
"rce": ["rce", "remote code execution", "command injection", "os command", "code execution"],
"lfi": ["lfi", "local file inclusion", "file inclusion", "path traversal", "directory traversal"],
"rfi": ["rfi", "remote file inclusion"],
"csrf": ["csrf", "cross-site request forgery", "cross site request forgery"],
"xxe": ["xxe", "xml external entity", "xml injection"],
"ssti": ["ssti", "server-side template injection", "template injection"],
"auth_bypass": ["auth bypass", "authentication bypass", "login bypass", "2fa bypass", "mfa bypass"],
"open_redirect": ["open redirect", "url redirect", "redirect vulnerability"],
"race_condition": ["race condition", "toctou", "time of check"],
"deserialization": ["deserialization", "deserialize", "insecure deserialization", "pickle", "java serialization"],
"graphql": ["graphql", "graphql injection", "introspection"],
"nosql": ["nosql", "nosql injection", "mongodb injection"],
"jwt": ["jwt", "json web token", "jwt attack", "jwt bypass"],
"cors": ["cors", "cross-origin", "access-control-allow-origin"],
"crlf": ["crlf", "crlf injection", "header injection"],
"upload": ["file upload", "upload bypass", "unrestricted upload", "webshell"],
"subdomain_takeover": ["subdomain takeover", "dangling dns"],
"information_disclosure": ["information disclosure", "info leak", "data exposure", "sensitive data"],
"privilege_escalation": ["privilege escalation", "privesc", "vertical privilege"],
"bola": ["bola", "broken object level authorization"],
"bfla": ["bfla", "broken function level authorization"],
"api": ["api security", "api vulnerability", "rest api", "api abuse"],
"websocket": ["websocket", "ws hijacking"],
"cache_poisoning": ["cache poisoning", "web cache"],
"prototype_pollution": ["prototype pollution", "__proto__"],
"clickjacking": ["clickjacking", "ui redressing", "x-frame-options"],
}
AI_ANALYSIS_PROMPT = """You are a security research analyst. Analyze the following security document and extract structured knowledge for a penetration testing AI agent.
Document filename: {filename}
Document content (truncated):
{text}
Extract the following as JSON:
{{
"title": "Short descriptive title for this document",
"summary": "2-3 sentence summary of the key security findings/methodology",
"vuln_types": ["list", "of", "vuln_types"],
"knowledge_entries": [
{{
"vuln_type": "the_vuln_type",
"methodology": "Step-by-step attack methodology described in the document",
"payloads": ["specific payloads or PoC code mentioned"],
"key_insights": "What makes this approach unique or effective",
"bypass_techniques": ["any WAF/filter/defense bypasses described"]
}}
]
}}
RULES:
- vuln_types must use standard identifiers: xss, sqli, ssrf, idor, rce, lfi, csrf, xxe, ssti, auth_bypass, open_redirect, race_condition, deserialization, graphql, nosql, jwt, cors, crlf, upload, subdomain_takeover, information_disclosure, privilege_escalation, bola, bfla, api, websocket, cache_poisoning, prototype_pollution, clickjacking
- Only extract information EXPLICITLY present in the document
- Do NOT fabricate payloads or methodologies not described in the text
- Each knowledge_entry should map to exactly one vuln_type
- If the document covers multiple vuln types, create separate entries for each
"""
class KnowledgeProcessor:
"""Processes uploaded security documents into indexed knowledge."""
def __init__(self, llm_client=None):
self.llm_client = llm_client
self._index = self._load_index()
KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True)
UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
def _load_index(self) -> dict:
"""Load or initialize the knowledge index."""
if INDEX_FILE.exists():
try:
return json.loads(INDEX_FILE.read_text())
except Exception as e:
logger.warning(f"Failed to load knowledge index: {e}")
return {"documents": [], "vuln_type_index": {}, "version": "1.0"}
def _save_index(self):
"""Persist index to disk."""
self._index["updated_at"] = datetime.utcnow().isoformat()
INDEX_FILE.write_text(json.dumps(self._index, indent=2))
async def process_upload(self, file_bytes: bytes, filename: str) -> dict:
"""Full pipeline for a single file upload."""
ext = Path(filename).suffix.lower()
if ext not in SUPPORTED_FORMATS:
raise ValueError(f"Unsupported format: {ext}. Supported: {', '.join(SUPPORTED_FORMATS)}")
# Generate unique ID
doc_id = str(uuid.uuid4())[:12]
# Save raw file
safe_filename = re.sub(r'[^a-zA-Z0-9._-]', '_', filename)
file_path = UPLOADS_DIR / f"{doc_id}_{safe_filename}"
file_path.write_bytes(file_bytes)
# Extract text
text = self._extract_text(file_path, ext)
if not text or len(text.strip()) < 50:
file_path.unlink(missing_ok=True)
raise ValueError("Document has insufficient text content (< 50 chars)")
# AI analysis (or keyword-based fallback)
if self.llm_client:
analysis = await self._ai_analyze(text, filename)
else:
analysis = self._keyword_analyze(text, filename)
# Build document entry
doc_entry = {
"id": doc_id,
"filename": filename,
"title": analysis.get("title", filename),
"source_type": ext.lstrip("."),
"uploaded_at": datetime.utcnow().isoformat(),
"processed": True,
"file_size_bytes": len(file_bytes),
"summary": analysis.get("summary", ""),
"vuln_types": analysis.get("vuln_types", []),
"knowledge_entries": analysis.get("knowledge_entries", []),
}
# Add to index
self._index_document(doc_entry)
self._save_index()
logger.info(f"Processed knowledge document: {filename} -> {len(doc_entry['knowledge_entries'])} entries")
return doc_entry
def _extract_text(self, file_path: Path, ext: str) -> str:
"""Extract text from file based on format."""
if ext == ".pdf":
return self._extract_text_pdf(file_path)
elif ext in (".md", ".txt"):
return self._extract_text_plaintext(file_path)
elif ext in (".html", ".htm"):
return self._extract_text_html(file_path)
return ""
def _extract_text_pdf(self, file_path: Path) -> str:
"""Extract text from PDF."""
if not HAS_PYPDKF2:
logger.warning("PyPDF2 not installed - PDF extraction unavailable. Install: pip install PyPDF2")
# Try reading as text fallback
try:
return file_path.read_text(errors="ignore")[:20000]
except Exception:
return ""
try:
reader = PdfReader(str(file_path))
text_parts = []
for page in reader.pages[:50]: # Max 50 pages
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
return "\n\n".join(text_parts)
except Exception as e:
logger.warning(f"PDF extraction failed: {e}")
return ""
def _extract_text_plaintext(self, file_path: Path) -> str:
"""Read markdown or plain text file."""
try:
return file_path.read_text(errors="ignore")
except Exception:
return ""
def _extract_text_html(self, file_path: Path) -> str:
"""Extract text from HTML by stripping tags."""
try:
html = file_path.read_text(errors="ignore")
# Remove script and style blocks
html = re.sub(r'', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'', '', html, flags=re.DOTALL | re.IGNORECASE)
# Strip all tags
text = re.sub(r'<[^>]+>', ' ', html)
# Clean whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
except Exception:
return ""
async def _ai_analyze(self, text: str, filename: str) -> dict:
"""Use LLM to extract structured knowledge."""
truncated = text[:8000]
prompt = AI_ANALYSIS_PROMPT.format(filename=filename, text=truncated)
try:
response = await self.llm_client.generate(prompt)
# Parse JSON from response
match = re.search(r'\{.*\}', response, re.DOTALL)
if match:
data = json.loads(match.group())
# Validate vuln_types
valid_types = set(VULN_KEYWORDS.keys())
data["vuln_types"] = [vt for vt in data.get("vuln_types", []) if vt in valid_types]
for entry in data.get("knowledge_entries", []):
if entry.get("vuln_type") not in valid_types:
entry["vuln_type"] = data["vuln_types"][0] if data["vuln_types"] else "information_disclosure"
return data
except Exception as e:
logger.warning(f"AI analysis failed, falling back to keyword analysis: {e}")
return self._keyword_analyze(text, filename)
def _keyword_analyze(self, text: str, filename: str) -> dict:
"""Fallback keyword-based analysis when no LLM available."""
text_lower = text.lower()
detected_types = []
for vuln_type, keywords in VULN_KEYWORDS.items():
for keyword in keywords:
if keyword in text_lower:
detected_types.append(vuln_type)
break
if not detected_types:
detected_types = ["information_disclosure"]
# Extract title from first line or filename
first_line = text.strip().split("\n")[0][:200]
title = first_line if len(first_line) > 10 else filename
# Build basic entries
entries = []
for vt in detected_types[:5]: # Max 5 types
entries.append({
"vuln_type": vt,
"methodology": self._extract_section(text, ["methodology", "steps", "approach", "technique"]),
"payloads": self._extract_payloads(text),
"key_insights": self._extract_section(text, ["insight", "key finding", "conclusion", "takeaway"]),
"bypass_techniques": self._extract_payloads_by_pattern(text, ["bypass", "evasion", "waf", "filter"]),
})
return {
"title": title.strip("#").strip(),
"summary": text[:300].strip(),
"vuln_types": detected_types,
"knowledge_entries": entries,
}
def _extract_section(self, text: str, keywords: List[str]) -> str:
"""Extract text section near keywords."""
text_lower = text.lower()
for keyword in keywords:
idx = text_lower.find(keyword)
if idx >= 0:
# Get surrounding context (up to 800 chars after keyword)
start = max(0, idx - 50)
end = min(len(text), idx + 800)
return text[start:end].strip()
return ""
def _extract_payloads(self, text: str) -> List[str]:
"""Extract potential payloads from text."""
payloads = []
# Look for common payload patterns
patterns = [
r'`([^`]{5,200})`', # Backtick-enclosed code
r"'([^']{10,200})'", # Single-quoted strings that look like payloads
]
for pattern in patterns:
matches = re.findall(pattern, text)
for m in matches:
if any(indicator in m.lower() for indicator in
["