Files
NeuroSploit/backend/core/deep_recon.py
CyberSecurityUP e0935793c5 NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform
116 modules | 100 vuln types | 18 API routes | 18 frontend pages

Major features:
- VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts
- Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop
- CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers
- Validation Pipeline: negative controls, proof of execution, confidence scoring, judge
- AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon
- Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents
- RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates
- Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh
- Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install
- Full IA Testing: methodology-driven comprehensive pentest sessions
- Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts
- Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-02-22 17:59:28 -03:00

378 lines
14 KiB
Python

"""
Advanced reconnaissance module for NeuroSploitv2.
Performs deep JS analysis, sitemap/robots parsing, API enumeration,
and technology fingerprinting using async HTTP requests.
"""
import re
import json
import asyncio
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from urllib.parse import urljoin, urlparse
try:
import aiohttp
HAS_AIOHTTP = True
except ImportError:
HAS_AIOHTTP = False
try:
from xml.etree import ElementTree as ET
except ImportError:
ET = None
REQUEST_TIMEOUT = aiohttp.ClientTimeout(total=10) if HAS_AIOHTTP else None
MAX_JS_FILES = 10
MAX_JS_SIZE = 500 * 1024 # 500 KB
MAX_SITEMAP_URLS = 200
# --- Regex patterns for JS analysis ---
RE_API_ENDPOINT = re.compile(r'/api/v[0-9]+/[a-z_/]+')
RE_FETCH_URL = re.compile(r'fetch\(\s*["\']([^"\']+)["\']')
RE_AXIOS_URL = re.compile(r'axios\.(?:get|post|put|patch|delete)\(\s*["\']([^"\']+)["\']')
RE_AJAX_URL = re.compile(r'\$\.ajax\(\s*\{[^}]*url\s*:\s*["\']([^"\']+)["\']', re.DOTALL)
RE_XHR_URL = re.compile(r'\.open\(\s*["\'][A-Z]+["\']\s*,\s*["\']([^"\']+)["\']')
RE_API_KEY = re.compile(
r'(?:sk-[a-zA-Z0-9]{20,}|pk_(?:live|test)_[a-zA-Z0-9]{20,}'
r'|AKIA[0-9A-Z]{16}'
r'|ghp_[a-zA-Z0-9]{36}'
r'|glpat-[a-zA-Z0-9\-]{20,}'
r'|eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,})'
)
RE_INTERNAL_URL = re.compile(
r'https?://(?:localhost|127\.0\.0\.1|10\.\d+\.\d+\.\d+|192\.168\.\d+\.\d+|172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+)[^\s"\']*'
)
RE_REACT_ROUTE = re.compile(r'path\s*[:=]\s*["\'](/[^"\']*)["\']')
RE_ANGULAR_ROUTE = re.compile(r'path\s*:\s*["\']([^"\']+)["\']')
RE_VUE_ROUTE = re.compile(r'path\s*:\s*["\'](/[^"\']*)["\']')
@dataclass
class JSAnalysisResult:
"""Results from JavaScript file analysis."""
endpoints: List[str] = field(default_factory=list)
api_keys: List[str] = field(default_factory=list)
internal_urls: List[str] = field(default_factory=list)
secrets: List[str] = field(default_factory=list)
@dataclass
class APISchema:
"""Parsed API schema from Swagger/OpenAPI or GraphQL introspection."""
endpoints: List[Dict] = field(default_factory=list)
version: str = ""
source: str = ""
class DeepRecon:
"""Advanced reconnaissance: JS analysis, sitemap, robots, API enum, fingerprinting."""
def __init__(self, session: Optional["aiohttp.ClientSession"] = None):
self._external_session = session is not None
self._session = session
async def _get_session(self) -> "aiohttp.ClientSession":
if self._session is None or self._session.closed:
self._session = aiohttp.ClientSession(timeout=REQUEST_TIMEOUT)
self._external_session = False
return self._session
async def close(self):
if not self._external_session and self._session and not self._session.closed:
await self._session.close()
async def _fetch(self, url: str, max_size: int = 0) -> Optional[str]:
"""Fetch URL text with optional size limit. Returns None on any error."""
try:
session = await self._get_session()
async with session.get(url, ssl=False, allow_redirects=True) as resp:
if resp.status != 200:
return None
if max_size:
chunk = await resp.content.read(max_size)
return chunk.decode("utf-8", errors="replace")
return await resp.text()
except Exception:
return None
# ------------------------------------------------------------------
# JS file analysis
# ------------------------------------------------------------------
async def crawl_js_files(self, base_url: str, js_urls: List[str]) -> JSAnalysisResult:
"""Fetch and analyse JavaScript files for endpoints, keys, and secrets."""
result = JSAnalysisResult()
urls_to_scan = js_urls[:MAX_JS_FILES]
tasks = [self._fetch(urljoin(base_url, u), max_size=MAX_JS_SIZE) for u in urls_to_scan]
bodies = await asyncio.gather(*tasks, return_exceptions=True)
seen_endpoints: set = set()
for body in bodies:
if not isinstance(body, str):
continue
# API endpoint patterns
for m in RE_API_ENDPOINT.finditer(body):
seen_endpoints.add(m.group(0))
for regex in (RE_FETCH_URL, RE_AXIOS_URL, RE_AJAX_URL, RE_XHR_URL):
for m in regex.finditer(body):
seen_endpoints.add(m.group(1))
# Route definitions (React Router, Angular, Vue Router)
for regex in (RE_REACT_ROUTE, RE_ANGULAR_ROUTE, RE_VUE_ROUTE):
for m in regex.finditer(body):
seen_endpoints.add(m.group(1))
# API keys / tokens
for m in RE_API_KEY.finditer(body):
val = m.group(0)
if val not in result.api_keys:
result.api_keys.append(val)
result.secrets.append(val)
# Internal / private URLs
for m in RE_INTERNAL_URL.finditer(body):
val = m.group(0)
if val not in result.internal_urls:
result.internal_urls.append(val)
# Resolve endpoints relative to base_url
for ep in sorted(seen_endpoints):
resolved = urljoin(base_url, ep) if not ep.startswith("http") else ep
if resolved not in result.endpoints:
result.endpoints.append(resolved)
return result
# ------------------------------------------------------------------
# Sitemap parsing
# ------------------------------------------------------------------
async def parse_sitemap(self, target: str) -> List[str]:
"""Fetch and parse sitemap XML files for URLs."""
target = target.rstrip("/")
candidates = [
f"{target}/sitemap.xml",
f"{target}/sitemap_index.xml",
f"{target}/sitemap1.xml",
]
urls: set = set()
for sitemap_url in candidates:
body = await self._fetch(sitemap_url)
if not body or ET is None:
continue
try:
root = ET.fromstring(body)
except ET.ParseError:
continue
# Handle both sitemapindex and urlset; strip namespace
for elem in root.iter():
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
if tag == "loc" and elem.text:
urls.add(elem.text.strip())
if len(urls) >= MAX_SITEMAP_URLS:
return sorted(urls)[:MAX_SITEMAP_URLS]
return sorted(urls)[:MAX_SITEMAP_URLS]
# ------------------------------------------------------------------
# Robots.txt parsing
# ------------------------------------------------------------------
async def parse_robots(self, target: str) -> List[str]:
"""Parse robots.txt and return resolved paths (Disallow + Allow)."""
target = target.rstrip("/")
body = await self._fetch(f"{target}/robots.txt")
if not body:
return []
paths: set = set()
for line in body.splitlines():
line = line.strip()
if line.startswith("#") or ":" not in line:
continue
directive, _, value = line.partition(":")
directive = directive.strip().lower()
value = value.strip()
if directive in ("disallow", "allow") and value:
resolved = urljoin(target + "/", value)
paths.add(resolved)
return sorted(paths)
# ------------------------------------------------------------------
# API enumeration (Swagger / OpenAPI / GraphQL)
# ------------------------------------------------------------------
_API_DOC_PATHS = [
"/swagger.json",
"/openapi.json",
"/api-docs",
"/v2/api-docs",
"/swagger/v1/swagger.json",
"/.well-known/openapi",
"/api/swagger.json",
]
async def enumerate_api(self, target: str, technologies: List[str]) -> APISchema:
"""Discover and parse API documentation (OpenAPI/Swagger, GraphQL)."""
target = target.rstrip("/")
schema = APISchema()
# Try OpenAPI / Swagger endpoints
for path in self._API_DOC_PATHS:
body = await self._fetch(f"{target}{path}")
if not body:
continue
try:
doc = json.loads(body)
except (json.JSONDecodeError, ValueError):
continue
# Looks like a valid Swagger/OpenAPI doc
if "paths" in doc or "openapi" in doc or "swagger" in doc:
schema.version = doc.get("openapi", doc.get("info", {}).get("version", ""))
schema.source = path
for route, methods in doc.get("paths", {}).items():
for method, detail in methods.items():
if method.lower() in ("get", "post", "put", "patch", "delete", "options", "head"):
params = [
p.get("name", "")
for p in detail.get("parameters", [])
if isinstance(p, dict)
]
schema.endpoints.append({
"url": route,
"method": method.upper(),
"params": params,
})
return schema
# GraphQL introspection
if "graphql" in [t.lower() for t in technologies] or not schema.endpoints:
introspection = await self._graphql_introspect(target)
if introspection:
return introspection
return schema
async def _graphql_introspect(self, target: str) -> Optional[APISchema]:
"""Attempt GraphQL introspection query."""
query = '{"query":"{ __schema { queryType { name } types { name fields { name args { name } } } } }"}'
try:
session = await self._get_session()
headers = {"Content-Type": "application/json"}
async with session.post(
f"{target}/graphql", data=query, headers=headers, ssl=False
) as resp:
if resp.status != 200:
return None
data = await resp.json()
except Exception:
return None
if "data" not in data or "__schema" not in data.get("data", {}):
return None
schema = APISchema(version="graphql", source="/graphql")
for type_info in data["data"]["__schema"].get("types", []):
type_name = type_info.get("name", "")
if type_name.startswith("__"):
continue
for fld in type_info.get("fields", []) or []:
params = [a["name"] for a in fld.get("args", []) if isinstance(a, dict)]
schema.endpoints.append({
"url": f"/{type_name}/{fld['name']}",
"method": "QUERY",
"params": params,
})
return schema if schema.endpoints else None
# ------------------------------------------------------------------
# Deep technology fingerprinting
# ------------------------------------------------------------------
_FINGERPRINT_FILES = [
"/readme.txt", "/README.md", "/CHANGELOG.md", "/CHANGES.txt",
"/package.json", "/composer.json",
]
_WP_PROBES = [
"/wp-links-opml.php",
"/wp-includes/js/wp-embed.min.js",
]
_DRUPAL_PROBES = [
"/CHANGELOG.txt",
"/core/CHANGELOG.txt",
]
RE_VERSION = re.compile(r'["\']?version["\']?\s*[:=]\s*["\']?(\d+\.\d+[\w.\-]*)')
RE_WP_VER = re.compile(r'ver=(\d+\.\d+[\w.\-]*)')
RE_DRUPAL_VER = re.compile(r'Drupal\s+(\d+\.\d+[\w.\-]*)')
async def deep_fingerprint(
self, target: str, headers: Dict, body: str
) -> List[Dict]:
"""Detect software and versions from well-known files and probes."""
target = target.rstrip("/")
results: List[Dict] = []
seen: set = set()
def _add(software: str, version: str, source: str):
key = (software.lower(), version)
if key not in seen:
seen.add(key)
results.append({"software": software, "version": version, "source": source})
# Generic version files
tasks = {path: self._fetch(f"{target}{path}") for path in self._FINGERPRINT_FILES}
bodies = dict(zip(tasks.keys(), await asyncio.gather(*tasks.values(), return_exceptions=True)))
for path, content in bodies.items():
if not isinstance(content, str):
continue
if path.endswith(".json"):
try:
doc = json.loads(content)
name = doc.get("name", "unknown")
ver = doc.get("version", "")
if ver:
_add(name, ver, path)
except (json.JSONDecodeError, ValueError):
pass
else:
m = self.RE_VERSION.search(content)
if m:
_add("unknown", m.group(1), path)
# WordPress probes
for wp_path in self._WP_PROBES:
content = await self._fetch(f"{target}{wp_path}")
if not content:
continue
m = self.RE_WP_VER.search(content)
if m:
_add("WordPress", m.group(1), wp_path)
elif "WordPress" in content or "wp-" in content:
_add("WordPress", "unknown", wp_path)
# Drupal probes
for dp_path in self._DRUPAL_PROBES:
content = await self._fetch(f"{target}{dp_path}")
if not content:
continue
m = self.RE_DRUPAL_VER.search(content)
if m:
_add("Drupal", m.group(1), dp_path)
return results