Files
NeuroSploit/backend/core/cli_output_parser.py
CyberSecurityUP e0935793c5 NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform
116 modules | 100 vuln types | 18 API routes | 18 frontend pages

Major features:
- VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts
- Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop
- CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers
- Validation Pipeline: negative controls, proof of execution, confidence scoring, judge
- AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon
- Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents
- RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates
- Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh
- Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install
- Full IA Testing: methodology-driven comprehensive pentest sessions
- Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts
- Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-02-22 17:59:28 -03:00

458 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
CLI Output Parser - 3-tier finding extraction from CLI agent output.
Tier 1: JSON marker blocks (===FINDING_START=== / ===FINDING_END===)
Tier 2: Regex patterns for known tool output formats (nuclei, nmap, sqlmap)
Tier 3: AI-assisted extraction via LLM for unstructured text
"""
import json
import re
import logging
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Set
logger = logging.getLogger(__name__)
# JSON finding markers used in CLI instructions
FINDING_START = "===FINDING_START==="
FINDING_END = "===FINDING_END==="
# Progress markers
PHASE_PATTERN = re.compile(r'\[PHASE\]\s*(.+)', re.IGNORECASE)
COMPLETE_PATTERN = re.compile(r'\[COMPLETE\]', re.IGNORECASE)
PROGRESS_PATTERN = re.compile(r'\[PROGRESS\]\s*(\d+)%?\s*(.*)', re.IGNORECASE)
# Severity keywords for regex extraction
SEVERITY_MAP = {
"critical": "critical", "crit": "critical",
"high": "high",
"medium": "medium", "med": "medium",
"low": "low",
"info": "info", "informational": "info",
}
# Nuclei JSONL output pattern
NUCLEI_JSON_PATTERN = re.compile(r'^\{.*"template-id".*"matched-at".*\}$', re.MULTILINE)
# Generic vulnerability patterns in CLI output
VULN_PATTERNS = [
# [VULNERABILITY] Title - Severity
re.compile(
r'\[(?:VULNERABILITY|VULN|FINDING|ALERT)\]\s*(.+?)(?:\s*[-]\s*(critical|high|medium|low|info))?$',
re.IGNORECASE | re.MULTILINE
),
# SQLMap style: Parameter 'X' is vulnerable
re.compile(
r"(?:Parameter|Param)\s+['\"]?(\w+)['\"]?\s+(?:is|appears)\s+(?:vulnerable|injectable)",
re.IGNORECASE
),
# Nuclei text: [severity] [template-id] URL
re.compile(
r'\[(critical|high|medium|low|info)\]\s*\[([^\]]+)\]\s*(https?://\S+)',
re.IGNORECASE
),
]
@dataclass
class ParsedFinding:
"""A finding extracted from CLI output."""
title: str
severity: str = "medium"
vulnerability_type: str = ""
endpoint: str = ""
parameter: str = ""
evidence: str = ""
poc_code: str = ""
request: str = ""
response: str = ""
impact: str = ""
cvss_score: Optional[float] = None
source: str = "cli_agent"
def to_dict(self) -> Dict:
d = {
"title": self.title,
"severity": self.severity,
"vulnerability_type": self.vulnerability_type or self._infer_vuln_type(),
"affected_endpoint": self.endpoint,
"parameter": self.parameter,
"evidence": self.evidence,
"poc_code": self.poc_code,
"request": self.request,
"response": self.response,
"impact": self.impact,
"source": self.source,
"ai_status": "confirmed",
"ai_verified": True,
"confidence_score": 70,
}
if self.cvss_score:
d["cvss_score"] = self.cvss_score
return d
def _infer_vuln_type(self) -> str:
"""Infer vulnerability type from title keywords."""
title_lower = self.title.lower()
type_map = {
"sql injection": "sqli_error", "sqli": "sqli_error",
"xss": "xss_reflected", "cross-site scripting": "xss_reflected",
"stored xss": "xss_stored", "dom xss": "xss_dom",
"command injection": "command_injection", "rce": "command_injection",
"ssrf": "ssrf", "server-side request": "ssrf",
"lfi": "lfi", "local file": "lfi", "path traversal": "path_traversal",
"rfi": "rfi", "remote file": "rfi",
"xxe": "xxe", "xml external": "xxe",
"ssti": "ssti", "template injection": "ssti",
"csrf": "csrf", "cross-site request": "csrf",
"idor": "idor", "insecure direct": "idor",
"open redirect": "open_redirect",
"file upload": "file_upload",
"directory listing": "directory_listing",
"information disclosure": "information_disclosure",
"sensitive data": "sensitive_data_exposure",
"security header": "security_headers",
"ssl": "ssl_issues", "tls": "ssl_issues",
"cors": "cors_misconfig",
"crlf": "crlf_injection",
"nosql": "nosql_injection",
"ldap": "ldap_injection",
"jwt": "jwt_manipulation",
"auth bypass": "auth_bypass",
"brute force": "brute_force",
"rate limit": "rate_limit_bypass",
"clickjacking": "clickjacking",
"http smuggling": "http_smuggling",
"cache poison": "cache_poisoning",
"deserialization": "insecure_deserialization",
"prototype pollution": "prototype_pollution",
"graphql": "graphql_injection",
"host header": "host_header_injection",
"race condition": "race_condition",
"business logic": "business_logic",
}
for keyword, vtype in type_map.items():
if keyword in title_lower:
return vtype
return "unknown"
class CLIOutputParser:
"""3-tier output parser for CLI agent findings."""
def __init__(self):
self._seen_finding_hashes: Set[str] = set()
self._buffer = "" # Accumulates partial JSON blocks across chunks
self._unparsed_chunks: List[str] = []
self._total_findings = 0
self._phases_seen: List[str] = []
self._is_complete = False
def parse_chunk(self, text: str) -> List[ParsedFinding]:
"""Parse a chunk of CLI output. Returns newly extracted findings."""
if not text or not text.strip():
return []
findings: List[ParsedFinding] = []
# Track progress markers
for m in PHASE_PATTERN.finditer(text):
phase = m.group(1).strip()
if phase not in self._phases_seen:
self._phases_seen.append(phase)
logger.info(f"[CLI-PARSER] Phase: {phase}")
if COMPLETE_PATTERN.search(text):
self._is_complete = True
# Tier 1: JSON marker blocks
combined = self._buffer + text
tier1 = self._extract_json_markers(combined)
findings.extend(tier1)
# Tier 2: Regex patterns
tier2 = self._extract_regex_findings(text)
findings.extend(tier2)
# Tier 2b: Nuclei JSONL
tier2b = self._extract_nuclei_jsonl(text)
findings.extend(tier2b)
# Track unparsed text for later AI extraction
if not tier1 and not tier2 and not tier2b:
if len(text.strip()) > 50:
self._unparsed_chunks.append(text)
# Deduplicate
unique = []
for f in findings:
h = f"{f.title}|{f.endpoint}|{f.severity}"
if h not in self._seen_finding_hashes:
self._seen_finding_hashes.add(h)
unique.append(f)
self._total_findings += 1
return unique
def get_unparsed_text(self, clear: bool = True) -> str:
"""Get accumulated unparsed text for AI extraction."""
text = "\n".join(self._unparsed_chunks)
if clear:
self._unparsed_chunks = []
return text
@property
def is_complete(self) -> bool:
return self._is_complete
@property
def phases(self) -> List[str]:
return self._phases_seen
@property
def total_findings(self) -> int:
return self._total_findings
def _extract_json_markers(self, text: str) -> List[ParsedFinding]:
"""Tier 1: Extract findings from ===FINDING_START=== / ===FINDING_END=== blocks."""
findings = []
remaining_buffer = ""
# Find all complete blocks
parts = text.split(FINDING_START)
for i, part in enumerate(parts):
if i == 0:
continue # Text before first marker
if FINDING_END in part:
json_text, after = part.split(FINDING_END, 1)
json_text = json_text.strip()
try:
data = json.loads(json_text)
f = self._json_to_finding(data)
if f:
findings.append(f)
except json.JSONDecodeError:
# Try to fix common JSON issues
fixed = self._try_fix_json(json_text)
if fixed:
f = self._json_to_finding(fixed)
if f:
findings.append(f)
else:
logger.debug(f"[CLI-PARSER] Invalid JSON in marker block: {json_text[:100]}")
else:
# Incomplete block - save to buffer for next chunk
remaining_buffer = FINDING_START + part
self._buffer = remaining_buffer
return findings
def _extract_regex_findings(self, text: str) -> List[ParsedFinding]:
"""Tier 2: Extract findings using regex patterns."""
findings = []
for pattern in VULN_PATTERNS:
for match in pattern.finditer(text):
groups = match.groups()
if len(groups) >= 1:
title = groups[0].strip()
severity = "medium"
endpoint = ""
if len(groups) >= 2 and groups[1]:
sev = groups[1].lower().strip()
severity = SEVERITY_MAP.get(sev, "medium")
if len(groups) >= 3 and groups[2]:
endpoint = groups[2].strip()
# Skip very short or generic titles
if len(title) < 5 or title.lower() in ("n/a", "none", "test"):
continue
findings.append(ParsedFinding(
title=title,
severity=severity,
endpoint=endpoint,
evidence=match.group(0),
))
return findings
def _extract_nuclei_jsonl(self, text: str) -> List[ParsedFinding]:
"""Tier 2b: Extract findings from Nuclei JSONL output."""
findings = []
for match in NUCLEI_JSON_PATTERN.finditer(text):
try:
data = json.loads(match.group(0))
template_id = data.get("template-id", "")
matched_at = data.get("matched-at", "")
info = data.get("info", {})
severity = info.get("severity", "medium").lower()
name = info.get("name", template_id)
description = info.get("description", "")
findings.append(ParsedFinding(
title=f"[Nuclei] {name}",
severity=SEVERITY_MAP.get(severity, "medium"),
vulnerability_type=self._nuclei_to_vuln_type(template_id),
endpoint=matched_at,
evidence=f"Template: {template_id}\n{description}",
poc_code=f"nuclei -t {template_id} -u {matched_at}",
))
except json.JSONDecodeError:
continue
return findings
def _json_to_finding(self, data: Dict) -> Optional[ParsedFinding]:
"""Convert a JSON dict to ParsedFinding."""
title = data.get("title", "").strip()
if not title:
return None
severity = data.get("severity", "medium").lower()
severity = SEVERITY_MAP.get(severity, severity)
if severity not in ("critical", "high", "medium", "low", "info"):
severity = "medium"
return ParsedFinding(
title=title,
severity=severity,
vulnerability_type=data.get("vulnerability_type", ""),
endpoint=data.get("endpoint", data.get("affected_endpoint", "")),
parameter=data.get("parameter", ""),
evidence=data.get("evidence", ""),
poc_code=data.get("poc_code", data.get("poc", "")),
request=data.get("request", ""),
response=data.get("response", ""),
impact=data.get("impact", ""),
cvss_score=data.get("cvss_score"),
)
@staticmethod
def _try_fix_json(text: str) -> Optional[Dict]:
"""Try to fix common JSON issues."""
# Remove trailing commas
fixed = re.sub(r',\s*}', '}', text)
fixed = re.sub(r',\s*]', ']', fixed)
# Try to parse
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
# Try wrapping in braces
if not fixed.startswith('{'):
try:
return json.loads('{' + fixed + '}')
except json.JSONDecodeError:
pass
return None
@staticmethod
def _nuclei_to_vuln_type(template_id: str) -> str:
"""Map nuclei template ID to vulnerability type."""
tid = template_id.lower()
mappings = {
"sqli": "sqli_error", "sql-injection": "sqli_error",
"xss": "xss_reflected", "cross-site-scripting": "xss_reflected",
"ssrf": "ssrf", "server-side-request": "ssrf",
"lfi": "lfi", "local-file": "lfi",
"rfi": "rfi", "remote-file": "rfi",
"rce": "command_injection", "command-injection": "command_injection",
"ssti": "ssti", "template-injection": "ssti",
"xxe": "xxe", "xml-external": "xxe",
"redirect": "open_redirect",
"cors": "cors_misconfig",
"crlf": "crlf_injection",
"csrf": "csrf",
"header-injection": "header_injection",
"directory-listing": "directory_listing",
"info-disclosure": "information_disclosure",
"exposure": "sensitive_data_exposure",
"ssl": "ssl_issues", "tls": "ssl_issues",
"default-login": "default_credentials",
"misconfig": "security_headers",
}
for key, vtype in mappings.items():
if key in tid:
return vtype
return "unknown"
# AI-assisted extraction prompt template
AI_EXTRACT_PROMPT = """Analyze this penetration testing CLI output and extract any CONFIRMED vulnerability findings.
IMPORTANT: Only extract findings where there is clear evidence of a vulnerability (error messages,
data leakage, successful exploitation). Do NOT extract theoretical or untested issues.
CLI Output:
{output}
For each confirmed finding, provide:
- title: concise vulnerability name
- severity: critical|high|medium|low|info
- vulnerability_type: e.g., sqli_error, xss_reflected, ssrf, command_injection, etc.
- endpoint: the affected URL
- parameter: affected parameter (if applicable)
- evidence: the actual proof (HTTP response, error, data leaked)
- poc_code: the command or request that confirmed it
Respond ONLY with valid JSON:
{{"findings": [{{"title": "...", "severity": "...", "vulnerability_type": "...", "endpoint": "...", "parameter": "...", "evidence": "...", "poc_code": "..."}}]}}
If no confirmed findings, respond: {{"findings": []}}"""
async def ai_extract_findings(text: str, llm, max_chars: int = 8000) -> List[ParsedFinding]:
"""Tier 3: AI-assisted extraction of findings from unstructured CLI output."""
if not text or len(text.strip()) < 100:
return []
# Truncate to max_chars
if len(text) > max_chars:
text = text[:max_chars] + "\n... [truncated]"
prompt = AI_EXTRACT_PROMPT.format(output=text)
try:
response = await llm.generate(
prompt=prompt,
system="You are a security finding extractor. Extract only confirmed vulnerabilities with real evidence.",
max_tokens=2000,
)
if not response:
return []
# Extract JSON from response
json_match = re.search(r'\{.*"findings".*\}', response, re.DOTALL)
if not json_match:
return []
data = json.loads(json_match.group(0))
findings_data = data.get("findings", [])
findings = []
for fd in findings_data:
if not fd.get("title"):
continue
findings.append(ParsedFinding(
title=fd["title"],
severity=fd.get("severity", "medium"),
vulnerability_type=fd.get("vulnerability_type", ""),
endpoint=fd.get("endpoint", ""),
parameter=fd.get("parameter", ""),
evidence=fd.get("evidence", ""),
poc_code=fd.get("poc_code", ""),
))
logger.info(f"[CLI-PARSER] AI extracted {len(findings)} findings")
return findings
except Exception as e:
logger.warning(f"[CLI-PARSER] AI extraction failed: {e}")
return []