Files
NeuroSploit/core/knowledge_augmentor.py
CyberSecurityUP e0935793c5 NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform
116 modules | 100 vuln types | 18 API routes | 18 frontend pages

Major features:
- VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts
- Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop
- CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers
- Validation Pipeline: negative controls, proof of execution, confidence scoring, judge
- AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon
- Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents
- RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates
- Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh
- Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install
- Full IA Testing: methodology-driven comprehensive pentest sessions
- Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts
- Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-02-22 17:59:28 -03:00

282 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Knowledge Augmentor - Adversarial pattern recognition from bug bounty data.
Loads the bug bounty finetuning dataset and provides retrieval-based
context enrichment for agent prompts. This is for PATTERN RECOGNITION
and adversarial intuition -- NOT for replaying exploits.
The augmentor:
- Builds a keyword index by vulnerability type
- Retrieves relevant patterns matching current testing context
- Injects formatted reference material into agent prompts
- Explicitly instructs the model to adapt, not copy
"""
import json
import logging
from typing import Dict, List, Optional
from pathlib import Path
logger = logging.getLogger(__name__)
# Optional RAG engine for semantic search upgrade
try:
from backend.core.rag import RAGEngine
HAS_RAG_ENGINE = True
except ImportError:
HAS_RAG_ENGINE = False
RAGEngine = None
class KnowledgeAugmentor:
"""Retrieval-based knowledge augmentation from bug bounty dataset.
Supports two retrieval modes:
- RAG mode (when RAGEngine available): Semantic vector search for better relevance
- Keyword mode (default fallback): Keyword index matching (original behavior)
"""
# Vulnerability type keyword mappings
VULN_KEYWORDS = {
'xss': ['xss', 'cross-site scripting', 'reflected xss', 'stored xss', 'dom xss',
'script injection', 'html injection'],
'sqli': ['sql injection', 'sqli', 'union select', 'blind sql', 'error-based sql',
'time-based sql', 'second-order sql'],
'ssrf': ['ssrf', 'server-side request forgery', 'internal service'],
'idor': ['idor', 'insecure direct object', 'broken object level',
'bola', 'horizontal privilege'],
'rce': ['rce', 'remote code execution', 'command injection', 'os command',
'code execution', 'shell injection'],
'lfi': ['lfi', 'local file inclusion', 'path traversal', 'directory traversal',
'file read', 'file disclosure'],
'auth_bypass': ['authentication bypass', 'broken authentication', 'auth bypass',
'session fixation', 'jwt', 'token manipulation'],
'csrf': ['csrf', 'cross-site request forgery', 'state-changing'],
'open_redirect': ['open redirect', 'url redirect', 'redirect vulnerability'],
'xxe': ['xxe', 'xml external entity', 'xml injection'],
'ssti': ['ssti', 'server-side template injection', 'template injection'],
'race_condition': ['race condition', 'toctou', 'concurrency'],
'graphql': ['graphql', 'introspection', 'batching attack'],
'api': ['api', 'rest api', 'broken api', 'api key', 'rate limiting'],
'deserialization': ['deserialization', 'insecure deserialization', 'pickle',
'object injection'],
'upload': ['file upload', 'unrestricted upload', 'web shell', 'upload bypass'],
'cors': ['cors', 'cross-origin', 'origin validation'],
'subdomain_takeover': ['subdomain takeover', 'dangling dns', 'cname'],
'information_disclosure': ['information disclosure', 'sensitive data', 'data exposure',
'directory listing', 'source code disclosure'],
}
def __init__(self, dataset_path: str = "models/bug-bounty/bugbounty_finetuning_dataset.json",
max_patterns: int = 3, rag_engine=None):
self.dataset_path = Path(dataset_path)
self.max_patterns = max_patterns
self.entries: List[Dict] = []
self.index: Dict[str, List[int]] = {} # vuln_type -> list of entry indices
self._loaded = False
# RAG engine for semantic search (optional upgrade)
self._rag_engine = rag_engine
if not self._rag_engine and HAS_RAG_ENGINE:
try:
import os
if os.getenv("ENABLE_RAG", "true").lower() != "false":
self._rag_engine = RAGEngine(data_dir="data", backend=os.getenv("RAG_BACKEND", "auto"))
if not self._rag_engine.is_indexed:
self._rag_engine.index_all()
except Exception as e:
logger.debug(f"RAG engine not available for augmentor: {e}")
self._rag_engine = None
def _ensure_loaded(self):
"""Lazy load and index the dataset on first use."""
if self._loaded:
return
if not self.dataset_path.exists():
logger.warning(f"Bug bounty dataset not found: {self.dataset_path}")
self._loaded = True
return
try:
with open(self.dataset_path, 'r', encoding='utf-8') as f:
self.entries = json.load(f)
logger.info(f"Loaded {len(self.entries)} entries from bug bounty dataset")
self._build_index()
except Exception as e:
logger.error(f"Failed to load bug bounty dataset: {e}")
self._loaded = True
def _build_index(self):
"""Build keyword index over the dataset entries."""
for i, entry in enumerate(self.entries):
text = (
entry.get('instruction', '') + ' ' +
entry.get('input', '') + ' ' +
entry.get('output', '')
).lower()
for vuln_type, keywords in self.VULN_KEYWORDS.items():
for kw in keywords:
if kw in text:
self.index.setdefault(vuln_type, []).append(i)
break # One match per vuln_type per entry
indexed_types = {k: len(v) for k, v in self.index.items()}
logger.info(f"Knowledge index built: {indexed_types}")
def get_relevant_patterns(self, vulnerability_type: str,
technologies: Optional[List[str]] = None,
max_entries: Optional[int] = None) -> str:
"""Retrieve relevant bug bounty patterns for context enrichment.
Args:
vulnerability_type: Type of vulnerability being tested (e.g., 'xss', 'sqli')
technologies: Optional list of detected technologies for relevance boosting
max_entries: Override default max patterns count
Returns:
Formatted string for injection into LLM prompts as cognitive augmentation.
Returns empty string if no relevant patterns found.
"""
# Try RAG-based semantic retrieval first (much better relevance)
if self._rag_engine:
try:
tech_str = ", ".join(technologies[:3]) if technologies else ""
rag_context = self._rag_engine.get_testing_context(
vuln_type=vulnerability_type,
technology=tech_str,
max_chars=3000
)
if rag_context and len(rag_context) > 50:
return rag_context
except Exception as e:
logger.debug(f"RAG retrieval failed, falling back to keyword: {e}")
# Fallback: keyword-based retrieval (original behavior)
self._ensure_loaded()
limit = max_entries or self.max_patterns
vuln_key = vulnerability_type.lower().replace(' ', '_').replace('-', '_')
# Try exact match first, then partial
candidates = self.index.get(vuln_key, [])
if not candidates:
# Try partial matching
for key, indices in self.index.items():
if vuln_key in key or key in vuln_key:
candidates = indices
break
if not candidates:
return ""
# Deduplicate
candidates = list(dict.fromkeys(candidates))
# Score by technology relevance if technologies provided
if technologies:
scored = []
for idx in candidates:
entry = self.entries[idx]
text = (entry.get('output', '') + ' ' + entry.get('instruction', '')).lower()
tech_score = sum(1 for t in technologies if t.lower() in text)
scored.append((tech_score, idx))
scored.sort(key=lambda x: x[0], reverse=True)
candidates = [idx for _, idx in scored]
selected = candidates[:limit]
# Build augmentation context
augmentation = (
"\n\n=== ADVERSARIAL PATTERN CONTEXT (Bug Bounty Knowledge) ===\n"
"These are REFERENCE PATTERNS for understanding attack vectors and methodology.\n"
"ADAPT the approach to the current target. Do NOT replay exact exploits.\n"
"Use these as cognitive anchors for creative hypothesis generation.\n\n"
)
for i, idx in enumerate(selected, 1):
entry = self.entries[idx]
instruction = entry.get('instruction', '')[:300]
output = entry.get('output', '')
# Extract methodology-relevant sections, truncate for context budget
methodology = self._extract_methodology(output, max_chars=1500)
augmentation += f"--- Pattern {i} ---\n"
augmentation += f"Context: {instruction}\n"
augmentation += f"Methodology:\n{methodology}\n\n"
augmentation += "=== END ADVERSARIAL PATTERN CONTEXT ===\n"
return augmentation
def _extract_methodology(self, text: str, max_chars: int = 1500) -> str:
"""Extract the most methodology-relevant portion of a writeup."""
# Look for methodology/steps/approach sections
markers = ['### steps', '### methodology', '### approach', '### exploitation',
'## steps', '## methodology', '## approach', '## exploitation',
'steps to reproduce', 'reproduction steps', 'proof of concept']
text_lower = text.lower()
for marker in markers:
idx = text_lower.find(marker)
if idx != -1:
return text[idx:idx + max_chars]
# Fall back to first max_chars of the output
return text[:max_chars]
def get_available_types(self) -> List[str]:
"""Return list of vulnerability types that have indexed entries."""
self._ensure_loaded()
return sorted(self.index.keys())
def get_entry_count(self, vulnerability_type: str) -> int:
"""Return count of indexed entries for a vulnerability type."""
self._ensure_loaded()
vuln_key = vulnerability_type.lower().replace(' ', '_').replace('-', '_')
return len(self.index.get(vuln_key, []))
def _get_custom_knowledge_patterns(self, vuln_type: str) -> str:
"""Get patterns from user-uploaded custom knowledge documents."""
try:
from backend.core.knowledge_processor import KnowledgeProcessor
processor = KnowledgeProcessor()
entries = processor.get_patterns_for_vuln(vuln_type, max_entries=3)
if not entries:
return ""
context = "\n\n=== CUSTOM KNOWLEDGE (User-Uploaded Research) ===\n"
for i, entry in enumerate(entries, 1):
context += f"--- Source: {entry.get('source_doc', 'Unknown')} ---\n"
if entry.get("methodology"):
context += f"Methodology: {entry['methodology']}\n"
if entry.get("key_insights"):
context += f"Key Insight: {entry['key_insights']}\n"
if entry.get("payloads"):
context += f"Payloads: {', '.join(entry['payloads'][:5])}\n"
if entry.get("bypass_techniques"):
context += f"Bypasses: {', '.join(entry['bypass_techniques'][:5])}\n"
context += "\n"
context += "=== END CUSTOM KNOWLEDGE ===\n"
return context
except Exception as e:
logger.debug(f"Custom knowledge lookup failed: {e}")
return ""
def get_relevant_patterns_with_custom(self, vulnerability_type: str,
technologies: Optional[List[str]] = None,
max_entries: Optional[int] = None) -> str:
"""Get patterns from both bug bounty dataset AND custom uploaded knowledge."""
# Original dataset patterns
result = self.get_relevant_patterns(vulnerability_type, technologies, max_entries)
# Custom knowledge patterns
custom = self._get_custom_knowledge_patterns(vulnerability_type)
if custom:
result += custom
return result