NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform

116 modules | 100 vuln types | 18 API routes | 18 frontend pages Major features: - VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts - Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop - CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers - Validation Pipeline: negative controls, proof of execution, confidence scoring, judge - AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon - Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents - RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates - Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh - Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install - Full IA Testing: methodology-driven comprehensive pentest sessions - Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts - Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-06-30 16:55:34 +02:00 · 2026-02-22 17:58:12 -03:00
commit e0935793c5
271 changed files with 132462 additions and 0 deletions
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Knowledge Augmentor - Adversarial pattern recognition from bug bounty data.
+
+Loads the bug bounty finetuning dataset and provides retrieval-based
+context enrichment for agent prompts. This is for PATTERN RECOGNITION
+and adversarial intuition -- NOT for replaying exploits.
+
+The augmentor:
+- Builds a keyword index by vulnerability type
+- Retrieves relevant patterns matching current testing context
+- Injects formatted reference material into agent prompts
+- Explicitly instructs the model to adapt, not copy
+"""
+
+import json
+import logging
+from typing import Dict, List, Optional
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# Optional RAG engine for semantic search upgrade
+try:
+    from backend.core.rag import RAGEngine
+    HAS_RAG_ENGINE = True
+except ImportError:
+    HAS_RAG_ENGINE = False
+    RAGEngine = None
+
+
+class KnowledgeAugmentor:
+    """Retrieval-based knowledge augmentation from bug bounty dataset.
+
+    Supports two retrieval modes:
+    - RAG mode (when RAGEngine available): Semantic vector search for better relevance
+    - Keyword mode (default fallback): Keyword index matching (original behavior)
+    """
+
+    # Vulnerability type keyword mappings
+    VULN_KEYWORDS = {
+        'xss': ['xss', 'cross-site scripting', 'reflected xss', 'stored xss', 'dom xss',
+                 'script injection', 'html injection'],
+        'sqli': ['sql injection', 'sqli', 'union select', 'blind sql', 'error-based sql',
+                 'time-based sql', 'second-order sql'],
+        'ssrf': ['ssrf', 'server-side request forgery', 'internal service'],
+        'idor': ['idor', 'insecure direct object', 'broken object level',
+                 'bola', 'horizontal privilege'],
+        'rce': ['rce', 'remote code execution', 'command injection', 'os command',
+                'code execution', 'shell injection'],
+        'lfi': ['lfi', 'local file inclusion', 'path traversal', 'directory traversal',
+                'file read', 'file disclosure'],
+        'auth_bypass': ['authentication bypass', 'broken authentication', 'auth bypass',
+                        'session fixation', 'jwt', 'token manipulation'],
+        'csrf': ['csrf', 'cross-site request forgery', 'state-changing'],
+        'open_redirect': ['open redirect', 'url redirect', 'redirect vulnerability'],
+        'xxe': ['xxe', 'xml external entity', 'xml injection'],
+        'ssti': ['ssti', 'server-side template injection', 'template injection'],
+        'race_condition': ['race condition', 'toctou', 'concurrency'],
+        'graphql': ['graphql', 'introspection', 'batching attack'],
+        'api': ['api', 'rest api', 'broken api', 'api key', 'rate limiting'],
+        'deserialization': ['deserialization', 'insecure deserialization', 'pickle',
+                           'object injection'],
+        'upload': ['file upload', 'unrestricted upload', 'web shell', 'upload bypass'],
+        'cors': ['cors', 'cross-origin', 'origin validation'],
+        'subdomain_takeover': ['subdomain takeover', 'dangling dns', 'cname'],
+        'information_disclosure': ['information disclosure', 'sensitive data', 'data exposure',
+                                   'directory listing', 'source code disclosure'],
+    }
+
+    def __init__(self, dataset_path: str = "models/bug-bounty/bugbounty_finetuning_dataset.json",
+                 max_patterns: int = 3, rag_engine=None):
+        self.dataset_path = Path(dataset_path)
+        self.max_patterns = max_patterns
+        self.entries: List[Dict] = []
+        self.index: Dict[str, List[int]] = {}  # vuln_type -> list of entry indices
+        self._loaded = False
+
+        # RAG engine for semantic search (optional upgrade)
+        self._rag_engine = rag_engine
+        if not self._rag_engine and HAS_RAG_ENGINE:
+            try:
+                import os
+                if os.getenv("ENABLE_RAG", "true").lower() != "false":
+                    self._rag_engine = RAGEngine(data_dir="data", backend=os.getenv("RAG_BACKEND", "auto"))
+                    if not self._rag_engine.is_indexed:
+                        self._rag_engine.index_all()
+            except Exception as e:
+                logger.debug(f"RAG engine not available for augmentor: {e}")
+                self._rag_engine = None
+
+    def _ensure_loaded(self):
+        """Lazy load and index the dataset on first use."""
+        if self._loaded:
+            return
+
+        if not self.dataset_path.exists():
+            logger.warning(f"Bug bounty dataset not found: {self.dataset_path}")
+            self._loaded = True
+            return
+
+        try:
+            with open(self.dataset_path, 'r', encoding='utf-8') as f:
+                self.entries = json.load(f)
+            logger.info(f"Loaded {len(self.entries)} entries from bug bounty dataset")
+            self._build_index()
+        except Exception as e:
+            logger.error(f"Failed to load bug bounty dataset: {e}")
+
+        self._loaded = True
+
+    def _build_index(self):
+        """Build keyword index over the dataset entries."""
+        for i, entry in enumerate(self.entries):
+            text = (
+                entry.get('instruction', '') + ' ' +
+                entry.get('input', '') + ' ' +
+                entry.get('output', '')
+            ).lower()
+
+            for vuln_type, keywords in self.VULN_KEYWORDS.items():
+                for kw in keywords:
+                    if kw in text:
+                        self.index.setdefault(vuln_type, []).append(i)
+                        break  # One match per vuln_type per entry
+
+        indexed_types = {k: len(v) for k, v in self.index.items()}
+        logger.info(f"Knowledge index built: {indexed_types}")
+
+    def get_relevant_patterns(self, vulnerability_type: str,
+                               technologies: Optional[List[str]] = None,
+                               max_entries: Optional[int] = None) -> str:
+        """Retrieve relevant bug bounty patterns for context enrichment.
+
+        Args:
+            vulnerability_type: Type of vulnerability being tested (e.g., 'xss', 'sqli')
+            technologies: Optional list of detected technologies for relevance boosting
+            max_entries: Override default max patterns count
+
+        Returns:
+            Formatted string for injection into LLM prompts as cognitive augmentation.
+            Returns empty string if no relevant patterns found.
+        """
+        # Try RAG-based semantic retrieval first (much better relevance)
+        if self._rag_engine:
+            try:
+                tech_str = ", ".join(technologies[:3]) if technologies else ""
+                rag_context = self._rag_engine.get_testing_context(
+                    vuln_type=vulnerability_type,
+                    technology=tech_str,
+                    max_chars=3000
+                )
+                if rag_context and len(rag_context) > 50:
+                    return rag_context
+            except Exception as e:
+                logger.debug(f"RAG retrieval failed, falling back to keyword: {e}")
+
+        # Fallback: keyword-based retrieval (original behavior)
+        self._ensure_loaded()
+
+        limit = max_entries or self.max_patterns
+        vuln_key = vulnerability_type.lower().replace(' ', '_').replace('-', '_')
+
+        # Try exact match first, then partial
+        candidates = self.index.get(vuln_key, [])
+        if not candidates:
+            # Try partial matching
+            for key, indices in self.index.items():
+                if vuln_key in key or key in vuln_key:
+                    candidates = indices
+                    break
+
+        if not candidates:
+            return ""
+
+        # Deduplicate
+        candidates = list(dict.fromkeys(candidates))
+
+        # Score by technology relevance if technologies provided
+        if technologies:
+            scored = []
+            for idx in candidates:
+                entry = self.entries[idx]
+                text = (entry.get('output', '') + ' ' + entry.get('instruction', '')).lower()
+                tech_score = sum(1 for t in technologies if t.lower() in text)
+                scored.append((tech_score, idx))
+            scored.sort(key=lambda x: x[0], reverse=True)
+            candidates = [idx for _, idx in scored]
+
+        selected = candidates[:limit]
+
+        # Build augmentation context
+        augmentation = (
+            "\n\n=== ADVERSARIAL PATTERN CONTEXT (Bug Bounty Knowledge) ===\n"
+            "These are REFERENCE PATTERNS for understanding attack vectors and methodology.\n"
+            "ADAPT the approach to the current target. Do NOT replay exact exploits.\n"
+            "Use these as cognitive anchors for creative hypothesis generation.\n\n"
+        )
+
+        for i, idx in enumerate(selected, 1):
+            entry = self.entries[idx]
+            instruction = entry.get('instruction', '')[:300]
+            output = entry.get('output', '')
+
+            # Extract methodology-relevant sections, truncate for context budget
+            methodology = self._extract_methodology(output, max_chars=1500)
+
+            augmentation += f"--- Pattern {i} ---\n"
+            augmentation += f"Context: {instruction}\n"
+            augmentation += f"Methodology:\n{methodology}\n\n"
+
+        augmentation += "=== END ADVERSARIAL PATTERN CONTEXT ===\n"
+        return augmentation
+
+    def _extract_methodology(self, text: str, max_chars: int = 1500) -> str:
+        """Extract the most methodology-relevant portion of a writeup."""
+        # Look for methodology/steps/approach sections
+        markers = ['### steps', '### methodology', '### approach', '### exploitation',
+                    '## steps', '## methodology', '## approach', '## exploitation',
+                    'steps to reproduce', 'reproduction steps', 'proof of concept']
+
+        text_lower = text.lower()
+        for marker in markers:
+            idx = text_lower.find(marker)
+            if idx != -1:
+                return text[idx:idx + max_chars]
+
+        # Fall back to first max_chars of the output
+        return text[:max_chars]
+
+    def get_available_types(self) -> List[str]:
+        """Return list of vulnerability types that have indexed entries."""
+        self._ensure_loaded()
+        return sorted(self.index.keys())
+
+    def get_entry_count(self, vulnerability_type: str) -> int:
+        """Return count of indexed entries for a vulnerability type."""
+        self._ensure_loaded()
+        vuln_key = vulnerability_type.lower().replace(' ', '_').replace('-', '_')
+        return len(self.index.get(vuln_key, []))
+
+    def _get_custom_knowledge_patterns(self, vuln_type: str) -> str:
+        """Get patterns from user-uploaded custom knowledge documents."""
+        try:
+            from backend.core.knowledge_processor import KnowledgeProcessor
+            processor = KnowledgeProcessor()
+            entries = processor.get_patterns_for_vuln(vuln_type, max_entries=3)
+            if not entries:
+                return ""
+
+            context = "\n\n=== CUSTOM KNOWLEDGE (User-Uploaded Research) ===\n"
+            for i, entry in enumerate(entries, 1):
+                context += f"--- Source: {entry.get('source_doc', 'Unknown')} ---\n"
+                if entry.get("methodology"):
+                    context += f"Methodology: {entry['methodology']}\n"
+                if entry.get("key_insights"):
+                    context += f"Key Insight: {entry['key_insights']}\n"
+                if entry.get("payloads"):
+                    context += f"Payloads: {', '.join(entry['payloads'][:5])}\n"
+                if entry.get("bypass_techniques"):
+                    context += f"Bypasses: {', '.join(entry['bypass_techniques'][:5])}\n"
+                context += "\n"
+            context += "=== END CUSTOM KNOWLEDGE ===\n"
+            return context
+        except Exception as e:
+            logger.debug(f"Custom knowledge lookup failed: {e}")
+            return ""
+
+    def get_relevant_patterns_with_custom(self, vulnerability_type: str,
+                                          technologies: Optional[List[str]] = None,
+                                          max_entries: Optional[int] = None) -> str:
+        """Get patterns from both bug bounty dataset AND custom uploaded knowledge."""
+        # Original dataset patterns
+        result = self.get_relevant_patterns(vulnerability_type, technologies, max_entries)
+
+        # Custom knowledge patterns
+        custom = self._get_custom_knowledge_patterns(vulnerability_type)
+        if custom:
+            result += custom
+
+        return result