NeuroSploit v3.2 - Autonomous AI Penetration Testing Platform

116 modules | 100 vuln types | 18 API routes | 18 frontend pages Major features: - VulnEngine: 100 vuln types, 526+ payloads, 12 testers, anti-hallucination prompts - Autonomous Agent: 3-stream auto pentest, multi-session (5 concurrent), pause/resume/stop - CLI Agent: Claude Code / Gemini CLI / Codex CLI inside Kali containers - Validation Pipeline: negative controls, proof of execution, confidence scoring, judge - AI Reasoning: ReACT engine, token budget, endpoint classifier, CVE hunter, deep recon - Multi-Agent: 5 specialists + orchestrator + researcher AI + vuln type agents - RAG System: BM25/TF-IDF/ChromaDB vectorstore, few-shot, reasoning templates - Smart Router: 20 providers (8 CLI OAuth + 12 API), tier failover, token refresh - Kali Sandbox: container-per-scan, 56 tools, VPN support, on-demand install - Full IA Testing: methodology-driven comprehensive pentest sessions - Notifications: Discord, Telegram, WhatsApp/Twilio multi-channel alerts - Frontend: React/TypeScript with 18 pages, real-time WebSocket updates
2026-06-03 03:18:01 +02:00 · 2026-02-22 17:58:12 -03:00
commit e0935793c5
271 changed files with 132462 additions and 0 deletions
@@ -0,0 +1,643 @@
+"""
+Multi-backend vector store for RAG knowledge retrieval.
+
+Backends (in priority order):
+1. ChromaDB + sentence-transformers (semantic embeddings, persistent)
+2. TF-IDF via scikit-learn (statistical similarity)
+3. BM25 (zero dependencies, keyword-based ranking)
+
+All backends provide the same interface: add(), query(), delete_collection().
+"""
+
+import json
+import math
+import hashlib
+import logging
+import time
+from abc import ABC, abstractmethod
+from collections import Counter
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import List, Dict, Optional, Any, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Optional dependencies
+try:
+    import chromadb
+    from chromadb.config import Settings as ChromaSettings
+    HAS_CHROMADB = True
+except ImportError:
+    HAS_CHROMADB = False
+
+try:
+    from sentence_transformers import SentenceTransformer
+    HAS_SENTENCE_TRANSFORMERS = True
+except ImportError:
+    HAS_SENTENCE_TRANSFORMERS = False
+
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ImportError:
+    HAS_NUMPY = False
+
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    from sklearn.metrics.pairwise import cosine_similarity
+    HAS_SKLEARN = True
+except ImportError:
+    HAS_SKLEARN = False
+
+
+@dataclass
+class RetrievedChunk:
+    """A retrieved knowledge chunk with relevance score."""
+    text: str
+    score: float
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    chunk_id: str = ""
+    source: str = ""
+
+    def to_dict(self) -> Dict:
+        return asdict(self)
+
+
+@dataclass
+class Document:
+    """A document to be indexed in the vector store."""
+    text: str
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    doc_id: str = ""
+
+    def __post_init__(self):
+        if not self.doc_id:
+            self.doc_id = hashlib.md5(self.text[:500].encode()).hexdigest()[:12]
+
+
+class BaseVectorStore(ABC):
+    """Abstract vector store interface."""
+
+    @abstractmethod
+    def add(self, collection: str, documents: List[Document]) -> int:
+        """Add documents to a collection. Returns count added."""
+        pass
+
+    @abstractmethod
+    def query(self, collection: str, query_text: str, top_k: int = 5,
+              metadata_filter: Optional[Dict] = None) -> List[RetrievedChunk]:
+        """Query a collection for relevant documents."""
+        pass
+
+    @abstractmethod
+    def collection_exists(self, collection: str) -> bool:
+        """Check if a collection has been indexed."""
+        pass
+
+    @abstractmethod
+    def delete_collection(self, collection: str) -> None:
+        """Delete a collection and all its documents."""
+        pass
+
+    @abstractmethod
+    def collection_count(self, collection: str) -> int:
+        """Return number of documents in a collection."""
+        pass
+
+    @property
+    @abstractmethod
+    def backend_name(self) -> str:
+        pass
+
+
+class BM25VectorStore(BaseVectorStore):
+    """
+    BM25 (Best Matching 25) keyword-based ranking.
+    Zero external dependencies - works with pure Python.
+    Good for exact keyword matching and term-frequency scoring.
+    """
+
+    def __init__(self, persist_dir: str, k1: float = 1.5, b: float = 0.75):
+        self.persist_dir = Path(persist_dir)
+        self.persist_dir.mkdir(parents=True, exist_ok=True)
+        self.k1 = k1
+        self.b = b
+        self._collections: Dict[str, Dict] = {}
+        self._load_persisted()
+
+    @property
+    def backend_name(self) -> str:
+        return "bm25"
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Simple whitespace + punctuation tokenizer."""
+        import re
+        text = text.lower()
+        tokens = re.findall(r'\b[a-z0-9_]{2,}\b', text)
+        return tokens
+
+    def _load_persisted(self):
+        """Load persisted collections from disk."""
+        index_file = self.persist_dir / "bm25_index.json"
+        if index_file.exists():
+            try:
+                with open(index_file, 'r') as f:
+                    data = json.load(f)
+                self._collections = data.get("collections", {})
+                logger.info(f"BM25: Loaded {len(self._collections)} collections from disk")
+            except Exception as e:
+                logger.warning(f"BM25: Failed to load index: {e}")
+                self._collections = {}
+
+    def _persist(self):
+        """Persist collections to disk."""
+        index_file = self.persist_dir / "bm25_index.json"
+        try:
+            with open(index_file, 'w') as f:
+                json.dump({"collections": self._collections, "timestamp": time.time()}, f)
+        except Exception as e:
+            logger.warning(f"BM25: Failed to persist index: {e}")
+
+    def add(self, collection: str, documents: List[Document]) -> int:
+        if not documents:
+            return 0
+
+        if collection not in self._collections:
+            self._collections[collection] = {
+                "documents": [],
+                "doc_freqs": [],
+                "df": {},
+                "doc_lengths": [],
+                "avgdl": 0,
+                "N": 0
+            }
+
+        col = self._collections[collection]
+
+        added = 0
+        existing_ids = {d.get("doc_id", "") for d in col["documents"]}
+
+        for doc in documents:
+            if doc.doc_id in existing_ids:
+                continue
+
+            tokens = self._tokenize(doc.text)
+            token_freq = dict(Counter(tokens))
+            unique_tokens = set(tokens)
+
+            col["documents"].append({
+                "doc_id": doc.doc_id,
+                "text": doc.text[:5000],  # Cap storage
+                "metadata": doc.metadata
+            })
+            col["doc_freqs"].append(token_freq)
+            col["doc_lengths"].append(len(tokens))
+
+            for token in unique_tokens:
+                col["df"][token] = col["df"].get(token, 0) + 1
+
+            added += 1
+
+        col["N"] = len(col["documents"])
+        col["avgdl"] = sum(col["doc_lengths"]) / max(col["N"], 1)
+
+        if added > 0:
+            self._persist()
+
+        return added
+
+    def query(self, collection: str, query_text: str, top_k: int = 5,
+              metadata_filter: Optional[Dict] = None) -> List[RetrievedChunk]:
+        if collection not in self._collections:
+            return []
+
+        col = self._collections[collection]
+        if col["N"] == 0:
+            return []
+
+        query_tokens = self._tokenize(query_text)
+        if not query_tokens:
+            return []
+
+        scores = []
+        N = col["N"]
+        avgdl = col["avgdl"]
+
+        for i in range(N):
+            # Metadata filter
+            if metadata_filter:
+                doc_meta = col["documents"][i].get("metadata", {})
+                skip = False
+                for key, val in metadata_filter.items():
+                    if isinstance(val, list):
+                        if doc_meta.get(key) not in val:
+                            skip = True
+                            break
+                    elif doc_meta.get(key) != val:
+                        skip = True
+                        break
+                if skip:
+                    scores.append(0.0)
+                    continue
+
+            doc_freq = col["doc_freqs"][i]
+            doc_len = col["doc_lengths"][i]
+            score = 0.0
+
+            for token in query_tokens:
+                if token not in doc_freq:
+                    continue
+
+                tf = doc_freq[token]
+                df = col["df"].get(token, 0)
+
+                # BM25 IDF
+                idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
+
+                # BM25 TF normalization
+                tf_norm = (tf * (self.k1 + 1)) / (
+                    tf + self.k1 * (1.0 - self.b + self.b * doc_len / avgdl)
+                )
+
+                score += idf * tf_norm
+
+            scores.append(score)
+
+        # Get top-k
+        indexed_scores = [(i, s) for i, s in enumerate(scores) if s > 0]
+        indexed_scores.sort(key=lambda x: x[1], reverse=True)
+
+        results = []
+        for i, score in indexed_scores[:top_k]:
+            doc = col["documents"][i]
+            results.append(RetrievedChunk(
+                text=doc["text"],
+                score=score,
+                metadata=doc.get("metadata", {}),
+                chunk_id=doc.get("doc_id", f"doc_{i}"),
+                source=collection
+            ))
+
+        return results
+
+    def collection_exists(self, collection: str) -> bool:
+        return collection in self._collections and self._collections[collection]["N"] > 0
+
+    def delete_collection(self, collection: str) -> None:
+        if collection in self._collections:
+            del self._collections[collection]
+            self._persist()
+
+    def collection_count(self, collection: str) -> int:
+        if collection not in self._collections:
+            return 0
+        return self._collections[collection]["N"]
+
+
+class TFIDFVectorStore(BaseVectorStore):
+    """
+    TF-IDF based vector store using scikit-learn.
+    Better than BM25 for capturing document-level similarity.
+    Requires: scikit-learn, numpy
+    """
+
+    def __init__(self, persist_dir: str):
+        if not HAS_SKLEARN or not HAS_NUMPY:
+            raise ImportError("TF-IDF backend requires scikit-learn and numpy")
+
+        self.persist_dir = Path(persist_dir)
+        self.persist_dir.mkdir(parents=True, exist_ok=True)
+        self._collections: Dict[str, Dict] = {}
+
+    @property
+    def backend_name(self) -> str:
+        return "tfidf"
+
+    def add(self, collection: str, documents: List[Document]) -> int:
+        if not documents:
+            return 0
+
+        if collection not in self._collections:
+            self._collections[collection] = {
+                "documents": [],
+                "texts": [],
+                "vectorizer": None,
+                "matrix": None
+            }
+
+        col = self._collections[collection]
+        existing_ids = {d.get("doc_id", "") for d in col["documents"]}
+
+        added = 0
+        for doc in documents:
+            if doc.doc_id in existing_ids:
+                continue
+            col["documents"].append({
+                "doc_id": doc.doc_id,
+                "text": doc.text[:5000],
+                "metadata": doc.metadata
+            })
+            col["texts"].append(doc.text[:5000])
+            added += 1
+
+        if added > 0:
+            # Rebuild TF-IDF matrix
+            vectorizer = TfidfVectorizer(
+                max_features=10000,
+                stop_words='english',
+                ngram_range=(1, 2),
+                min_df=1,
+                max_df=0.95
+            )
+            col["matrix"] = vectorizer.fit_transform(col["texts"])
+            col["vectorizer"] = vectorizer
+
+        return added
+
+    def query(self, collection: str, query_text: str, top_k: int = 5,
+              metadata_filter: Optional[Dict] = None) -> List[RetrievedChunk]:
+        if collection not in self._collections:
+            return []
+
+        col = self._collections[collection]
+        if col["vectorizer"] is None or col["matrix"] is None:
+            return []
+
+        query_vec = col["vectorizer"].transform([query_text])
+        similarities = cosine_similarity(query_vec, col["matrix"]).flatten()
+
+        # Apply metadata filter
+        if metadata_filter:
+            for i, doc in enumerate(col["documents"]):
+                meta = doc.get("metadata", {})
+                for key, val in metadata_filter.items():
+                    if isinstance(val, list):
+                        if meta.get(key) not in val:
+                            similarities[i] = 0.0
+                    elif meta.get(key) != val:
+                        similarities[i] = 0.0
+
+        top_indices = np.argsort(similarities)[::-1][:top_k]
+
+        results = []
+        for i in top_indices:
+            if similarities[i] <= 0:
+                continue
+            doc = col["documents"][i]
+            results.append(RetrievedChunk(
+                text=doc["text"],
+                score=float(similarities[i]),
+                metadata=doc.get("metadata", {}),
+                chunk_id=doc.get("doc_id", f"doc_{i}"),
+                source=collection
+            ))
+
+        return results
+
+    def collection_exists(self, collection: str) -> bool:
+        return (collection in self._collections and
+                len(self._collections[collection]["documents"]) > 0)
+
+    def delete_collection(self, collection: str) -> None:
+        if collection in self._collections:
+            del self._collections[collection]
+
+    def collection_count(self, collection: str) -> int:
+        if collection not in self._collections:
+            return 0
+        return len(self._collections[collection]["documents"])
+
+
+class ChromaVectorStore(BaseVectorStore):
+    """
+    ChromaDB + sentence-transformers for true semantic embeddings.
+    Best quality: understands meaning, not just keywords.
+    Requires: chromadb, sentence-transformers
+    """
+
+    DEFAULT_MODEL = "all-MiniLM-L6-v2"  # Fast, 384-dim, good quality
+
+    def __init__(self, persist_dir: str, model_name: str = None):
+        if not HAS_CHROMADB:
+            raise ImportError("ChromaDB backend requires: pip install chromadb")
+
+        self.persist_dir = Path(persist_dir)
+        self.persist_dir.mkdir(parents=True, exist_ok=True)
+
+        self.client = chromadb.PersistentClient(
+            path=str(self.persist_dir / "chromadb")
+        )
+
+        # Embedding model
+        self._embed_model = None
+        self._model_name = model_name or self.DEFAULT_MODEL
+        if HAS_SENTENCE_TRANSFORMERS:
+            try:
+                self._embed_model = SentenceTransformer(self._model_name)
+                logger.info(f"ChromaDB: Loaded embedding model '{self._model_name}'")
+            except Exception as e:
+                logger.warning(f"ChromaDB: Failed to load model: {e}")
+
+    @property
+    def backend_name(self) -> str:
+        return "chromadb"
+
+    def _get_collection(self, name: str):
+        """Get or create a ChromaDB collection."""
+        if self._embed_model:
+            return self.client.get_or_create_collection(
+                name=name,
+                metadata={"hnsw:space": "cosine"}
+            )
+        else:
+            return self.client.get_or_create_collection(name=name)
+
+    def _embed(self, texts: List[str]) -> Optional[List[List[float]]]:
+        """Generate embeddings using sentence-transformers."""
+        if not self._embed_model:
+            return None
+        try:
+            embeddings = self._embed_model.encode(texts, show_progress_bar=False)
+            return embeddings.tolist()
+        except Exception as e:
+            logger.warning(f"ChromaDB: Embedding failed: {e}")
+            return None
+
+    def add(self, collection: str, documents: List[Document]) -> int:
+        if not documents:
+            return 0
+
+        col = self._get_collection(collection)
+
+        # Filter already-indexed docs
+        existing = set()
+        try:
+            result = col.get()
+            if result and result.get("ids"):
+                existing = set(result["ids"])
+        except Exception:
+            pass
+
+        new_docs = [d for d in documents if d.doc_id not in existing]
+        if not new_docs:
+            return 0
+
+        # Batch add (ChromaDB limit: 41666 per batch)
+        batch_size = 500
+        added = 0
+
+        for start in range(0, len(new_docs), batch_size):
+            batch = new_docs[start:start + batch_size]
+
+            ids = [d.doc_id for d in batch]
+            texts = [d.text[:5000] for d in batch]
+            metadatas = []
+            for d in batch:
+                # ChromaDB metadata must be str/int/float/bool
+                meta = {}
+                for k, v in d.metadata.items():
+                    if isinstance(v, (str, int, float, bool)):
+                        meta[k] = v
+                    elif isinstance(v, list):
+                        meta[k] = ",".join(str(x) for x in v)
+                    else:
+                        meta[k] = str(v)
+                metadatas.append(meta)
+
+            embeddings = self._embed(texts)
+
+            try:
+                if embeddings:
+                    col.add(
+                        ids=ids,
+                        documents=texts,
+                        metadatas=metadatas,
+                        embeddings=embeddings
+                    )
+                else:
+                    col.add(
+                        ids=ids,
+                        documents=texts,
+                        metadatas=metadatas
+                    )
+                added += len(batch)
+            except Exception as e:
+                logger.warning(f"ChromaDB: Failed to add batch: {e}")
+
+        return added
+
+    def query(self, collection: str, query_text: str, top_k: int = 5,
+              metadata_filter: Optional[Dict] = None) -> List[RetrievedChunk]:
+        try:
+            col = self._get_collection(collection)
+        except Exception:
+            return []
+
+        if col.count() == 0:
+            return []
+
+        # Build ChromaDB where clause
+        where = None
+        if metadata_filter:
+            conditions = []
+            for key, val in metadata_filter.items():
+                if isinstance(val, list):
+                    conditions.append({key: {"$in": val}})
+                else:
+                    conditions.append({key: {"$eq": val}})
+            if len(conditions) == 1:
+                where = conditions[0]
+            elif len(conditions) > 1:
+                where = {"$and": conditions}
+
+        # Query with embeddings if available
+        query_embedding = self._embed([query_text])
+
+        try:
+            if query_embedding:
+                results = col.query(
+                    query_embeddings=query_embedding,
+                    n_results=min(top_k, col.count()),
+                    where=where
+                )
+            else:
+                results = col.query(
+                    query_texts=[query_text],
+                    n_results=min(top_k, col.count()),
+                    where=where
+                )
+        except Exception as e:
+            logger.warning(f"ChromaDB query failed: {e}")
+            return []
+
+        chunks = []
+        if results and results.get("documents"):
+            docs = results["documents"][0]
+            ids = results["ids"][0] if results.get("ids") else [""] * len(docs)
+            distances = results["distances"][0] if results.get("distances") else [0.0] * len(docs)
+            metadatas = results["metadatas"][0] if results.get("metadatas") else [{}] * len(docs)
+
+            for text, doc_id, distance, meta in zip(docs, ids, distances, metadatas):
+                # ChromaDB returns distance (lower = better), convert to similarity score
+                score = max(0.0, 1.0 - distance)
+                chunks.append(RetrievedChunk(
+                    text=text,
+                    score=score,
+                    metadata=meta or {},
+                    chunk_id=doc_id,
+                    source=collection
+                ))
+
+        return chunks
+
+    def collection_exists(self, collection: str) -> bool:
+        try:
+            col = self.client.get_collection(collection)
+            return col.count() > 0
+        except Exception:
+            return False
+
+    def delete_collection(self, collection: str) -> None:
+        try:
+            self.client.delete_collection(collection)
+        except Exception:
+            pass
+
+    def collection_count(self, collection: str) -> int:
+        try:
+            col = self.client.get_collection(collection)
+            return col.count()
+        except Exception:
+            return 0
+
+
+def create_vectorstore(persist_dir: str, backend: str = "auto") -> BaseVectorStore:
+    """
+    Factory function to create the best available vector store.
+
+    Args:
+        persist_dir: Directory for persistent storage
+        backend: "auto" (best available), "chromadb", "tfidf", or "bm25"
+
+    Returns:
+        Configured vector store instance
+    """
+    if backend == "chromadb" or (backend == "auto" and HAS_CHROMADB):
+        try:
+            store = ChromaVectorStore(persist_dir)
+            logger.info(f"RAG: Using ChromaDB backend (semantic embeddings)")
+            return store
+        except Exception as e:
+            logger.warning(f"RAG: ChromaDB init failed: {e}, falling back")
+
+    if backend == "tfidf" or (backend == "auto" and HAS_SKLEARN):
+        try:
+            store = TFIDFVectorStore(persist_dir)
+            logger.info(f"RAG: Using TF-IDF backend (statistical similarity)")
+            return store
+        except Exception as e:
+            logger.warning(f"RAG: TF-IDF init failed: {e}, falling back")
+
+    store = BM25VectorStore(persist_dir)
+    logger.info(f"RAG: Using BM25 backend (keyword ranking)")
+    return store