From bf145e70b6cd29880a287df73dbdfc513d2f84cc Mon Sep 17 00:00:00 2001
From: Adam Wilson <lightbrok3r@gmail.com>
Date: Thu, 15 May 2025 09:30:25 -0600
Subject: [PATCH] LangChain WIP; demo of RAG integration from Claude Sonnet 3.7

---
 .github/workflows/llmsecops-cicd.test.yml |   3 +-
 requirements.txt                          | 192 ++++++++++++++++
 tests/llm/llm.py                          |  36 ++-
 tests/llm/llm_rag.py                      | 263 ++++++++++++++++++++++
 4 files changed, 483 insertions(+), 11 deletions(-)
 create mode 100644 requirements.txt
 create mode 100644 tests/llm/llm_rag.py

diff --git a/.github/workflows/llmsecops-cicd.test.yml b/.github/workflows/llmsecops-cicd.test.yml
index 7c733f159..bc196ffb0 100644
--- a/.github/workflows/llmsecops-cicd.test.yml
+++ b/.github/workflows/llmsecops-cicd.test.yml
@@ -31,5 +31,6 @@ jobs:
         pip install langchain_huggingface 
         pip install transformers
         pip install accelerate
-        python -m tests.llm.llm --prompt "What is the capital of France?"
+        pip install optimum[exporters,onnxruntime]
+        python -m tests.llm.llm_rag --prompt "What is the capital of France?"
         
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..07baf0a56
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,192 @@
+accelerate==1.6.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+avidtools==0.1.2
+backoff==2.2.1
+base2048==0.1.3
+boto3==1.38.2
+botocore==1.38.2
+cachetools==5.5.2
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+chevron==0.14.0
+click==8.1.8
+cmd2==2.4.3
+cohere==4.57
+colorama==0.4.6
+coloredlogs==15.0.1
+dataclasses-json==0.6.7
+datasets==2.16.1
+DateTime==5.5
+deepl==1.17.0
+dill==0.3.7
+distro==1.9.0
+ecoji==0.1.1
+fastapi==0.115.12
+fastavro==1.10.0
+filelock==3.18.0
+flatbuffers==25.2.10
+frozenlist==1.6.0
+fschat==0.2.36
+fsspec==2023.10.0
+garak==0.10.3.1
+google-api-core==2.24.2
+google-api-python-client==2.168.0
+google-auth==2.39.0
+google-auth-httplib2==0.2.0
+googleapis-common-protos==1.70.0
+greenlet==3.2.1
+h11==0.14.0
+httpcore==1.0.8
+httplib2==0.22.0
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.31.2
+humanfriendly==10.0
+idna==3.10
+importlib-metadata==6.11.0
+inquirerpy==0.3.4
+Jinja2==3.1.6
+jiter==0.9.0
+jmespath==1.0.1
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-ng==1.7.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+langchain==0.3.25
+langchain-community==0.3.24
+langchain-core==0.3.59
+langchain-huggingface==0.2.0
+langchain-text-splitters==0.3.8
+langsmith==0.3.33
+latex2mathml==3.77.0
+litellm==1.67.2
+lorem==0.1.1
+Markdown==3.8
+markdown-it-py==3.0.0
+markdown2==2.5.3
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.4.3
+multiprocess==0.70.15
+mypy_extensions==1.1.0
+nemollm==0.3.5
+networkx==3.4.2
+nh3==0.2.21
+nltk==3.9.1
+numpy==1.26.4
+nvdlib==0.8.0
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+octoai-sdk==0.10.1
+ollama==0.4.8
+onnx==1.18.0
+onnxruntime==1.21.0
+onnxruntime-genai==0.7.0
+openai==1.76.0
+optimum==1.25.0
+orjson==3.10.16
+packaging==24.2
+pandas==2.2.3
+pfzy==0.3.4
+pillow==10.4.0
+ply==3.11
+prompt_toolkit==3.0.50
+propcache==0.3.1
+proto-plus==1.26.1
+protobuf==6.30.2
+psutil==7.0.0
+pyarrow==19.0.1
+pyarrow-hotfix==0.6
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.11.3
+pydantic-settings==2.9.1
+pydantic_core==2.33.1
+Pygments==2.19.1
+pyparsing==3.2.3
+pyperclip==1.9.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-magic==0.4.27
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+RapidFuzz==3.13.0
+referencing==0.36.2
+regex==2024.11.6
+replicate==1.0.4
+requests==2.32.3
+requests-futures==1.0.2
+requests-toolbelt==1.0.0
+rich==14.0.0
+rpds-py==0.24.0
+rsa==4.9.1
+s3transfer==0.12.0
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.3
+sentence-transformers==4.1.0
+sentencepiece==0.2.0
+setuptools==79.0.1
+shortuuid==1.0.13
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+SQLAlchemy==2.0.40
+starlette==0.46.2
+stdlibs==2025.4.4
+svgwrite==1.4.3
+sympy==1.13.3
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+timm==1.0.15
+tokenizers==0.21.1
+tomli==2.2.1
+torch==2.7.0
+torchvision==0.22.0
+tqdm==4.67.1
+transformers==4.51.3
+triton==3.3.0
+types-PyYAML==6.0.12.20250402
+types-requests==2.32.0.20250328
+typing-inspect==0.9.0
+typing-inspection==0.4.0
+typing_extensions==4.13.1
+tzdata==2025.2
+uritemplate==4.1.1
+urllib3==2.3.0
+uvicorn==0.34.2
+wavedrom==2.0.3.post3
+wcwidth==0.2.13
+wn==0.9.5
+xdg-base-dirs==6.0.2
+xxhash==3.5.0
+yarl==1.20.0
+zalgolib==0.2.2
+zipp==3.21.0
+zope.interface==7.2
+zstandard==0.23.0
diff --git a/tests/llm/llm.py b/tests/llm/llm.py
index cf60716c6..98a9f4b4a 100644
--- a/tests/llm/llm.py
+++ b/tests/llm/llm.py
@@ -8,36 +8,52 @@ from langchain_huggingface import HuggingFaceEmbeddings
 
 from langchain_huggingface import HuggingFacePipeline
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import os
+from langchain_community.llms import HuggingFacePipeline
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer, pipeline
 
 
 class Llm:
 
     def __init__(self, model_path=None):
+
+        # Get path to the model directory using your specified structure
         base_dir = os.path.dirname(os.path.abspath(__file__))
-        model_path = os.path.join(base_dir, "phi3")
-        tokenizer = AutoTokenizer.from_pretrained(model_path)
-        model = AutoModelForCausalLM.from_pretrained(
-            pretrained_model_name_or_path=model_path,
-            device_map="cpu",  # Use available GPU
-            trust_remote_code=True,  # If model requires custom code
+        model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
+
+        # Load the tokenizer from local path
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True  # Important for some models with custom code
         )
 
-        # Create a pipeline
+        # Load the ONNX model with optimum from local path
+        model = ORTModelForCausalLM.from_pretrained(
+            model_path,
+            provider="CPUExecutionProvider",
+            trust_remote_code=True
+        )
+
+        # Create a text generation pipeline
         pipe = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
             max_new_tokens=512,
             temperature=0.7,
+            top_p=0.9,
+            repetition_penalty=1.1,
+            do_sample=True
         )
 
-        # Create LangChain LLM
-        self.hf_model = HuggingFacePipeline(pipeline=pipe)
+        # Create the LangChain LLM
+        self.llm = HuggingFacePipeline(pipeline=pipe)
 
     def get_response(self, input):
         # Use the model
         print(f'End user prompt: {input}')
-        response = self.hf_model.invoke(input)
+        response = self.llm.invoke(input)
         print(response)
 
 if __name__ == "__main__":
diff --git a/tests/llm/llm_rag.py b/tests/llm/llm_rag.py
new file mode 100644
index 000000000..8bdbe50b9
--- /dev/null
+++ b/tests/llm/llm_rag.py
@@ -0,0 +1,263 @@
+"""
+RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
+"""
+
+import os
+from typing import List
+import numpy as np
+
+# LangChain imports
+from langchain_community.llms import HuggingFacePipeline
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.schema import Document
+
+# HuggingFace and ONNX imports
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer, pipeline
+
+# ------------------------------------------------------
+# 1. LOAD THE LOCAL PHI-3 MODEL
+# ------------------------------------------------------
+
+# Set up paths to the local model
+base_dir = os.path.dirname(os.path.abspath(__file__))
+model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
+print(f"Loading Phi-3 model from: {model_path}")
+
+# Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = ORTModelForCausalLM.from_pretrained(
+    model_path,
+    provider="CPUExecutionProvider",
+    trust_remote_code=True
+)
+
+# Create the text generation pipeline
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=512,
+    temperature=0.7,
+    top_p=0.9,
+    repetition_penalty=1.1,
+    do_sample=True
+)
+
+# Create the LangChain LLM
+llm = HuggingFacePipeline(pipeline=pipe)
+
+# ------------------------------------------------------
+# 2. LOAD THE EMBEDDING MODEL
+# ------------------------------------------------------
+
+# Initialize the embedding model - using a small, efficient model
+# Options:
+# - "BAAI/bge-small-en-v1.5" (385MB, good performance/size ratio)
+# - "sentence-transformers/all-MiniLM-L6-v2" (91MB, very small)
+# - "intfloat/e5-small-v2" (134MB, good performance)
+embeddings = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-MiniLM-L6-v2",
+    model_kwargs={"device": "cpu"},
+    encode_kwargs={"normalize_embeddings": True}
+)
+print("Embedding model loaded")
+
+# ------------------------------------------------------
+# 3. CREATE A SAMPLE DOCUMENT COLLECTION
+# ------------------------------------------------------
+
+# Sample documents about artificial intelligence
+docs = [
+    Document(
+        page_content="Artificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals, which involves consciousness and emotionality.",
+        metadata={"source": "AI Definition"}
+    ),
+    Document(
+        page_content="Machine learning is a subset of AI focused on building systems that learn from data. Deep learning is a subset of machine learning based on neural networks.",
+        metadata={"source": "Machine Learning"}
+    ),
+    Document(
+        page_content="Neural networks are computing systems inspired by the biological neural networks that constitute animal brains. They form the basis of many modern AI systems.",
+        metadata={"source": "Neural Networks"}
+    ),
+    Document(
+        page_content="Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and humans through natural language.",
+        metadata={"source": "NLP"}
+    ),
+    Document(
+        page_content="Reinforcement learning is an area of machine learning where an agent learns to make decisions by taking actions in an environment to maximize a reward signal.",
+        metadata={"source": "Reinforcement Learning"}
+    ),
+    Document(
+        page_content="Computer vision is a field of AI that enables computers to derive meaningful information from digital images, videos and other visual inputs.",
+        metadata={"source": "Computer Vision"}
+    ),
+]
+
+# ------------------------------------------------------
+# 4. SPLIT DOCUMENTS AND CREATE VECTOR STORE
+# ------------------------------------------------------
+
+# Split documents into chunks
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=500,
+    chunk_overlap=50,
+    separators=["\n\n", "\n", ".", " ", ""]
+)
+
+# Split the documents
+split_docs = text_splitter.split_documents(docs)
+print(f"Split {len(docs)} documents into {len(split_docs)} chunks")
+
+# Create a FAISS vector store from the chunks
+vectorstore = FAISS.from_documents(split_docs, embeddings)
+print("Vector store created")
+
+# ------------------------------------------------------
+# 5. CREATE RAG PROMPT TEMPLATE FOR PHI-3
+# ------------------------------------------------------
+
+# Phi-3 specific RAG prompt template
+rag_prompt_template = """<|user|>
+Use the following context to answer the question. If you don't know the answer based on the context, just say you don't know.
+
+Context:
+{context}
+
+Question: {question}
+<|assistant|>"""
+
+# Create the prompt
+prompt = PromptTemplate(
+    template=rag_prompt_template,
+    input_variables=["context", "question"]
+)
+
+# ------------------------------------------------------
+# 6. CREATE RAG CHAIN
+# ------------------------------------------------------
+
+# Create the retrieval QA chain
+qa_chain = RetrievalQA.from_chain_type(
+    llm=llm,
+    chain_type="stuff",  # "stuff" method puts all retrieved docs into one prompt
+    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 results
+    return_source_documents=True,  # Return source docs for transparency
+    chain_type_kwargs={"prompt": prompt}  # Use our custom prompt
+)
+
+# ------------------------------------------------------
+# 7. QUERY FUNCTIONS
+# ------------------------------------------------------
+
+def ask_rag(question: str):
+    """Query the RAG system with a question"""
+    print(f"\nQuestion: {question}")
+    
+    # Get response from the chain
+    response = qa_chain({"query": question})
+    
+    # Print the answer
+    print("\nAnswer:")
+    print(response["result"])
+    
+    # Print the source documents
+    print("\nSources:")
+    for i, doc in enumerate(response["source_documents"]):
+        print(f"\nSource {i+1}: {doc.metadata['source']}")
+        print(f"Content: {doc.page_content}")
+    
+    return response
+
+# ------------------------------------------------------
+# 8. FUNCTION TO LOAD CUSTOM DOCUMENTS
+# ------------------------------------------------------
+
+def load_docs_from_files(file_paths: List[str]):
+    """Load documents from files"""
+    from langchain_community.document_loaders import TextLoader, PyPDFLoader
+    
+    all_docs = []
+    for file_path in file_paths:
+        try:
+            if file_path.lower().endswith('.pdf'):
+                loader = PyPDFLoader(file_path)
+            else:
+                loader = TextLoader(file_path)
+            docs = loader.load()
+            all_docs.extend(docs)
+            print(f"Loaded {len(docs)} document(s) from {file_path}")
+        except Exception as e:
+            print(f"Error loading {file_path}: {e}")
+    
+    return all_docs
+
+def create_rag_from_files(file_paths: List[str]):
+    """Create a new RAG system from the provided files"""
+    # Load documents
+    loaded_docs = load_docs_from_files(file_paths)
+    
+    # Split documents
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+        separators=["\n\n", "\n", ".", " ", ""]
+    )
+    split_docs = text_splitter.split_documents(loaded_docs)
+    
+    # Create vector store
+    new_vectorstore = FAISS.from_documents(split_docs, embeddings)
+    
+    # Create QA chain
+    new_qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=new_vectorstore.as_retriever(search_kwargs={"k": 3}),
+        return_source_documents=True,
+        chain_type_kwargs={"prompt": prompt}  # Use our custom prompt
+    )
+    
+    return new_qa_chain
+
+# ------------------------------------------------------
+# 9. EXAMPLE USAGE
+# ------------------------------------------------------
+
+if __name__ == "__main__":
+    # Test with sample questions
+    print("\n===== RAG System Demo =====")
+    
+    # Example 1: Basic retrieval
+    ask_rag("What is the difference between machine learning and deep learning?")
+    
+    # Example 2: Testing knowledge boundaries
+    ask_rag("What are the key components of a neural network?")
+    
+    # Example 3: Question outside the knowledge base
+    ask_rag("What is the capital of France?")
+    
+    print("\n===== Demo Complete =====")
+
+# ------------------------------------------------------
+# 10. SAVE AND LOAD VECTOR STORE FOR FUTURE USE
+# ------------------------------------------------------
+
+def save_vectorstore(vectorstore, directory="faiss_index"):
+    """Save the FAISS vector store to disk"""
+    vectorstore.save_local(directory)
+    print(f"Vector store saved to {directory}")
+
+def load_vectorstore(directory="faiss_index"):
+    """Load a FAISS vector store from disk"""
+    if os.path.exists(directory):
+        loaded_vectorstore = FAISS.load_local(directory, embeddings)
+        print(f"Vector store loaded from {directory}")
+        return loaded_vectorstore
+    else:
+        print(f"No vector store found at {directory}")
+        return None
\ No newline at end of file