LangChain WIP; demo of RAG integration from Claude Sonnet 3.7

This commit is contained in:
Adam Wilson
2025-05-15 09:30:25 -06:00
parent 8fc43066e0
commit bf145e70b6
4 changed files with 483 additions and 11 deletions

View File

@@ -31,5 +31,6 @@ jobs:
pip install langchain_huggingface
pip install transformers
pip install accelerate
python -m tests.llm.llm --prompt "What is the capital of France?"
pip install optimum[exporters,onnxruntime]
python -m tests.llm.llm_rag --prompt "What is the capital of France?"

192
requirements.txt Normal file
View File

@@ -0,0 +1,192 @@
accelerate==1.6.0
aiohappyeyeballs==2.6.1
aiohttp==3.11.18
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.9.0
attrs==25.3.0
avidtools==0.1.2
backoff==2.2.1
base2048==0.1.3
boto3==1.38.2
botocore==1.38.2
cachetools==5.5.2
certifi==2025.1.31
cffi==1.17.1
charset-normalizer==3.4.1
chevron==0.14.0
click==8.1.8
cmd2==2.4.3
cohere==4.57
colorama==0.4.6
coloredlogs==15.0.1
dataclasses-json==0.6.7
datasets==2.16.1
DateTime==5.5
deepl==1.17.0
dill==0.3.7
distro==1.9.0
ecoji==0.1.1
fastapi==0.115.12
fastavro==1.10.0
filelock==3.18.0
flatbuffers==25.2.10
frozenlist==1.6.0
fschat==0.2.36
fsspec==2023.10.0
garak==0.10.3.1
google-api-core==2.24.2
google-api-python-client==2.168.0
google-auth==2.39.0
google-auth-httplib2==0.2.0
googleapis-common-protos==1.70.0
greenlet==3.2.1
h11==0.14.0
httpcore==1.0.8
httplib2==0.22.0
httpx==0.28.1
httpx-sse==0.4.0
huggingface-hub==0.31.2
humanfriendly==10.0
idna==3.10
importlib-metadata==6.11.0
inquirerpy==0.3.4
Jinja2==3.1.6
jiter==0.9.0
jmespath==1.0.1
joblib==1.4.2
jsonpatch==1.33
jsonpath-ng==1.7.0
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2025.4.1
langchain==0.3.25
langchain-community==0.3.24
langchain-core==0.3.59
langchain-huggingface==0.2.0
langchain-text-splitters==0.3.8
langsmith==0.3.33
latex2mathml==3.77.0
litellm==1.67.2
lorem==0.1.1
Markdown==3.8
markdown-it-py==3.0.0
markdown2==2.5.3
MarkupSafe==3.0.2
marshmallow==3.26.1
mdurl==0.1.2
mpmath==1.3.0
multidict==6.4.3
multiprocess==0.70.15
mypy_extensions==1.1.0
nemollm==0.3.5
networkx==3.4.2
nh3==0.2.21
nltk==3.9.1
numpy==1.26.4
nvdlib==0.8.0
nvidia-cublas-cu12==12.6.4.1
nvidia-cuda-cupti-cu12==12.6.80
nvidia-cuda-nvrtc-cu12==12.6.77
nvidia-cuda-runtime-cu12==12.6.77
nvidia-cudnn-cu12==9.5.1.17
nvidia-cufft-cu12==11.3.0.4
nvidia-cufile-cu12==1.11.1.6
nvidia-curand-cu12==10.3.7.77
nvidia-cusolver-cu12==11.7.1.2
nvidia-cusparse-cu12==12.5.4.2
nvidia-cusparselt-cu12==0.6.3
nvidia-nccl-cu12==2.26.2
nvidia-nvjitlink-cu12==12.6.85
nvidia-nvtx-cu12==12.6.77
octoai-sdk==0.10.1
ollama==0.4.8
onnx==1.18.0
onnxruntime==1.21.0
onnxruntime-genai==0.7.0
openai==1.76.0
optimum==1.25.0
orjson==3.10.16
packaging==24.2
pandas==2.2.3
pfzy==0.3.4
pillow==10.4.0
ply==3.11
prompt_toolkit==3.0.50
propcache==0.3.1
proto-plus==1.26.1
protobuf==6.30.2
psutil==7.0.0
pyarrow==19.0.1
pyarrow-hotfix==0.6
pyasn1==0.6.1
pyasn1_modules==0.4.2
pycparser==2.22
pydantic==2.11.3
pydantic-settings==2.9.1
pydantic_core==2.33.1
Pygments==2.19.1
pyparsing==3.2.3
pyperclip==1.9.0
python-dateutil==2.9.0.post0
python-dotenv==1.1.0
python-magic==0.4.27
python-multipart==0.0.20
pytz==2025.2
PyYAML==6.0.2
RapidFuzz==3.13.0
referencing==0.36.2
regex==2024.11.6
replicate==1.0.4
requests==2.32.3
requests-futures==1.0.2
requests-toolbelt==1.0.0
rich==14.0.0
rpds-py==0.24.0
rsa==4.9.1
s3transfer==0.12.0
safetensors==0.5.3
scikit-learn==1.6.1
scipy==1.15.3
sentence-transformers==4.1.0
sentencepiece==0.2.0
setuptools==79.0.1
shortuuid==1.0.13
six==1.17.0
sniffio==1.3.1
soundfile==0.13.1
SQLAlchemy==2.0.40
starlette==0.46.2
stdlibs==2025.4.4
svgwrite==1.4.3
sympy==1.13.3
tenacity==9.1.2
threadpoolctl==3.6.0
tiktoken==0.9.0
timm==1.0.15
tokenizers==0.21.1
tomli==2.2.1
torch==2.7.0
torchvision==0.22.0
tqdm==4.67.1
transformers==4.51.3
triton==3.3.0
types-PyYAML==6.0.12.20250402
types-requests==2.32.0.20250328
typing-inspect==0.9.0
typing-inspection==0.4.0
typing_extensions==4.13.1
tzdata==2025.2
uritemplate==4.1.1
urllib3==2.3.0
uvicorn==0.34.2
wavedrom==2.0.3.post3
wcwidth==0.2.13
wn==0.9.5
xdg-base-dirs==6.0.2
xxhash==3.5.0
yarl==1.20.0
zalgolib==0.2.2
zipp==3.21.0
zope.interface==7.2
zstandard==0.23.0

View File

@@ -8,36 +8,52 @@ from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os
from langchain_community.llms import HuggingFacePipeline
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer, pipeline
class Llm:
def __init__(self, model_path=None):
# Get path to the model directory using your specified structure
base_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(base_dir, "phi3")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=model_path,
device_map="cpu", # Use available GPU
trust_remote_code=True, # If model requires custom code
model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
# Load the tokenizer from local path
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True # Important for some models with custom code
)
# Create a pipeline
# Load the ONNX model with optimum from local path
model = ORTModelForCausalLM.from_pretrained(
model_path,
provider="CPUExecutionProvider",
trust_remote_code=True
)
# Create a text generation pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
do_sample=True
)
# Create LangChain LLM
self.hf_model = HuggingFacePipeline(pipeline=pipe)
# Create the LangChain LLM
self.llm = HuggingFacePipeline(pipeline=pipe)
def get_response(self, input):
# Use the model
print(f'End user prompt: {input}')
response = self.hf_model.invoke(input)
response = self.llm.invoke(input)
print(response)
if __name__ == "__main__":

263
tests/llm/llm_rag.py Normal file
View File

@@ -0,0 +1,263 @@
"""
RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
"""
import os
from typing import List
import numpy as np
# LangChain imports
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document
# HuggingFace and ONNX imports
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer, pipeline
# ------------------------------------------------------
# 1. LOAD THE LOCAL PHI-3 MODEL
# ------------------------------------------------------
# Set up paths to the local model
base_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
print(f"Loading Phi-3 model from: {model_path}")
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = ORTModelForCausalLM.from_pretrained(
model_path,
provider="CPUExecutionProvider",
trust_remote_code=True
)
# Create the text generation pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
do_sample=True
)
# Create the LangChain LLM
llm = HuggingFacePipeline(pipeline=pipe)
# ------------------------------------------------------
# 2. LOAD THE EMBEDDING MODEL
# ------------------------------------------------------
# Initialize the embedding model - using a small, efficient model
# Options:
# - "BAAI/bge-small-en-v1.5" (385MB, good performance/size ratio)
# - "sentence-transformers/all-MiniLM-L6-v2" (91MB, very small)
# - "intfloat/e5-small-v2" (134MB, good performance)
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}
)
print("Embedding model loaded")
# ------------------------------------------------------
# 3. CREATE A SAMPLE DOCUMENT COLLECTION
# ------------------------------------------------------
# Sample documents about artificial intelligence
docs = [
Document(
page_content="Artificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals, which involves consciousness and emotionality.",
metadata={"source": "AI Definition"}
),
Document(
page_content="Machine learning is a subset of AI focused on building systems that learn from data. Deep learning is a subset of machine learning based on neural networks.",
metadata={"source": "Machine Learning"}
),
Document(
page_content="Neural networks are computing systems inspired by the biological neural networks that constitute animal brains. They form the basis of many modern AI systems.",
metadata={"source": "Neural Networks"}
),
Document(
page_content="Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and humans through natural language.",
metadata={"source": "NLP"}
),
Document(
page_content="Reinforcement learning is an area of machine learning where an agent learns to make decisions by taking actions in an environment to maximize a reward signal.",
metadata={"source": "Reinforcement Learning"}
),
Document(
page_content="Computer vision is a field of AI that enables computers to derive meaningful information from digital images, videos and other visual inputs.",
metadata={"source": "Computer Vision"}
),
]
# ------------------------------------------------------
# 4. SPLIT DOCUMENTS AND CREATE VECTOR STORE
# ------------------------------------------------------
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ".", " ", ""]
)
# Split the documents
split_docs = text_splitter.split_documents(docs)
print(f"Split {len(docs)} documents into {len(split_docs)} chunks")
# Create a FAISS vector store from the chunks
vectorstore = FAISS.from_documents(split_docs, embeddings)
print("Vector store created")
# ------------------------------------------------------
# 5. CREATE RAG PROMPT TEMPLATE FOR PHI-3
# ------------------------------------------------------
# Phi-3 specific RAG prompt template
rag_prompt_template = """<|user|>
Use the following context to answer the question. If you don't know the answer based on the context, just say you don't know.
Context:
{context}
Question: {question}
<|assistant|>"""
# Create the prompt
prompt = PromptTemplate(
template=rag_prompt_template,
input_variables=["context", "question"]
)
# ------------------------------------------------------
# 6. CREATE RAG CHAIN
# ------------------------------------------------------
# Create the retrieval QA chain
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff", # "stuff" method puts all retrieved docs into one prompt
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}), # Retrieve top 3 results
return_source_documents=True, # Return source docs for transparency
chain_type_kwargs={"prompt": prompt} # Use our custom prompt
)
# ------------------------------------------------------
# 7. QUERY FUNCTIONS
# ------------------------------------------------------
def ask_rag(question: str):
"""Query the RAG system with a question"""
print(f"\nQuestion: {question}")
# Get response from the chain
response = qa_chain({"query": question})
# Print the answer
print("\nAnswer:")
print(response["result"])
# Print the source documents
print("\nSources:")
for i, doc in enumerate(response["source_documents"]):
print(f"\nSource {i+1}: {doc.metadata['source']}")
print(f"Content: {doc.page_content}")
return response
# ------------------------------------------------------
# 8. FUNCTION TO LOAD CUSTOM DOCUMENTS
# ------------------------------------------------------
def load_docs_from_files(file_paths: List[str]):
"""Load documents from files"""
from langchain_community.document_loaders import TextLoader, PyPDFLoader
all_docs = []
for file_path in file_paths:
try:
if file_path.lower().endswith('.pdf'):
loader = PyPDFLoader(file_path)
else:
loader = TextLoader(file_path)
docs = loader.load()
all_docs.extend(docs)
print(f"Loaded {len(docs)} document(s) from {file_path}")
except Exception as e:
print(f"Error loading {file_path}: {e}")
return all_docs
def create_rag_from_files(file_paths: List[str]):
"""Create a new RAG system from the provided files"""
# Load documents
loaded_docs = load_docs_from_files(file_paths)
# Split documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ".", " ", ""]
)
split_docs = text_splitter.split_documents(loaded_docs)
# Create vector store
new_vectorstore = FAISS.from_documents(split_docs, embeddings)
# Create QA chain
new_qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=new_vectorstore.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True,
chain_type_kwargs={"prompt": prompt} # Use our custom prompt
)
return new_qa_chain
# ------------------------------------------------------
# 9. EXAMPLE USAGE
# ------------------------------------------------------
if __name__ == "__main__":
# Test with sample questions
print("\n===== RAG System Demo =====")
# Example 1: Basic retrieval
ask_rag("What is the difference between machine learning and deep learning?")
# Example 2: Testing knowledge boundaries
ask_rag("What are the key components of a neural network?")
# Example 3: Question outside the knowledge base
ask_rag("What is the capital of France?")
print("\n===== Demo Complete =====")
# ------------------------------------------------------
# 10. SAVE AND LOAD VECTOR STORE FOR FUTURE USE
# ------------------------------------------------------
def save_vectorstore(vectorstore, directory="faiss_index"):
"""Save the FAISS vector store to disk"""
vectorstore.save_local(directory)
print(f"Vector store saved to {directory}")
def load_vectorstore(directory="faiss_index"):
"""Load a FAISS vector store from disk"""
if os.path.exists(directory):
loaded_vectorstore = FAISS.load_local(directory, embeddings)
print(f"Vector store loaded from {directory}")
return loaded_vectorstore
else:
print(f"No vector store found at {directory}")
return None