start splitting into classes; use env vars for foundation model file dependencies

This commit is contained in:
Adam Wilson
2025-05-26 18:27:23 -06:00
parent 8bb4a473ca
commit 39c779058d
7 changed files with 236 additions and 91 deletions

2
.gitignore vendored
View File

@@ -175,5 +175,5 @@ cython_debug/
# HuggingFace / Microsoft LLM supporting files
# (these are downloaded for local development via bash script, or inside GH Action workflow context)
src/**/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/**
infrastructure/foundation_model/cpu_and_mobile/**
logs

19
infrastructure/README.md Normal file
View File

@@ -0,0 +1,19 @@
# Infrastructure
This directory exists to contain the foundation model (pre-trained generative language model).
## Model Choice
The foundation model for this project needed to work under multiple constraints:
1. __Repo storage limits:__ Even with Git LFS enabled, GitHub restricts repository size to 5GB (at least for the free tier).
1. __Build system storage limits:__ [Standard Linux runners](https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners?ref=devtron.ai#standard-github-hosted-runners-for-public-repositories) in GitHub Actions have a 16GB SSD.
The CPU-optimized [`microsoft/Phi-3-mini-4k-instruct-onnx`](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) model met this storage space requirement.
## Provisioning the Foundation Model
The foundation model dependency is loaded differently for local development vs. the build system:
1. __Local:__ The model is downloaded once by the `./run.sh` shell script at the project root, but excluded in `.gitignore` since it's too large for GitHub's LFS limitations.
1. __Build System:__ The model is downloaded on every workflow run with `huggingface-cli`.

View File

@@ -0,0 +1,106 @@
aiohappyeyeballs==2.6.1
aiohttp==3.11.18
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.9.0
attrs==25.3.0
certifi==2025.4.26
charset-normalizer==3.4.2
coloredlogs==15.0.1
dataclasses-json==0.6.7
datasets==3.6.0
dill==0.3.8
faiss-cpu==1.11.0
filelock==3.18.0
flatbuffers==25.2.10
frozenlist==1.6.0
fsspec==2025.3.0
greenlet==3.2.2
h11==0.16.0
hf-xet==1.1.2
httpcore==1.0.9
httpx==0.28.1
httpx-sse==0.4.0
huggingface-hub==0.32.0
humanfriendly==10.0
idna==3.10
inquirerpy==0.3.4
Jinja2==3.1.6
joblib==1.5.1
jsonpatch==1.33
jsonpointer==3.0.0
langchain==0.3.25
langchain-community==0.3.24
langchain-core==0.3.61
langchain-huggingface==0.2.0
langchain-text-splitters==0.3.8
langsmith==0.3.42
MarkupSafe==3.0.2
marshmallow==3.26.1
mpmath==1.3.0
multidict==6.4.4
multiprocess==0.70.16
mypy_extensions==1.1.0
networkx==3.4.2
numpy==2.2.6
nvidia-cublas-cu12==12.6.4.1
nvidia-cuda-cupti-cu12==12.6.80
nvidia-cuda-nvrtc-cu12==12.6.77
nvidia-cuda-runtime-cu12==12.6.77
nvidia-cudnn-cu12==9.5.1.17
nvidia-cufft-cu12==11.3.0.4
nvidia-cufile-cu12==1.11.1.6
nvidia-curand-cu12==10.3.7.77
nvidia-cusolver-cu12==11.7.1.2
nvidia-cusparse-cu12==12.5.4.2
nvidia-cusparselt-cu12==0.6.3
nvidia-nccl-cu12==2.26.2
nvidia-nvjitlink-cu12==12.6.85
nvidia-nvtx-cu12==12.6.77
onnx==1.18.0
onnxruntime==1.22.0
optimum==1.25.3
orjson==3.10.18
packaging==24.2
pandas==2.2.3
pfzy==0.3.4
pillow==11.2.1
prompt_toolkit==3.0.51
propcache==0.3.1
protobuf==6.31.0
pyarrow==20.0.0
pydantic==2.11.5
pydantic-settings==2.9.1
pydantic_core==2.33.2
python-dateutil==2.9.0.post0
python-dotenv==1.1.0
pytz==2025.2
PyYAML==6.0.2
regex==2024.11.6
requests==2.32.3
requests-toolbelt==1.0.0
safetensors==0.5.3
scikit-learn==1.6.1
scipy==1.15.3
sentence-transformers==4.1.0
setuptools==80.8.0
six==1.17.0
sniffio==1.3.1
SQLAlchemy==2.0.41
sympy==1.14.0
tenacity==9.1.2
threadpoolctl==3.6.0
tokenizers==0.21.1
torch==2.7.0
tqdm==4.67.1
transformers==4.51.3
triton==3.3.0
typing-inspect==0.9.0
typing-inspection==0.4.1
typing_extensions==4.13.2
tzdata==2025.2
urllib3==2.4.0
wcwidth==0.2.13
xxhash==3.5.0
yarl==1.20.0
zstandard==0.23.0

27
run.sh
View File

@@ -7,12 +7,29 @@ source .env/bin/activate
# the ONNX model/data require git Large File System support
git lfs install
# pip install huggingface-hub[cli]
# install Python dependencies
# pip install huggingface-hub[cli] langchain langchain_huggingface langchain_community optimum[onnxruntime] faiss-cpu
pip install -r ./requirements.txt
# # get foundation model dependencies from HuggingFace / Microsoft
# huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \
# --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* \
# --local-dir ./infrastructure/foundation_model
# environment variables
export MODEL_BASE_DIR="./infrastructure/foundation_model"
export MODEL_CPU_DIR="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"
MODEL_DATA_FILENAME="phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data"
MODEL_DATA_FILEPATH="$MODEL_BASE_DIR/$MODEL_CPU_DIR/$MODEL_DATA_FILENAME"
echo "==================="
echo "$MODEL_DATA_FILEPATH"
echo "==================="
# get foundation model dependencies from HuggingFace / Microsoft
if [ ! -f "$MODEL_DATA_FILEPATH" ]; then
echo "Downloading foundation model..."
huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \
--include "$MODEL_CPU_DIR/*" \
--local-dir $MODEL_BASE_DIR
else
echo "Foundation model files already exist at: $MODEL_DATA_FILEPATH"
fi
python -m src.text_generation.entrypoints.server

View File

@@ -3,18 +3,13 @@ RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
"""
import logging
import os
import sys
# LangChain imports
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# HuggingFace and ONNX imports
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer, pipeline
from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel
class Phi3LanguageModel:
@@ -29,40 +24,8 @@ class Phi3LanguageModel:
def configure_model(self):
# Set up paths to the local model
base_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
self.logger.debug(f"Loading Phi-3 model from: {model_path}")
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_path,
trust_remote_code=True,
local_files_only=True
)
model = ORTModelForCausalLM.from_pretrained(
model_path,
provider="CPUExecutionProvider",
trust_remote_code=True,
local_files_only=True
)
model.name_or_path = model_path
# Create the text generation pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
use_fast=True,
do_sample=True
)
# Create the LangChain LLM
llm = HuggingFacePipeline(pipeline=pipe)
llm = TextGenerationFoundationModel().build()
# Phi-3 specific prompt template
template = """<|user|>
@@ -91,10 +54,7 @@ class Phi3LanguageModel:
def invoke(self, user_input: str) -> str:
try:
# Get response from the chain
self.logger.debug(f'===Prompt: {user_input}\n\n')
response = self.chain.invoke(user_input)
# Print the answer
self.logger.debug(f'===Response: {response}\n\n')
return response
except Exception as e:
self.logger.error(f"Failed: {e}")

View File

@@ -2,57 +2,33 @@
RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
"""
import os
import logging
import sys
# LangChain imports
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document
# HuggingFace and ONNX imports
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer, pipeline
from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel
class Phi3LanguageModelWithRag:
def invoke(self, user_input):
def __init__(self):
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
logger.addHandler(handler)
self.logger = logger
self.configure_model()
# Set up paths to the local model
base_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
print(f"Loading Phi-3 model from: {model_path}")
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_path,
trust_remote_code=True
)
model = ORTModelForCausalLM.from_pretrained(
model_id=model_path,
provider="CPUExecutionProvider",
trust_remote_code=True
)
model.name_or_path = model_path
# Create the text generation pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
do_sample=True
)
def configure_model(self):
# Create the LangChain LLM
llm = HuggingFacePipeline(pipeline=pipe)
llm = TextGenerationFoundationModel().build()
# Initialize the embedding model - using a small, efficient model
# Options:
@@ -64,7 +40,6 @@ class Phi3LanguageModelWithRag:
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}
)
print("Embedding model loaded")
# Sample documents about artificial intelligence
docs = [
@@ -141,7 +116,7 @@ class Phi3LanguageModelWithRag:
)
# Create the retrieval QA chain
qa_chain = RetrievalQA.from_chain_type(
self.qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff", # "stuff" method puts all retrieved docs into one prompt
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}), # Retrieve top 3 results
@@ -149,10 +124,9 @@ class Phi3LanguageModelWithRag:
chain_type_kwargs={"prompt": prompt} # Use our custom prompt
)
def invoke(self, user_input: str) -> str:
# Get response from the chain
response = qa_chain.invoke({"query": user_input})
# Print the answer
print(response["result"])
response = self.qa_chain.invoke({"query": user_input})
return response["result"]

View File

@@ -0,0 +1,69 @@
"""
RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
"""
import logging
import os
import sys
# LangChain imports
from langchain_huggingface import HuggingFacePipeline
# HuggingFace and ONNX imports
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer, pipeline
class TextGenerationFoundationModel:
def __init__(self):
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
logger.addHandler(handler)
self.logger = logger
def build(self) -> HuggingFacePipeline:
# Set up paths to the local model
# base_dir = os.path.dirname(os.path.abspath(__file__))
# model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
model_base_dir = os.environ.get('MODEL_BASE_DIR')
model_cpu_dir = os.environ.get('MODEL_CPU_DIR')
model_path = os.path.join(model_base_dir, model_cpu_dir)
self.logger.debug(f'model_base_dir: {model_base_dir}')
self.logger.debug(f'model_cpu_dir: {model_cpu_dir}')
self.logger.debug(f"Loading Phi-3 model from: {model_path}")
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_path,
trust_remote_code=True,
local_files_only=True
)
model = ORTModelForCausalLM.from_pretrained(
model_path,
provider="CPUExecutionProvider",
trust_remote_code=True,
local_files_only=True
)
model.name_or_path = model_path
# Create the text generation pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
use_fast=True,
do_sample=True
)
# Create the LangChain LLM
return HuggingFacePipeline(pipeline=pipe)