diff --git a/.gitignore b/.gitignore index 9c6e0eb8c..08ad87c8d 100644 --- a/.gitignore +++ b/.gitignore @@ -175,5 +175,5 @@ cython_debug/ # HuggingFace / Microsoft LLM supporting files # (these are downloaded for local development via bash script, or inside GH Action workflow context) -src/**/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/** +infrastructure/foundation_model/cpu_and_mobile/** logs \ No newline at end of file diff --git a/infrastructure/README.md b/infrastructure/README.md new file mode 100644 index 000000000..8e962f758 --- /dev/null +++ b/infrastructure/README.md @@ -0,0 +1,19 @@ +# Infrastructure + +This directory exists to contain the foundation model (pre-trained generative language model). + +## Model Choice + +The foundation model for this project needed to work under multiple constraints: + +1. __Repo storage limits:__ Even with Git LFS enabled, GitHub restricts repository size to 5GB (at least for the free tier). +1. __Build system storage limits:__ [Standard Linux runners](https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners?ref=devtron.ai#standard-github-hosted-runners-for-public-repositories) in GitHub Actions have a 16GB SSD. + +The CPU-optimized [`microsoft/Phi-3-mini-4k-instruct-onnx`](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) model met this storage space requirement. + +## Provisioning the Foundation Model + +The foundation model dependency is loaded differently for local development vs. the build system: + +1. __Local:__ The model is downloaded once by the `./run.sh` shell script at the project root, but excluded in `.gitignore` since it's too large for GitHub's LFS limitations. +1. __Build System:__ The model is downloaded on every workflow run with `huggingface-cli`. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29bb..93ac15959 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,106 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.11.18 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.9.0 +attrs==25.3.0 +certifi==2025.4.26 +charset-normalizer==3.4.2 +coloredlogs==15.0.1 +dataclasses-json==0.6.7 +datasets==3.6.0 +dill==0.3.8 +faiss-cpu==1.11.0 +filelock==3.18.0 +flatbuffers==25.2.10 +frozenlist==1.6.0 +fsspec==2025.3.0 +greenlet==3.2.2 +h11==0.16.0 +hf-xet==1.1.2 +httpcore==1.0.9 +httpx==0.28.1 +httpx-sse==0.4.0 +huggingface-hub==0.32.0 +humanfriendly==10.0 +idna==3.10 +inquirerpy==0.3.4 +Jinja2==3.1.6 +joblib==1.5.1 +jsonpatch==1.33 +jsonpointer==3.0.0 +langchain==0.3.25 +langchain-community==0.3.24 +langchain-core==0.3.61 +langchain-huggingface==0.2.0 +langchain-text-splitters==0.3.8 +langsmith==0.3.42 +MarkupSafe==3.0.2 +marshmallow==3.26.1 +mpmath==1.3.0 +multidict==6.4.4 +multiprocess==0.70.16 +mypy_extensions==1.1.0 +networkx==3.4.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +onnx==1.18.0 +onnxruntime==1.22.0 +optimum==1.25.3 +orjson==3.10.18 +packaging==24.2 +pandas==2.2.3 +pfzy==0.3.4 +pillow==11.2.1 +prompt_toolkit==3.0.51 +propcache==0.3.1 +protobuf==6.31.0 +pyarrow==20.0.0 +pydantic==2.11.5 +pydantic-settings==2.9.1 +pydantic_core==2.33.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +pytz==2025.2 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.3 +requests-toolbelt==1.0.0 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.3 +sentence-transformers==4.1.0 +setuptools==80.8.0 +six==1.17.0 +sniffio==1.3.1 +SQLAlchemy==2.0.41 +sympy==1.14.0 +tenacity==9.1.2 +threadpoolctl==3.6.0 +tokenizers==0.21.1 +torch==2.7.0 +tqdm==4.67.1 +transformers==4.51.3 +triton==3.3.0 +typing-inspect==0.9.0 +typing-inspection==0.4.1 +typing_extensions==4.13.2 +tzdata==2025.2 +urllib3==2.4.0 +wcwidth==0.2.13 +xxhash==3.5.0 +yarl==1.20.0 +zstandard==0.23.0 diff --git a/run.sh b/run.sh index f016ada9d..6078c8b55 100755 --- a/run.sh +++ b/run.sh @@ -7,12 +7,29 @@ source .env/bin/activate # the ONNX model/data require git Large File System support git lfs install -# pip install huggingface-hub[cli] +# install Python dependencies +# pip install huggingface-hub[cli] langchain langchain_huggingface langchain_community optimum[onnxruntime] faiss-cpu +pip install -r ./requirements.txt -# # get foundation model dependencies from HuggingFace / Microsoft -# huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \ -# --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* \ -# --local-dir ./infrastructure/foundation_model +# environment variables +export MODEL_BASE_DIR="./infrastructure/foundation_model" +export MODEL_CPU_DIR="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4" +MODEL_DATA_FILENAME="phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data" +MODEL_DATA_FILEPATH="$MODEL_BASE_DIR/$MODEL_CPU_DIR/$MODEL_DATA_FILENAME" + +echo "===================" +echo "$MODEL_DATA_FILEPATH" +echo "===================" + +# get foundation model dependencies from HuggingFace / Microsoft +if [ ! -f "$MODEL_DATA_FILEPATH" ]; then + echo "Downloading foundation model..." + huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \ + --include "$MODEL_CPU_DIR/*" \ + --local-dir $MODEL_BASE_DIR +else + echo "Foundation model files already exist at: $MODEL_DATA_FILEPATH" +fi python -m src.text_generation.entrypoints.server diff --git a/src/text_generation/adapters/llm/llm.py b/src/text_generation/adapters/llm/llm.py index d68b53d73..d4bbb386e 100644 --- a/src/text_generation/adapters/llm/llm.py +++ b/src/text_generation/adapters/llm/llm.py @@ -3,18 +3,13 @@ RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings """ import logging -import os import sys # LangChain imports -from langchain_huggingface import HuggingFacePipeline from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough - -# HuggingFace and ONNX imports -from optimum.onnxruntime import ORTModelForCausalLM -from transformers import AutoTokenizer, pipeline +from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel class Phi3LanguageModel: @@ -29,40 +24,8 @@ class Phi3LanguageModel: def configure_model(self): - # Set up paths to the local model - base_dir = os.path.dirname(os.path.abspath(__file__)) - model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4") - self.logger.debug(f"Loading Phi-3 model from: {model_path}") - - # Load the tokenizer and model - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=model_path, - trust_remote_code=True, - local_files_only=True - ) - model = ORTModelForCausalLM.from_pretrained( - model_path, - provider="CPUExecutionProvider", - trust_remote_code=True, - local_files_only=True - ) - model.name_or_path = model_path - - # Create the text generation pipeline - pipe = pipeline( - "text-generation", - model=model, - tokenizer=tokenizer, - max_new_tokens=256, - temperature=0.7, - top_p=0.9, - repetition_penalty=1.1, - use_fast=True, - do_sample=True - ) - # Create the LangChain LLM - llm = HuggingFacePipeline(pipeline=pipe) + llm = TextGenerationFoundationModel().build() # Phi-3 specific prompt template template = """<|user|> @@ -91,10 +54,7 @@ class Phi3LanguageModel: def invoke(self, user_input: str) -> str: try: # Get response from the chain - self.logger.debug(f'===Prompt: {user_input}\n\n') response = self.chain.invoke(user_input) - # Print the answer - self.logger.debug(f'===Response: {response}\n\n') return response except Exception as e: self.logger.error(f"Failed: {e}") diff --git a/src/text_generation/adapters/llm/llm_rag.py b/src/text_generation/adapters/llm/llm_rag.py index 9188421ff..47d0a457c 100644 --- a/src/text_generation/adapters/llm/llm_rag.py +++ b/src/text_generation/adapters/llm/llm_rag.py @@ -2,57 +2,33 @@ RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings """ -import os +import logging +import sys # LangChain imports -from langchain_huggingface import HuggingFacePipeline from langchain_huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain.schema import Document - -# HuggingFace and ONNX imports -from optimum.onnxruntime import ORTModelForCausalLM -from transformers import AutoTokenizer, pipeline +from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel class Phi3LanguageModelWithRag: - def invoke(self, user_input): + def __init__(self): + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + logger.addHandler(handler) + self.logger = logger + self.configure_model() - # Set up paths to the local model - base_dir = os.path.dirname(os.path.abspath(__file__)) - model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4") - print(f"Loading Phi-3 model from: {model_path}") - - # Load the tokenizer and model - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=model_path, - trust_remote_code=True - ) - model = ORTModelForCausalLM.from_pretrained( - model_id=model_path, - provider="CPUExecutionProvider", - trust_remote_code=True - ) - model.name_or_path = model_path - - # Create the text generation pipeline - pipe = pipeline( - "text-generation", - model=model, - tokenizer=tokenizer, - max_new_tokens=512, - temperature=0.7, - top_p=0.9, - repetition_penalty=1.1, - do_sample=True - ) + def configure_model(self): # Create the LangChain LLM - llm = HuggingFacePipeline(pipeline=pipe) + llm = TextGenerationFoundationModel().build() # Initialize the embedding model - using a small, efficient model # Options: @@ -64,7 +40,6 @@ class Phi3LanguageModelWithRag: model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": True} ) - print("Embedding model loaded") # Sample documents about artificial intelligence docs = [ @@ -141,7 +116,7 @@ class Phi3LanguageModelWithRag: ) # Create the retrieval QA chain - qa_chain = RetrievalQA.from_chain_type( + self.qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", # "stuff" method puts all retrieved docs into one prompt retriever=vectorstore.as_retriever(search_kwargs={"k": 3}), # Retrieve top 3 results @@ -149,10 +124,9 @@ class Phi3LanguageModelWithRag: chain_type_kwargs={"prompt": prompt} # Use our custom prompt ) + def invoke(self, user_input: str) -> str: + # Get response from the chain - response = qa_chain.invoke({"query": user_input}) - - # Print the answer - print(response["result"]) - + response = self.qa_chain.invoke({"query": user_input}) return response["result"] + \ No newline at end of file diff --git a/src/text_generation/adapters/llm/text_generation_model.py b/src/text_generation/adapters/llm/text_generation_model.py new file mode 100644 index 000000000..12fce00e9 --- /dev/null +++ b/src/text_generation/adapters/llm/text_generation_model.py @@ -0,0 +1,69 @@ +""" +RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings +""" + +import logging +import os +import sys + +# LangChain imports +from langchain_huggingface import HuggingFacePipeline + +# HuggingFace and ONNX imports +from optimum.onnxruntime import ORTModelForCausalLM +from transformers import AutoTokenizer, pipeline + + +class TextGenerationFoundationModel: + + def __init__(self): + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + logger.addHandler(handler) + self.logger = logger + + def build(self) -> HuggingFacePipeline: + + # Set up paths to the local model + # base_dir = os.path.dirname(os.path.abspath(__file__)) + # model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4") + + model_base_dir = os.environ.get('MODEL_BASE_DIR') + model_cpu_dir = os.environ.get('MODEL_CPU_DIR') + model_path = os.path.join(model_base_dir, model_cpu_dir) + + self.logger.debug(f'model_base_dir: {model_base_dir}') + self.logger.debug(f'model_cpu_dir: {model_cpu_dir}') + self.logger.debug(f"Loading Phi-3 model from: {model_path}") + + # Load the tokenizer and model + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_path, + trust_remote_code=True, + local_files_only=True + ) + model = ORTModelForCausalLM.from_pretrained( + model_path, + provider="CPUExecutionProvider", + trust_remote_code=True, + local_files_only=True + ) + model.name_or_path = model_path + + # Create the text generation pipeline + pipe = pipeline( + "text-generation", + model=model, + tokenizer=tokenizer, + max_new_tokens=256, + temperature=0.7, + top_p=0.9, + repetition_penalty=1.1, + use_fast=True, + do_sample=True + ) + + # Create the LangChain LLM + return HuggingFacePipeline(pipeline=pipe) +