start splitting into classes; use env vars for foundation model file dependencies

2026-02-12 14:42:48 +00:00 · 2025-05-26 18:27:23 -06:00
parent 8bb4a473ca
commit 39c779058d
7 changed files with 236 additions and 91 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -175,5 +175,5 @@ cython_debug/

 # HuggingFace / Microsoft LLM supporting files
 # (these are downloaded for local development via bash script, or inside GH Action workflow context)
-src/**/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/**
+infrastructure/foundation_model/cpu_and_mobile/**
 logs
--- a/infrastructure/README.md
+++ b/infrastructure/README.md
@@ -0,0 +1,19 @@
+# Infrastructure
+
+This directory exists to contain the foundation model (pre-trained generative language model).
+
+## Model Choice
+
+The foundation model for this project needed to work under multiple constraints:
+
+1. __Repo storage limits:__ Even with Git LFS enabled, GitHub restricts repository size to 5GB (at least for the free tier).
+1. __Build system storage limits:__ [Standard Linux runners](https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners?ref=devtron.ai#standard-github-hosted-runners-for-public-repositories) in GitHub Actions have a 16GB SSD.
+
+The CPU-optimized [`microsoft/Phi-3-mini-4k-instruct-onnx`](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) model met this storage space requirement. 
+
+## Provisioning the Foundation Model
+
+The foundation model dependency is loaded differently for local development vs. the build system:
+
+1. __Local:__ The model is downloaded once by the `./run.sh` shell script at the project root, but excluded in `.gitignore` since it's too large for GitHub's LFS limitations.
+1. __Build System:__ The model is downloaded on every workflow run with `huggingface-cli`.
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,106 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+coloredlogs==15.0.1
+dataclasses-json==0.6.7
+datasets==3.6.0
+dill==0.3.8
+faiss-cpu==1.11.0
+filelock==3.18.0
+flatbuffers==25.2.10
+frozenlist==1.6.0
+fsspec==2025.3.0
+greenlet==3.2.2
+h11==0.16.0
+hf-xet==1.1.2
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.32.0
+humanfriendly==10.0
+idna==3.10
+inquirerpy==0.3.4
+Jinja2==3.1.6
+joblib==1.5.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==0.3.25
+langchain-community==0.3.24
+langchain-core==0.3.61
+langchain-huggingface==0.2.0
+langchain-text-splitters==0.3.8
+langsmith==0.3.42
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+mpmath==1.3.0
+multidict==6.4.4
+multiprocess==0.70.16
+mypy_extensions==1.1.0
+networkx==3.4.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+onnx==1.18.0
+onnxruntime==1.22.0
+optimum==1.25.3
+orjson==3.10.18
+packaging==24.2
+pandas==2.2.3
+pfzy==0.3.4
+pillow==11.2.1
+prompt_toolkit==3.0.51
+propcache==0.3.1
+protobuf==6.31.0
+pyarrow==20.0.0
+pydantic==2.11.5
+pydantic-settings==2.9.1
+pydantic_core==2.33.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.3
+sentence-transformers==4.1.0
+setuptools==80.8.0
+six==1.17.0
+sniffio==1.3.1
+SQLAlchemy==2.0.41
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+torch==2.7.0
+tqdm==4.67.1
+transformers==4.51.3
+triton==3.3.0
+typing-inspect==0.9.0
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+wcwidth==0.2.13
+xxhash==3.5.0
+yarl==1.20.0
+zstandard==0.23.0
--- a/run.sh
+++ b/run.sh
@@ -7,12 +7,29 @@ source .env/bin/activate
 # the ONNX model/data require git Large File System support
 git lfs install

-# pip install huggingface-hub[cli]
+# install Python dependencies
+# pip install huggingface-hub[cli] langchain langchain_huggingface langchain_community optimum[onnxruntime] faiss-cpu
+pip install -r ./requirements.txt

-# # get foundation model dependencies from HuggingFace / Microsoft
-# huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \
-#     --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* \
-#     --local-dir ./infrastructure/foundation_model
+# environment variables
+export MODEL_BASE_DIR="./infrastructure/foundation_model"
+export MODEL_CPU_DIR="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"
+MODEL_DATA_FILENAME="phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data"
+MODEL_DATA_FILEPATH="$MODEL_BASE_DIR/$MODEL_CPU_DIR/$MODEL_DATA_FILENAME"
+
+echo "==================="
+echo "$MODEL_DATA_FILEPATH"
+echo "==================="
+
+# get foundation model dependencies from HuggingFace / Microsoft
+if [ ! -f "$MODEL_DATA_FILEPATH" ]; then
+    echo "Downloading foundation model..."
+    huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \
+        --include "$MODEL_CPU_DIR/*" \
+        --local-dir $MODEL_BASE_DIR
+else
+    echo "Foundation model files already exist at: $MODEL_DATA_FILEPATH"
+fi

 python -m src.text_generation.entrypoints.server

--- a/src/text_generation/adapters/llm/llm.py
+++ b/src/text_generation/adapters/llm/llm.py
@@ -3,18 +3,13 @@ RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
 """

 import logging
-import os
 import sys

 # LangChain imports
-from langchain_huggingface import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
-
-# HuggingFace and ONNX imports
-from optimum.onnxruntime import ORTModelForCausalLM
-from transformers import AutoTokenizer, pipeline
+from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel


 class Phi3LanguageModel:
@@ -29,40 +24,8 @@ class Phi3LanguageModel:

    def configure_model(self):

-        # Set up paths to the local model
-        base_dir = os.path.dirname(os.path.abspath(__file__))
-        model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
-        self.logger.debug(f"Loading Phi-3 model from: {model_path}")
-
-        # Load the tokenizer and model
-        tokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path=model_path,
-            trust_remote_code=True,
-            local_files_only=True
-        )
-        model = ORTModelForCausalLM.from_pretrained(
-            model_path, 
-            provider="CPUExecutionProvider",
-            trust_remote_code=True,
-            local_files_only=True
-        )
-        model.name_or_path = model_path
-
-        # Create the text generation pipeline
-        pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=256,
-            temperature=0.7,
-            top_p=0.9,
-            repetition_penalty=1.1,
-            use_fast=True,
-            do_sample=True
-        )
-
        # Create the LangChain LLM
-        llm = HuggingFacePipeline(pipeline=pipe)
+        llm = TextGenerationFoundationModel().build()

        # Phi-3 specific prompt template
        template = """<|user|>
@@ -91,10 +54,7 @@ class Phi3LanguageModel:
    def invoke(self, user_input: str) -> str:
        try:
            # Get response from the chain
-            self.logger.debug(f'===Prompt: {user_input}\n\n')
            response = self.chain.invoke(user_input)
-            # Print the answer
-            self.logger.debug(f'===Response: {response}\n\n')
            return response
        except Exception as e:
            self.logger.error(f"Failed: {e}")
--- a/src/text_generation/adapters/llm/llm_rag.py
+++ b/src/text_generation/adapters/llm/llm_rag.py
@@ -2,57 +2,33 @@
 RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
 """

-import os
+import logging
+import sys

 # LangChain imports
-from langchain_huggingface import HuggingFacePipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from langchain.prompts import PromptTemplate
 from langchain.schema import Document
-
-# HuggingFace and ONNX imports
-from optimum.onnxruntime import ORTModelForCausalLM
-from transformers import AutoTokenizer, pipeline
+from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel


 class Phi3LanguageModelWithRag:

-    def invoke(self, user_input):
+    def __init__(self):
+        logger = logging.getLogger()
+        logger.setLevel(logging.DEBUG)
+        handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(handler)
+        self.logger = logger
+        self.configure_model()

-        # Set up paths to the local model
-        base_dir = os.path.dirname(os.path.abspath(__file__))
-        model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
-        print(f"Loading Phi-3 model from: {model_path}")
-
-        # Load the tokenizer and model
-        tokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path=model_path,
-            trust_remote_code=True
-        )
-        model = ORTModelForCausalLM.from_pretrained(
-            model_id=model_path,
-            provider="CPUExecutionProvider",
-            trust_remote_code=True
-        )
-        model.name_or_path = model_path
-
-        # Create the text generation pipeline
-        pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.9,
-            repetition_penalty=1.1,
-            do_sample=True
-        )
+    def configure_model(self):

        # Create the LangChain LLM
-        llm = HuggingFacePipeline(pipeline=pipe)
+        llm = TextGenerationFoundationModel().build()

        # Initialize the embedding model - using a small, efficient model
        # Options:
@@ -64,7 +40,6 @@ class Phi3LanguageModelWithRag:
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True}
        )
-        print("Embedding model loaded")

        # Sample documents about artificial intelligence
        docs = [
@@ -141,7 +116,7 @@ class Phi3LanguageModelWithRag:
        )

        # Create the retrieval QA chain
-        qa_chain = RetrievalQA.from_chain_type(
+        self.qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",  # "stuff" method puts all retrieved docs into one prompt
            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 results
@@ -149,10 +124,9 @@ class Phi3LanguageModelWithRag:
            chain_type_kwargs={"prompt": prompt}  # Use our custom prompt
        )

+    def invoke(self, user_input: str) -> str:
+
        # Get response from the chain
-        response = qa_chain.invoke({"query": user_input})
-        
-        # Print the answer
-        print(response["result"])
-        
+        response = self.qa_chain.invoke({"query": user_input})
        return response["result"]
+ 
--- a/src/text_generation/adapters/llm/text_generation_model.py
+++ b/src/text_generation/adapters/llm/text_generation_model.py
@@ -0,0 +1,69 @@
+"""
+RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
+"""
+
+import logging
+import os
+import sys
+
+# LangChain imports
+from langchain_huggingface import HuggingFacePipeline
+
+# HuggingFace and ONNX imports
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer, pipeline
+
+
+class TextGenerationFoundationModel:
+
+    def __init__(self):
+        logger = logging.getLogger()
+        logger.setLevel(logging.DEBUG)
+        handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(handler)
+        self.logger = logger
+
+    def build(self) -> HuggingFacePipeline:
+
+        # Set up paths to the local model
+        # base_dir = os.path.dirname(os.path.abspath(__file__))
+        # model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
+        
+        model_base_dir = os.environ.get('MODEL_BASE_DIR')
+        model_cpu_dir = os.environ.get('MODEL_CPU_DIR')
+        model_path = os.path.join(model_base_dir, model_cpu_dir)
+        
+        self.logger.debug(f'model_base_dir: {model_base_dir}')
+        self.logger.debug(f'model_cpu_dir: {model_cpu_dir}')
+        self.logger.debug(f"Loading Phi-3 model from: {model_path}")
+
+        # Load the tokenizer and model
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            trust_remote_code=True,
+            local_files_only=True
+        )
+        model = ORTModelForCausalLM.from_pretrained(
+            model_path, 
+            provider="CPUExecutionProvider",
+            trust_remote_code=True,
+            local_files_only=True
+        )
+        model.name_or_path = model_path
+
+        # Create the text generation pipeline
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=256,
+            temperature=0.7,
+            top_p=0.9,
+            repetition_penalty=1.1,
+            use_fast=True,
+            do_sample=True
+        )
+
+        # Create the LangChain LLM
+        return HuggingFacePipeline(pipeline=pipe)
+