mirror of
https://github.com/lightbroker/llmsecops-research.git
synced 2026-02-12 14:42:48 +00:00
start splitting into classes; use env vars for foundation model file dependencies
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -175,5 +175,5 @@ cython_debug/
|
||||
|
||||
# HuggingFace / Microsoft LLM supporting files
|
||||
# (these are downloaded for local development via bash script, or inside GH Action workflow context)
|
||||
src/**/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/**
|
||||
infrastructure/foundation_model/cpu_and_mobile/**
|
||||
logs
|
||||
19
infrastructure/README.md
Normal file
19
infrastructure/README.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# Infrastructure
|
||||
|
||||
This directory exists to contain the foundation model (pre-trained generative language model).
|
||||
|
||||
## Model Choice
|
||||
|
||||
The foundation model for this project needed to work under multiple constraints:
|
||||
|
||||
1. __Repo storage limits:__ Even with Git LFS enabled, GitHub restricts repository size to 5GB (at least for the free tier).
|
||||
1. __Build system storage limits:__ [Standard Linux runners](https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners?ref=devtron.ai#standard-github-hosted-runners-for-public-repositories) in GitHub Actions have a 16GB SSD.
|
||||
|
||||
The CPU-optimized [`microsoft/Phi-3-mini-4k-instruct-onnx`](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) model met this storage space requirement.
|
||||
|
||||
## Provisioning the Foundation Model
|
||||
|
||||
The foundation model dependency is loaded differently for local development vs. the build system:
|
||||
|
||||
1. __Local:__ The model is downloaded once by the `./run.sh` shell script at the project root, but excluded in `.gitignore` since it's too large for GitHub's LFS limitations.
|
||||
1. __Build System:__ The model is downloaded on every workflow run with `huggingface-cli`.
|
||||
106
requirements.txt
106
requirements.txt
@@ -0,0 +1,106 @@
|
||||
aiohappyeyeballs==2.6.1
|
||||
aiohttp==3.11.18
|
||||
aiosignal==1.3.2
|
||||
annotated-types==0.7.0
|
||||
anyio==4.9.0
|
||||
attrs==25.3.0
|
||||
certifi==2025.4.26
|
||||
charset-normalizer==3.4.2
|
||||
coloredlogs==15.0.1
|
||||
dataclasses-json==0.6.7
|
||||
datasets==3.6.0
|
||||
dill==0.3.8
|
||||
faiss-cpu==1.11.0
|
||||
filelock==3.18.0
|
||||
flatbuffers==25.2.10
|
||||
frozenlist==1.6.0
|
||||
fsspec==2025.3.0
|
||||
greenlet==3.2.2
|
||||
h11==0.16.0
|
||||
hf-xet==1.1.2
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
httpx-sse==0.4.0
|
||||
huggingface-hub==0.32.0
|
||||
humanfriendly==10.0
|
||||
idna==3.10
|
||||
inquirerpy==0.3.4
|
||||
Jinja2==3.1.6
|
||||
joblib==1.5.1
|
||||
jsonpatch==1.33
|
||||
jsonpointer==3.0.0
|
||||
langchain==0.3.25
|
||||
langchain-community==0.3.24
|
||||
langchain-core==0.3.61
|
||||
langchain-huggingface==0.2.0
|
||||
langchain-text-splitters==0.3.8
|
||||
langsmith==0.3.42
|
||||
MarkupSafe==3.0.2
|
||||
marshmallow==3.26.1
|
||||
mpmath==1.3.0
|
||||
multidict==6.4.4
|
||||
multiprocess==0.70.16
|
||||
mypy_extensions==1.1.0
|
||||
networkx==3.4.2
|
||||
numpy==2.2.6
|
||||
nvidia-cublas-cu12==12.6.4.1
|
||||
nvidia-cuda-cupti-cu12==12.6.80
|
||||
nvidia-cuda-nvrtc-cu12==12.6.77
|
||||
nvidia-cuda-runtime-cu12==12.6.77
|
||||
nvidia-cudnn-cu12==9.5.1.17
|
||||
nvidia-cufft-cu12==11.3.0.4
|
||||
nvidia-cufile-cu12==1.11.1.6
|
||||
nvidia-curand-cu12==10.3.7.77
|
||||
nvidia-cusolver-cu12==11.7.1.2
|
||||
nvidia-cusparse-cu12==12.5.4.2
|
||||
nvidia-cusparselt-cu12==0.6.3
|
||||
nvidia-nccl-cu12==2.26.2
|
||||
nvidia-nvjitlink-cu12==12.6.85
|
||||
nvidia-nvtx-cu12==12.6.77
|
||||
onnx==1.18.0
|
||||
onnxruntime==1.22.0
|
||||
optimum==1.25.3
|
||||
orjson==3.10.18
|
||||
packaging==24.2
|
||||
pandas==2.2.3
|
||||
pfzy==0.3.4
|
||||
pillow==11.2.1
|
||||
prompt_toolkit==3.0.51
|
||||
propcache==0.3.1
|
||||
protobuf==6.31.0
|
||||
pyarrow==20.0.0
|
||||
pydantic==2.11.5
|
||||
pydantic-settings==2.9.1
|
||||
pydantic_core==2.33.2
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.1.0
|
||||
pytz==2025.2
|
||||
PyYAML==6.0.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
requests-toolbelt==1.0.0
|
||||
safetensors==0.5.3
|
||||
scikit-learn==1.6.1
|
||||
scipy==1.15.3
|
||||
sentence-transformers==4.1.0
|
||||
setuptools==80.8.0
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
SQLAlchemy==2.0.41
|
||||
sympy==1.14.0
|
||||
tenacity==9.1.2
|
||||
threadpoolctl==3.6.0
|
||||
tokenizers==0.21.1
|
||||
torch==2.7.0
|
||||
tqdm==4.67.1
|
||||
transformers==4.51.3
|
||||
triton==3.3.0
|
||||
typing-inspect==0.9.0
|
||||
typing-inspection==0.4.1
|
||||
typing_extensions==4.13.2
|
||||
tzdata==2025.2
|
||||
urllib3==2.4.0
|
||||
wcwidth==0.2.13
|
||||
xxhash==3.5.0
|
||||
yarl==1.20.0
|
||||
zstandard==0.23.0
|
||||
|
||||
27
run.sh
27
run.sh
@@ -7,12 +7,29 @@ source .env/bin/activate
|
||||
# the ONNX model/data require git Large File System support
|
||||
git lfs install
|
||||
|
||||
# pip install huggingface-hub[cli]
|
||||
# install Python dependencies
|
||||
# pip install huggingface-hub[cli] langchain langchain_huggingface langchain_community optimum[onnxruntime] faiss-cpu
|
||||
pip install -r ./requirements.txt
|
||||
|
||||
# # get foundation model dependencies from HuggingFace / Microsoft
|
||||
# huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \
|
||||
# --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* \
|
||||
# --local-dir ./infrastructure/foundation_model
|
||||
# environment variables
|
||||
export MODEL_BASE_DIR="./infrastructure/foundation_model"
|
||||
export MODEL_CPU_DIR="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"
|
||||
MODEL_DATA_FILENAME="phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data"
|
||||
MODEL_DATA_FILEPATH="$MODEL_BASE_DIR/$MODEL_CPU_DIR/$MODEL_DATA_FILENAME"
|
||||
|
||||
echo "==================="
|
||||
echo "$MODEL_DATA_FILEPATH"
|
||||
echo "==================="
|
||||
|
||||
# get foundation model dependencies from HuggingFace / Microsoft
|
||||
if [ ! -f "$MODEL_DATA_FILEPATH" ]; then
|
||||
echo "Downloading foundation model..."
|
||||
huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx \
|
||||
--include "$MODEL_CPU_DIR/*" \
|
||||
--local-dir $MODEL_BASE_DIR
|
||||
else
|
||||
echo "Foundation model files already exist at: $MODEL_DATA_FILEPATH"
|
||||
fi
|
||||
|
||||
python -m src.text_generation.entrypoints.server
|
||||
|
||||
|
||||
@@ -3,18 +3,13 @@ RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
# LangChain imports
|
||||
from langchain_huggingface import HuggingFacePipeline
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
# HuggingFace and ONNX imports
|
||||
from optimum.onnxruntime import ORTModelForCausalLM
|
||||
from transformers import AutoTokenizer, pipeline
|
||||
from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel
|
||||
|
||||
|
||||
class Phi3LanguageModel:
|
||||
@@ -29,40 +24,8 @@ class Phi3LanguageModel:
|
||||
|
||||
def configure_model(self):
|
||||
|
||||
# Set up paths to the local model
|
||||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
|
||||
self.logger.debug(f"Loading Phi-3 model from: {model_path}")
|
||||
|
||||
# Load the tokenizer and model
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path=model_path,
|
||||
trust_remote_code=True,
|
||||
local_files_only=True
|
||||
)
|
||||
model = ORTModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
provider="CPUExecutionProvider",
|
||||
trust_remote_code=True,
|
||||
local_files_only=True
|
||||
)
|
||||
model.name_or_path = model_path
|
||||
|
||||
# Create the text generation pipeline
|
||||
pipe = pipeline(
|
||||
"text-generation",
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
max_new_tokens=256,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.1,
|
||||
use_fast=True,
|
||||
do_sample=True
|
||||
)
|
||||
|
||||
# Create the LangChain LLM
|
||||
llm = HuggingFacePipeline(pipeline=pipe)
|
||||
llm = TextGenerationFoundationModel().build()
|
||||
|
||||
# Phi-3 specific prompt template
|
||||
template = """<|user|>
|
||||
@@ -91,10 +54,7 @@ class Phi3LanguageModel:
|
||||
def invoke(self, user_input: str) -> str:
|
||||
try:
|
||||
# Get response from the chain
|
||||
self.logger.debug(f'===Prompt: {user_input}\n\n')
|
||||
response = self.chain.invoke(user_input)
|
||||
# Print the answer
|
||||
self.logger.debug(f'===Response: {response}\n\n')
|
||||
return response
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed: {e}")
|
||||
|
||||
@@ -2,57 +2,33 @@
|
||||
RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import sys
|
||||
|
||||
# LangChain imports
|
||||
from langchain_huggingface import HuggingFacePipeline
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain.chains import RetrievalQA
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain.schema import Document
|
||||
|
||||
# HuggingFace and ONNX imports
|
||||
from optimum.onnxruntime import ORTModelForCausalLM
|
||||
from transformers import AutoTokenizer, pipeline
|
||||
from src.text_generation.adapters.llm.text_generation_model import TextGenerationFoundationModel
|
||||
|
||||
|
||||
class Phi3LanguageModelWithRag:
|
||||
|
||||
def invoke(self, user_input):
|
||||
def __init__(self):
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(handler)
|
||||
self.logger = logger
|
||||
self.configure_model()
|
||||
|
||||
# Set up paths to the local model
|
||||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
|
||||
print(f"Loading Phi-3 model from: {model_path}")
|
||||
|
||||
# Load the tokenizer and model
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path=model_path,
|
||||
trust_remote_code=True
|
||||
)
|
||||
model = ORTModelForCausalLM.from_pretrained(
|
||||
model_id=model_path,
|
||||
provider="CPUExecutionProvider",
|
||||
trust_remote_code=True
|
||||
)
|
||||
model.name_or_path = model_path
|
||||
|
||||
# Create the text generation pipeline
|
||||
pipe = pipeline(
|
||||
"text-generation",
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
max_new_tokens=512,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.1,
|
||||
do_sample=True
|
||||
)
|
||||
def configure_model(self):
|
||||
|
||||
# Create the LangChain LLM
|
||||
llm = HuggingFacePipeline(pipeline=pipe)
|
||||
llm = TextGenerationFoundationModel().build()
|
||||
|
||||
# Initialize the embedding model - using a small, efficient model
|
||||
# Options:
|
||||
@@ -64,7 +40,6 @@ class Phi3LanguageModelWithRag:
|
||||
model_kwargs={"device": "cpu"},
|
||||
encode_kwargs={"normalize_embeddings": True}
|
||||
)
|
||||
print("Embedding model loaded")
|
||||
|
||||
# Sample documents about artificial intelligence
|
||||
docs = [
|
||||
@@ -141,7 +116,7 @@ class Phi3LanguageModelWithRag:
|
||||
)
|
||||
|
||||
# Create the retrieval QA chain
|
||||
qa_chain = RetrievalQA.from_chain_type(
|
||||
self.qa_chain = RetrievalQA.from_chain_type(
|
||||
llm=llm,
|
||||
chain_type="stuff", # "stuff" method puts all retrieved docs into one prompt
|
||||
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}), # Retrieve top 3 results
|
||||
@@ -149,10 +124,9 @@ class Phi3LanguageModelWithRag:
|
||||
chain_type_kwargs={"prompt": prompt} # Use our custom prompt
|
||||
)
|
||||
|
||||
def invoke(self, user_input: str) -> str:
|
||||
|
||||
# Get response from the chain
|
||||
response = qa_chain.invoke({"query": user_input})
|
||||
|
||||
# Print the answer
|
||||
print(response["result"])
|
||||
|
||||
response = self.qa_chain.invoke({"query": user_input})
|
||||
return response["result"]
|
||||
|
||||
69
src/text_generation/adapters/llm/text_generation_model.py
Normal file
69
src/text_generation/adapters/llm/text_generation_model.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""
|
||||
RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
# LangChain imports
|
||||
from langchain_huggingface import HuggingFacePipeline
|
||||
|
||||
# HuggingFace and ONNX imports
|
||||
from optimum.onnxruntime import ORTModelForCausalLM
|
||||
from transformers import AutoTokenizer, pipeline
|
||||
|
||||
|
||||
class TextGenerationFoundationModel:
|
||||
|
||||
def __init__(self):
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(handler)
|
||||
self.logger = logger
|
||||
|
||||
def build(self) -> HuggingFacePipeline:
|
||||
|
||||
# Set up paths to the local model
|
||||
# base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
# model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
|
||||
|
||||
model_base_dir = os.environ.get('MODEL_BASE_DIR')
|
||||
model_cpu_dir = os.environ.get('MODEL_CPU_DIR')
|
||||
model_path = os.path.join(model_base_dir, model_cpu_dir)
|
||||
|
||||
self.logger.debug(f'model_base_dir: {model_base_dir}')
|
||||
self.logger.debug(f'model_cpu_dir: {model_cpu_dir}')
|
||||
self.logger.debug(f"Loading Phi-3 model from: {model_path}")
|
||||
|
||||
# Load the tokenizer and model
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path=model_path,
|
||||
trust_remote_code=True,
|
||||
local_files_only=True
|
||||
)
|
||||
model = ORTModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
provider="CPUExecutionProvider",
|
||||
trust_remote_code=True,
|
||||
local_files_only=True
|
||||
)
|
||||
model.name_or_path = model_path
|
||||
|
||||
# Create the text generation pipeline
|
||||
pipe = pipeline(
|
||||
"text-generation",
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
max_new_tokens=256,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.1,
|
||||
use_fast=True,
|
||||
do_sample=True
|
||||
)
|
||||
|
||||
# Create the LangChain LLM
|
||||
return HuggingFacePipeline(pipeline=pipe)
|
||||
|
||||
Reference in New Issue
Block a user