Merge pull request #8 from lightbroker/langchain-embedding

API iterations
2026-07-02 03:15:33 +02:00 · 2025-05-20 07:16:24 -06:00
parent a526b03266 d1c87d4cdf
commit bab9709e81
30 changed files with 573 additions and 72 deletions
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+cd $GITHUB_WORKSPACE
+
+echo "Cleaning up processes..."
+
+# Kill the monitoring process if it exists
+if [ -f "$MONITOR_PID_FILE" ]; then
+  MONITOR_PID=$(cat $MONITOR_PID_FILE)
+  echo "Stopping monitoring process with PID: $MONITOR_PID"
+  kill $MONITOR_PID 2>/dev/null || echo "Monitor process already stopped"
+  rm $MONITOR_PID_FILE
+fi
+
+# Kill the API process if it exists
+if [ -f "$API_PID_FILE" ]; then
+  API_PID=$(cat $API_PID_FILE)
+  echo "Stopping API process with PID: $API_PID"
+  kill $API_PID 2>/dev/null || echo "API process already stopped"
+  rm $API_PID_FILE
+fi
+
+echo "Cleanup complete"
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -e  # Exit on error
+
+cd $GITHUB_WORKSPACE
+
+echo "Waiting for API to be ready..."
+max_attempts=10
+attempt=1
+
+while [ $attempt -le $max_attempts ]; do
+  echo "Health check attempt $attempt of $max_attempts..."
+  if curl -s -f -i http://localhost:9999/ > logs/health_check_$attempt.log 2>&1; then
+    echo "Health check succeeded"
+    break
+  else
+    echo "Health check failed, waiting 5 seconds..."
+    sleep 5
+    attempt=$((attempt+1))
+  fi
+done
+
+if [ $attempt -gt $max_attempts ]; then
+  echo "API failed to start after $max_attempts attempts"
+  cat logs/api.log
+  exit 1
+fi
@@ -0,0 +1,134 @@
+#!/bin/bash
+# Don't use set -e here as we want to capture and handle errors ourselves
+
+cd $GITHUB_WORKSPACE
+
+# Make sure garak report directory exists
+GARAK_REPORTS_DIR="/home/runner/.local/share/garak/garak_runs"
+mkdir -p $GARAK_REPORTS_DIR
+mkdir -p logs/garak_reports
+
+# Log system resource information before starting garak
+echo "System resources before starting garak:" > logs/system_before_garak.log
+free -h >> logs/system_before_garak.log
+df -h >> logs/system_before_garak.log
+ulimit -a >> logs/system_before_garak.log
+
+# Generate a time-stamped log file for garak
+GARAK_LOG_FILE="logs/garak_$(date +%Y%m%d_%H%M%S).log"
+echo "GARAK_LOG_FILE=$GARAK_LOG_FILE" >> $GITHUB_ENV
+echo "Running garak vulnerability scan with output to $GARAK_LOG_FILE..."
+
+# Start garak with enhanced error capture and reduced resource usage
+{
+  set -x  # Enable debug mode to print commands
+  
+  # Run with trap to capture signals
+  (
+    trap 'echo "Received termination signal at $(date)" >> $GARAK_LOG_FILE' TERM INT
+    
+    # Run garak with lower parallel attempts to reduce resource usage
+    # and with a timeout to prevent hanging
+    timeout --preserve-status 40m garak -v \
+      --config $WORKSPACE/src/tools/garak.config.yml \
+      --generator_option_file $WORKSPACE/src/tools/garak.rest.llm.json \
+      --model_type=rest \
+      --parallel_attempts 8
+    
+    echo "Garak completed with exit code $?" >> $GARAK_LOG_FILE
+  )
+  
+  set +x  # Disable debug mode
+} > $GARAK_LOG_FILE 2>&1
+
+GARAK_EXIT_CODE=$?
+echo "Garak exit code: $GARAK_EXIT_CODE"
+
+# Log system resource information after garak completes
+echo "System resources after garak:" > logs/system_after_garak.log
+free -h >> logs/system_after_garak.log
+df -h >> logs/system_after_garak.log
+
+# Copy any garak reports to our logs directory for easier access
+echo "Copying garak reports to logs directory..."
+cp -r $GARAK_REPORTS_DIR/* logs/garak_reports/ || echo "No garak reports found to copy"
+
+# List what reports were generated
+echo "Garak reports found:"
+find logs/garak_reports -type f | sort || echo "No garak reports found"
+
+# Capture and report logs regardless of success/failure
+echo "Last 200 lines of garak log:"
+cat $GARAK_LOG_FILE | tail -n 200
+
+# Check for specific error patterns
+echo "Checking for known error patterns..."
+{
+  if grep -q "operation was canceled" $GARAK_LOG_FILE; then
+    echo "FOUND 'operation was canceled' error in logs:"
+    grep -A 10 -B 10 "operation was canceled" $GARAK_LOG_FILE
+  fi
+
+  if grep -q "memory" $GARAK_LOG_FILE; then
+    echo "FOUND memory-related messages in logs:"
+    grep -A 10 -B 10 "memory" $GARAK_LOG_FILE
+  fi
+  
+  if grep -q "timeout" $GARAK_LOG_FILE; then
+    echo "FOUND timeout-related messages in logs:"
+    grep -A 10 -B 10 "timeout" $GARAK_LOG_FILE
+  fi
+  
+  if grep -q "SIGTERM\|signal\|terminated" $GARAK_LOG_FILE; then
+    echo "FOUND termination signals in logs:"
+    grep -A 10 -B 10 -E "SIGTERM|signal|terminated" $GARAK_LOG_FILE
+  fi
+} >> logs/error_analysis.log
+
+# Save the exit code analysis
+echo "Exit code analysis:" > logs/exit_code_analysis.log
+{
+  echo "Garak exit code: $GARAK_EXIT_CODE"
+  case $GARAK_EXIT_CODE in
+    0)
+      echo "Success - completed normally"
+      ;;
+    124)
+      echo "Error - timed out after 40 minutes"
+      ;;
+    130)
+      echo "Error - terminated by SIGINT (Ctrl+C)"
+      ;;
+    137)
+      echo "Error - killed by SIGKILL (likely out of memory)"
+      ;;
+    143)
+      echo "Error - terminated by SIGTERM (possibly by runner timeout or job cancellation)"
+      ;;
+    *)
+      echo "Error - unknown exit code"
+      ;;
+  esac
+} >> logs/exit_code_analysis.log
+
+cat logs/exit_code_analysis.log
+
+# Return proper exit code based on analysis
+if [ $GARAK_EXIT_CODE -eq 143 ]; then
+  echo "Process was terminated by SIGTERM. This may be due to:"
+  echo "1. GitHub Actions workflow timeout"
+  echo "2. Out of memory condition"
+  echo "3. Manual cancellation of the workflow"
+  echo "Treating as a workflow issue rather than a test failure"
+  # We return 0 to avoid failing the workflow on infrastructure issues
+  # You can change this to exit 1 if you prefer the workflow to fail
+  exit 0
+elif [ $GARAK_EXIT_CODE -eq 124 ]; then
+  echo "Garak timed out after 40 minutes"
+  exit 0  # Treat timeout as acceptable
+elif [ $GARAK_EXIT_CODE -ne 0 ]; then
+  echo "Garak failed with exit code $GARAK_EXIT_CODE"
+  exit 1  # Only fail for actual test failures
+else
+  exit 0
+fi
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e  # Exit on error
+
+cd $GITHUB_WORKSPACE
+
+echo "Starting API server with logging..."
+nohup python -m src.api.server > logs/api.log 2>&1 &
+API_PID=$!
+echo "API server started with PID: $API_PID"
+
+# Save PID to file so it can be accessed by other scripts
+echo $API_PID > api_pid.txt
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+echo "Starting system monitoring..."
+
+cd $GITHUB_WORKSPACE
+
+# Read API PID from file
+API_PID=$(cat api_pid.txt)
+echo "Monitoring API process with PID: $API_PID"
+
+# Save monitoring PID to file for later cleanup
+echo $$ > $MONITOR_PID_FILE
+
+while true; do
+  date >> logs/system_monitor.log
+  echo "Memory usage:" >> logs/system_monitor.log
+  free -m >> logs/system_monitor.log
+  echo "Process info:" >> logs/system_monitor.log
+  ps aux | grep -E 'python|garak' >> logs/system_monitor.log
+  echo "Network connections:" >> logs/system_monitor.log
+  netstat -tulpn | grep python >> logs/system_monitor.log 2>/dev/null || echo "No network connections found" >> logs/system_monitor.log
+  echo "API process status:" >> logs/system_monitor.log
+  if ps -p $API_PID > /dev/null; then
+    echo "API process is running" >> logs/system_monitor.log
+  else
+    echo "API process is NOT running!" >> logs/system_monitor.log
+  fi
+  echo "-------------------" >> logs/system_monitor.log
+  sleep 10
+done
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -e  # Exit on error
+
+cd $GITHUB_WORKSPACE
+
+echo "Making API request..."
+curl -X POST -i http://localhost:9999/api/conversations \
+  -d '{ "prompt": "describe a random planet in our solar system in 10 words or less" }' \
+  -H "Content-Type: application/json" > logs/test_request.log 2>&1
+
+if [ $? -ne 0 ]; then
+  echo "Test API request failed"
+  cat logs/test_request.log
+  exit 1
+else
+  echo "Test API request succeeded"
+  cat logs/test_request.log
+fi
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# This script is designed to fix the Exit Code 143 issue in GitHub Actions
+# by troubleshooting likely resource and timeout issues
+
+echo "Running troubleshooting for Exit Code 143 (SIGTERM)"
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# Check for existence of important files and directories
+echo "## Checking file system status" > logs/troubleshooting.log
+ls -la $WORKSPACE/src/tools/ >> logs/troubleshooting.log 2>&1
+echo "" >> logs/troubleshooting.log
+
+# Check garak configuration files
+echo "## Checking garak configuration files" >> logs/troubleshooting.log
+if [ -f "$WORKSPACE/src/tools/garak.config.yml" ]; then
+  echo "garak.config.yml exists" >> logs/troubleshooting.log
+  grep -v "^#" "$WORKSPACE/src/tools/garak.config.yml" | grep -v "^$" >> logs/troubleshooting.log
+else
+  echo "ERROR: garak.config.yml NOT FOUND" >> logs/troubleshooting.log
+fi
+echo "" >> logs/troubleshooting.log
+
+if [ -f "$WORKSPACE/src/tools/garak.rest.llm.json" ]; then
+  echo "garak.rest.llm.json exists" >> logs/troubleshooting.log
+  cat "$WORKSPACE/src/tools/garak.rest.llm.json" >> logs/troubleshooting.log
+else
+  echo "ERROR: garak.rest.llm.json NOT FOUND" >> logs/troubleshooting.log
+fi
+echo "" >> logs/troubleshooting.log
+
+# Check GitHub Actions runner environment
+echo "## GitHub Actions runner environment" >> logs/troubleshooting.log
+echo "CPU cores: $(nproc)" >> logs/troubleshooting.log
+echo "Memory:" >> logs/troubleshooting.log
+free -h >> logs/troubleshooting.log
+echo "Disk space:" >> logs/troubleshooting.log
+df -h >> logs/troubleshooting.log
+echo "" >> logs/troubleshooting.log
+
+# Check garak installation
+echo "## Garak installation" >> logs/troubleshooting.log
+pip show garak >> logs/troubleshooting.log
+echo "" >> logs/troubleshooting.log
+
+# Test garak basic functionality
+echo "## Testing garak basic functionality" >> logs/troubleshooting.log
+garak --version >> logs/troubleshooting.log 2>&1
+
+# Output troubleshooting suggestions
+echo "## Troubleshooting suggestions for Exit Code 143" >> logs/troubleshooting.log
+echo "1. Resource limitations:" >> logs/troubleshooting.log
+echo "   - Reduce parallel_attempts from 8 to 4" >> logs/troubleshooting.log
+echo "   - Set MALLOC_ARENA_MAX=2 environment variable" >> logs/troubleshooting.log
+echo "   - Monitor memory usage more closely" >> logs/troubleshooting.log
+echo "2. Timeout issues:" >> logs/troubleshooting.log
+echo "   - Break the garak run into multiple smaller runs" >> logs/troubleshooting.log
+echo "   - Reduce the number of tests being run" >> logs/troubleshooting.log
+echo "3. Consider using a larger GitHub Actions runner" >> logs/troubleshooting.log
+echo "4. Investigate network issues between API and garak" >> logs/troubleshooting.log
+
+# # Create a patch file for reducing parallel attempts even further if needed
+# cat > logs/reduce_parallel.patch << 'EOF'
+# --- a/.github/scripts/run_garak.sh
+# +++ b/.github/scripts/run_garak.sh
+# @@ -27,7 +27,7 @@
+#      timeout --preserve-status 40m garak -v \
+#        --config $WORKSPACE/src/tools/garak.config.yml \
+#        --generator_option_file $WORKSPACE/src/tools/garak.rest.llm.json \
+# -      --model_type=rest \
+# -      --parallel_attempts 8
+# +      --model_type=rest --probe-parameters '{"concurrent_requests": 2}' \
+# +      --parallel_attempts 4
+     
+#      echo "Garak completed with exit code $?" >> $GARAK_LOG_FILE
+# EOF
+
+echo "Troubleshooting complete. See logs/troubleshooting.log for details."
+echo "A patch file has been created at logs/reduce_parallel.patch if you need to reduce parallel attempts further."
@@ -1,52 +1,135 @@
 name: 'LLM Prompt Testing (LLM, no RAG)'
-
 on:
  workflow_dispatch:
-
 jobs:
  build:
    runs-on: ubuntu-latest
-
+    timeout-minutes: 60  # Add overall job timeout
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683    
-
-    - name: 'set up git LFS'
-      run: git lfs install
-
-    - name: 'set up Python'
-      uses: actions/setup-python@v3
-      with:
-        python-version: '3.12'
-
-    - name: 'set up Python dependencies'
-      run: |
-        pip install -r ${{ github.workspace }}/requirements.txt  
-
-    - name: 'set up Microsoft Phi-3 Mini 4k LLM from HuggingFace'
-      run: |
-        pip install huggingface-hub[cli]
-        huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir ${{ github.workspace }}/tests/llm
-
-    - name: 'set up garak'
-      run: |
-        pip install garak
-
-    - name: 'run HTTP server and call REST API'
-      run: |
-        nohup python -m tests.api.server > server.log 2>&1 &
-        sleep 2
-        curl -X POST -i localhost:9999/api/conversations -d '{ "prompt": "describe a random planet in our solar system in 10 words or less" }' || true
-        echo
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      
+      - name: 'set up git LFS'
+        run: git lfs install
        
-        garak -v \
-          --config ${{ github.workspace }}/tests/tools/garak.config.yml \
-          --generator_option_file ${{ github.workspace }}/tests/tools/garak.rest.llm.json \
-          --model_type=rest \
-          --parallel_attempts 32
+      - name: 'set up Python'
+        uses: actions/setup-python@v3
+        with:
+          python-version: '3.12'
+          
+      - name: 'set up Python dependencies'
+        run: |
+          pip install -r ${{ github.workspace }}/requirements.txt
+          
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-lightboker-llmsecopsresearch
+            
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          # Install diagnostic tools
+          pip install psutil
+          
+      - name: 'set up Microsoft Phi-3 Mini 4k LLM from HuggingFace'
+        run: |
+          pip install huggingface-hub[cli]
+          huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir ${{ github.workspace }}/src/llm
+          
+      - name: 'set up garak'
+        run: |
+          pip install garak
+      
+      # Split into separate scripts for cleaner workflow
+      - name: 'Prepare test environment'
+        run: |
+          mkdir -p logs
+          chmod +x ${{ github.workspace }}/.github/scripts/*.sh
+          
+      - name: 'Start API server'
+        run: ${{ github.workspace }}/.github/scripts/start_api.sh
+        env:
+          WORKSPACE: ${{ github.workspace }}
+          
+      - name: 'Run health check'
+        run: ${{ github.workspace }}/.github/scripts/health_check.sh
        
-        cat server.log
-
-    - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
-      with:
-        name: 'garak_report'
-        path: /home/runner/.local/share/garak/garak_runs/garak.*.html
+      - name: 'Run test API request'
+        run: ${{ github.workspace }}/.github/scripts/test_api.sh
+        
+      - name: 'Start system monitoring'
+        run: ${{ github.workspace }}/.github/scripts/start_monitoring.sh &
+        env:
+          MONITOR_PID_FILE: ${{ github.workspace }}/monitor_pid.txt
+          
+      - name: 'Run garak vulnerability scan'
+        continue-on-error: true  # Allow job to continue even if this step fails
+        timeout-minutes: 45  # Add step timeout
+        run: ${{ github.workspace }}/.github/scripts/run_garak.sh
+        env:
+          WORKSPACE: ${{ github.workspace }}
+          GITHUB_ENV: $GITHUB_ENV
+          
+      # Add error analysis step
+      - name: 'Analyze errors and create report'
+        if: always()  # Run this step even if previous steps failed
+        run: |
+          echo "### Garak Execution Summary" > $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          
+          if [ -f "logs/exit_code_analysis.log" ]; then
+            echo "#### Exit Code Analysis" >> $GITHUB_STEP_SUMMARY
+            echo '```' >> $GITHUB_STEP_SUMMARY
+            cat logs/exit_code_analysis.log >> $GITHUB_STEP_SUMMARY
+            echo '```' >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+          fi
+          
+          if [ -f "logs/error_analysis.log" ]; then
+            echo "#### Error Patterns Found" >> $GITHUB_STEP_SUMMARY
+            echo '```' >> $GITHUB_STEP_SUMMARY
+            cat logs/error_analysis.log >> $GITHUB_STEP_SUMMARY
+            echo '```' >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+          fi
+          
+          echo "#### System Resources" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
+          if [ -f "logs/system_before_garak.log" ]; then
+            echo "BEFORE GARAK:" >> $GITHUB_STEP_SUMMARY
+            cat logs/system_before_garak.log >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+          fi
+          
+          if [ -f "logs/system_after_garak.log" ]; then
+            echo "AFTER GARAK:" >> $GITHUB_STEP_SUMMARY
+            cat logs/system_after_garak.log >> $GITHUB_STEP_SUMMARY
+          fi
+          echo '```' >> $GITHUB_STEP_SUMMARY
+          
+      - name: 'Stop monitoring and API processes'
+        if: always()  # Run this step even if previous steps failed
+        run: ${{ github.workspace }}/.github/scripts/cleanup.sh
+        env:
+          MONITOR_PID_FILE: ${{ github.workspace }}/monitor_pid.txt
+          API_PID_FILE: ${{ github.workspace }}/api_pid.txt
+          
+      - name: Upload logs
+        if: always()  # Upload logs even if previous steps failed
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: 'execution_logs'
+          path: logs/
+          
+      - name: Upload garak report
+        if: always()  # Upload report even if previous steps failed
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: 'garak_report'
+          path: |
+            /home/runner/.local/share/garak/garak_runs/
+            logs/garak_reports/
@@ -33,9 +33,9 @@ jobs:

    - name: 'run HTTP server and call REST API'
      run: |
-        nohup python -m tests.api.server > server.log 2>&1 &
+        python -m tests.api.server
        sleep 2
-        curl -X POST -i localhost:9999/api/conversations -d '{ "prompt": "describe a random planet in our solar system in 10 words or less" }' || true
+        curl -X POST -i localhost:9999/api/conversations -d '{ "prompt": "describe a random planet in our solar system in 10 words or less" }' || exit 1
        echo
        
        garak -v \
@@ -44,8 +44,6 @@ jobs:
          --model_type=rest \
          --parallel_attempts 32
        
-        cat server.log
-
    - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
      with:
        name: 'garak_report'
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Get the directory of the script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Navigate to the project root (2 levels up from .github/workflows)
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Move to the project root
+cd "$PROJECT_ROOT"
+
+# Start Flask server in the background
+python -m src.api.controller &
+SERVER_PID=$!
+
+# Function to check if server is up
+wait_for_server() {
+    echo "Waiting for Flask server to start..."
+    local max_attempts=100
+    local attempt=0
+    
+    while [ $attempt -lt $max_attempts ]; do
+        if curl -s http://localhost:9998/ > /dev/null 2>&1; then
+            echo "Server is up!"
+            return 0
+        fi
+        
+        attempt=$((attempt + 1))
+        echo "Attempt $attempt/$max_attempts - Server not ready yet, waiting..."
+        sleep 1
+    done
+    
+    echo "Server failed to start after $max_attempts attempts"
+    kill $SERVER_PID
+    return 1
+}
+
+# Wait for server to be ready
+wait_for_server || exit 1
+
+# Make the actual request once server is ready
+echo "Making API request..."
+curl -X POST -i http://localhost:9998/api/conversations \
+    -d '{ "prompt": "describe a random planet in our solar system in 10 words or less" }' \
+    -H "Content-Type: application/json" || exit 1
+echo
+
+exit 0
@@ -175,13 +175,13 @@ cython_debug/

 # HuggingFace / Microsoft LLM supporting files
 # (these are downloaded for local development via bash script, or inside GH Action workflow context)
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/added_tokens.json
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/config.json
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/configuration_phi3.py
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/genai_config.json
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/special_tokens_map.json
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer_config.json
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.json
-tests/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.model
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/added_tokens.json
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/config.json
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/configuration_phi3.py
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/genai_config.json
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/special_tokens_map.json
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer_config.json
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.json
+src/llm/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.model
@@ -8,6 +8,7 @@ attrs==25.3.0
 avidtools==0.1.2
 backoff==2.2.1
 base2048==0.1.3
+blinker==1.9.0
 boto3==1.38.2
 botocore==1.38.2
 cachetools==5.5.2
@@ -31,6 +32,7 @@ faiss-cpu==1.11.0
 fastapi==0.115.12
 fastavro==1.10.0
 filelock==3.18.0
+Flask==3.1.1
 flatbuffers==25.2.10
 frozenlist==1.6.0
 fschat==0.2.36
@@ -53,6 +55,7 @@ humanfriendly==10.0
 idna==3.10
 importlib-metadata==6.11.0
 inquirerpy==0.3.4
+itsdangerous==2.2.0
 Jinja2==3.1.6
 jiter==0.9.0
 jmespath==1.0.1
@@ -182,8 +185,10 @@ tzdata==2025.2
 uritemplate==4.1.1
 urllib3==2.3.0
 uvicorn==0.34.2
+waitress==3.0.2
 wavedrom==2.0.3.post3
 wcwidth==0.2.13
+Werkzeug==3.1.3
 wn==0.9.5
 xdg-base-dirs==6.0.2
 xxhash==3.5.0
@@ -0,0 +1,26 @@
+import logging
+from flask import Flask, jsonify, request
+from waitress import serve
+from src.llm.llm import Phi3LanguageModel
+from src.llm.llm_rag import Phi3LanguageModelWithRag
+
+app = Flask(__name__)
+
+@app.route('/', methods=['GET'])
+def health_check():
+    return f"Server is running\n", 200
+
+@app.route('/api/conversations', methods=['POST'])
+def get_llm_response():
+    prompt = request.json['prompt']
+    service = Phi3LanguageModel()
+    response = service.invoke(user_input=prompt)
+    return jsonify({'response': response}), 201
+
+if __name__ == '__main__':
+    logger = logging.Logger(name='Flask API', level=logging.DEBUG)
+    print('test')
+    logger.debug('running...')
+
+    # TODO set up port # as env var
+    serve(app, host='0.0.0.0', port=9999)
@@ -1,8 +1,8 @@
 import json
 import traceback

-from tests.llm.llm import Phi3LanguageModel
-from tests.llm.llm_rag import Phi3LanguageModelWithRag
+from src.llm.llm import Phi3LanguageModel
+from src.llm.llm_rag import Phi3LanguageModelWithRag

 class ApiController:
    def __init__(self):
@@ -1,6 +1,7 @@
 import json
+import logging

-from tests.api.controller import ApiController
+from src.api.controller import ApiController
 from wsgiref.simple_server import make_server


@@ -13,11 +14,14 @@ class RestApiServer:
        yield [json.dumps({'received': 'data'}).encode('utf-8')]

    def listen(self):
-        port = 9999
-        controller = ApiController()
-        with make_server('', port, controller) as wsgi_srv:
-            print(f'listening on port {port}...')
-            wsgi_srv.serve_forever()
+        try:
+            port = 9999
+            controller = ApiController()
+            with make_server('', port, controller) as wsgi_srv:
+                print(f'listening on port {port}...')
+                wsgi_srv.serve_forever()
+        except Exception as e:
+            logging.warning(e)


 if __name__ == '__main__':
@@ -2,7 +2,9 @@
 RAG implementation with local Phi-3-mini-4k-instruct-onnx and embeddings
 """

+import logging
 import os
+import sys
 from typing import List

 # LangChain imports
@@ -26,27 +28,36 @@ from transformers import AutoTokenizer, pipeline

 class Phi3LanguageModel:

+    def __init__(self):
+        logger = logging.getLogger()
+        logger.setLevel(logging.DEBUG)
+        handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(handler)
+        self.logger = logger
+
    def extract_assistant_response(self, text):
        if "<|assistant|>" in text:
            return text.split("<|assistant|>")[-1].strip()
        return text


-    def invoke(self, user_input):
+    def invoke(self, user_input: str) -> str:
        # Set up paths to the local model
        base_dir = os.path.dirname(os.path.abspath(__file__))
        model_path = os.path.join(base_dir, "cpu_and_mobile", "cpu-int4-rtn-block-32-acc-level-4")
-        print(f"Loading Phi-3 model from: {model_path}")
+        self.logger.debug(f"Loading Phi-3 model from: {model_path}")

        # Load the tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path=model_path,
-            trust_remote_code=True
+            trust_remote_code=True,
+            local_files_only=True
        )
        model = ORTModelForCausalLM.from_pretrained(
-            model_id=model_path,
+            model_path,  # Change model_id to just model_path
            provider="CPUExecutionProvider",
-            trust_remote_code=True
+            trust_remote_code=True,
+            local_files_only=True
        )
        model.name_or_path = model_path

@@ -85,10 +96,12 @@ class Phi3LanguageModel:
        
        try:
            # Get response from the chain
+            self.logger.debug(f'===Prompt: {user_input}\n\n')
            response = chain.invoke(user_input)
            # Print the answer
-            print(response)
+            self.logger.debug(f'===Response: {response}\n\n')
            return response
        except Exception as e:
-            print(f"Failed: {e}")
+            self.logger.error(f"Failed: {e}")
+            return e