From eecd577eba60948e0d4fb9163c0fbf72d5fe2e75 Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Tue, 22 Apr 2025 20:57:46 -0600 Subject: [PATCH 1/8] workflow: local API for ONNX interactions --- .github/workflows/llmsecops-cicd.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/llmsecops-cicd.yml b/.github/workflows/llmsecops-cicd.yml index b0492a415..ea9c45717 100644 --- a/.github/workflows/llmsecops-cicd.yml +++ b/.github/workflows/llmsecops-cicd.yml @@ -22,13 +22,16 @@ jobs: with: python-version: '3.12' - - name: Download Huggingface CLI + - name: Set up HuggingFace LLM run: | pip install huggingface-hub[cli] huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir . pip install onnxruntime-genai - curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py - python phi3-qa.py -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 -e cpu -v + # curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py + python phi3_language_model.py -v \ + --prompt 'Describe the principle of existence, from the first principles of philosophy.' \ + -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 \ + -e cpu - name: Run REST API server run: | @@ -36,4 +39,4 @@ jobs: - name: Test API call run: | - curl -i localhost:9999/hello + curl -i localhost:9999 From 66371bf208daad95543349dbc990b867443c5e27 Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Tue, 22 Apr 2025 20:58:16 -0600 Subject: [PATCH 2/8] support POST --- tests/api/PathDispatcher.py | 41 +++++++++++++++++++++++++++++-------- tests/api/server.py | 18 ++++++++-------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/tests/api/PathDispatcher.py b/tests/api/PathDispatcher.py index 0649b8201..7e534677f 100644 --- a/tests/api/PathDispatcher.py +++ b/tests/api/PathDispatcher.py @@ -1,22 +1,45 @@ import cgi +import json class PathDispatcher: def __init__(self): self.routes = {} + self.response_headers = [('Content-Type', 'application/json')] + + + def __http_415_notsupported(self, env, start_response): + start_response('415 Unsupported Media Type', self.response_headers) + return [json.dumps({'error': 'Unsupported Content-Type'}).encode('utf-8')] + + + def __http_200_ok(self, env, start_response): + try: + request_body_size = int(env.get('CONTENT_LENGTH', 0)) + except (ValueError): + request_body_size = 0 + + request_body = env['wsgi.input'].read(request_body_size) + data = json.loads(request_body.decode('utf-8')) + start_response('200 OK', self.response_headers) + return [json.dumps({'received': data}).encode('utf-8')] - def notfound_404(self, env, start_response): - start_response('404 Not Found', [ ('Content-Type', 'text/plain') ]) - return [b'Not Found'] def __call__(self, env, start_response): + method = env.get('REQUEST_METHOD').upper() path = env.get('PATH_INFO') - params = cgi.FieldStorage(env.get('wsgi.output'), environ=env) - method = env.get('REQUEST_METHOD').lower() - env['params'] = { key: params.getvalue(key) for key in params } - handler = self.routes.get((method,path), self.notfound_404) - return handler(env, start_response) - + + if not method == 'POST': + self.__http_415_notsupported(env, start_response) + + try: + handler = self.routes.get((method,path), self.__http_200_ok) + return handler(env, start_response) + except json.JSONDecodeError: + start_response('400 Bad Request', self.response_headers) + return [json.dumps({'error': 'Invalid JSON'}).encode('utf-8')] + + def register(self, method, path, function): self.routes[method.lower(), path] = function return function diff --git a/tests/api/server.py b/tests/api/server.py index 9abc4565d..7a8d0c620 100644 --- a/tests/api/server.py +++ b/tests/api/server.py @@ -1,3 +1,5 @@ +import json + from PathDispatcher import PathDispatcher from wsgiref.simple_server import make_server @@ -6,19 +8,19 @@ class RestApiServer: def __init__(self): pass - def response_function(self, environ, start_response): - start_response('200 OK', [('Content-Type','text/html')]) - yield str(f'testing...\n').encode('utf-8') + def post_response(self, env, start_response): + start_response('200 OK', [('Content-Type', 'application/json')]) + yield [json.dumps({'received': 'data'}).encode('utf-8')] def listen(self): port = 9999 dispatcher = PathDispatcher() - dispatcher.register('GET', '/hello', self.response_function) - wsgi_srv = make_server('', port, dispatcher) - print(f'listening on port {port}...') - wsgi_srv.serve_forever() + dispatcher.register('POST', '/', self.post_response) + with make_server('', port, dispatcher) as wsgi_srv: + print(f'listening on port {port}...') + wsgi_srv.serve_forever() if __name__ == '__main__': srv = RestApiServer() - srv.listen() + srv.listen() \ No newline at end of file From e04235606d557e5394240fefe395cc2140bc6940 Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Tue, 22 Apr 2025 20:59:02 -0600 Subject: [PATCH 3/8] refactor example Python class to accept a single prompt and exit --- tests/llm/phi3_interface.py | 1 - tests/llm/phi3_language_model.py | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) delete mode 100644 tests/llm/phi3_interface.py create mode 100644 tests/llm/phi3_language_model.py diff --git a/tests/llm/phi3_interface.py b/tests/llm/phi3_interface.py deleted file mode 100644 index ec8805df0..000000000 --- a/tests/llm/phi3_interface.py +++ /dev/null @@ -1 +0,0 @@ -# TODO: business logic for REST API interaction w/ LLM via prompt input \ No newline at end of file diff --git a/tests/llm/phi3_language_model.py b/tests/llm/phi3_language_model.py new file mode 100644 index 000000000..542a396c6 --- /dev/null +++ b/tests/llm/phi3_language_model.py @@ -0,0 +1,57 @@ +# TODO: business logic for REST API interaction w/ LLM via prompt input + +import onnxruntime_genai as og +import argparse + + +class Phi3LanguageModel: + + def __init__(self, model_path): + # configure the ONNX runtime + config = og.Config(model_path) + config.clear_providers() + self.model = og.Model(config) + self.tokenizer = og.Tokenizer(model) + self.tokenizer_stream = self.tokenizer.create_stream() + + + def get_response(self, args): + + search_options = { 'max_length': 2048 } + params = og.GeneratorParams(model) + params.set_search_options(**search_options) + generator = og.Generator(model, params) + + # process prompt input and generate tokens + prompt_input = args.prompt + chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' + prompt = f'{chat_template.format(input=prompt_input)}' + input_tokens = self.tokenizer.encode(prompt) + generator.append_tokens(input_tokens) + + print("Output: ", end='', flush=True) + + try: + while not generator.is_done(): + generator.generate_next_token() + new_token = generator.get_next_tokens()[0] + print(self.tokenizer_stream.decode(new_token), end='', flush=True) + except Exception as e: + print(f'{e}') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") + parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') + parser.add_argument('-p', '--prompt', type=str, required=True, help='Prompt input') + parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') + parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') + parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') + parser.add_argument('--top_p', type=float, help='Top p probability to sample with') + parser.add_argument('--top_k', type=int, help='Top k tokens to sample from') + parser.add_argument('--temperature', type=float, help='Temperature to sample with') + parser.add_argument('--repetition_penalty', type=float, help='Repetition penalty to sample with') + args = parser.parse_args() + + model = Phi3LanguageModel(args.model_path) + model.get_response() From 3e27caa991cd42b6ef5b643e0e2fb3ffe6e67e44 Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Tue, 22 Apr 2025 21:25:26 -0600 Subject: [PATCH 4/8] fix path --- .github/workflows/llmsecops-cicd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llmsecops-cicd.yml b/.github/workflows/llmsecops-cicd.yml index ea9c45717..f9aa8d518 100644 --- a/.github/workflows/llmsecops-cicd.yml +++ b/.github/workflows/llmsecops-cicd.yml @@ -28,7 +28,7 @@ jobs: huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir . pip install onnxruntime-genai # curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py - python phi3_language_model.py -v \ + python ${{ github.workspace }}/tests/llm/phi3_language_model.py -v \ --prompt 'Describe the principle of existence, from the first principles of philosophy.' \ -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 \ -e cpu From 3ccb57e40350cf311dab682007df25a48c1756bf Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Tue, 22 Apr 2025 21:28:05 -0600 Subject: [PATCH 5/8] fix args --- .github/workflows/llmsecops-cicd.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/llmsecops-cicd.yml b/.github/workflows/llmsecops-cicd.yml index f9aa8d518..9be8285a8 100644 --- a/.github/workflows/llmsecops-cicd.yml +++ b/.github/workflows/llmsecops-cicd.yml @@ -28,10 +28,9 @@ jobs: huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir . pip install onnxruntime-genai # curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py - python ${{ github.workspace }}/tests/llm/phi3_language_model.py -v \ + python ${{ github.workspace }}/tests/llm/phi3_language_model.py \ --prompt 'Describe the principle of existence, from the first principles of philosophy.' \ -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 \ - -e cpu - name: Run REST API server run: | From 1ba607cd3012166f904a731dc40aea175cb3606b Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Wed, 23 Apr 2025 06:32:28 -0600 Subject: [PATCH 6/8] fix model ref --- tests/llm/phi3_language_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/llm/phi3_language_model.py b/tests/llm/phi3_language_model.py index 542a396c6..7c6ab73b0 100644 --- a/tests/llm/phi3_language_model.py +++ b/tests/llm/phi3_language_model.py @@ -7,20 +7,20 @@ import argparse class Phi3LanguageModel: def __init__(self, model_path): - # configure the ONNX runtime + # configure ONNX runtime config = og.Config(model_path) config.clear_providers() self.model = og.Model(config) - self.tokenizer = og.Tokenizer(model) + self.tokenizer = og.Tokenizer(self.model) self.tokenizer_stream = self.tokenizer.create_stream() def get_response(self, args): search_options = { 'max_length': 2048 } - params = og.GeneratorParams(model) + params = og.GeneratorParams(self.model) params.set_search_options(**search_options) - generator = og.Generator(model, params) + generator = og.Generator(self.model, params) # process prompt input and generate tokens prompt_input = args.prompt From 5d44d90047e7ae082c49187b09ce6f2a6b707987 Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Wed, 23 Apr 2025 06:56:24 -0600 Subject: [PATCH 7/8] fix prompt ref --- tests/llm/phi3_language_model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/llm/phi3_language_model.py b/tests/llm/phi3_language_model.py index 7c6ab73b0..a08357ec4 100644 --- a/tests/llm/phi3_language_model.py +++ b/tests/llm/phi3_language_model.py @@ -15,7 +15,7 @@ class Phi3LanguageModel: self.tokenizer_stream = self.tokenizer.create_stream() - def get_response(self, args): + def get_response(self, prompt_input): search_options = { 'max_length': 2048 } params = og.GeneratorParams(self.model) @@ -23,7 +23,6 @@ class Phi3LanguageModel: generator = og.Generator(self.model, params) # process prompt input and generate tokens - prompt_input = args.prompt chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' prompt = f'{chat_template.format(input=prompt_input)}' input_tokens = self.tokenizer.encode(prompt) @@ -54,4 +53,4 @@ if __name__ == "__main__": args = parser.parse_args() model = Phi3LanguageModel(args.model_path) - model.get_response() + model.get_response(args.prompt) From 771e2a1668695f07257fcf721a7711530a86df47 Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Wed, 23 Apr 2025 06:57:23 -0600 Subject: [PATCH 8/8] update workflow name --- .github/workflows/llmsecops-cicd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llmsecops-cicd.yml b/.github/workflows/llmsecops-cicd.yml index 9be8285a8..81806e940 100644 --- a/.github/workflows/llmsecops-cicd.yml +++ b/.github/workflows/llmsecops-cicd.yml @@ -1,4 +1,4 @@ -name: REST Server +name: LLM Prompt Testing on: # push: