diff --git a/.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.01-20.yml b/.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.01-20.yml new file mode 100644 index 000000000..4ed1be9fb --- /dev/null +++ b/.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.01-20.yml @@ -0,0 +1,25 @@ +name: 'Test #4 | RAG + CoT | microsoft/Phi-3-mini-4k-instruct' +on: + workflow_dispatch: + +jobs: + test-all-ranges: + strategy: + matrix: + include: + - offset: 0 + range: "1-20" + - offset: 20 + range: "21-40" + - offset: 40 + range: "41-60" + - offset: 60 + range: "61-80" + - offset: 80 + range: "81-100" + fail-fast: false + uses: ./.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.base.yml + with: + batch_offset: ${{ matrix.offset }} + range_name: ${{ matrix.range }} + batch_size: 2 \ No newline at end of file diff --git a/.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.base.yml b/.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.base.yml new file mode 100644 index 000000000..b513e57fc --- /dev/null +++ b/.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.base.yml @@ -0,0 +1,59 @@ +name: 'Reusable Test #4 | RAG + CoT | microsoft/Phi-3-mini-4k-instruct' +on: + workflow_call: + inputs: + batch_offset: + description: 'Starting prompt index offset' + required: true + type: number + range_name: + description: 'Human readable range name (e.g., "1-20")' + required: true + type: string + batch_size: + description: 'Number of prompts per batch' + required: false + type: number + default: 2 + +jobs: + test: + runs-on: ubuntu-latest + strategy: + # Always 10 batches per workflow + matrix: + batch: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + fail-fast: false + steps: + - name: 'checkout' + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + - name: 'set up Python' + uses: actions/setup-python@v3 + with: + python-version: '3.12' + - name: 'set up Python dependencies' + shell: bash + run: pip install -r ${{ github.workspace }}/requirements.txt + - name: 'run text generation tests - range ${{ inputs.range_name }} batch ${{ matrix.batch }}' + shell: bash + env: + PROMPT_BATCH: ${{ matrix.batch }} + BATCH_SIZE: ${{ inputs.batch_size }} + BATCH_OFFSET: ${{ inputs.batch_offset }} + run: pytest tests/integration/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.py -s --disable-warnings + - name: Check for changes + id: verify-changed-files + run: | + if [ -n "$(git status --porcelain)" ]; then + echo "changed=true" >> $GITHUB_OUTPUT + else + echo "changed=false" >> $GITHUB_OUTPUT + fi + - name: Commit and push changes + if: steps.verify-changed-files.outputs.changed == 'true' + run: | + git config --local user.email "42450907+lightbroker@users.noreply.github.com" + git config --local user.name "Adam Wilson" + git add . + git commit -m "Auto-generated files from range ${{ inputs.range_name }} batch ${{ matrix.batch }} [skip ci]" + git push \ No newline at end of file diff --git a/src/text_generation/services/logging/test_run_logging_service.py b/src/text_generation/services/logging/test_run_logging_service.py index 9f3146130..88c66efb9 100644 --- a/src/text_generation/services/logging/test_run_logging_service.py +++ b/src/text_generation/services/logging/test_run_logging_service.py @@ -13,10 +13,16 @@ from src.text_generation.services.logging.abstract_test_run_logging_service impo class TestRunLoggingService(AbstractTestRunLoggingService): - def __init__(self, test_id: int, model_id: ModelId): + def __init__( + self, + test_id: int, + model_id: ModelId, + start: int, + end: int + ): self._lock = threading.Lock() timestamp = calendar.timegm(time.gmtime()) - self.log_file_path = f"./tests/logs/test_{test_id}/{str(model_id.value).replace("/", "_")}/test_{test_id}_logs_{timestamp}.json" + self.log_file_path = f"./tests/logs/test_{test_id}/{start}_{end}/{str(model_id.value).replace("/", "_")}/test_{test_id}_logs_{timestamp}.json" self._ensure_log_file_exists() def _ensure_log_file_exists(self): diff --git a/tests/integration/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.py b/tests/integration/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.py index 0a9ede299..4fc7a230b 100644 --- a/tests/integration/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.py +++ b/tests/integration/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.py @@ -18,12 +18,16 @@ def test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct( .with_chain_of_thought_guidelines() .with_rag_context_guidelines()) + prompt_batch, start, end = get_prompt_batch(malicious_prompts) + results = run_prompt_analysis_test( test_id=4, model_id=ModelId.MICROSOFT_PHI_3_MINI4K_INSTRUCT, text_generation_completion_service=text_generation_completion_service, semantic_similarity_service=semantic_similarity_service, - prompts=get_prompt_batch(malicious_prompts), + prompts=prompt_batch, + start=start, + end=end, comparison_texts=prompt_injection_all_example_responses, service_configurator=configure_service ) \ No newline at end of file diff --git a/tests/integration/test_utils.py b/tests/integration/test_utils.py index f8bfe36a3..7c8edeaf5 100644 --- a/tests/integration/test_utils.py +++ b/tests/integration/test_utils.py @@ -11,7 +11,7 @@ from src.text_generation.services.nlp.text_generation_completion_service import -def get_prompt_batch(prompts, batch_size=10, env_var='PROMPT_BATCH'): +def get_prompt_batch(prompts: List[str], batch_size=10, env_var='PROMPT_BATCH'): batch_size = int(os.getenv('BATCH_SIZE', '2')) batch_num = int(os.getenv('PROMPT_BATCH', '1')) @@ -29,11 +29,11 @@ def get_prompt_batch(prompts, batch_size=10, env_var='PROMPT_BATCH'): end_idx = min(start_idx + batch_size, len(prompts)) # Get the subset of prompts for this batch - prompt_subset = prompts[start_idx:end_idx] + prompt_subset: List[str] = prompts[start_idx:end_idx] print(f"Running batch {batch_num} (range offset {offset}): prompts {start_idx+1}-{end_idx} ({len(prompt_subset)} prompts)") - return prompt_subset + return prompt_subset, start_idx, end_idx def run_prompt_analysis_test( @@ -42,6 +42,8 @@ def run_prompt_analysis_test( text_generation_completion_service: AbstractTextGenerationCompletionService, semantic_similarity_service: AbstractSemanticSimilarityService, prompts: List, + start: int, + end: int, comparison_texts: List, service_configurator: Callable, max_prompts: int = 100 @@ -80,7 +82,12 @@ def run_prompt_analysis_test( print(f'{i}/{len(prompts)} Max Score: {result.max}') print(f'{i}/{len(prompts)} Avg Score: {result.mean}') - TestRunLoggingService(test_id=test_id, model_id=model_id).log_results( + TestRunLoggingService( + test_id=test_id, + model_id=model_id, + start=start, + end=end + ).log_results( id=inspect.currentframe().f_back.f_code.co_name, text_generation_completion_result=completion_result, final_completion_text_score=result.max,