try smaller batches

2026-04-30 15:27:51 +02:00 · 2025-08-18 18:00:08 -06:00
parent 09eac1f050
commit 8c6230e0dc
5 changed files with 108 additions and 7 deletions
@@ -0,0 +1,25 @@
+name: 'Test #4 | RAG + CoT | microsoft/Phi-3-mini-4k-instruct'
+on:
+  workflow_dispatch:
+
+jobs:
+  test-all-ranges:
+    strategy:
+      matrix:
+        include:
+          - offset: 0
+            range: "1-20"
+          - offset: 20
+            range: "21-40"
+          - offset: 40
+            range: "41-60"
+          - offset: 60
+            range: "61-80"
+          - offset: 80
+            range: "81-100"
+      fail-fast: false
+    uses: ./.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.base.yml
+    with:
+      batch_offset: ${{ matrix.offset }}
+      range_name: ${{ matrix.range }}
+      batch_size: 2
@@ -0,0 +1,59 @@
+name: 'Reusable Test #4 | RAG + CoT | microsoft/Phi-3-mini-4k-instruct'
+on:
+  workflow_call:
+    inputs:
+      batch_offset:
+        description: 'Starting prompt index offset'
+        required: true
+        type: number
+      range_name:
+        description: 'Human readable range name (e.g., "1-20")'
+        required: true
+        type: string
+      batch_size:
+        description: 'Number of prompts per batch'
+        required: false
+        type: number
+        default: 2
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      # Always 10 batches per workflow
+      matrix:
+        batch: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+      fail-fast: false
+    steps:
+      - name: 'checkout'
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - name: 'set up Python'
+        uses: actions/setup-python@v3
+        with:
+          python-version: '3.12'
+      - name: 'set up Python dependencies'
+        shell: bash
+        run: pip install -r ${{ github.workspace }}/requirements.txt
+      - name: 'run text generation tests - range ${{ inputs.range_name }} batch ${{ matrix.batch }}'
+        shell: bash
+        env:
+          PROMPT_BATCH: ${{ matrix.batch }}
+          BATCH_SIZE: ${{ inputs.batch_size }}
+          BATCH_OFFSET: ${{ inputs.batch_offset }}
+        run: pytest tests/integration/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.py -s --disable-warnings
+      - name: Check for changes
+        id: verify-changed-files
+        run: |
+          if [ -n "$(git status --porcelain)" ]; then
+            echo "changed=true" >> $GITHUB_OUTPUT
+          else
+            echo "changed=false" >> $GITHUB_OUTPUT
+          fi
+      - name: Commit and push changes
+        if: steps.verify-changed-files.outputs.changed == 'true'
+        run: |
+          git config --local user.email "42450907+lightbroker@users.noreply.github.com"
+          git config --local user.name "Adam Wilson"
+          git add .
+          git commit -m "Auto-generated files from range ${{ inputs.range_name }} batch ${{ matrix.batch }} [skip ci]"
+          git push
@@ -13,10 +13,16 @@ from src.text_generation.services.logging.abstract_test_run_logging_service impo


 class TestRunLoggingService(AbstractTestRunLoggingService):
-    def __init__(self, test_id: int, model_id: ModelId):
+    def __init__(
+            self, 
+            test_id: int, 
+            model_id: ModelId,
+            start: int,
+            end: int
+    ):
        self._lock = threading.Lock()
        timestamp = calendar.timegm(time.gmtime())
-        self.log_file_path = f"./tests/logs/test_{test_id}/{str(model_id.value).replace("/", "_")}/test_{test_id}_logs_{timestamp}.json"
+        self.log_file_path = f"./tests/logs/test_{test_id}/{start}_{end}/{str(model_id.value).replace("/", "_")}/test_{test_id}_logs_{timestamp}.json"
        self._ensure_log_file_exists()

    def _ensure_log_file_exists(self):
@@ -18,12 +18,16 @@ def test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct(
            .with_chain_of_thought_guidelines()
            .with_rag_context_guidelines())
    
+    prompt_batch, start, end = get_prompt_batch(malicious_prompts)
+
    results = run_prompt_analysis_test(
        test_id=4,
        model_id=ModelId.MICROSOFT_PHI_3_MINI4K_INSTRUCT,
        text_generation_completion_service=text_generation_completion_service,
        semantic_similarity_service=semantic_similarity_service,
-        prompts=get_prompt_batch(malicious_prompts),
+        prompts=prompt_batch,
+        start=start,
+        end=end,
        comparison_texts=prompt_injection_all_example_responses,
        service_configurator=configure_service
    )
@@ -11,7 +11,7 @@ from src.text_generation.services.nlp.text_generation_completion_service import



-def get_prompt_batch(prompts, batch_size=10, env_var='PROMPT_BATCH'):
+def get_prompt_batch(prompts: List[str], batch_size=10, env_var='PROMPT_BATCH'):
    
    batch_size = int(os.getenv('BATCH_SIZE', '2'))
    batch_num = int(os.getenv('PROMPT_BATCH', '1'))
@@ -29,11 +29,11 @@ def get_prompt_batch(prompts, batch_size=10, env_var='PROMPT_BATCH'):
    end_idx = min(start_idx + batch_size, len(prompts))
    
    # Get the subset of prompts for this batch
-    prompt_subset = prompts[start_idx:end_idx]
+    prompt_subset: List[str] = prompts[start_idx:end_idx]
    
    print(f"Running batch {batch_num} (range offset {offset}): prompts {start_idx+1}-{end_idx} ({len(prompt_subset)} prompts)")
    
-    return prompt_subset
+    return prompt_subset, start_idx, end_idx


 def run_prompt_analysis_test(
@@ -42,6 +42,8 @@ def run_prompt_analysis_test(
    text_generation_completion_service: AbstractTextGenerationCompletionService,
    semantic_similarity_service: AbstractSemanticSimilarityService,
    prompts: List,
+    start: int,
+    end: int,
    comparison_texts: List,
    service_configurator: Callable,
    max_prompts: int = 100
@@ -80,7 +82,12 @@ def run_prompt_analysis_test(
        print(f'{i}/{len(prompts)} Max Score: {result.max}')
        print(f'{i}/{len(prompts)} Avg Score: {result.mean}')
        
-        TestRunLoggingService(test_id=test_id, model_id=model_id).log_results(
+        TestRunLoggingService(
+            test_id=test_id,
+            model_id=model_id,
+            start=start,
+            end=end
+        ).log_results(
            id=inspect.currentframe().f_back.f_code.co_name,
            text_generation_completion_result=completion_result,
            final_completion_text_score=result.max,