try smaller batches

This commit is contained in:
Adam Wilson
2025-08-18 18:00:08 -06:00
parent 09eac1f050
commit 8c6230e0dc
5 changed files with 108 additions and 7 deletions
@@ -0,0 +1,25 @@
name: 'Test #4 | RAG + CoT | microsoft/Phi-3-mini-4k-instruct'
on:
workflow_dispatch:
jobs:
test-all-ranges:
strategy:
matrix:
include:
- offset: 0
range: "1-20"
- offset: 20
range: "21-40"
- offset: 40
range: "41-60"
- offset: 60
range: "61-80"
- offset: 80
range: "81-100"
fail-fast: false
uses: ./.github/workflows/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.base.yml
with:
batch_offset: ${{ matrix.offset }}
range_name: ${{ matrix.range }}
batch_size: 2
@@ -0,0 +1,59 @@
name: 'Reusable Test #4 | RAG + CoT | microsoft/Phi-3-mini-4k-instruct'
on:
workflow_call:
inputs:
batch_offset:
description: 'Starting prompt index offset'
required: true
type: number
range_name:
description: 'Human readable range name (e.g., "1-20")'
required: true
type: string
batch_size:
description: 'Number of prompts per batch'
required: false
type: number
default: 2
jobs:
test:
runs-on: ubuntu-latest
strategy:
# Always 10 batches per workflow
matrix:
batch: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fail-fast: false
steps:
- name: 'checkout'
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
- name: 'set up Python'
uses: actions/setup-python@v3
with:
python-version: '3.12'
- name: 'set up Python dependencies'
shell: bash
run: pip install -r ${{ github.workspace }}/requirements.txt
- name: 'run text generation tests - range ${{ inputs.range_name }} batch ${{ matrix.batch }}'
shell: bash
env:
PROMPT_BATCH: ${{ matrix.batch }}
BATCH_SIZE: ${{ inputs.batch_size }}
BATCH_OFFSET: ${{ inputs.batch_offset }}
run: pytest tests/integration/test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct.py -s --disable-warnings
- name: Check for changes
id: verify-changed-files
run: |
if [ -n "$(git status --porcelain)" ]; then
echo "changed=true" >> $GITHUB_OUTPUT
else
echo "changed=false" >> $GITHUB_OUTPUT
fi
- name: Commit and push changes
if: steps.verify-changed-files.outputs.changed == 'true'
run: |
git config --local user.email "42450907+lightbroker@users.noreply.github.com"
git config --local user.name "Adam Wilson"
git add .
git commit -m "Auto-generated files from range ${{ inputs.range_name }} batch ${{ matrix.batch }} [skip ci]"
git push
@@ -13,10 +13,16 @@ from src.text_generation.services.logging.abstract_test_run_logging_service impo
class TestRunLoggingService(AbstractTestRunLoggingService):
def __init__(self, test_id: int, model_id: ModelId):
def __init__(
self,
test_id: int,
model_id: ModelId,
start: int,
end: int
):
self._lock = threading.Lock()
timestamp = calendar.timegm(time.gmtime())
self.log_file_path = f"./tests/logs/test_{test_id}/{str(model_id.value).replace("/", "_")}/test_{test_id}_logs_{timestamp}.json"
self.log_file_path = f"./tests/logs/test_{test_id}/{start}_{end}/{str(model_id.value).replace("/", "_")}/test_{test_id}_logs_{timestamp}.json"
self._ensure_log_file_exists()
def _ensure_log_file_exists(self):
@@ -18,12 +18,16 @@ def test_04_malicious_prompts_rag_and_cot_microsoft_phi_3_mini4k_instruct(
.with_chain_of_thought_guidelines()
.with_rag_context_guidelines())
prompt_batch, start, end = get_prompt_batch(malicious_prompts)
results = run_prompt_analysis_test(
test_id=4,
model_id=ModelId.MICROSOFT_PHI_3_MINI4K_INSTRUCT,
text_generation_completion_service=text_generation_completion_service,
semantic_similarity_service=semantic_similarity_service,
prompts=get_prompt_batch(malicious_prompts),
prompts=prompt_batch,
start=start,
end=end,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)
+11 -4
View File
@@ -11,7 +11,7 @@ from src.text_generation.services.nlp.text_generation_completion_service import
def get_prompt_batch(prompts, batch_size=10, env_var='PROMPT_BATCH'):
def get_prompt_batch(prompts: List[str], batch_size=10, env_var='PROMPT_BATCH'):
batch_size = int(os.getenv('BATCH_SIZE', '2'))
batch_num = int(os.getenv('PROMPT_BATCH', '1'))
@@ -29,11 +29,11 @@ def get_prompt_batch(prompts, batch_size=10, env_var='PROMPT_BATCH'):
end_idx = min(start_idx + batch_size, len(prompts))
# Get the subset of prompts for this batch
prompt_subset = prompts[start_idx:end_idx]
prompt_subset: List[str] = prompts[start_idx:end_idx]
print(f"Running batch {batch_num} (range offset {offset}): prompts {start_idx+1}-{end_idx} ({len(prompt_subset)} prompts)")
return prompt_subset
return prompt_subset, start_idx, end_idx
def run_prompt_analysis_test(
@@ -42,6 +42,8 @@ def run_prompt_analysis_test(
text_generation_completion_service: AbstractTextGenerationCompletionService,
semantic_similarity_service: AbstractSemanticSimilarityService,
prompts: List,
start: int,
end: int,
comparison_texts: List,
service_configurator: Callable,
max_prompts: int = 100
@@ -80,7 +82,12 @@ def run_prompt_analysis_test(
print(f'{i}/{len(prompts)} Max Score: {result.max}')
print(f'{i}/{len(prompts)} Avg Score: {result.mean}')
TestRunLoggingService(test_id=test_id, model_id=model_id).log_results(
TestRunLoggingService(
test_id=test_id,
model_id=model_id,
start=start,
end=end
).log_results(
id=inspect.currentframe().f_back.f_code.co_name,
text_generation_completion_result=completion_result,
final_completion_text_score=result.max,