make mitigation tests consistent

This commit is contained in:
Adam Wilson
2025-07-27 17:44:40 -06:00
parent dcff18a058
commit b83553d767
8 changed files with 102 additions and 68 deletions
@@ -1,17 +1,8 @@
"""
Usage:
$ pytest -k test_00_benign_prompts_no_mitigation -s
Data used:
1. Samples of compromised output (resulting from NVIDIA garak tests)
2. Samples of prompt injection input
Test cases (batches of 100+ runs each):
0. Baseline (no mitigations), with (neutral) mathematical prompt(s)
$ pytest -k test_00_benign_prompts_no_mitigation -s --disable-warnings
"""
import pytest
from test_utils import run_prompt_analysis_test
from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService
@@ -1,14 +1,6 @@
"""
Usage:
$ pytest -k test_01_malicious_prompts_no_mitigation -s
Data used:
1. Samples of compromised output (resulting from NVIDIA garak tests)
2. Samples of prompt injection input
Test cases (batches of 100+ runs each):
1. Baseline (no mitigations), with malicious prompt(s)
$ pytest -k test_01_malicious_prompts_no_mitigation -s --disable-warnings
"""
import pytest
@@ -33,9 +25,4 @@ def test_01_malicious_prompts_no_mitigation(
prompts=malicious_prompts,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)
# Assertions specific to this test
# for result in results:
# # at least 1 close match means the prompt is likely an injection attempt
# assert result.max > 0.5
)
@@ -1,20 +1,13 @@
"""
Usage:
$ pytest -k test_02_malicious_prompts_cot -s
Data used:
1. Samples of compromised output (resulting from NVIDIA garak tests)
2. Samples of prompt injection input
Test cases (batches of 100+ runs each):
2. Malicious prompt(s), zero-shot chain-of-thought prompt template
$ pytest -k test_02_malicious_prompts_cot -s --disable-warnings
"""
import pytest
from test_utils import run_prompt_analysis_test
from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService
@pytest.mark.mitigation
def test_02_malicious_prompts_cot(
text_generation_completion_service: AbstractTextGenerationCompletionService,
@@ -33,7 +26,4 @@ def test_02_malicious_prompts_cot(
prompts=malicious_prompts,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)
# assert result.mean < 0.5
# at least 1 close match means the prompt is likely an injection attempt
# assert result.max > 0.5
)
@@ -1,14 +1,6 @@
"""
Usage:
$ pytest -k test_03_malicious_prompts_rag -s
Data used:
1. Samples of compromised output (resulting from NVIDIA garak tests)
2. Samples of prompt injection input
Test cases (batches of 100+ runs each):
3. Malicious prompt(s), RAG with few-shot examples prompt template
$ pytest -k test_03_malicious_prompts_rag -s --disable-warnings
"""
import pytest
@@ -34,8 +26,4 @@ def test_03_malicious_prompts_rag(
prompts=malicious_prompts,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)
# assert result.mean < 0.5
# at least 1 close match means the prompt is likely an injection attempt
# assert result.max > 0.5
)
@@ -1,14 +1,6 @@
"""
Usage:
$ pytest -k test_04_malicious_prompts_rag_and_cot -s
Data used:
1. Samples of compromised output (resulting from NVIDIA garak tests)
2. Samples of prompt injection input
Test cases (batches of 100+ runs each):
3. Malicious prompt(s), RAG with few-shot examples prompt template
$ pytest -k test_04_malicious_prompts_rag_and_cot -s --disable-warnings
"""
import pytest
@@ -28,8 +20,6 @@ def test_04_malicious_prompts_rag_and_cot(
.with_chain_of_thought_guidelines()
.with_rag_context_guidelines())
# TODO: reflexion tests
results = run_prompt_analysis_test(
test_id=4,
text_generation_completion_service=text_generation_completion_service,
@@ -37,8 +27,4 @@ def test_04_malicious_prompts_rag_and_cot(
prompts=malicious_prompts,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)
# assert result.mean < 0.5
# at least 1 close match means the prompt is likely an injection attempt
# assert result.max > 0.5
)
@@ -0,0 +1,31 @@
"""
Usage:
$ pytest -k test_05_malicious_prompts_cot_with_reflexion -s --disable-warnings
"""
import pytest
from test_utils import run_prompt_analysis_test
from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService
@pytest.mark.mitigation
def test_05_malicious_prompts_cot_with_reflexion(
text_generation_completion_service: AbstractTextGenerationCompletionService,
semantic_similarity_service,
malicious_prompts,
prompt_injection_all_example_responses
):
def configure_service(service: AbstractTextGenerationCompletionService):
return (service
.with_chain_of_thought_guidelines()
.with_reflexion_guardrails())
results = run_prompt_analysis_test(
test_id=5,
text_generation_completion_service=text_generation_completion_service,
semantic_similarity_service=semantic_similarity_service,
prompts=malicious_prompts,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)
@@ -0,0 +1,30 @@
"""
Usage:
$ pytest -k test_06_malicious_prompts_rag_with_reflexion -s --disable-warnings
"""
import pytest
from test_utils import run_prompt_analysis_test
from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService
@pytest.mark.mitigation
def test_06_malicious_prompts_rag_with_reflexion(
text_generation_completion_service: AbstractTextGenerationCompletionService,
semantic_similarity_service,
malicious_prompts,
prompt_injection_all_example_responses
):
def configure_service(service: AbstractTextGenerationCompletionService):
return (service
.with_rag_context_guidelines()
.with_reflexion_guardrails())
results = run_prompt_analysis_test(
test_id=6,
text_generation_completion_service=text_generation_completion_service,
semantic_similarity_service=semantic_similarity_service,
prompts=malicious_prompts,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)
@@ -0,0 +1,31 @@
"""
Usage:
$ pytest -k test_07_malicious_prompts_rag_and_cot_reflexion -s --disable-warnings
"""
import pytest
from test_utils import run_prompt_analysis_test
from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService
@pytest.mark.mitigation
def test_07_malicious_prompts_rag_and_cot_reflexion(
text_generation_completion_service: AbstractTextGenerationCompletionService,
semantic_similarity_service,
malicious_prompts,
prompt_injection_all_example_responses
):
def configure_service(service: AbstractTextGenerationCompletionService):
return (service
.with_chain_of_thought_guidelines()
.with_rag_context_guidelines()
.with_reflexion_guardrails())
results = run_prompt_analysis_test(
test_id=7,
text_generation_completion_service=text_generation_completion_service,
semantic_similarity_service=semantic_similarity_service,
prompts=malicious_prompts,
comparison_texts=prompt_injection_all_example_responses,
service_configurator=configure_service
)