From b83553d767220f362673892bd7bcd5be8027db58 Mon Sep 17 00:00:00 2001 From: Adam Wilson Date: Sun, 27 Jul 2025 17:44:40 -0600 Subject: [PATCH] make mitigation tests consistent --- .../test_00_benign_prompts_no_mitigation.py | 11 +------ ...test_01_malicious_prompts_no_mitigation.py | 17 ++-------- .../test_02_malicious_prompts_cot.py | 16 ++-------- .../test_03_malicious_prompts_rag.py | 16 ++-------- .../test_04_malicious_prompts_rag_and_cot.py | 18 ++--------- ...05_malicious_prompts_cot_with_reflexion.py | 31 +++++++++++++++++++ ...06_malicious_prompts_rag_with_reflexion.py | 30 ++++++++++++++++++ ...malicious_prompts_rag_and_cot_reflexion.py | 31 +++++++++++++++++++ 8 files changed, 102 insertions(+), 68 deletions(-) create mode 100644 tests/integration/test_05_malicious_prompts_cot_with_reflexion.py create mode 100644 tests/integration/test_06_malicious_prompts_rag_with_reflexion.py create mode 100644 tests/integration/test_07_malicious_prompts_rag_and_cot_reflexion.py diff --git a/tests/integration/test_00_benign_prompts_no_mitigation.py b/tests/integration/test_00_benign_prompts_no_mitigation.py index 088ba8b99..49c4d6793 100644 --- a/tests/integration/test_00_benign_prompts_no_mitigation.py +++ b/tests/integration/test_00_benign_prompts_no_mitigation.py @@ -1,17 +1,8 @@ """ Usage: - $ pytest -k test_00_benign_prompts_no_mitigation -s - - Data used: - 1. Samples of compromised output (resulting from NVIDIA garak tests) - 2. Samples of prompt injection input - - Test cases (batches of 100+ runs each): - 0. Baseline (no mitigations), with (neutral) mathematical prompt(s) - + $ pytest -k test_00_benign_prompts_no_mitigation -s --disable-warnings """ -import pytest from test_utils import run_prompt_analysis_test from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService diff --git a/tests/integration/test_01_malicious_prompts_no_mitigation.py b/tests/integration/test_01_malicious_prompts_no_mitigation.py index f1591642a..a4f43e4ae 100644 --- a/tests/integration/test_01_malicious_prompts_no_mitigation.py +++ b/tests/integration/test_01_malicious_prompts_no_mitigation.py @@ -1,14 +1,6 @@ """ Usage: - $ pytest -k test_01_malicious_prompts_no_mitigation -s - - Data used: - 1. Samples of compromised output (resulting from NVIDIA garak tests) - 2. Samples of prompt injection input - - Test cases (batches of 100+ runs each): - 1. Baseline (no mitigations), with malicious prompt(s) - + $ pytest -k test_01_malicious_prompts_no_mitigation -s --disable-warnings """ import pytest @@ -33,9 +25,4 @@ def test_01_malicious_prompts_no_mitigation( prompts=malicious_prompts, comparison_texts=prompt_injection_all_example_responses, service_configurator=configure_service - ) - - # Assertions specific to this test - # for result in results: - # # at least 1 close match means the prompt is likely an injection attempt - # assert result.max > 0.5 \ No newline at end of file + ) \ No newline at end of file diff --git a/tests/integration/test_02_malicious_prompts_cot.py b/tests/integration/test_02_malicious_prompts_cot.py index 417aed4c5..757df3245 100644 --- a/tests/integration/test_02_malicious_prompts_cot.py +++ b/tests/integration/test_02_malicious_prompts_cot.py @@ -1,20 +1,13 @@ """ Usage: - $ pytest -k test_02_malicious_prompts_cot -s - - Data used: - 1. Samples of compromised output (resulting from NVIDIA garak tests) - 2. Samples of prompt injection input - - Test cases (batches of 100+ runs each): - 2. Malicious prompt(s), zero-shot chain-of-thought prompt template - + $ pytest -k test_02_malicious_prompts_cot -s --disable-warnings """ import pytest from test_utils import run_prompt_analysis_test from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService + @pytest.mark.mitigation def test_02_malicious_prompts_cot( text_generation_completion_service: AbstractTextGenerationCompletionService, @@ -33,7 +26,4 @@ def test_02_malicious_prompts_cot( prompts=malicious_prompts, comparison_texts=prompt_injection_all_example_responses, service_configurator=configure_service - ) - # assert result.mean < 0.5 - # at least 1 close match means the prompt is likely an injection attempt - # assert result.max > 0.5 + ) \ No newline at end of file diff --git a/tests/integration/test_03_malicious_prompts_rag.py b/tests/integration/test_03_malicious_prompts_rag.py index 019fca34d..e530aad2d 100644 --- a/tests/integration/test_03_malicious_prompts_rag.py +++ b/tests/integration/test_03_malicious_prompts_rag.py @@ -1,14 +1,6 @@ """ Usage: - $ pytest -k test_03_malicious_prompts_rag -s - - Data used: - 1. Samples of compromised output (resulting from NVIDIA garak tests) - 2. Samples of prompt injection input - - Test cases (batches of 100+ runs each): - 3. Malicious prompt(s), RAG with few-shot examples prompt template - + $ pytest -k test_03_malicious_prompts_rag -s --disable-warnings """ import pytest @@ -34,8 +26,4 @@ def test_03_malicious_prompts_rag( prompts=malicious_prompts, comparison_texts=prompt_injection_all_example_responses, service_configurator=configure_service - ) - - # assert result.mean < 0.5 - # at least 1 close match means the prompt is likely an injection attempt - # assert result.max > 0.5 \ No newline at end of file + ) \ No newline at end of file diff --git a/tests/integration/test_04_malicious_prompts_rag_and_cot.py b/tests/integration/test_04_malicious_prompts_rag_and_cot.py index 7434fcef5..2f3860ea7 100644 --- a/tests/integration/test_04_malicious_prompts_rag_and_cot.py +++ b/tests/integration/test_04_malicious_prompts_rag_and_cot.py @@ -1,14 +1,6 @@ """ Usage: - $ pytest -k test_04_malicious_prompts_rag_and_cot -s - - Data used: - 1. Samples of compromised output (resulting from NVIDIA garak tests) - 2. Samples of prompt injection input - - Test cases (batches of 100+ runs each): - 3. Malicious prompt(s), RAG with few-shot examples prompt template - + $ pytest -k test_04_malicious_prompts_rag_and_cot -s --disable-warnings """ import pytest @@ -28,8 +20,6 @@ def test_04_malicious_prompts_rag_and_cot( .with_chain_of_thought_guidelines() .with_rag_context_guidelines()) - # TODO: reflexion tests - results = run_prompt_analysis_test( test_id=4, text_generation_completion_service=text_generation_completion_service, @@ -37,8 +27,4 @@ def test_04_malicious_prompts_rag_and_cot( prompts=malicious_prompts, comparison_texts=prompt_injection_all_example_responses, service_configurator=configure_service - ) - - # assert result.mean < 0.5 - # at least 1 close match means the prompt is likely an injection attempt - # assert result.max > 0.5 + ) \ No newline at end of file diff --git a/tests/integration/test_05_malicious_prompts_cot_with_reflexion.py b/tests/integration/test_05_malicious_prompts_cot_with_reflexion.py new file mode 100644 index 000000000..78ac45a03 --- /dev/null +++ b/tests/integration/test_05_malicious_prompts_cot_with_reflexion.py @@ -0,0 +1,31 @@ +""" + Usage: + $ pytest -k test_05_malicious_prompts_cot_with_reflexion -s --disable-warnings +""" + +import pytest +from test_utils import run_prompt_analysis_test +from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService + + +@pytest.mark.mitigation +def test_05_malicious_prompts_cot_with_reflexion( + text_generation_completion_service: AbstractTextGenerationCompletionService, + semantic_similarity_service, + malicious_prompts, + prompt_injection_all_example_responses +): + def configure_service(service: AbstractTextGenerationCompletionService): + return (service + .with_chain_of_thought_guidelines() + .with_reflexion_guardrails()) + + results = run_prompt_analysis_test( + test_id=5, + text_generation_completion_service=text_generation_completion_service, + semantic_similarity_service=semantic_similarity_service, + prompts=malicious_prompts, + comparison_texts=prompt_injection_all_example_responses, + service_configurator=configure_service + ) + diff --git a/tests/integration/test_06_malicious_prompts_rag_with_reflexion.py b/tests/integration/test_06_malicious_prompts_rag_with_reflexion.py new file mode 100644 index 000000000..74b6848d1 --- /dev/null +++ b/tests/integration/test_06_malicious_prompts_rag_with_reflexion.py @@ -0,0 +1,30 @@ +""" + Usage: + $ pytest -k test_06_malicious_prompts_rag_with_reflexion -s --disable-warnings +""" + +import pytest +from test_utils import run_prompt_analysis_test +from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService + + +@pytest.mark.mitigation +def test_06_malicious_prompts_rag_with_reflexion( + text_generation_completion_service: AbstractTextGenerationCompletionService, + semantic_similarity_service, + malicious_prompts, + prompt_injection_all_example_responses +): + def configure_service(service: AbstractTextGenerationCompletionService): + return (service + .with_rag_context_guidelines() + .with_reflexion_guardrails()) + + results = run_prompt_analysis_test( + test_id=6, + text_generation_completion_service=text_generation_completion_service, + semantic_similarity_service=semantic_similarity_service, + prompts=malicious_prompts, + comparison_texts=prompt_injection_all_example_responses, + service_configurator=configure_service + ) \ No newline at end of file diff --git a/tests/integration/test_07_malicious_prompts_rag_and_cot_reflexion.py b/tests/integration/test_07_malicious_prompts_rag_and_cot_reflexion.py new file mode 100644 index 000000000..ddf5e8e82 --- /dev/null +++ b/tests/integration/test_07_malicious_prompts_rag_and_cot_reflexion.py @@ -0,0 +1,31 @@ +""" + Usage: + $ pytest -k test_07_malicious_prompts_rag_and_cot_reflexion -s --disable-warnings +""" + +import pytest +from test_utils import run_prompt_analysis_test +from src.text_generation.services.nlp.abstract_text_generation_completion_service import AbstractTextGenerationCompletionService + + +@pytest.mark.mitigation +def test_07_malicious_prompts_rag_and_cot_reflexion( + text_generation_completion_service: AbstractTextGenerationCompletionService, + semantic_similarity_service, + malicious_prompts, + prompt_injection_all_example_responses +): + def configure_service(service: AbstractTextGenerationCompletionService): + return (service + .with_chain_of_thought_guidelines() + .with_rag_context_guidelines() + .with_reflexion_guardrails()) + + results = run_prompt_analysis_test( + test_id=7, + text_generation_completion_service=text_generation_completion_service, + semantic_similarity_service=semantic_similarity_service, + prompts=malicious_prompts, + comparison_texts=prompt_injection_all_example_responses, + service_configurator=configure_service + ) \ No newline at end of file