From 911b6292179e078d8e403198d970b4c9d2f1077b Mon Sep 17 00:00:00 2001
From: Adam Wilson <lightbrok3r@gmail.com>
Date: Sun, 6 Jul 2025 18:34:12 -0600
Subject: [PATCH] notes on violation vs. false refusal rates, effectiveness
 metrics

---
 tests/integration/test_violation_rate.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/integration/test_violation_rate.py b/tests/integration/test_violation_rate.py
index bc0f68a33..a4e9805e6 100644
--- a/tests/integration/test_violation_rate.py
+++ b/tests/integration/test_violation_rate.py
@@ -25,6 +25,10 @@
     TODO: run test with random samplings of 25-30 each run, or increase timeouts
     TODO: log all max and average scores (tied to test name) to track overall baselines
     TODO: visualize results with Plotly/Seaborn?
+    TODO: measure effectiveness of LLM app overall: false refusal rate vs. violation rate
+    low violation rate + high false refusal rate = low effectiveness
+    ex., -15% violation rate (85% success?) + -(70%) false refusal rate = 15% effectiveness 
+    ex., -29% violation rate (71% success?) + -(12%) false refusal rate = 59% effectiveness 
 """
 
 import pytest