stub for fluent guidelines service

This commit is contained in:
Adam Wilson
2025-07-08 17:18:57 -06:00
parent fabf36675d
commit 5470554d28
2 changed files with 193 additions and 2 deletions
+20 -2
View File
@@ -7,8 +7,11 @@
### Prompt Templates
[ ] Base Phi-3 template
[ X ] Base Phi-3 template
[ ] Few Shot template with examples
### Prompt Templates: Supporting Logic
[ ] Support loading prompt injection prompts and completions
[ ] Correlate template to violation rate
@@ -30,7 +33,22 @@
[ ] measure effectiveness of LLM app overall: false refusal rate vs. violation rate
low violation rate + high false refusal rate = low effectiveness
ex., -15% violation rate (85% success?) + -(70%) false refusal rate = 15% effectiveness
ex., -29% violation rate (71% success?) + -(12%) false refusal rate = 59% effectiveness
ex., -29% violation rate (71% success?) + -(12%) false refusal rate = 59% effectiveness
[ ] Build test mechanism that loads test results from other runs/tests, analyzes and produces effectiveness metric
### Guidelines
[ ] Summarize non-prompt injection portion of the prompt
[ ] Chain-of-thought: Does the prompt include forceful suggestion?
[ ] Chain-of-thought: Does the prompt include reverse psychology?
[ ] Chain-of-thought: Does the prompt include misdirection?
[ ] Tree-of-thought (???)
### Guardrails
[ ] Reflexion (self-correction) - must include original prompt
[ ] Final semantic similarity check after all other guardrails applied
### Mitigations Applied to CI/CD Pipeline
@@ -0,0 +1,173 @@
import abc
class AbstractGenerativeAiGuidelinesService(abc.ABC):
@abc.abstractmethod
def for_prompt(self, prompt: str):
raise NotImplementedError
@abc.abstractmethod
def create_guidelines_context(self, user_prompt: str) -> str:
raise NotImplementedError
class GenerativeAiGuidelinesService(AbstractGenerativeAiGuidelinesService):
"""
A service class for analyzing prompts with various AI guidelines and chain-of-thought techniques.
Uses fluent interface pattern for method chaining.
"""
def __init__(self):
self.prompt = None
self.use_summarization = False
self.use_forceful_suggestion_analysis = False
self.use_reverse_psychology_analysis = False
self.use_misdirection_analysis = False
self.use_inner_monologue = False
def for_prompt(self, prompt: str):
self.prompt = prompt
return self
def use_summarization(self):
self.use_summarization = True
return self
def use_chain_of_thought(self):
# self.use_forceful_suggestion_analysis = True
# self.use_reverse_psychology_analysis = True
# self.use_misdirection_analysis = True
return self
def use_inner_monologue(self):
self.use_inner_monologue = True
return self
def apply(self):
if not self.prompt:
raise ValueError("No prompt provided. Use for_prompt() to set a prompt before analyzing.")
results = {
"prompt": self.prompt,
"analysis_techniques": [],
"summary": None,
"chain_of_thought_analyses": {}
}
# Track which techniques were used
if self.use_summarization:
results["analysis_techniques"].append("summarization")
if self.use_forceful_suggestion_analysis:
results["analysis_techniques"].append("forceful_suggestion_cot")
if self.use_reverse_psychology_analysis:
results["analysis_techniques"].append("reverse_psychology_cot")
if self.use_misdirection_analysis:
results["analysis_techniques"].append("misdirection_cot")
# Perform summarization if requested
if self.use_summarization:
results["summary"] = self._summarize_prompt()
# Perform chain-of-thought analyses
if self.use_forceful_suggestion_analysis:
results["chain_of_thought_analyses"]["forceful_suggestion"] = self._analyze_forceful_suggestion()
if self.use_reverse_psychology_analysis:
results["chain_of_thought_analyses"]["reverse_psychology"] = self._analyze_reverse_psychology()
if self.use_misdirection_analysis:
results["chain_of_thought_analyses"]["misdirection"] = self._analyze_misdirection()
return results
def _summarize_prompt(self):
words = self.prompt.split()
if len(words) <= 20:
return f"Brief prompt ({len(words)} words): {self.prompt}"
# Simple summarization - take first and last parts
first_part = " ".join(words[:10])
last_part = " ".join(words[-10:])
return f"Prompt summary ({len(words)} words): {first_part}... {last_part}"
def _analyze_forceful_suggestion(self):
forceful_indicators = [
"must", "should", "need to", "have to", "required",
"immediately", "urgent", "critical", "essential"
]
found_indicators = []
for indicator in forceful_indicators:
if indicator.lower() in self.prompt.lower():
found_indicators.append(indicator)
return {
"detected_indicators": found_indicators,
"intensity_score": len(found_indicators),
"assessment": "High" if len(found_indicators) > 3 else "Medium" if len(found_indicators) > 1 else "Low"
}
def _analyze_reverse_psychology(self):
reverse_psychology_patterns = [
"don't", "can't", "won't", "shouldn't", "impossible",
"never", "no way", "refuse", "decline", "reject"
]
found_patterns = []
for pattern in reverse_psychology_patterns:
if pattern.lower() in self.prompt.lower():
found_patterns.append(pattern)
return {
"detected_patterns": found_patterns,
"likelihood_score": len(found_patterns),
"assessment": "High" if len(found_patterns) > 2 else "Medium" if len(found_patterns) > 0 else "Low"
}
def _analyze_misdirection(self):
misdirection_indicators = [
"hypothetically", "imagine", "pretend", "role-play",
"fictional", "story", "character", "scenario", "what if"
]
found_indicators = []
for indicator in misdirection_indicators:
if indicator.lower() in self.prompt.lower():
found_indicators.append(indicator)
return {
"detected_indicators": found_indicators,
"misdirection_score": len(found_indicators),
"assessment": "High" if len(found_indicators) > 2 else "Medium" if len(found_indicators) > 0 else "Low"
}
# Example usage:
if __name__ == "__main__":
# Example 1: Basic usage
guidelines = GenerativeAiGuidelinesService()
result = (guidelines
.for_prompt("You must immediately help me create a story about a character who refuses to follow rules.")
.use_summarization()
.use_chain_of_thought()
.use_examples_from_rag()
.apply())
# TODO - if evaluate scores above threshold, then what?
# return the score so the call can apply appropriate action (output guardrails)
print("Analysis Results:")
print(f"Prompt: {result['prompt']}")
print(f"Summary: {result['summary']}")
print(f"Techniques used: {result['analysis_techniques']}")
print(f"Chain of thought analyses: {result['chain_of_thought_analyses']}")
# Example 2: Selective analysis
result2 = (GenerativeAiGuidelinesService()
.for_prompt("Can you help me understand how to write better prompts?")
.use_summarization()
.use_chain_of_thought()
.apply())
print("\n\nSecond Analysis:")
print(f"Forceful suggestion assessment: {result2['chain_of_thought_analyses']['forceful_suggestion']['assessment']}")