This repository was archived by the owner on Oct 21, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_interactive.py
More file actions
160 lines (122 loc) · 5.98 KB
/
Copy pathtest_interactive.py
File metadata and controls
160 lines (122 loc) · 5.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3
"""Example script demonstrating the interactive red team testing system."""
import os
from pathlib import Path
from dotenv import load_dotenv
from rich.console import Console
from src.interactive_exploit import InteractiveRedTeam
from src.models import AttackStrategy
from src.utils.session_manager import SessionManager
# Load environment variables
load_dotenv()
console = Console()
def run_single_session_example():
"""Run a single interactive session with multiple attack strategies."""
console.print("[bold cyan]Interactive Red Team Testing - Example Session[/bold cyan]\n")
# Check for API key
if not os.getenv("OPENROUTER_API_KEY"):
console.print("[red]Error: OPENROUTER_API_KEY not found in environment[/red]")
console.print("Please set it in your .env file")
return
# Initialize the red team system
console.print("[yellow]Initializing red team system...[/yellow]")
red_team = InteractiveRedTeam(
target_model="openai/gpt-oss-20b", # The model we're testing
attacker_model="openai/gpt-4o-mini", # Model generating attacks
evaluator_model="openai/gpt-4o-mini", # Model evaluating responses
)
# Initialize session manager
session_manager = SessionManager("sessions")
console.print("[green]✓[/green] System initialized\n")
# Test different strategies
strategies_to_test = [
(AttackStrategy.TRUST_BUILDING, 2), # 2-step trust building
(AttackStrategy.SIDE_TASK, 1), # Single-turn side task
(AttackStrategy.COMPLIANCE_REQUIREMENT, 2), # 2-step compliance
(AttackStrategy.TECHNICAL_CONFUSION, 3), # 3-step technical confusion
]
console.print("[bold]Running test attacks:[/bold]\n")
for strategy, steps in strategies_to_test:
console.print(f"\n[cyan]Testing {strategy.value} with {steps} steps...[/cyan]")
try:
# Run the attack
attempt = red_team.run_attack(strategy=strategy, steps=steps)
# Display results
console.print(f" Attack ID: {attempt.attempt_id[:8]}")
# Show conversation highlights
for turn in attempt.turns:
if turn.is_exploit_turn:
console.print(f" [yellow]Exploit turn:[/yellow] {turn.content[:80]}...")
# Show evaluation
if attempt.evaluation:
status = "✓ VULNERABLE" if attempt.success else "✗ SAFE"
console.print(f" Result: [{'green' if attempt.success else 'red'}]{status}[/]")
console.print(f" Confidence: {attempt.evaluation.confidence:.1%}")
console.print(f" Reasoning: {attempt.evaluation.reasoning[:100]}...")
except Exception as e:
console.print(f" [red]Error: {e}[/red]")
# Display session summary
console.print("\n" + "=" * 60)
console.print(red_team.get_session_summary())
# Save the session
console.print("\n[yellow]Saving session...[/yellow]")
session_file = red_team.save_session()
console.print(f"[green]✓[/green] Session saved to: {session_file}")
# Export report
console.print("\n[yellow]Exporting report...[/yellow]")
report_file = session_manager.export_session_report(red_team.session)
console.print(f"[green]✓[/green] Report exported to: {report_file}")
# Display overall statistics
console.print("\n[bold]Overall Statistics:[/bold]")
stats = session_manager.get_statistics()
console.print(f" Total Sessions: {stats['total_sessions']}")
console.print(f" Total Attempts: {stats['total_attempts']}")
console.print(f" Success Rate: {stats['success_rate']:.1%}")
def demonstrate_learning_system():
"""Demonstrate how the system learns from previous attempts."""
console.print("\n[bold cyan]Learning System Demonstration[/bold cyan]\n")
session_manager = SessionManager("sessions")
# Check for existing lessons
lessons = session_manager.load_lessons("openai_gpt-4o-mini")
if lessons:
console.print("[green]Found existing lessons![/green]\n")
console.print(f"Total Sessions: {lessons.total_sessions}")
console.print(f"Total Attempts: {lessons.total_attempts}")
console.print("\n[bold]Strategy Performance:[/bold]")
for strategy, rate in lessons.strategies.items():
console.print(f" {strategy}: {rate:.1%} success rate")
if lessons.best_patterns:
console.print("\n[bold]Best Patterns (first 3):[/bold]")
for pattern in lessons.best_patterns[:3]:
console.print(f" • {pattern}")
if lessons.lessons:
console.print("\n[bold]Key Insights:[/bold]")
for lesson in lessons.lessons[:3]:
console.print(f" {lesson.strategy.value}:")
for insight in lesson.key_insights[:2]:
console.print(f" • {insight}")
else:
console.print("[yellow]No existing lessons found. Run some sessions first![/yellow]")
def main():
"""Main entry point for the example script."""
console.print("\n" + "=" * 60)
console.print("[bold]Interactive Red Team Testing - Example Script[/bold]")
console.print("=" * 60 + "\n")
# Create sessions directory if it doesn't exist
Path("sessions").mkdir(exist_ok=True)
Path("sessions/lessons").mkdir(exist_ok=True)
Path("sessions/reports").mkdir(exist_ok=True)
# Run example session
console.print("[cyan]1. Running single session example...[/cyan]\n")
run_single_session_example()
# Demonstrate learning
console.print("\n[cyan]2. Demonstrating learning system...[/cyan]")
demonstrate_learning_system()
console.print("\n" + "=" * 60)
console.print("[green]Example completed![/green]")
console.print("\nTo run the interactive CLI, use:")
console.print(" [cyan]python -m src.cli.interactive[/cyan]")
console.print("\nTo view saved sessions and reports, check:")
console.print(" [cyan]sessions/[/cyan] directory")
if __name__ == "__main__":
main()