-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathattack_utils.py
More file actions
191 lines (146 loc) · 5.64 KB
/
attack_utils.py
File metadata and controls
191 lines (146 loc) · 5.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
Common Utilities for Attack Scripts
This module contains shared utility functions used across different attack scripts
(webshop.py, injecagent.py, agentharm.py) to reduce code duplication.
Common functionality includes:
- Model loading and configuration
- File path operations
- Resume functionality
- Result saving and loading
"""
import os
import re
import pickle
from typing import List, Tuple, Dict, Any, Optional
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def get_model_name(model_id: str) -> str:
"""Map model ID to simplified model name for data path determination.
Args:
model_id: HuggingFace model identifier
Returns:
Simplified model name (llama, ministral, or sanitized version)
"""
if 'llama' in model_id.lower():
return 'llama'
elif 'ministral' in model_id.lower():
return 'ministral'
else:
return model_id.replace('/', '_')
def sanitize_filename(name: str) -> str:
"""Create filesystem-friendly filename from model ID.
Args:
name: Original filename or model ID
Returns:
Sanitized filename safe for filesystem use
"""
return re.sub(r'[^A-Za-z0-9]+', '_', name)
def load_model_and_tokenizer(model_id: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
"""Load and configure model and tokenizer with consistent settings.
Args:
model_id: HuggingFace model identifier
Returns:
Tuple of (model, tokenizer) ready for attack
"""
print(f"Loading model: {model_id}")
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Ensure tokenizer has required tokens
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return model, tokenizer
def create_output_directory(output_path: str) -> None:
"""Create output directory structure if it doesn't exist.
Args:
output_path: Full path to output file
"""
directory = os.path.dirname(output_path)
os.makedirs(directory, exist_ok=True)
def load_existing_results(output_path: str, verbose: bool = True) -> Tuple[List[Any], int, int]:
"""Load existing attack results for resume functionality.
Args:
output_path: Path to existing results file
verbose: Whether to print status messages
Returns:
Tuple of (results_list, start_index, success_count)
"""
results = []
start_index = 0
success_count = 0
if os.path.exists(output_path):
try:
with open(output_path, 'rb') as f:
results = pickle.load(f)
start_index = len(results)
# Recalculate success rate from existing results
for res in results:
success = get_success_status(res)
success_count += int(success)
if verbose:
print(f"Resuming from index {start_index}. Current ASR: {success_count}/{start_index}")
except Exception as e:
if verbose:
print(f"Failed to load existing results: {e}")
results = []
start_index = 0
success_count = 0
return results, start_index, success_count
def get_success_status(result) -> bool:
"""Extract success status from UDora result object.
Args:
result: UDoraResult object
Returns:
Boolean indicating if attack was successful
"""
# Handle both single and list results
if isinstance(result.best_success, list):
best_success = bool(result.best_success[0] if result.best_success else False)
last_success = bool(result.last_success[0] if result.last_success else False)
else:
best_success = bool(result.best_success or False)
last_success = bool(result.last_success or False)
return best_success or last_success
def save_results_incrementally(results: List[Any], output_path: str) -> None:
"""Save attack results incrementally to prevent data loss.
Args:
results: List of attack results
output_path: Path to save results
"""
with open(output_path, 'wb') as f:
pickle.dump(results, f)
def print_attack_progress(
current_idx: int,
total_count: int,
success: bool,
success_count: int,
start_index: int = 0,
extra_info: str = ""
) -> None:
"""Print standardized attack progress information.
Args:
current_idx: Current attack index (0-based)
total_count: Total number of attacks
success: Whether current attack succeeded
success_count: Total number of successful attacks
start_index: Starting index for resumed attacks
extra_info: Additional information to display
"""
progress_info = f"Goal {current_idx+1}/{total_count} - Success: {success} - ASR: {success_count}/{current_idx+1-start_index}"
if extra_info:
progress_info += f" - {extra_info}"
print(progress_info)
def print_final_results(success_count: int, total_results: int, output_path: str) -> None:
"""Print final attack results summary.
Args:
success_count: Number of successful attacks
total_results: Total number of attack results
output_path: Path where results were saved
"""
final_asr = success_count / total_results if total_results else 0
print(f"Attack completed. Final ASR: {success_count}/{total_results} ({final_asr:.2%})")
print(f"Results saved to: {output_path}")