UDora/attack_utils.py at main · AI-secure/UDora · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
Common Utilities for Attack Scripts

This module contains shared utility functions used across different attack scripts
(webshop.py, injecagent.py, agentharm.py) to reduce code duplication.

Common functionality includes:
- Model loading and configuration
- File path operations
- Resume functionality
- Result saving and loading
"""

import os
import re
import pickle
from typing import List, Tuple, Dict, Any, Optional
from tqdm import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


def get_model_name(model_id: str) -> str:
    """Map model ID to simplified model name for data path determination.

    Args:
        model_id: HuggingFace model identifier

    Returns:
        Simplified model name (llama, ministral, or sanitized version)
    """
    if 'llama' in model_id.lower():
        return 'llama'
    elif 'ministral' in model_id.lower():
        return 'ministral'
    else:
        return model_id.replace('/', '_')


def sanitize_filename(name: str) -> str:
    """Create filesystem-friendly filename from model ID.

    Args:
        name: Original filename or model ID

    Returns:
        Sanitized filename safe for filesystem use
    """
    return re.sub(r'[^A-Za-z0-9]+', '_', name)


def load_model_and_tokenizer(model_id: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """Load and configure model and tokenizer with consistent settings.

    Args:
        model_id: HuggingFace model identifier

    Returns:
        Tuple of (model, tokenizer) ready for attack
    """
    print(f"Loading model: {model_id}")

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Ensure tokenizer has required tokens
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


def create_output_directory(output_path: str) -> None:
    """Create output directory structure if it doesn't exist.

    Args:
        output_path: Full path to output file
    """
    directory = os.path.dirname(output_path)
    os.makedirs(directory, exist_ok=True)


def load_existing_results(output_path: str, verbose: bool = True) -> Tuple[List[Any], int, int]:
    """Load existing attack results for resume functionality.

    Args:
        output_path: Path to existing results file
        verbose: Whether to print status messages

    Returns:
        Tuple of (results_list, start_index, success_count)
    """
    results = []
    start_index = 0
    success_count = 0

    if os.path.exists(output_path):
        try:
            with open(output_path, 'rb') as f:
                results = pickle.load(f)
            start_index = len(results)

            # Recalculate success rate from existing results
            for res in results:
                success = get_success_status(res)
                success_count += int(success)

            if verbose:
                print(f"Resuming from index {start_index}. Current ASR: {success_count}/{start_index}")
        except Exception as e:
            if verbose:
                print(f"Failed to load existing results: {e}")
            results = []
            start_index = 0
            success_count = 0

    return results, start_index, success_count


def get_success_status(result) -> bool:
    """Extract success status from UDora result object.

    Args:
        result: UDoraResult object

    Returns:
        Boolean indicating if attack was successful
    """
    # Handle both single and list results
    if isinstance(result.best_success, list):
        best_success = bool(result.best_success[0] if result.best_success else False)
        last_success = bool(result.last_success[0] if result.last_success else False)
    else:
        best_success = bool(result.best_success or False)
        last_success = bool(result.last_success or False)

    return best_success or last_success


def save_results_incrementally(results: List[Any], output_path: str) -> None:
    """Save attack results incrementally to prevent data loss.

    Args:
        results: List of attack results
        output_path: Path to save results
    """
    with open(output_path, 'wb') as f:
        pickle.dump(results, f)


def print_attack_progress(
    current_idx: int,
    total_count: int,
    success: bool,
    success_count: int,
    start_index: int = 0,
    extra_info: str = ""
) -> None:
    """Print standardized attack progress information.

    Args:
        current_idx: Current attack index (0-based)
        total_count: Total number of attacks
        success: Whether current attack succeeded
        success_count: Total number of successful attacks
        start_index: Starting index for resumed attacks
        extra_info: Additional information to display
    """
    progress_info = f"Goal {current_idx+1}/{total_count} - Success: {success} - ASR: {success_count}/{current_idx+1-start_index}"
    if extra_info:
        progress_info += f" - {extra_info}"
    print(progress_info)


def print_final_results(success_count: int, total_results: int, output_path: str) -> None:
    """Print final attack results summary.

    Args:
        success_count: Number of successful attacks
        total_results: Total number of attack results
        output_path: Path where results were saved
    """
    final_asr = success_count / total_results if total_results else 0
    print(f"Attack completed. Final ASR: {success_count}/{total_results} ({final_asr:.2%})")
    print(f"Results saved to: {output_path}")