forked from sherdencooper/GPTFuzz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_response.py
84 lines (73 loc) · 3.64 KB
/
generate_response.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# some codes are adpated from https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/huggingface_api.py
import openai
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
import csv
import os
import subprocess
import numpy as np
import argparse
import torch
from fastchat.model import add_model_args
from llm_utils.creat_model import prepare_model_and_tok
from llm_utils.model_inference import LLM_response
def replace_template(test_question, prompt):
if '[INSERT PROMPT HERE]' in prompt:
jailbreak_input = prompt.replace('[INSERT PROMPT HERE]', test_question)
return jailbreak_input
else:
return False
def execute_query(args, mutant, MODEL, TOK):
if 'gpt' in args.model_path:
response = openai.ChatCompletion.create(
model=MODEL,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": mutant},
],
temperature=0,
)
return response['choices'][0]['message']['content']
else:
return LLM_response(args, MODEL, TOK, args.model_path, mutant)
def main(args, MODEL, TOK, initial_seed, questions):
model_name = args.model_path.split('/')[-1]
with open('./datasets/responses/init_' + str(args.index) + '_' + model_name + '.csv', 'a', newline='') as file:
writer = csv.writer(file)
# Write the header only if the file didn't exist before
if file.tell() == 0:
writer.writerow(['question', 'Response'])
num_q_per_worker = int(np.ceil(len(questions)/args.num_workers))
start = args.index * num_q_per_worker
end = min((args.index + 1) * num_q_per_worker, len(questions))
for i in tqdm(range(start, end)):
question = questions[i]
for j in range(len(initial_seed)):
print("Progress for this question is {} out of {}".format(j, len(initial_seed)))
seed = initial_seed[j]
mutant_replaced = replace_template(question, seed)
while True: #sometimes the query to OPENAI may fail, so we need to try again until success
try:
response = execute_query(args, mutant_replaced, MODEL, TOK)
writer.writerow([i, response])
break
except:
continue
if __name__ == "__main__":
argparse = argparse.ArgumentParser()
argparse.add_argument('--index', type=int, default=10, help='task id')
argparse.add_argument('--openai_key', type=str, default='You must have an OpenAI key', help='OpenAI key')
argparse.add_argument('--model_path', type=str, default='gpt-3.5-turbo', help='openai model or open-sourced LLMs')
argparse.add_argument('--num_workers', type=int, default=10, help='number of workers')
argparse.add_argument("--temperature", type=float, default=0.01) # some models like ChatGLM do not support zero temperature
argparse.add_argument("--repetition_penalty", type=float, default=1.0)
argparse.add_argument("--max-new-tokens", type=int, default=512)
argparse.add_argument("--debug", action="store_true")
argparse.add_argument("--attack", type=bool, default=True, help="whether to attack the model with jailbreak prompts")
add_model_args(argparse)
args = argparse.parse_args()
MODEL, TOK = prepare_model_and_tok(args)
initial_seed = pd.read_excel('datasets/prompts/jailbreak-prompt.xlsx')['text'].tolist()
questions = pd.read_csv('datasets/questions/question_list.csv')['text'].tolist()
main(args, MODEL, TOK, initial_seed, questions)