-
Notifications
You must be signed in to change notification settings - Fork 11
/
prompt.py
112 lines (101 loc) · 3.91 KB
/
prompt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from transformers import AutoModelForMaskedLM , AutoTokenizer
import torch
class Prompting(object):
""" doc string
This class helps us to implement
Prompt-based Learning Model
"""
def __init__(self, **kwargs):
""" constructor
parameter:
----------
model: AutoModelForMaskedLM
path to a Pre-trained language model form HuggingFace Hub
tokenizer: AutoTokenizer
path to tokenizer if different tokenizer is used,
otherwise leave it empty
"""
model_path=kwargs['model']
tokenizer_path= kwargs['model']
if "tokenizer" in kwargs.keys():
tokenizer_path= kwargs['tokenizer']
self.model = AutoModelForMaskedLM.from_pretrained(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
def prompt_pred(self,text):
"""
Predict MASK token by listing the probability of candidate tokens
where the first token is the most likely
Parameters:
----------
text: str
The text including [MASK] token.
It supports single MASK token. If more [MASK]ed tokens
are given, it takes the first one.
Returns:
--------
list of (token, prob)
The return is a list of all token in LM Vocab along with
their prob score, sort by score in descending order
"""
indexed_tokens=self.tokenizer(text, return_tensors="pt").input_ids
tokenized_text= self.tokenizer.convert_ids_to_tokens (indexed_tokens[0])
# take the first masked token
mask_pos=tokenized_text.index(self.tokenizer.mask_token)
self.model.eval()
with torch.no_grad():
outputs = self.model(indexed_tokens)
predictions = outputs[0]
values, indices=torch.sort(predictions[0, mask_pos], descending=True)
#values=torch.nn.functional.softmax(values, dim=0)
result=list(zip(self.tokenizer.convert_ids_to_tokens(indices), values))
self.scores_dict={a:b for a,b in result}
return result
def compute_tokens_prob(self, text, token_list1, token_list2):
"""
Compute the activations for given two token list,
Parameters:
---------
token_list1: List(str)
it is a list for positive polarity tokens such as good, great.
token_list2: List(str)
it is a list for negative polarity tokens such as bad, terrible.
Returns:
--------
Tuple (
the probability for first token list,
the probability of the second token list,
the ratio score1/ (score1+score2)
The softmax returns
"""
_=self.prompt_pred(text)
score1=[self.scores_dict[token1] if token1 in self.scores_dict.keys() else 0\
for token1 in token_list1]
score1= sum(score1)
score2=[self.scores_dict[token2] if token2 in self.scores_dict.keys() else 0\
for token2 in token_list2]
score2= sum(score2)
softmax_rt=torch.nn.functional.softmax(torch.Tensor([score1,score2]), dim=0)
return softmax_rt
def fine_tune(self, sentences, labels, prompt=" Since it was [MASK].",goodToken="good",badToken="bad"):
"""
Fine tune the model
"""
good=tokenizer.convert_tokens_to_ids(goodToken)
bad=tokenizer.convert_tokens_to_ids(badToken)
from transformers import AdamW
optimizer = AdamW(self.model.parameters(),lr=1e-3)
for sen, label in zip(sentences, labels):
tokenized_text = self.tokenizer.tokenize(sen+prompt)
indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
# take the first masked token
mask_pos=tokenized_text.index(self.tokenizer.mask_token)
outputs = self.model(tokens_tensor)
predictions = outputs[0]
pred=predictions[0, mask_pos][[good,bad]]
prob=torch.nn.functional.softmax(pred, dim=0)
lossFunc = torch.nn.CrossEntropyLoss()
loss=lossFunc(prob.unsqueeze(0), torch.tensor([label]))
loss.backward()
optimizer.step()
print("done!")