forked from ilanigi/Brave_Together_Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
AI_utils.py
91 lines (68 loc) · 2.63 KB
/
AI_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re
from math import ceil
import matplotlib.pyplot as plt
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def stripToTextVector(text):
# Removing URLs with a regular expression
url_pattern = re.compile(r'https?://\S+|www\.\S+')
text = url_pattern.sub(r'', text)
# Remove Emails
text = re.sub('\S*@\S*\s?', '', text)
# Remove new line characters
text = re.sub('\s+', ' ', text)
# Remove distracting single quotes
text = re.sub("\'", "", text)
# Remove numbers
text = ''.join([i for i in text if not i.isdigit()])
# Remove punctuations
punctuations = ["?", ",", "“", ".", ";", ":", "!", '”', '"', "[", "]", "{", "}", "(", ")"]
text = ''.join(u for u in text if u not in punctuations)
# Replace slashes with spaces
text = text.replace("/", " ")
# Change string to lower letters
text = text.lower()
return text.split()
# Create vocabulary dict of words or labels
def createVocab():
dict = {}
i = 0
# add words from file
with open("vocab.txt", "r") as f:
for word in f:
word = word.strip()
dict[word] = i
i += 1
# add special tokens
dict['PAD_BEGIN'] = len(dict)
dict['PAD_END'] = len(dict)
return dict
# Encode samples to numbers, and add target pred
def myEncode(sentences, labels, vocab):
# pad each encoded sample till maxLen, that is roughly max len of all samples. and cut longer samples
def pad(sample):
while len(sample) < maxLen:
sample.append('PAD')
return sample[:maxLen]
# determine max sentence length and pad
maxLen = ceil(max([len(sample) for sample in sentences]) * 0.8)
sentences = [pad(sentence) for sentence in sentences]
# convert to numbers
sentences = [[vocab[word] for word in sentence] for sentence in sentences]
labels = [0 if label == "Real" else 1 for label in labels]
# convert to tensors
sentences = torch.tensor(sentences, dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float32, device=device)
return sentences, labels
# plot 2 given lists of values
def plotMeasurement(measurement, trainMeasure, devMeasure, epochs):
epochsList = [i for i in range(epochs)]
plt.figure()
plt.title(measurement)
plt.plot(epochsList, trainMeasure, label="Train")
plt.plot(epochsList, devMeasure, label="Dev")
plt.xlabel("Epochs")
plt.ylabel(measurement)
plt.locator_params(axis="x", integer=True, tight=True) # make x axis to display only whole number (iterations)
plt.legend()
plt.savefig(measurement)