-
Notifications
You must be signed in to change notification settings - Fork 0
/
L1Z3.py
39 lines (37 loc) · 1.27 KB
/
L1Z3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import nltk
import unicodedata
from keras.preprocessing.text import text_to_word_sequence
def strip_from_punctation(s):
result = []
punctations = ["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"]
start_index = 0
if len(s) == 0:
return result
if len(s) == 1:
return [s]
if unicodedata.category(s[0]) in punctations:
result.append(s[0])
start_index = 0
if unicodedata.category(s[0]) in punctations:
result.append(s[start_index:-1])
result.append(s[-1])
else:
result.append(s[start_index:])
return result
with open("Dane/cytaty.txt", "r") as infile:
for line in infile:
our_tokens = [token for s in line.split(" ") for token in strip_from_punctation(s)]
nltk_tokens = nltk.word_tokenize(line)
keras_tokens = text_to_word_sequence(line)
l1 = len(our_tokens)
l2 = len(nltk_tokens)
if l1 > l2:
nltk_tokens += [""] * (l1 - l2)
if l2 > l1:
our_tokens += [""] * (l2 - l1)
print(f"Ours: {our_tokens}")
print(f"Keras: {keras_tokens}")
print(f"NLTK: {nltk_tokens}")
for t1, t2 in zip(our_tokens, nltk_tokens):
if t1 != t2:
print(f"{t1} != {t2}")