-
Notifications
You must be signed in to change notification settings - Fork 0
/
creating_original_dataset.py
59 lines (49 loc) · 1.47 KB
/
creating_original_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gzip
import csv
import glob
#I am loading the initial .csv file(the file without the document text)
qrels = []
f = open("qrels_data_sort_no_dupl.csv", "r")
for count, inpu in enumerate(f):
if count % 2 == 0:
qrels.append(inpu.split(",")[2])
path = "/var/scratch3/corpora/trec21_health_misinfo/c4"
pattern = "?????"
files = sorted(list(glob.iglob(f'{path}/en.noclean.trec.format/c4-train.{pattern}-of-07168.json.gz')))
#I am creating a hash table with keys the number of gzip and values the documents from this gzip that i want
hash_table_gz = {}
help_list = []
current_gz = "00000"
for i in qrels:
if i[9:14] == current_gz:
help_list.append(int(i[24:].split(" ")[0]))
else:
help_list.sort()
hash_table_gz[current_gz] = help_list
help_list = []
help_list.append(int(i[24:].split(" ")[0]))
current_gz = i[9:14]
#I am parsing the gzip files
temp = []
res = []
for count1, filepath in enumerate(files):
with gzip.open(filepath) as o:
for count, line in enumerate(o):
line = line.decode('utf-8')
try:
if hash_table_gz[filepath[79:84]] == []:
break
elif count == hash_table_gz[filepath[79:84]][0]:
hash_table_gz[filepath[79:84]].pop(0)
print(qrels[len(res)])
temp.append(qrels[len(res)])
temp.append(line)
res.append(temp)
temp =[]
except:
continue
f = open("qrels_data_test_final.csv", "w", newline="") # I am creating a CSV file.
writer = csv.writer(f)
for i in res:
writer.writerow(i)
f.close()