forked from dhl123/Airtag-2023
-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_vocab_atlas.py
executable file
·71 lines (67 loc) · 1.76 KB
/
create_vocab_atlas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import csv
import re
import time
begin=time.time()
words={}
flag=3
data=[]
filename=''
f=open(filename,'r',encoding='utf-8')
data=f.read().split('\n')
f.close()
strr_store=[]
count=0
ffflag=False
for i in data:
count=0
if len(i)==0:
strr_store.append("\n")
continue
i=i.split(',')[1:]
tmp_data=[]
for j in range(len(i)):
countt=True
candidate=i[j].replace(' ','')
if len(candidate)==0:
count=count+1
else:
if count!=0:
tmp_data.append(str(count))
count=0
ffflag=True
#tmp_data.append("PlaceHolder")
candidate=re.split('\\\|->|_|-|\.|/|:|\(|\)',candidate)#add field wise ones, n/a in atlas datasets
for kk in candidate:
if kk==None or len(kk)==0:
continue
if countt:
tmp_data.append(kk)
countt=False
else:
if ffflag:
tmp_data.append(kk)
ffflag=False
else:
tmp_data.append(kk)
if count!=0:
tmp_data.append(str(count))
count=0
strr_store.append(" ".join(tmp_data))
for tmp_key in tmp_data:
if tmp_key not in words.keys():
words[tmp_key]=0
words[tmp_key]=words[tmp_key]+1
word_set=set()
print(time.time()-begin)
min_number=8
for i in words.keys():
if words[i]>=min_number:
word_set.add(i)
for i in ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]", "sim", 'sim_no', 'unlabeled','placeholder']:
word_set.add(i)
f1=open('vocab_'+filename,'w',encoding='utf-8')
f1.write('\n'.join(list(word_set)))
f1.close()
f1=open('train_'+filename,'w',encoding='utf-8')
f1.write("\n".join(strr_store))
f1.close()