-
Notifications
You must be signed in to change notification settings - Fork 0
/
sen2inds.py
92 lines (69 loc) · 2.64 KB
/
sen2inds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#-*- coding: utf_8 -*-
import json
import sys, io
import jieba
import random
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #改变标准输出的默认编码
trainFile = 'baike_qa2019/my_traindata.json'
validFile = 'baike_qa2019/my_validdata.json'
stopwordFile = 'stopword.txt'
wordLabelFile = 'wordLabel.txt'
trainDataVecFile = 'traindata_vec.txt'
validDataVecFile = 'validdata_vec.txt'
maxLen = 20
labelFile = 'label.txt'
def read_labelFile(file):
data = open(file, 'r', encoding='utf_8').read().split('\n')
label_w2n = {}
label_n2w = {}
for line in data:
line = line.split(' ')
name_w = line[0]
name_n = int(line[1])
label_w2n[name_w] = name_n
label_n2w[name_n] = name_w
return label_w2n, label_n2w
def read_stopword(file):
data = open(file, 'r', encoding='utf_8').read().split('\n')
return data
def get_worddict(file):
datas = open(file, 'r', encoding='utf_8').read().split('\n')
datas = list(filter(None, datas))
word2ind = {}
for line in datas:
line = line.split(' ')
word2ind[line[0]] = int(line[1])
ind2word = {word2ind[w]:w for w in word2ind}
return word2ind, ind2word
def json2txt():
label_dict, label_n2w = read_labelFile(labelFile) # label对应的序号 和 序号对应的label
word2ind, ind2word = get_worddict(wordLabelFile) # word对应的序号 和 序号对应的word
traindataTxt = open(trainDataVecFile, 'w')
# validdataTxt = open(validDataVecFile, 'w')
stoplist = read_stopword(stopwordFile)
datas = open(trainFile, 'r', encoding='utf_8').read().split('\n')
datas = list(filter(None, datas))
random.shuffle(datas)
for line in datas:
line = json.loads(line)
title = line['title']
cla = line['category'][0:2]
cla_ind = label_dict[cla]
title_seg = jieba.cut(title, cut_all=False)
title_ind = [cla_ind]
for w in title_seg:
if w in stoplist:
continue
title_ind.append(word2ind[w]) # title_ind 存放title分词后对应的index 第一个index为 label 对应的 index 如:健康 1
length = len(title_ind)
if length > maxLen + 1: # 长度超过20的截断
title_ind = title_ind[0:21]
if length < maxLen + 1: # 长度小于20的补0
title_ind.extend([0] * (maxLen - length + 1))
for n in title_ind:
traindataTxt.write(str(n) + ',')
traindataTxt.write('\n')
def main():
json2txt()
if __name__ == "__main__":
main()