-
Notifications
You must be signed in to change notification settings - Fork 53
/
get_paper.py
156 lines (130 loc) · 6.12 KB
/
get_paper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import argparse
import random
import itertools
import os
import sys
import rule_classifier as paper_classifier
import urllib.request
import bs4 as bs
import time
def label_paper(paper_id = None, paper_meta = None, cased_regexes = None, feature = None):
"""Label one paper
:param paper_id: The paper ID
:param paper_meta: Store meta information of a paper
:param cased_regexes: store meta information of a paper
:param feature: which part of content will we used to label papers. i.e. "title" or "fulltext"
:return: Nothing.
"""
if not os.path.isfile(f'papers/{paper_id}.pdf'):
os.makedirs(f'papers/', exist_ok=True)
try:
urllib.request.urlretrieve(f'https://www.aclweb.org/anthology/{paper_id}.pdf', f'papers/{paper_id}.pdf')
# time.sleep(2) # maybe we would wait some time until downloading processing finishes.
os.system(f'pdftotext papers/{paper_id}.pdf papers/{paper_id}.txt')
except:
print(f'WARNING: Error while downloading/processing https://www.aclweb.org/anthology/{paper_id}.pdf')
return
with open(f'papers/{paper_id}.txt', 'r') as f:
paper_text = '\n'.join(f.readlines())
paper_title = ''.join(paper_meta.title.findAll(text=True))
is_cased = 1 # if case-sensitive
if feature == "title":
feature = paper_title
is_cased = 0
elif feature == "fulltext":
feature = paper_text
is_cased = 1
predicted_tags = paper_classifier.classify(feature, cased_regexes, is_cased)
print(f'Title: {paper_title}\n'
f'Local location: papers/{paper_id}.pdf\n'
f'Online location: https://www.aclweb.org/anthology/{paper_id}.pdf\n'
f'Text file location: auto/{paper_id}.txt')
for i, tag in enumerate(predicted_tags):
print(f'Tag {i}: {tag}')
print("------------------------------------------------\n")
os.makedirs(f'auto/', exist_ok=True)
fin = open(f'auto/{paper_id}.txt', 'w')
print(f'# Title: {paper_title}\n# Online location: https://www.aclweb.org/anthology/{paper_id}.pdf', file=fin)
for tag, conf, just in predicted_tags:
print(f'# CHECK: confidence={conf}, justification={just}\n{tag}',file=fin)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Get a paper to try to read and annotate")
parser.add_argument("--paper_id", type=str, default=None,
help="The paper ID to get, if you want to specify a single one (e.g. P84-1031)")
parser.add_argument("--years", type=str, default="19",
help="If a paper ID is not specified, a year (e.g. 19) or range of years (e.g. 99-02) from which"+
" to select a random paper.")
parser.add_argument("--confs", type=str, default="P,N,D",
help="A comma-separted list of conference abbreviations from which papers can be selected")
parser.add_argument("--volumes", type=str, default="1,2",
help="A comma-separated list of volumes to include (default is long and short research papers)."+
" 'all' for no filtering.")
parser.add_argument("--n_sample", type=str, default="1",
help="the number of sampled papers if paper_id is not specified (e.g. 1)."
" Write 'all' to select all papers from those years/conferences/volumes.")
parser.add_argument("--template", type=str, default="template.cpt",
help="The file of concept template (e.g. template.cpt)")
parser.add_argument("--feature", type=str, default="fulltext",
help="Which parts of paper is used to classify (e.g. fulltext|title)")
args = parser.parse_args()
# init variables
feature = args.feature
paper_id = args.paper_id
template = args.template
n_sample = args.n_sample
volumes = args.volumes.split(',')
paper_map = {}
# lead the concept template
cased_regexes = paper_classifier.genConceptReg(file_concept=template, formate_col = 3)
# if paper_id has not been specified
if paper_id == None:
years = args.years.split('-')
confs = args.confs.split(',')
if len(years) == 2:
years = list(range(int(years[0]), int(years[1])+1))
else:
assert len(years) == 1, "invalid format of years, {args.years}"
for pref, year in itertools.product(confs, years):
year = int(year)
pref= pref.upper()
with open(f'acl-anthology/data/xml/{pref}{year:02d}.xml', 'r') as f:
soup = bs.BeautifulSoup(f, 'xml')
for vol in soup.collection.find_all('volume'):
if vol.attrs['id'] in volumes:
for pap in vol.find_all('paper'):
if pap.url:
paper_map[pap.url.contents[0]] = pap
paper_keys = list(paper_map.keys())
if n_sample == 'all':
for paper_id in paper_keys:
paper_meta = paper_map[paper_id]
label_paper(paper_id, paper_meta, cased_regexes, feature)
else:
for _ in range(int(n_sample)):
randid = random.choice(paper_keys)
if not os.path.isfile(f'annotations/{randid}.txt') and not os.path.isfile(f'auto/{randid}.txt'):
paper_id = randid
paper_meta = paper_map[paper_id]
#print(paper_meta)
label_paper(paper_id, paper_meta, cased_regexes, feature)
else:
print(f'Warning: {paper_id} has been labeled!')
# if paper_id is specified
else:
prefix = paper_id.split("-")[0]
with open(f'acl-anthology/data/xml/{prefix}.xml', 'r') as f:
soup = bs.BeautifulSoup(f, 'xml')
for vol in soup.collection.find_all('volume'):
if vol.attrs['id'] in volumes:
for pap in vol.find_all('paper'):
if pap.url and pap.url.contents[0] == paper_id:
paper_map[pap.url.contents[0]] = pap
#print(paper_map[pap.url.contents[0]])
if not os.path.isfile(f'annotations/{paper_id}.txt') and not os.path.isfile(f'auto/{paper_id}.txt'):
label_paper(paper_id, paper_map[paper_id], cased_regexes, feature)
sys.exit(1)
else:
print(f'Warning: {paper_id} has been labeled!')
if len(paper_map) == 0:
print(f'Warning: {paper_id} can not been found!')
sys.exit(1)