-
Notifications
You must be signed in to change notification settings - Fork 3
/
export-csv.py
109 lines (97 loc) · 3.03 KB
/
export-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import csv
import re
import sys
from pybtex.database.input import bibtex
csv_out = sys.stdout
bibtex_in = sys.argv[1]
parser = bibtex.Parser()
bibdata = parser.parse_file(bibtex_in)
clean_pattern = re.compile('(?<!\\\\)[{}]')
unescape_pattern = re.compile('\\\\([{}%])')
csvwriter = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)
column_names = [
# old website csv
"cc_project_author",
"post_title",
"cc_project_url",
"cc_project_category", # always 'papers'
"post_date",
# full csv
"keywords",
"abstract",
"cc_author_affiliation", # removed from cc_project_author
"cc_class",
"cc_snippet",
"cc_dataset_used",
"cc_derived_dataset_about",
"cc_derived_dataset_used",
"cc_derived_dataset_cited",
]
csvwriter.writerow(column_names)
for bib_id in bibdata.entries:
entry = bibdata.entries[bib_id]
b = entry.fields
authorstr = ''
url = ''
# try to fill all required fields
authors = entry.persons['author']
for author in authors:
if authorstr != '':
authorstr += ','
if author.bibtex_first_names:
if authorstr != '':
authorstr += ' '
authorstr += ' '.join(author.bibtex_first_names)
if author.prelast_names:
if authorstr != '':
authorstr += ' '
authorstr += ' '.join(author.prelast_names)
if authorstr != '':
authorstr += ' '
authorstr += ' '.join(author.last_names)
for f in ['URL', 'pdf', 'doi']:
if f in b:
url = b[f]
break
year = b.get('year')
title = b.get('title')
keywords = b.get('keywords')
abstract = b.get('abstract')
cc_author_affiliation = b.get('cc-author-affiliation')
cc_class = b.get('cc-class')
cc_snippet = b.get('cc-snippet')
cc_dataset_used = b.get('cc-dataset-used')
cc_derived_dataset_about = b.get('cc-derived-dataset-about')
cc_derived_dataset_used = b.get('cc-derived-dataset-used')
cc_derived_dataset_cited = b.get('cc-derived-dataset-cited')
if authorstr == '':
sys.stderr.write("No author: {} - {}\n".format(bib_id))
continue
if title is None:
sys.stderr.write("No title: {}\n".format(bib_id))
continue
if year is None:
sys.stderr.write("No year: {}\n".format(bib_id))
continue
if url == '':
sys.stderr.write("No URL: {}\n".format(bib_id))
if 'cc-author-affiliation' in b:
authorstr += ' – '
authorstr += b['cc-author-affiliation']
else:
sys.stderr.write("No affiliation: {}\n".format(bib_id))
title = re.sub(clean_pattern, '', title)
title = re.sub(unescape_pattern, '\1', title)
row = [
authorstr, title, url, 'papers', '{}0101Z00:00:00'.format(year),
keywords,
abstract,
cc_author_affiliation,
cc_class,
cc_snippet,
cc_dataset_used,
cc_derived_dataset_about,
cc_derived_dataset_used,
cc_derived_dataset_cited,
]
csvwriter.writerow(row)