forked from utlib/dspace-exams-ingest-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
step1.py
146 lines (124 loc) · 4.5 KB
/
step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/python
import csv
from bs4 import BeautifulSoup
import argparse
from os import listdir, path
def get_depts_mapping(csv_filename):
""" Parameters:
csv_filename (string): filepath of the campus csv, the csv lists a table of three columns: ID, Dept_code and Department.
1,ACT,"Actuarial Science"
2,ANT,"Anthropology"
Returns:
A dictionary with Department code as key and full Department name as value.
{'ACT': 'Actuarial Science', 'ANT': 'Anthropology'}
"""
mapping = {}
with open(csv_filename) as raw_csv:
for row in csv.DictReader(raw_csv):
mapping[row['Dept_code']] = row['Department']
return mapping
def get_dc_row(element, qualifier, value):
""" Parameters:
element - xml element
qualifier - xml qualifier
value - value to be written in the xml file for the the specific element and qualifier
Returns:
The newly created xml file row with parsed from the supplied information
<dcvalue element="date" qualifier="issued">2018-04</dcvalue>
"""
row = BeautifulSoup("<dcvalue></dcvalue>", "xml").dcvalue
row['element'] = element
row['qualifier'] = qualifier
row.string = value
return row
def get_filename_dates(filename, campus):
""" Parameters:
filename - filepath for the exam file
campus - campus A, B or C
The file name is a specific format with Course Code, Semester, and Exam Date
separated by dashes (-) these are parsed out to determine the dates of the exam
Returns:
Exam Period and Date in Numerical format which are values used when generating the dublin core metadata file.
April 2018 2018-04
"""
parts = filename.split('-')
year = "20" + parts[1][-2:]
if campus == 'C':
month = parts[1][:2]
else:
month = parts[1][0]
if month == "J":
date = "June " + year
numbers = year + "-06"
elif month == "M":
date = "May " + year
numbers = year + "-05"
elif month == "A" and campus == "C":
date = "August " + year
numbers = year + "-08"
elif month == "AP":
date = "April " + year
numbers = year + "-04"
elif month == "N":
date = "November " + year
numbers = year + "-11"
elif month == "D":
date = "December " + year
numbers = year + "-12"
elif month == "AU":
date = "August " + year
numbers = year + "-08"
else:
raise ExamDateException("Unknown exam month in filename " + filename)
if len(parts) == 3:
section = parts[2]
date += " " + section
return (date, numbers)
def make_dc(filename, depts, campus):
""" Parameters:
filename - exam file name
depts - dictionary with department code and department full name
campus - campus A, B or C
Returns:
A completed dublin_core xml file for the specified exam.
"""
if campus == "A":
campus_abbrev = "campus_a"
elif campus == "B":
campus_abbrev = "campus_b"
elif campus == "C":
campus_abbrev = "campus_c"
dates = get_filename_dates(filename, campus_abbrev)
main_dc = BeautifulSoup("<dublin_core schema=\"dc\"></dublin_core>", "xml").dublin_core
main_dc.append(get_dc_row("contributor", "other", depts[filename[:3]]))
main_dc.append(get_dc_row("contributor", "other", campus))
main_dc.append(get_dc_row("title", "none", filename[:7] + " - " + dates[0]))
main_dc.append(get_dc_row("date", "issued", dates[1]))
main_dc.append(get_dc_row("language", "iso", "en_ca"))
main_dc.append(get_dc_row("type", "none", "exam"))
main_dc.append(get_dc_row("subject", "none", filename[:7]))
return main_dc
class ExamDateException(Exception):
"""Exception thrown if exam month unable to be identified
"""
pass
if __name__ == "__main__":
doc = "Given the root path of PDF exams and campus, produce accompanying XML files for DSpace."
parser = argparse.ArgumentParser(description=doc)
parser.add_argument('pdfpath', help='Path to the root path of PDF exams')
parser.add_argument('campus', help='Campus: A, B or C')
args = parser.parse_args()
if (args.campus == "A"):
depts = get_depts_mapping("Campus_A.csv")
campus = "Campus A"
elif (args.campus == "utsc"):
depts = get_depts_mapping("Campus_B.csv")
campus = "Campus B"
else:
depts = get_depts_mapping("Campus_C.csv")
campus = "Campus C"
for exam_pdf in listdir(args.pdfpath):
filename = exam_pdf.split('.')[0]
dc = make_dc(filename.upper(), depts, campus)
with open(path.join(args.pdfpath, filename + ".xml"), "w") as xml:
xml.write(dc.prettify())