-
Notifications
You must be signed in to change notification settings - Fork 0
/
maketoc.py
executable file
·537 lines (454 loc) · 15.6 KB
/
maketoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
#!/usr/bin/env python3
"""
routine which tries to create a valid table of contents file for SE projects
"""
import argparse
import os
from typing import TextIO
from enum import Enum
import regex
from bs4 import BeautifulSoup, Tag
# global variable
VERBOSE = False
class TocItem:
"""
small class to hold data on each table of contents item
found in the project
"""
filelink = ''
level = 0
roman = ''
title = ''
subtitle = ''
id = ''
epubtype = ''
def output(self) -> str:
"""
the output method just outputs the linking tag line
eg <a href=... depending on the data found
"""
outstring = ''
if title_is_entirely_roman(self.title):
if self.subtitle == '': # no subtitle
outstring += tabs(1) + '<a href="text/' + self.filelink + '" epub:type="z3998:roman">' + self.roman + '</a>\n'
else:
outstring += tabs(1) + '<a href="text/' + self.filelink + '">' + self.title + ': ' + self.subtitle + '</a>\n'
else:
outstring += tabs(1) + '<a href="text/' + self.filelink + '">' + self.title + '</a>\n'
return outstring
class Position(Enum):
"""
enum to indicate whether a landmark is frontmatter, bodymatter or backmatter
"""
NONE = 0
FRONT = 1
BODY = 2
BACK = 3
class LandmarkItem:
"""
small class to hold data on landmark items found in the project
"""
title = ''
filelink = ''
epubtype = ''
place: Position = Position.FRONT
def output(self, worktype: str = 'fiction', worktitle: str = 'WORKTITLE'):
"""
returns the linking string to be included in landmarks section
"""
outstring = ''
if self.place == Position.FRONT:
outstring = tabs(4) + '<li>\n' + tabs(5) + '<a href="text/' + self.filelink \
+ '" epub:type="frontmatter ' + self.epubtype + '">' + self.title + '</a>\n' + tabs(4) + '</li>\n'
if self.place == Position.BODY:
outstring = tabs(4) + '<li>\n' + tabs(5) + '<a href="text/' + self.filelink \
+ '" epub:type="bodymatter z3998:' + worktype + '">' + worktitle + '</a>\n' + tabs(4) + '</li>\n'
if self.place == Position.BACK:
outstring = tabs(4) + '<li>\n' + tabs(5) + '<a href="text/' + self.filelink \
+ '" epub:type="backmatter ' + self.epubtype + '">' + self.title + '</a>\n' + tabs(4) + '</li>\n'
return outstring
def tabs(num_tabs: int) -> str:
"""
convenience function to return given number of tabs as a string.
offset is optional
"""
if num_tabs > 0:
return '\t' * num_tabs
return ''
def indent(level: int, offset: int = 0) -> str:
"""
convenience function to return given number of tabs as a string.
offset is optional
"""
num_tabs = (level * 2 + 2) + offset # offset may be negative
if num_tabs > 0:
return '\t' * num_tabs
return ''
def getcontentfiles(opf: BeautifulSoup) -> list:
"""
reads the spine from content.opf to obtain a list of content files in the order wanted for the ToC
"""
itemrefs = opf.find_all('itemref')
retlist = []
for itemref in itemrefs:
retlist.append(itemref['idref'])
return retlist
def get_worktitle(opf: BeautifulSoup) -> str:
"""
pulls the title of the work out of the content.opf file
"""
dctitle = opf.find('dc:title')
if dctitle is not None:
return dctitle.string
return 'WORKTITLE'
def gethtml(filename: str) -> str:
"""
reads an xhtml file and returns the text
"""
try:
fileobject = open(filename, 'r', encoding='utf-8')
except IOError:
print('Could not open ' + filename)
return ''
text = fileobject.read()
fileobject.close()
return text
def get_epub_type(soup: BeautifulSoup) -> str:
"""
retrieve the epubtype of this file to see if it's a landmark item
"""
# try for a heading
first_head = soup.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
if first_head is not None:
parent = first_head.find_parent(['section', 'article'])
else: # no heading so go hunting for some content
para = soup.find(['p', 'header', 'img']) # we find the first <p> or <header>
parent = para.find_parent(['section', 'article'])
try:
return parent['epub:type']
except KeyError:
return ''
def get_place(soup: BeautifulSoup) -> Position:
"""
returns place of file in ebook, eg frontmatter, backmatter, etc.
"""
bod = soup.body
try:
epubtype = bod['epub:type']
except KeyError:
return Position.NONE
if 'backmatter' in epubtype:
retval = Position.BACK
elif 'frontmatter' in epubtype:
retval = Position.FRONT
elif 'bodymatter' in epubtype:
retval = Position.BODY
else:
retval = Position.NONE
return retval
def add_landmark(soup: BeautifulSoup, textf: str, landmarks: list):
"""
adds item to landmark list with appropriate details
"""
epubtype = get_epub_type(soup)
title = soup.find('title').string
landmark = LandmarkItem()
landmark.title = title
if epubtype != '':
landmark.epubtype = epubtype
landmark.filelink = textf
landmark.place = get_place(soup)
landmarks.append(landmark)
def process_landmarks(landmarks_list: list, tocfile: TextIO, worktype: str, worktitle: str):
"""
goes through all found landmark items and writes them to the toc file
"""
frontitems = [item for item in landmarks_list if item.place == Position.FRONT]
bodyitems = [item for item in landmarks_list if item.place == Position.BODY]
backitems = [item for item in landmarks_list if item.place == Position.BACK]
for item in frontitems:
tocfile.write(item.output())
if bodyitems:
tocfile.write(bodyitems[0].output(worktype, worktitle)) # just the first item
for item in backitems:
tocfile.write(item.output())
def process_items(item_list: list, tocfile: TextIO):
"""
goes through all found toc items and writes them to the toc file
"""
unclosed_ol = 0 # keep track of how many ordered lists we open
# process all but last item so we can look ahead
for index in range(0, len(item_list) - 1): # ignore very last item, which is a dummy
thisitem = item_list[index]
nextitem = item_list[index + 1]
toprint = ''
# check to see if next item is at same, lower or higher level than us
if nextitem.level == thisitem.level: # SIMPLE
toprint += indent(thisitem.level) + '<li>\n'
toprint += indent(thisitem.level) + thisitem.output()
toprint += indent(thisitem.level) + '</li>\n'
if nextitem.level > thisitem.level: # PARENT
toprint += indent(thisitem.level) + '<li>\n'
toprint += indent(thisitem.level) + thisitem.output()
toprint += indent(thisitem.level) + tabs(1) + '<ol>\n'
unclosed_ol += 1
if VERBOSE:
print(thisitem.filelink + ' unclosed ol = ' + str(unclosed_ol))
if nextitem.level < thisitem.level: # LAST CHILD
toprint += indent(thisitem.level) + '<li>\n'
toprint += indent(thisitem.level) + thisitem.output()
toprint += indent(thisitem.level) + '</li>\n' # end of this item
torepeat = thisitem.level - nextitem.level
current_level = thisitem.level
if torepeat > 0 and unclosed_ol > 0:
for _ in range(0, torepeat): # need to repeat a few times as may be jumping back from eg h5 to h2
toprint += indent(current_level, -1) + '</ol>\n' # end of embedded list
unclosed_ol -= 1
if VERBOSE:
print(thisitem.filelink + ' unclosed ol = ' + str(unclosed_ol))
toprint += indent(current_level, -2) + '</li>\n' # end of parent item
current_level -= 1
tocfile.write(toprint)
while unclosed_ol > 0:
# shouldn't ever get here, but just to be safe...
tocfile.write(tabs(3) + '</ol>\n')
unclosed_ol -= 1
if VERBOSE:
print('Closing: unclosed ol = ' + str(unclosed_ol))
tocfile.write(tabs(2) + '</li>\n')
def output_toc(item_list: list, landmark_list, outtocpath: str, worktype: str, worktitle: str):
"""
outputs the contructed ToC based on the lists of items and landmarks found, to the specified output file
"""
if len(item_list) < 2:
print('Too few ToC items found')
return
try:
if os.path.exists(outtocpath):
os.remove(outtocpath) # get rid of file if it already exists
tocfile = open(outtocpath, 'a', encoding='utf-8')
except IOError:
print('Unable to open output file! ' + outtocpath)
return
write_toc_start(tocfile)
process_items(item_list, tocfile)
write_toc_middle(tocfile)
process_landmarks(landmark_list, tocfile, worktype, worktitle)
write_toc_end(tocfile)
tocfile.close()
def write_toc_start(tocfile):
"""
write opening part of ToC
"""
tocfile.write('<?xml version="1.0" encoding="utf-8"?>\n')
tocfile.write('<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" ')
tocfile.write('epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, ')
tocfile.write('se: https://standardebooks.org/vocab/1.0" xml:lang="en-US">\n')
tocfile.write(tabs(1) + '<head>\n')
tocfile.write(tabs(2) + '<title>Table of Contents</title>\n')
tocfile.write(tabs(1) + '</head>\n')
tocfile.write(tabs(1) + '<body epub:type="frontmatter">\n')
tocfile.write(tabs(2) + '<nav epub:type="toc">\n')
tocfile.write(tabs(3) + '<h2 epub:type="title">Table of Contents</h2>\n')
tocfile.write(tabs(3) + '<ol>\n')
def write_toc_middle(tocfile):
"""
write middle part of ToC and start of Landmarks
"""
tocfile.write(tabs(3) + '</ol>\n')
tocfile.write(tabs(2) + '</nav>\n')
tocfile.write(tabs(2) + '<nav epub:type="landmarks">\n')
tocfile.write(tabs(3) + '<h2 epub:type="title">Landmarks</h2>\n')
tocfile.write(tabs(3) + '<ol>\n')
def write_toc_end(tocfile):
"""
write closing part of ToC
"""
tocfile.write(tabs(3) + '</ol>\n')
tocfile.write(tabs(2) + '</nav>\n')
tocfile.write(tabs(1) + '</body>\n')
tocfile.write('</html>')
def get_parent_id(hchild: Tag) -> str:
"""
climbs up the document tree looking for parent id in a <section> tag.
"""
parent = hchild.find_parent("section")
if parent is None:
return ''
try:
return parent['id']
except KeyError:
return ''
def extract_strings(atag: Tag) -> str:
"""
returns only the string content of a tag, ignoring noteref and its content
"""
retstring = ''
for child in atag.contents:
if child != '\n':
if isinstance(child, Tag):
try:
epubtype = child['epub:type']
if 'z3998:roman' in epubtype:
retstring += str(child) # want the whole span
if 'noteref' in epubtype:
continue
except KeyError: # tag has no epubtype, probably <abbr>
retstring += child.string
else:
retstring += child # must be NavigableString
return retstring
def process_headings(soup: BeautifulSoup, textf: str, toclist: list, nest_under_halftitle: bool):
"""
find headings in current file and extract data
into items added to toclist
"""
# find all the h1, h2 etc headings
heads = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
if not heads: # may be a dedication or an epigraph, with no heading tag
special_item = TocItem()
sections = soup.find_all('section') # count the sections within this file
special_item.level = len(sections)
title_tag = soup.find('title') # use page title as the ToC entry title
special_item.title = title_tag.string
special_item.filelink = textf
toclist.append(special_item)
return
is_toplevel = True
for heading in heads:
tocitem = process_heading(heading, is_toplevel, textf)
if nest_under_halftitle:
tocitem.level += 1
is_toplevel = False
toclist.append(tocitem)
def title_is_entirely_roman(title: str) -> bool:
"""
test to see if there's nothing else in a title than a roman number.
if so, we can collapse the epub type into the surrounding ToC <a> tag
"""
pattern = r'^<span epub:type="z3998:roman">[IVXLC]{1,10}<\/span>$'
compiled_regex = regex.compile(pattern)
return compiled_regex.search(title)
def process_heading(heading, is_toplevel, textf) -> TocItem:
"""
generate and return a TocItem from this heading
"""
tocitem = TocItem()
parent_sections = heading.find_parents(['section', 'article'])
tocitem.level = len(parent_sections)
# this stops the first heading in a file getting an anchor id, which is what we want
if is_toplevel:
tocitem.id = ''
tocitem.filelink = textf
else:
tocitem.id = get_parent_id(heading)
if tocitem.id == '':
tocitem.filelink = textf
else:
tocitem.filelink = textf + '#' + tocitem.id
# a heading may include z3998:roman directly,
# eg <h5 epub:type="title z3998:roman">II</h5>
try:
attribs = heading['epub:type']
except KeyError:
if VERBOSE:
print(textf + ': warning: heading with no epub:type')
attribs = ''
if 'z3998:roman' in attribs:
tocitem.roman = extract_strings(heading)
tocitem.title = '<span epub:type="z3998:roman">' + tocitem.roman + '</span>'
return tocitem
process_heading_contents(heading, tocitem)
return tocitem
def process_heading_contents(heading, tocitem):
"""
go through each item in the heading contents
and try to pull out the toc item data
"""
accumulator = '' # we'll use this to build up the title
for child in heading.contents: # was children
if child != '\n':
if isinstance(child, Tag):
try:
epubtype = child['epub:type']
except KeyError:
epubtype = 'blank'
if child.name == 'abbr':
accumulator += extract_strings(child)
continue # skip following and go to next child
if 'z3998:roman' in epubtype:
tocitem.roman = extract_strings(child)
accumulator += str(child)
elif 'subtitle' in epubtype:
tocitem.subtitle = extract_strings(child)
elif 'title' in epubtype:
tocitem.title = extract_strings(child)
elif 'noteref' in epubtype:
pass # do nowt
else:
tocitem.title = extract_strings(child)
else: # should be a simple NavigableString
accumulator += str(child)
if tocitem.title == '':
tocitem.title = accumulator
def process_all_content(filelist, textpath) -> (list, list):
"""
analyze the whole content of the project, build and return lists
if tocitems and landmarks
"""
toclist = []
landmarks = []
nest_under_halftitle = False
for textf in filelist:
if VERBOSE:
print('Processing: ' + textf)
html_text = gethtml(os.path.join(textpath, textf))
soup = BeautifulSoup(html_text, 'html.parser')
place = get_place(soup)
if place == Position.BACK:
nest_under_halftitle = False
process_headings(soup, textf, toclist, nest_under_halftitle)
if textf == 'halftitle.xhtml':
nest_under_halftitle = True
add_landmark(soup, textf, landmarks)
# we add this dummy item because outputtoc always needs to look ahead to the next item
lasttoc = TocItem()
lasttoc.level = 1
lasttoc.title = "dummy"
toclist.append(lasttoc)
return landmarks, toclist
def main():
"""
main routine of the tool
"""
parser = argparse.ArgumentParser(description="Attempts to build a table of contents for an SE project")
parser.add_argument("-o", "--output", dest="output", required=False, help="path and filename of output file if existing ToC is to be left alone")
parser.add_argument("-v", "--verbose", required=False, action="store_const", const=True, help="verbose output")
parser.add_argument("-n", "--nonfiction", required=False, action="store_true", help="work type is non-fiction")
parser.add_argument("directory", metavar="DIRECTORY", help="a Standard Ebooks source directory")
args = parser.parse_args()
rootpath = args.directory
tocpath = os.path.join(rootpath, 'src', 'epub', 'toc.xhtml')
textpath = os.path.join(rootpath, 'src', 'epub', 'text')
opfpath = os.path.join(rootpath, 'src', 'epub', 'content.opf')
temptext = gethtml(opfpath)
opf = BeautifulSoup(temptext, 'html.parser')
filelist = getcontentfiles(opf)
worktitle = get_worktitle(opf)
if not os.path.exists(opfpath):
print("Error: this does not seem to be a Standard Ebooks root directory")
exit(-1)
if args.nonfiction is not None:
worktype = 'non-fiction'
else:
worktype = 'fiction'
global VERBOSE
VERBOSE = bool(args.verbose is not None)
landmarks, toclist = process_all_content(filelist, textpath)
outpath = tocpath
if args.output is not None:
outpath = args.output
output_toc(toclist, landmarks, outpath, worktype, worktitle)
print('done!')
if __name__ == "__main__":
main()