forked from ocropus/hocr-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hocr-merge-dc
executable file
·61 lines (47 loc) · 1.84 KB
/
hocr-merge-dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
from __future__ import print_function
import argparse
import re
from lxml import etree, html
dcknown = [
"dc:title", "dc:creator", "dc:subject", "dc:description", "dc:publisher",
"dc:contributor", "dc:date", "dc:type", "dc:format", "dc:identifier",
"dc:source", "dc:language", "dc:relation", "dc:coverage", "dc:rights"
]
def get_text(node):
textnodes = node.xpath(".//text()")
s = "".join([text for text in textnodes])
return re.sub(r'\s+', ' ', s)
parser = argparse.ArgumentParser(
description="merge Dublin Core metadata into hOCR header files")
parser.add_argument(
"dc",
help="XML file with Dublin Core metadata",
type=argparse.FileType('r'))
parser.add_argument("hocr", help="hOCR file", type=argparse.FileType('r'))
args = parser.parse_args()
dc_doc = etree.parse(args.dc, html.XHTMLParser())
hocr_doc = html.parse(args.hocr)
# remove all existing META tags representing Dublin Core metadata
hocr_meta = hocr_doc.xpath("//HEAD|//head")
assert hocr_meta != []
hocr_meta = hocr_meta[0]
hocr_nodes = hocr_doc.xpath("//head//meta[starts-with(@name,'DC.')]")
for node in hocr_nodes:
node.getparent().remove(node)
# find all the Dublin Core tags in the Dublin Core metadata
dc_nodes = dc_doc.xpath(
"//dc:*", namespaces={"dc": "http://purl.org/dc/elements/1.1/"})
for node in dc_nodes:
nt = re.sub(r'^{http://purl.org/dc/elements/1.1/}', 'dc:', node.tag)
if nt in dcknown:
name = re.sub(r'^dc:', 'DC.', nt)
value = get_text(node)
value = re.sub("[\t\r\n'\"]", " ", value).strip()
value = value[:500]
hnode = etree.Element(
"meta", nsmap={'DC': 'http://purl.org/dc/elements/1.1'})
hnode.attrib['name'] = name
hnode.attrib['content'] = value
hocr_meta.append(hnode)
print(etree.tostring(hocr_doc, pretty_print=True))