-
Notifications
You must be signed in to change notification settings - Fork 1
/
html_to_text.py
223 lines (209 loc) · 7.38 KB
/
html_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
from io import StringIO
from logging import getLogger
logger = getLogger('html_to_text')
import lxml
import lxml.etree
import lxml.html
try:
from urllib import unquote
except ImportError:
from urllib.parse import unquote
import posixpath
import re
_collect_string_content = lxml.etree.XPath("string()")
HR_TEXT = "\n"+('-'*80)
class LXMLParser(object):
def __init__(self, item):
self.parse_tag(item)
def parse_tag(self, item):
if item.tag != lxml.etree.Comment and item.tag != lxml.etree.PI:
self.handle_starttag(item.tag, item.attrib)
if item.text is not None:
self.handle_data(item.text, item.tag)
for tag in item:
self.parse_tag(tag)
self.handle_endtag(item.tag, item)
if item.tail:
self.handle_data(item.tail, None)
class HTMLParser(LXMLParser):
_heading_tags = "h1 h2 h3 h4 h5 h6".split(' ')
_pre_tags = ('pre', 'code')
_table_tags = ('table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot')
_ignored = ['script', 'style', 'title']
whitespace_re = re.compile(r'\s+')
_block = ('p', 'div', 'center', 'blockquote')
heading_levels = {"h1": 1, "h2": 2, "h3": 3,
"h4": 4, "h5": 5, "h6": 6}
def __init__(self, item, node_parsed_callback=None, startpos=0, file=""):
self.node_parsed_callback = node_parsed_callback
self.startpos = startpos
self.file = file
self.output = StringIO()
self.add = ""
self.initial_space = False
self.ignoring = False
self.in_pre = False
self.last_data = ""
self.out = ['']
self.starting = True #Haven't written anything yet
self.final_space = False
self.heading_stack = []
self.last_page = None
self.table_stack = []
LXMLParser.__init__(self, item)
def handle_starttag(self, tag, attrs):
if self.ignoring:
return
if tag in self._ignored or attrs.get('class', None) == 'pagenum':
self.ignoring = True
return
elif tag in self._block:
self.add = "\n\n"
self.final_space = False
elif tag in self._heading_tags:
self.add = '\n\n'
self.final_space = False
level = self.heading_levels[tag]
start = self.output.tell()+self.startpos+(len(self.add) if not self.starting else 0)+(1 if self.final_space else 0)
if self.node_parsed_callback:
self.heading_stack.append((level, start, None))
if tag in self._pre_tags:
self.add = "\n"
self.in_pre = True
if tag == 'a' and 'href' in attrs:
self.link_start = self.output.tell()+self.startpos+(len(self.add) if not self.starting else 0)+(1 if self.final_space else 0)
if tag in ('dd', 'dt'):
self.add = '\n'
if 'id' in attrs and self.node_parsed_callback:
self.node_parsed_callback(None, 'id', self.file+"#"+attrs['id'], start=self.output.tell()+self.startpos+len(self.add))
if tag in self._table_tags and self.node_parsed_callback:
if self.table_stack:
parent = self.table_stack[-1]['id']
else:
parent = None
node = self.node_parsed_callback(parent, tag, None, start=self.output.tell()+self.startpos+len(self.add), attrs=dict(attrs))
self.table_stack.append(node)
def handle_endtag(self, tag, item):
if 'class' in item.attrib and item.attrib['class'] == 'pagenum':
if self.last_page is not None:
self.last_page['end'] = self.output.tell()+self.startpos
if self.node_parsed_callback:
self.last_page = self.node_parsed_callback(None, 'page', item.attrib['id'], start=self.output.tell()+self.startpos, pagenum=parse_pagenum(item.attrib['id']))
if tag in self._ignored or item.attrib.get('class', None) == 'pagenum':
self.ignoring = False
return
if tag in self._ignored:
self.ignoring = False
return
if tag in self._block:
self.add = "\n\n"
elif tag == 'br':
self.write_data('\n')
elif tag in self._heading_tags:
self.add = '\n\n'
if self.node_parsed_callback:
self.add_heading_node(tag)
elif tag in self._pre_tags:
self.in_pre = False
elif tag == 'a' and 'href' in item.attrib and self.node_parsed_callback:
self.add_link(item)
elif tag == 'hr':
self.output.write(HR_TEXT)
elif tag in self._table_tags and self.node_parsed_callback:
self.table_stack[-1]['end'] = self.output.tell() + self.startpos
self.table_stack.pop()
self.last_start = tag
def handle_data(self, data, start_tag):
if self.ignoring:
return
if self.in_pre:
if self.add:
self.write_data(self.add)
self.add = ""
self.write_data(data)
return
data = self.whitespace_re.sub(' ', data)
#The newline after <br> will turn into space above. Also,
#<span>a</span> <span>b</span> will return a space after a. We want to keep it
if data[0] == ' ':
self.initial_space = True
data = data[1:]
if not data:
return
if not self.add and self.final_space:
self.write_data(' ')
self.final_space = False
if data and data[-1] == ' ':
self.final_space = True
data = data[:-1]
if self.starting:
self.initial_space = False
self.add = ""
if self.add:
self.write_data(self.add)
self.add = ''
if self.initial_space and not self.last_newline:
self.write_data(' ')
self.write_data(data)
self.add = ""
self.initial_space = False
def write_data(self, data):
self.output.write(data)
self.last_newline = data[-1] == '\n'
self.last_data = data
self.starting = False
def add_heading_node(self, item):
"""Adds a heading to the list of nodes.
We can't have an end heading without a start heading."""
(level, start, node_id) = self.heading_stack.pop()
end = self.output.tell()+self.startpos
while self.need_heading_pop(level):
self.heading_stack.pop()
#The last element of the stack is our parent. If it's empty, we have no parent.
parent = None
if len(self.heading_stack):
parent = self.heading_stack[-1][2]
#parent should be set, create the heading. We need to put it back on the stack for the next heading to grab
#its parent if needed.
name = None #self.output.getvalue()[start:end+1]
id = self.node_parsed_callback(parent, 'heading', name, start=start, end=end, tag=item, level=item[-1])['id']
self.heading_stack.append((level, start, id))
def need_heading_pop(self, level):
if len(self.heading_stack) == 0:
return False #nothing to pop
prev_level = self.heading_stack[-1][0]
if level <= prev_level:
return True
def add_link(self, item):
text = _collect_string_content(item)
#Is this an internal link?
href = item.attrib['href']
if '://' not in href:
href = unquote(item.attrib['href'])
href = posixpath.normpath(posixpath.join(posixpath.dirname(self.file), href))
self.node_parsed_callback(None, 'link', text, start=self.link_start, end=self.output.tell()+self.startpos, href=href)
def html_to_text(item, node_parsed_callback=None, startpos=0, file=""):
if isinstance(item, str):
item = tree_from_string(item)
lxml.html.xhtml_to_html(item)
parser = HTMLParser(item, node_parsed_callback, startpos, file)
text = parser.output.getvalue()
if parser.last_page is not None:
parser.last_page['end'] = parser.output.tell()
return text
pagenum_re = re.compile(r'(\d+)$')
def parse_pagenum(num):
r = pagenum_re.search(num)
if r:
return str(int(r.group(1)))
elif num.startswith('p'):
return num[1:].lower()
else:
logger.warn("unable to parse page %r" % num)
return None
def tree_from_string(html):
try:
return lxml.etree.fromstring(html)
except lxml.etree.XMLSyntaxError:
pass
return lxml.html.fromstring(html)