-
Notifications
You must be signed in to change notification settings - Fork 1
/
html2txt.py
executable file
·161 lines (146 loc) · 4.92 KB
/
html2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python
#
# html2txt.py - text extractor (or HTML tag ripper)
#
# Copyright (c) 2005 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# html2txt.py is a much simpler text extractor (or an HTML tag
# ripper) without using any sort of predefined pattern. It just
# removes all HTML tags from the input files. It also removes
# javascript or stylesheet contents surrounded by
# <script>...</script> or <style>...</style> tag.
#
# Usage:
# $ ./html2txt.py [options] input_filename ... > output_text
#
# Options:
# -C output_text_encoding
# Specifies the encoding of output texts (page titles and main texts). The default is utf-8.
# -c default_character_set
# Specifies the default character set that is used when there is
# no charset declaration (<meta> tag) in an HTML file. A different
# character set is not automatically detected.
#
import sys, re
from webstemmer.htmlutils import rmsp, getencoder
from webstemmer.htmlparser3 import HTMLParser3, HTMLHandler
## HTMLTextHandler
##
class HTMLTextHandler(HTMLHandler):
CUTSP = re.compile(ur'([\u3000-\u9fff])\n+([\u3000-\u9fff])')
IGNORED_TAGS = dict.fromkeys(
'comment script style select'.split(' ')
)
NEWLINE_TAGS = dict.fromkeys(
'p br div td th li blockquote pre form hr h1 h2 h3 h4 h5 h6 address'.split(' ')
)
def __init__(self, out, ignored_tags=IGNORED_TAGS, newline_tags=NEWLINE_TAGS):
self.out = out
self.ignored_tags = ignored_tags
self.newline_tags = newline_tags
self.ignore = 0
self.text = []
return
def flush(self, newline=False):
if self.text:
s = rmsp(self.CUTSP.sub(r'\1\2', ''.join(self.text).strip()))
if s:
self.out.feed(s+'\n')
self.text = []
return
def start_unknown(self, tag, attrs):
if tag in self.ignored_tags:
self.ignore += 1
if tag in self.newline_tags:
self.flush(True)
return
def end_unknown(self, tag):
if tag in self.ignored_tags:
self.ignore -= 1
return
def handle_data(self, data):
if not self.ignore:
self.text.append(data)
return
def finish(self):
self.flush()
self.out.close()
return self.out.output_text
# By me steve kieu for use in my script
class OutStr:
def __init__(self, charset):
self.encoder = getencoder(charset)
self.output_text = None
return
def close(self): pass
def feed(self, s):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
output = StringIO()
output.write(self.encoder(s, 'replace')[0])
self.output_text = output.getvalue()
output.close()
class HtmlToText:
def __init__(self, text):
(charset_in, charset_out) = ('utf-8', 'utf-8')
self.out = OutStr(charset_out)
p = HTMLParser3(HTMLTextHandler(self.out), charset=charset_in)
p.feed_byte(text).close()
# Get using out.output_text End skieu
# main
if __name__ == "__main__":
import getopt, urllib
class out:
def __init__(self, charset):
self.encoder = getencoder(charset)
return
def close(self): pass
def feed(self, s):
sys.stdout.write(self.encoder(s, 'replace')[0])
sys.stdout.flush()
return
def usage():
print 'usage: html2txt.py [-c charset_in] [-C charset_out] files ...'
sys.exit(2)
try:
(opts, args) = getopt.getopt(sys.argv[1:], 'c:C:')
except getopt.GetoptError:
usage()
(charset_in, charset_out) = ('utf-8', 'utf-8')
for (k,v) in opts:
if k == '-c': charset_in = v
elif k == '-C': charset_out = v
if not args: args = ['-']
for url in args:
if url == '-':
fp = sys.stdin
elif url.startswith('http:') or url.startswith('ftp:'):
fp = urllib.urlopen(url)
else:
fp = file(url)
p = HTMLParser3(HTMLTextHandler(out(charset_out)), charset=charset_in)
p.feed_file(fp).close()
fp.close()