forked from DidierStevens/DidierStevensSuite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractscripts.py
77 lines (59 loc) · 1.89 KB
/
extractscripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/python
# V1.1 15/01/2007 - 10/07/2007
# Source code put in public domain by Didier Stevens, no Copyright
# https://DidierStevens.com
# Use at your own risk
#
# History:
# 10/07/2007: Handle comments inside script tags
import sgmllib
import sys
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
self.inSCRIPT = 0
self.SCRIPTdata = ""
self.countScripts = 0
self.scriptingLanguage = ""
def start_script(self, attributes):
"Process a <script> tag."
self.scriptingLanguage = ""
for name, value in attributes:
if name == "language":
self.scriptingLanguage = value
self.inSCRIPT = 1
self.SCRIPTdata = ""
self.countScripts += 1
def end_script(self):
"Process a </script> tag."
self.inSCRIPT = 0
fScript = open("script.%d.%s" % (self.countScripts, self.scriptingLanguage), "w")
fScript.write(self.SCRIPTdata)
fScript.close()
def handle_data(self, s):
"Process data between <script> tags"
if self.inSCRIPT == 1:
self.SCRIPTdata = self.SCRIPTdata + s
def handle_comment(self, s):
"Process data between comment tags"
if self.inSCRIPT == 1:
self.SCRIPTdata = self.SCRIPTdata + s
def get_SCRIPTdata(self):
"Return the text between <script> tags."
return self.SCRIPTdata
if len(sys.argv) != 2:
print "Usage: extractscripts html-file"
else:
fHTML = open(sys.argv[1], "r")
s = fHTML.read()
fHTML.close()
del fHTML
myparser = MyParser()
myparser.parse(s)