-
Notifications
You must be signed in to change notification settings - Fork 0
/
WWScrap.py
68 lines (63 loc) · 2.2 KB
/
WWScrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import urllib.request as urlRequest
import urllib.parse as urlParse
import sys
import codecs as cs
import os
def MakeAChapter(page):
return page.split('class="fr-view">')[1].splitlines()[1]
def MakeAWebpage(pathname, meme=True, default=""):
chapterName = pathname.split("/")[-1]
bookName = pathname.split("/")[-2]
file_name = chapterName+'.html'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"}
req = urlRequest.Request(pathname, headers = headers)
x = urlRequest.urlopen(req)
sourceCode = x.read()
try:
os.makedirs("htmlCache")
except OSError:
if not os.path.isdir("htmlCache"):
raise
if(meme):
html = cs.open("htmlCache/"+file_name,'w','utf-8')
decoded=sourceCode.decode("utf-8")
html.write(MakeAChapter(decoded))
html.close()
else:
html = cs.open("htmlCache/"+default,'w','utf-8')
decoded=sourceCode.decode("utf-8")
html.write(decoded)
html.close()
def GatherCompleteSummaryPages():
MakeAWebpage("https://www.wuxiaworld.com/" + "/".join(sys.argv[1].split("/")[-2:]),False,"index.html")
memeFile=open("htmlCache/index.html","r",encoding="utf-8")
wholeIndex=memeFile.read()
memeFile.close()
ListeLiens=[]
mymeme = wholeIndex.split("panel-body")[1].split('id="sidebar"')[0]
xd=mymeme.splitlines()
for line in xd:
if line.count("a href")!=0:
ListeLiens.append(line.split('"')[1])
return(ListeLiens)
def WriteSummary(listeLiens, outPath="htmlCache/toc.html"):
html_doc = """
<html>
<body>
<h1>Table of Contents</h1>
<p style="text-indent:0pt">
"""
for lien in listeLiens:
MakeAWebpage("https://www.wuxiaworld.com"+lien)
chapterName = ("https://www.wuxiaworld.com/"+lien).split("/")[-1]
html_doc = html_doc + "<a href=" + "\"" + chapterName + ".html\">" + chapterName + "</a><br/>" + "\r\n"
html_doc += """
</p>
</body>
</html>
"""
tocHTML = cs.open(outPath, 'w', 'utf-8')
tocHTML.write(html_doc)
tocHTML.close()
maListe=GatherCompleteSummaryPages()
WriteSummary(maListe)