-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrapping.py
308 lines (278 loc) · 14 KB
/
scrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
'''
Importing all imports required for scrapping, crawling and analyzing the data from the webpages
'''
import requests
import cloudscraper
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
import time
import numpy as np
from nltk.corpus import stopwords
from collections import Counter
from nltk import word_tokenize
from nltk.util import ngrams
from concurrent.futures import ThreadPoolExecutor
import asyncio
# Include the list of parents to blacklist, user can change it as per his will
blacklist = ['[document]', 'noscript', 'header',
'html', 'meta','head', 'script', 'style']
'''
This function is used for getting all links from the Soup
It is basically a nested function. The getLink(e) function takes a link and makes it a valid URL, if it is not already valid
The getLink function has all the possible types of URLs that could be gotten from the soup object. It makes them traversabale
'''
def getLinksFromLinkAndSession(baseURL, session):
def getLink(e):
link = e["href"]
if len(link) < 1:
return ''
if link.startswith('//'):
return 'http:'+link
if link.startswith('?'):
if baseURL.endswith('/'):
return baseURL[:-1]+link
else:
return baseURL+link
if link[0] == '/':
if baseURL.endswith(link):
return ''
if baseURL[-1] != '/':
return baseURL+link
else:
return baseURL+link[1:]
elif link[0] == '#':
return ''
elif len(link) > 7:
return link
else:
return ''
'''
It maps each link from the soup to the getLink function. After that the getLink function sends back valid URLs one by one
'''
try:
response=session.get(baseURL) #getting the page
except:
return []
if response.ok:
html_page = response.text #extracting text from the response
soup = BeautifulSoup(html_page, 'lxml') #Creating a soup using the lxml parser
allLinks = list(map(getLink, soup.find_all('a', href=True)))
allLinks = [link for link in allLinks if link] #removing empty links from the links gotten from a page
return list(set(allLinks)) #remove duplicates and return a list of links
else:
return []
'''
This function is used for getting all words from the soup
'''
def getWordsAndSentencesFromLinkAndSession(link, session):
try:
response=session.get(link) #getting the page
except:
return [],[]
if response.ok:
html_page = response.text #extracting text from the response
soup = BeautifulSoup(html_page, 'lxml') #Creating a soup using the lxml parser
text = soup.find_all(text=True) #finding out all the text from the soup
output = ''
outputSentences = []
'''
Cleaning the text received from the soup. Firstly we will remove all elements having their parents in blacklist
'''
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
outputSentences.append('{} '.format(t))
outputSentences = [i.strip() for i in outputSentences]
# removing special characters
outputSentences = [re.sub('[^a-zA-Z0-9]+', ' ', _)
for _ in outputSentences]
# removing all only digit phrases
outputSentences = [' '.join(s for s in i.split() if not any(
c.isdigit() for c in s)) for i in outputSentences]
#removing empty sentences
outputSentences = [i for i in outputSentences if i]
#tokenising the sentence phrases using nltk
outputSentences = [nltk.tokenize.sent_tokenize(i) for i in outputSentences]
words = list(output.split(' '))
words = [re.sub('[^a-zA-Z0-9]+', ' ', _) for _ in words]
allWords = []
#Filtering words and removing words having length less than 2 and numbers
for word in words:
if len(word) > 2:
wordsInCurrent = word.split(' ')
for w in wordsInCurrent:
if len(w) > 2 and not(w.isdecimal()):
allWords.append(w.lower()) #appending lowercase words to the final list
return allWords, outputSentences
else:
return [],[]
'''
This function is used to get all bigrams(words coming together in pair) level by level from all the sentences and phrases we have gotten from the site
'''
def getBigrams(allSentences):
allBiagrams = []
for sentencesPerLevel in allSentences:
bigram = []
for sentence in sentencesPerLevel:
token = nltk.word_tokenize(sentence) #tokenising the phrase
token = [word.lower() for word in token if word not in stopwords.words('english')] #removing stopwords **only english
bi = list(ngrams(token, 2)) #creating bigrams using ngrams
[bigram.append(i[0]+" " + i[1]) for i in bi if i]
[allBiagrams.append(i) for i in bigram]
bigrams = {} #making a bigram frequency dictionary for plotting graphs
for i in allBiagrams:
bigrams[i] = bigrams.get(i, 0) + 1
b = Counter(bigrams)
#return all the bigrams sorted in descending order of frequency
return b.most_common()
'''
This function is used to analyze the words level by level for plotting graphs.
The statistics done are:
1. Count of Words Per Level including stopwords
2. Word Cloud with frequencies without stopwords
3. Average Length of Words Per Level including stopwords
'''
def analyzeWords(allWords):
level = 1
words = {}
countOfWordsPerLevel = []
averageLengthOfWordsPerLevel = []
for wordsPerLevel in allWords:
countOfWordsPerLevel.append(["Level "+str(level), len(wordsPerLevel)])
averageLengthOfWordsPerLevel.append(["Level "+str(level), sum(len(s) for s in wordsPerLevel)/len(wordsPerLevel)])
level += 1
filtered_words = [word for word in wordsPerLevel if word not in stopwords.words('english')] #filtering words using nltk by removing stopwords in english only
count = {}
for i in filtered_words:
count[i] = count.get(i, 0) + 1 #This dictionary can be used to analyze words individually per level
words[i] = words.get(i, 0)+1
c = Counter(count)
w = Counter(words) #making a counter of the words
return w.most_common(), countOfWordsPerLevel, averageLengthOfWordsPerLevel
'''
The next two functions use multithreading for asynchronous crawling and gathering of data reveived by extraction of data from a list of URLs.
These functions ensure that the crawling is done asynchronously and hence speeds up the process.
'''
async def threadPoolForGettingWordsAndLinks(URLs):
#using numpy arrays for faster append operation
wordsInCurrentLevel=np.array([])
sentenceInCurrentLevel=np.array([])
linksInNextLevel=np.array([])
with ThreadPoolExecutor(max_workers=len(URLs)) as executor:
with requests.Session() as session:
loop=asyncio.get_event_loop()
#the tasks to perform, here two tasks are performed in concurrency
tasks=[
[loop.run_in_executor(executor, getWordsAndSentencesFromLinkAndSession, *(link, session)) for link in URLs],
[loop.run_in_executor(executor, getLinksFromLinkAndSession, *(link, session)) for link in URLs]
]
#appending data gathered from crawling to the numpy arrays
for words, sentences in await asyncio.gather(*tasks[0]):
wordsInCurrentLevel=np.append(wordsInCurrentLevel, words)
sentenceInCurrentLevel=np.append(sentenceInCurrentLevel,sentences)
for links in await asyncio.gather(*tasks[1]):
linksInNextLevel=np.append(linksInNextLevel,links)
return list(wordsInCurrentLevel),list(sentenceInCurrentLevel), list(linksInNextLevel)
async def threadPoolForGettingWords(URLs):
#using numpy arrays for faster append operation
wordsInCurrentLevel=np.array([])
sentenceInCurrentLevel=np.array([])
#the threadPoolExecutor Service
with ThreadPoolExecutor(max_workers=max(1,len(URLs))) as executor:
with requests.Session() as session:
loop=asyncio.get_event_loop()
#the tasks to perform
tasks=[
[loop.run_in_executor(executor, getWordsAndSentencesFromLinkAndSession, *(link, session)) for link in URLs]
]
#appending data gathered from crawling to the numpy arrays
for words, sentences in await asyncio.gather(*tasks[0]):
wordsInCurrentLevel=np.append(wordsInCurrentLevel, words)
sentenceInCurrentLevel=np.append(sentenceInCurrentLevel,sentences)
return list(wordsInCurrentLevel),list(sentenceInCurrentLevel)
'''
This is the main function for starting the scrapping and crawling. The parameters it takes are the baseURL and the maximum depth it has to go till.
Multithreading using asyncio and ThreadPoolExecutor for faster gathering of information during crawling.
'''
def startScraping(baseURL, maxLevels):
visited = {} # a visited dictionary to keep track of the visited links and also can be useful for counting how many times any URL occurs on a page
allURLs = [] #a list to maintain the URLs per level
allWords = [] #a list to maintain all the Words per level
allSentences = [] # a list to maintain all the sentences and phrases per level
for level in range(0, maxLevels+1):
if level == 0:
visited[baseURL] = 1 #marking the URL visited so that it will not be visited again
l=[]
l.append(baseURL)
loop = asyncio.new_event_loop() #starting a new event loop
asyncio.set_event_loop(loop) #setting the loop
allURLs.append(l)
'''
If the current level(0) is the maxLevel then we don't need to search for URLs on that page.
Else we need to search for URLs on that page
'''
if level==maxLevels:
future=asyncio.ensure_future(threadPoolForGettingWords(l))
#running the process until all the data is gathered
wordsInCurrentLevel,sentences=loop.run_until_complete(future)
#adding to respective lists
allWords.append(wordsInCurrentLevel)
allSentences.append(sentences)
else:
future=asyncio.ensure_future(threadPoolForGettingWordsAndLinks(l))
#running the process until all the data is gathered
wordsInCurrentLevel,sentencesInCurrentLevel, LinksInNextLevel=loop.run_until_complete(future)
#adding to respective lists
allWords.append(wordsInCurrentLevel)
allSentences.append(sentencesInCurrentLevel)
allURLs.append(LinksInNextLevel)
elif level == maxLevels:
#if the level is maxLevel we will not find the links on these pages and just find the words and sentences.
URLs=[] #list to hold valid URLs in current level
for link in allURLs[-1]:
if link not in visited.keys() and ((not link.startswith("mailto:")) and (not ("javascript:" in link)) and (not link.endswith(".png")) and (not link.endswith(".jpg")) and (not link.endswith(".jpeg"))):
URLs.append(link)
visited[link]=1
else:
if link in visited.keys(): #if the link is already visited increase the counter to know about duplicate URLs
visited[link] += 1
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
future=asyncio.ensure_future(threadPoolForGettingWords(URLs)) #running the process until all the data is gathered
wordsInCurrentLevel,sentencesInCurrentLevel=loop.run_until_complete(future)
#adding to respective lists
allWords.append(wordsInCurrentLevel)
allSentences.append(sentencesInCurrentLevel)
else:
URLs = [] #list to hold valid URLs in current level
for link in allURLs[-1]:
if link not in visited.keys() and ((not link.startswith("mailto:")) and (not ("javascript:" in link)) and (not link.endswith(".png")) and (not link.endswith(".jpg")) and (not link.endswith(".jpeg"))):
visited[link] = 1 #marking the URL visited so that it will not be visited again
URLs.append(link)
else:
if link in visited.keys():
visited[link]+=1
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
future=asyncio.ensure_future(threadPoolForGettingWordsAndLinks(URLs))
wordsInCurrentLevel,sentencesInCurrentLevel, URLsInNextLevel=loop.run_until_complete(future)
#adding to respective lists
allURLs.append(URLsInNextLevel)
allWords.append(wordsInCurrentLevel)
allSentences.append(sentencesInCurrentLevel)
#analyzing the data gotten from crawling
wordCloudWords, countOfWordsPerLevel, averageLengthOfWordsPerLevel = analyzeWords(allWords)
allBiagrams = getBigrams(allSentences)
# print(allBiagrams)
wordCloud = []
bigramCloud = []
# #formatting the wordCloudWords and allBiagrams for the plotting of graphs
for key, val in wordCloudWords:
wordCloud.append({"x": key, "value": val, "category": key})
for key, val in allBiagrams:
bigramCloud.append({"x": key, "value": val, "category": key})
return (wordCloud, countOfWordsPerLevel, averageLengthOfWordsPerLevel, bigramCloud)