-
Notifications
You must be signed in to change notification settings - Fork 0
/
s1.py
56 lines (48 loc) · 1.36 KB
/
s1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests
from BeautifulSoup import BeautifulSoup
import re
#Fetch the main page
main='http://www.seeedstudio.com/wiki/Main_Page'
print 'Fetching Main Page'
r = requests.get(main)
raw_html=r.text
soup = BeautifulSoup(''.join(raw_html))
links=[]
#Find links in the page
for anchor in soup.findAll('a', href=True):
links.append( anchor['href'])
l=len(links)
temp=[]
for i in range(len(links)):
if links[i][0] == '/': #For searching only the Grove sensors
if links[i][6] =='G':
links[i]='http://www.seeedstudio.com'+links[i]
temp.append(links[i])
links=temp
print len(links),"links found"
op_file=open("grove.csv", "w")
for i in range(len(links)):
#Fetch other links
page=requests.get(links[i])
raw_html=page.text
soup = BeautifulSoup(''.join(raw_html))
print i,"of",len(links),":"
#Page Title
heading= (soup('h1',{'id':'firstHeading'})[0])
p_head=heading('span',{'dir':'auto'})[0].string,
p_h=str(p_head[0])
p_h=p_h.decode("utf-8") #Correct encoding for not ASCII characters like 02
p_h=p_h.encode('ascii',errors='ignore')
print p_h
op_file.write(p_h+';')
#Page Viewcount
str1= (soup('li',{'id':'footer-info-viewcount'})[0].string)
matches=re.findall('\d+',str1)
p_vc=",".join(matches)
print p_vc
op_file.write(p_vc+';')
#print url
p_lk=links[i]
print p_lk,'\n'
op_file.write(p_lk+'\n')
op_file.close()