forked from nava45/flipkart-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
86 lines (72 loc) · 3.56 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from lxml import etree
from itertools import izip_longest
from config import MONGO_STORAGE_INPUT_DICT
from models import insert
#TODO: Make sure price, rating are aligned for the item
class FParser(object):
def __init__(self, page, pformat='html'):
self.data = page.replace('<b>','').replace('</b>','')
self.data_format = pformat
self.dom = etree.HTML(self.data)
self.products_xpath = '//div[@id="products"]'
self.product_col_xpath = '//div[@class="gd-col gu3"]'
self.title_path = '//a[@data-tracking-id="prd_title"]/text() | //a[@class="lu-title"]/text()'
#ratings xpath are approximate only not perfect
self.ratings_path = '//div[@class="pu-rating"]/text() | //div[@class="rating-wrapper"]/text()'
self.price_path = '//span[@class="fk-font-17 fk-bold"]/text() | //div[@class="pu-final fk-font-17 fk-bold"]/text()'
self.product_url = '//a[@data-tracking-id="prd_title"]/@href | //a[@class="lu-title"]/@href'
def __parse(self):
pass
def get_all_cols(self, xp):
self.prod = self.dom.xpath(self.products_xpath)
if self.prod:
self.prod = self.prod[0]
return self.prod.xpath(xp)
def __get_matches_list(self, dom, _xpath):
return [_.strip() for _ in dom.xpath(_xpath)]
def items(self):
pitems = self.get_all_cols(self.product_col_xpath)
if pitems:
#Grid view
titles = self.__get_matches_list(pitems[0], self.title_path)
ratings = self.__get_matches_list(pitems[0], self.ratings_path)
prices = self.__get_matches_list(pitems[0], self.price_path)
landing_page_url = self.__get_matches_list(pitems[0], self.product_url)
else:
#list view
titles = self.__get_matches_list(self.prod, self.title_path)
ratings = self.__get_matches_list(self.prod, self.ratings_path)
prices = self.__get_matches_list(self.prod, self.price_path)
landing_page_url = self.__get_matches_list(self.prod, self.product_url)
#ratings is approximate, so in izip we have to put it at last
for i in izip_longest(titles,prices,landing_page_url,ratings):
yield i
def store(self):
for i in self.items():
if i:
data_layer = MONGO_STORAGE_INPUT_DICT
#order is important
data_layer['name'],data_layer['price'],data_layer['url'],data_layer['rating'] = i
insert(data_layer)
class Amazon(FParser):
def __init__(self, page, pformat='html'):
self.data = page.replace('<b>','').replace('</b>','')
self.data_format = pformat
self.dom = etree.HTML(self.data)
self.products_xpath = '//ul[@id="s-results-list-atf"]'
self.product_col_xpath = '//li[@class="s-result-item"]'
self.title_path = '//h2[@class="a-size-medium a-color-null s-inline s-access-title a-text-normal"]/text()'
#ratings xpath are approximate only not perfect
self.ratings_path = '//i[@class="a-icon a-icon-star a-star-5"]/text()'
self.price_path = '//span[@class="a-size-base a-color-price s-price a-text-bold"]/text()'
self.product_url = '//a[@class="a-link-normal s-access-detail-page a-text-normal"]/@href'
"""
data = open('out.html','rb').read()
fp = Amazon(data)
for i in fp.items():
if i:
data_layer = MONGO_STORAGE_INPUT_DICT
data_layer['name'],data_layer['price'],data_layer['rating'],data_layer['url'] = i
print data_layer
#insert(data_layer)
"""