-
Notifications
You must be signed in to change notification settings - Fork 0
/
new.py
254 lines (215 loc) · 9.31 KB
/
new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import os,time,re,math
from selenium.common.exceptions import StaleElementReferenceException
# Pre-processed datas to match with current backend system
using_district_list = {
"ALIPURDUAR": 719,
"BANKURA": 720,
"BIRBHUM": 721,
"COOCHBEHAR": 722,
"DAKSHIN 24 PARGANA": 740,
"DAKSHIN DINAJPUR": 723,
"DARJEELING": 724,
"HOOGHLY": 725,
"HOWRAH": 726,
"JALPAIGURI": 727,
"JHARGRAM": 728,
"KALIMPONG": 729,
"KOLKATA METROPOLITAN AREA": 730,
"MALDA": 731,
"MURSHIDABAD": 732,
"NADIA": 733,
"PASCHIM BARDHAMAN": 735,
"PASCHIM MEDINIPUR": 736,
"PURBA BARDHHAMAN": 737,
"PURBA MEDINIPUR": 738,
"PURULIA": 739,
"UTTAR 24 PARGANA": 734,
"UTTAR DINAJPUR": 741
}
using_hospital_type={
"Government Hospital": {
"dbkey": "govt",
"webkey": 1
},
"Govt. Requisitioned Pvt. Hospital": {
"dbkey": "pvtundergovt",
"webkey": 2
},
"Private Hospital": {
"dbkey": "private",
"webkey": 3
}
}
# Config
PATH_TO_DRIVER = r'/usr/bin/chromedriver'
endpoint = 'https://excise.wb.gov.in/CHMS/Public/Page/CHMS_Public_Hospital_Bed_Availability.aspx'
chrome_options = Options()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--window-size=1920x1080")
#Functions
#Init
def init_scrape(chrome_options,endpoint,PATH_TO_DRIVER=r'/usr/bin/chromedriver'):
driver = webdriver.Chrome(executable_path = PATH_TO_DRIVER, options=chrome_options)
driver.get(endpoint)
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.ID,"ctl00_ContentPlaceHolder1_ddl_District")))
return driver,wait
# Track the loading spinner
def still_loading(driver,wait):
try:
return driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_upPgo"]').value_of_css_property("display")
except:
return "done"
# Select Radio Buttons
def select_hospital_type(driver,label):
try:
radio = driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_rdo_Govt_Flag']/label[{}]".format(str(label)))
radio.click()
driver.implicitly_wait(10)
except:
raise Exception("ERROR : Can't Select Hospital Type")
# Click on <DISTRICT_NAME> district and load the pages
def select_district_and_wait_until_load(driver,wait,district_name):
try:
driver.find_element_by_xpath("//select[@name='ctl00$ContentPlaceHolder1$ddl_District']/option[text()='{}']".format(district_name)).click()
while still_loading(driver,wait) != "none":
driver.implicitly_wait(2)
driver.implicitly_wait(5)
wait.until(EC.element_to_be_clickable((By.ID,"ctl00_ContentPlaceHolder1_GridView2")))
except:
raise Exception("ERROR : Can't select district")
# Get no of pages (Pagination) for selected district
def no_of_pages_for_selected_district_and_type(driver):
try:
pages=len(driver.find_elements_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_GridView2']/tbody/tr[1]/td/table/tbody/tr/td"))
if pages == 0:
pages = 1
return pages
except:
raise Exception("ERROR : Can't get number of pages for selected district and type")
# Select page
def select_page_pagination_section(driver,page_no):
try:
driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_GridView2']/tbody/tr[1]/td/table/tbody/tr/td[{}]/a".format(str(page_no))).click()
while still_loading(driver,wait) != "none":
driver.implicitly_wait(2)
driver.implicitly_wait(2)
wait.until(EC.element_to_be_clickable((By.ID,"ctl00_ContentPlaceHolder1_GridView2")))
except:
raise Exception("ERROR : Switching page for pagination")
def try_to_switch_to_first_page(driver):
try:
driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_GridView2']/tbody/tr[1]/td/table/tbody/tr/td[{}]/a".format(str(1))).click()
while still_loading(driver,wait) != "none":
driver.implicitly_wait(2)
driver.implicitly_wait(2)
wait.until(EC.element_to_be_clickable((By.ID,"ctl00_ContentPlaceHolder1_GridView2")))
except:
print("ERROR : Maybe first page is already selected !")
# Click on all the "View Detailed break up option"
def toggle_detailed_break_up_section(driver):
try:
driver.find_element_by_xpath("//*[contains(@id,'_div_card')]/div[2]/div[2]/div[1]/a").click()
driver.implicitly_wait(5)
except:
raise Exception("ERROR : Can't open detailed break up section")
def MobileCleanData(data):
# Process data for DB
# Database only have Big Integer field , String not accepted
result = []
copydata = None
# Split data at "/" & "," for multiple mobile number
if "/" in str(data):
copydata = str(data).split("/")
elif "," in str(data):
copydata = str(data).split(",")
else:
copydata = [data]
for item in copydata:
try:
# Remove "+"
# Remove "(" & ")"
# Remove "-"
# Remove all blank spaces
tmp = item.replace("+","").replace("(","").replace(")","").replace("-","").replace(" ","")
# Check whether its is empty or not before typecasting to int to reduce possibility of error
if tmp.strip() != "":
result.append(int(tmp))
except ValueError:
print(f"{item} failed")
return 0,[0]
# Will return a tupple of (<No of mobile numbers>, [list of <mobileno>])
return len(result),result
scraping = False
# Main Driver function for scraping
def scrape_data(driver):
scraping = True
# Get the list of hospotals entry
entries = driver.find_elements_by_xpath("//*[contains(@id,'_div_card')]")
if len(entries) == 0 :
print("No items found")
scraping = False
return
toggle_detailed_break_up_section(driver)
# No of hospitals under selected category and dsitrct
print(str(len(entries)) + " items fetched.")
# Iterate through the entried and print the details
for ei,e in enumerate(entries):
# print(ei)
print("Name : {}".format(e.find_element_by_xpath(".//h5").text))
print("Address : {}".format(e.find_element_by_xpath("(.//div/div/div)[1]").text))
print("Phoneno : {}".format(MobileCleanData(e.find_element_by_xpath("(.//div/div/div)[2]//a").text)[1][0]))
print("Total Beds : {}".format(e.find_element_by_xpath(".//div[2]/div[1]/div[4]/div/ul/li[1]/h3").text))
print("Available Beds : {}".format(e.find_element_by_xpath(".//div[2]/div[1]/div[4]/div/ul/li[2]/h3").text))
print("Verified On : {}".format(e.find_element_by_xpath(".//div[3]/small").text))
print("------------------------")
bed_categories = e.find_elements_by_xpath(".//div[2]/div[2]/div[2]/div/div")
print("Categories of beds : {}".format(len(bed_categories)))
for bc in bed_categories:
# Make sure you have click on "View detailed Details" else you will get blank details
try:
print("Category : {}".format(bc.find_element_by_xpath(".//div/div[1]").text))
print("Available : {}".format(bc.find_element_by_xpath(".//div/div[2]/div/div[4]/div/ul/li[2]/h3").text))
print("Total : {}".format(bc.find_element_by_xpath(".//div/div[2]/div/div[4]/div/ul/li[1]/h3").text))
except:
print("Due to error skipping the category section")
print("----category_ end-------\n")
print("----------END------------")
scraping = False
def still_loading(driver,wait):
try:
return driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_upPgo"]').value_of_css_property("display")
except:
return "error"
driver,wait = init_scrape(chrome_options,endpoint)
for district_lable,district_id in using_district_list.items():
# print(district_lable,district_id)
for hospital_type in using_hospital_type:
select_hospital_type(driver,using_hospital_type[hospital_type]["webkey"])
select_district_and_wait_until_load(driver,wait,district_lable)
time.sleep(5)
no_of_pages = no_of_pages_for_selected_district_and_type(driver)
print(f"\n\n{no_of_pages}\n\n")
global_buggy_pnum = 0
for pg_no in range(no_of_pages):
if pg_no == 0:
try_to_switch_to_first_page(driver)
time.sleep(10)
if pg_no != 0 or (pg_no==0 and global_buggy_pnum!=0) :
while True:
if scraping:
time.sleep(2)
else:
global_buggy_pnum = pg_no
select_page_pagination_section(driver,pg_no+1)
time.sleep(10)
break
while still_loading(driver,wait) != "none":
driver.implicitly_wait(10)
scrape_data(driver)
print(f"Exiting from District : {district_lable} Type : {hospital_type} Page No : {pg_no}")