-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
68 lines (61 loc) · 2.23 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
import os
import time
import ssl
import requests
import lxml
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
from pathlib import Path
import selenium
from selenium import webdriver
def scroll_down(driver):
page_height = driver.execute_script("return document.body.scrollHeight")
total_scrolled = 0
for i in range(page_height):
driver.execute_script(f'window.scrollBy(0,{i});')
total_scrolled += i
if total_scrolled >= page_height/2:
last_no = i
break
for i in range(last_no, 0, -1):
driver.execute_script(f'window.scrollBy(0,{i});')
def imagescrape():
try:
# Script params
DRIVER_PATH = './chromedriver.exe' # path to chromedriver
output_dir = './output' # path to output
base_url = 'https://stock.adobe.com/fr/search?gallery_id=Pnb3vT0akesPgEDqaqSlBRifOFBa3LoJ' # url to the images
page_max = 4 # Max nb of page to scroll
page_start = 1 # In case you want to resume
# Create output directory if needed
if not os.path.exists(output_dir):
os.mkdir(output_dir)
# Script start
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
total_img = 0
for i in range(page_start, page_max+1):
url = base_url + '&search_page=' + str(i)
driver.get(url)
scroll_down(driver)
data = driver.execute_script('return document.documentElement.outerHTML')
scraper = BeautifulSoup(data, 'lxml')
img_container = scraper.find_all('img', src=re.compile('.jpg'))
nb_img = len(img_container) - 1
total_img += nb_img
print(f'Page {i} {nb_img} {total_img}')
for j in range(0, nb_img):
img_src = img_container[j].get('src')
name = img_src.rsplit('/', 1)[-1]
try:
urlretrieve(img_src, os.path.join(output_dir, os.path.basename(img_src)))
#print(f'Scraped {name}')
except Exception as e:
print(e)
driver.close()
except Exception as e:
print(e)
def main() -> None:
imagescrape()
if __name__ == '__main__':
main()