-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_products.py
43 lines (32 loc) · 1.27 KB
/
scrape_products.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import requests
from fake_useragent import UserAgent
import json
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
session = requests.session()
session.headers.update({"User-Agent": UserAgent().chrome})
codes = pd.read_csv("data/codes.csv")
def scrape_single_product(code):
url = f"https://www.vinmonopolet.no/api/products/{code}?fields=FULL"
product_page_content = session.get(url).content
return json.loads(product_page_content)
def scrape_products():
product_data = []
for code in tqdm(codes["code"]):
product_data.append(scrape_single_product(code))
return product_data
def scrape_products_mulithreaded():
product_data = []
with tqdm(total=len(codes)) as progressbar:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(scrape_single_product, code) for code in
codes["code"]]
for future in as_completed(futures):
product_data.append(future.result())
progressbar.update(1)
return product_data
scraped_product_data = scrape_products_mulithreaded()
products_df = pd.DataFrame(data=scraped_product_data)
products_df.to_csv("data/product_data.csv")
print(products_df.describe())