-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_codes.py
39 lines (30 loc) · 1.07 KB
/
scrape_codes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import requests
from fake_useragent import UserAgent
import json
import math
from tqdm import tqdm
import pandas as pd
TOTAL_PRODUCTS = 29677
PAGE_SIZE = 100
search_pages = range(math.ceil(TOTAL_PRODUCTS / PAGE_SIZE))
session = requests.session()
session.headers.update({"User-Agent": UserAgent().chrome})
def scrape_all_codes():
all_codes = []
for page in tqdm(search_pages):
url = f"https://www.vinmonopolet.no/api/search?" \
f"q=:relevance:visibleInSearch:true" \
f"&searchType=product" \
f"&fields=FULL" \
f"&pageSize={PAGE_SIZE}" \
f"¤tPage={page}"
page_content = session.get(url).content
json_content = json.loads(page_content)
products = json_content["productSearchResult"]["products"]
codes_on_page = [product["code"] for product in products]
all_codes.extend(codes_on_page)
return all_codes
product_codes = scrape_all_codes()
codes_df = pd.DataFrame(product_codes, columns=["code"])
codes_df.to_csv("data/codes.csv")
print(codes_df.describe())