-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
69 lines (64 loc) · 2.6 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from bs4 import BeautifulSoup
import requests
import pandas as pd
DOMAIN = "https://letterboxd.com"
def transform_ratings(some_str):
"""
transforms raw star rating into float value
:param: some_str: actual star rating
:rtype: returns the float representation of the given star(s)
"""
stars = {
"★": 1,
"★★": 2,
"★★★": 3,
"★★★★": 4,
"★★★★★": 5,
"½": 0.5,
"★½": 1.5,
"★★½": 2.5,
"★★★½": 3.5,
"★★★★½": 4.5
}
try:
return stars[some_str]
except:
return -1
def scrape_user(username):
movies_dict = {}
movies_dict['id'] = []
movies_dict['title'] = []
movies_dict['rating'] = []
movies_dict['liked'] = []
movies_dict['link'] = []
url = DOMAIN + "/" + username + "/films/"
url_page = requests.get(url)
soup = BeautifulSoup(url_page.content, 'html.parser')
# check number of pages
li_pagination = soup.findAll("li", {"class": "paginate-page"})
if len(li_pagination) == 0:
ul = soup.find("ul", {"class": "poster-list"})
if (ul != None):
movies = ul.find_all("li")
for movie in movies:
movies_dict['id'].append(movie.find('div')['data-film-id'])
movies_dict['title'].append(movie.find('img')['alt'])
movies_dict['rating'].append(transform_ratings(movie.find('p', {"class": "poster-viewingdata"}).get_text().strip()))
movies_dict['liked'].append(movie.find('span', {'class': 'like'})!=None)
movies_dict['link'].append(movie.find('div')['data-target-link'])
else:
for i in range(int(li_pagination[-1].find('a').get_text().strip())):
url = DOMAIN + "/" + username + "/films/page/" + str(i+1)
url_page = requests.get(url)
soup = BeautifulSoup(url_page.content, 'html.parser')
ul = soup.find("ul", {"class": "poster-list"})
if (ul != None):
movies = ul.find_all("li")
for movie in movies:
movies_dict['id'].append(movie.find('div')['data-film-id'])
movies_dict['title'].append(movie.find('img')['alt'])
movies_dict['rating'].append(transform_ratings(movie.find('p', {"class": "poster-viewingdata"}).get_text().strip()))
movies_dict['liked'].append(movie.find('span', {'class': 'like'})!=None)
movies_dict['link'].append(movie.find('div')['data-target-link'])
df_film = pd.DataFrame(movies_dict)
return df_film