Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed parsing problems and made all methods return json object instead of string #16

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 102 additions & 117 deletions PyMovieDb/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import json
import requests
from PyMovieDb import ImdbParser
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
Expand Down Expand Up @@ -40,16 +40,17 @@ class IMDB:
#8. popular_tv(genre=None, start_id=1, sort_by=None)
-- to get IMDB popular Tv-Series
"""
def __init__(self):
self.session = HTMLSession()
self.headers = {
def __init__(self, lang="en"):
self.session = requests.session()
self.session.headers.update({
"Accept": "application/json, text/plain, */*",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
"Referer": "https://www.imdb.com/"
}
"Referer": "https://www.imdb.com/",
"Accept-Language": lang
})
self.baseURL = "https://www.imdb.com"
self.search_results = {'result_count': 0, 'results': []}
self.NA = json.dumps({"status": 404, "message": "No Result Found!", 'result_count': 0, 'results': []})
self.NA = {"status": 404, "message": "No Result Found!", 'result_count': 0, 'results': []}

# ..................................method to search on IMDB...........................................
def search(self, name, year=None, tv=False, person=False):
Expand All @@ -59,7 +60,7 @@ def search(self, name, year=None, tv=False, person=False):
@parameter-2:- <int:year> OPTIONAL, release year of query/movie/tv/file to search.
@parameter-3:- <bool:tv> OPTIONAL, to filter/limit/bound search results only for 'TV Series'.
@parameter-4:- <bool:person> OPTIONAL, to filter search results only for person.
@returns:- A JSON string:
@returns:- A JSON object:
- {'result_count': <int:total_search_results>, 'results': <list:list_of_files/movie_info_dict>}
"""
assert isinstance(name, str)
Expand All @@ -79,73 +80,76 @@ def search(self, name, year=None, tv=False, person=False):
except requests.exceptions.ConnectionError as e:
response = self.session.get(url, verify=False)

# results = response.html.xpath("//table[@class='findList']/tr")
results = response.html.xpath("//section[@data-testid='find-results-section-title']/div/ul/li")
#print(len(results))
if tv is True:
results = [result for result in results if "TV" in result.text]

if person is True:
results = [result for result in results if 'name' in result.find('a')[0].attrs['href']]
#print(results)
soup = BeautifulSoup(response.content,'html.parser')
results = soup.find('script', type='application/json').text
results = json.loads(results)
results = ImdbParser(results).unescape_json_values
output = []
for result in results:
name = result.text.replace('\n', ' ')
url = result.find('a')[0].attrs['href']
if ('Podcast' not in name) and ('Music Video' not in name):
try:
image = result.xpath("//img")[0].attrs['src']
file_id = url.split('/')[2]

if tv:
for item in results["props"]["pageProps"]["titleResults"]["results"]:
if item["imageType"] in ["tvMiniSeries", "tvSeries"]:
output.append({
'id': file_id,
"name": name,
"url": f"https://www.imdb.com{url}",
"poster": image
})
except IndexError:
pass
self.search_results = {'result_count': len(output), 'results': output}
return json.dumps(self.search_results, indent=2)
"id": item['id'],
"name": item['titleNameText'],
"url": f"{self.baseURL}/title/{item['id']}",
"poster": item.get('titlePosterImageModel', {}).get('url', ''),
"type": item["imageType"]
})

elif person:
for item in results["props"]["pageProps"]["nameResults"]["results"]:
output.append({
"id": item['id'],
"name": item['displayNameText'],
"url": f"{self.baseURL}/name/{item['id']}",
"poster": item.get('avatarImageModel', {}).get('url', ''),
"type": item["knownForJobCategory"]
})
else:
for item in results["props"]["pageProps"]["titleResults"]["results"]:
if item["imageType"] not in ["podcastSeries", "tvSpecial"]:
output.append({
"id": item['id'],
"name": item['titleNameText'],
"url": f"{self.baseURL}/title/{item['id']}",
"poster": item.get('titlePosterImageModel', {}).get('url', ''),
"type": item["imageType"]
})
for item in results["props"]["pageProps"]["nameResults"]["results"]:
output.append({
"id": item['id'],
"name": item['displayNameText'],
"url": f"{self.baseURL}/name/{item['id']}",
"poster": item.get('avatarImageModel', {}).get('url', ''),
"type": item["knownForJobCategory"]
})

self.search_results = {'result_count': len(output), 'results': output}
return self.search_results

# ..............................methods to get a movie/web-series/tv info..............................
def get(self, url):
"""
@description:- helps to get a file's complete info (used by get_by_name() & get_by_id() )
@parameter:- <str:url>, url of the file/movie/tv-series.
@returns:- File/movie/TV info as JSON string.
@returns:- File/movie/TV info as JSON.
"""
try:
response = self.session.get(url)
result = response.html.xpath("//script[@type='application/ld+json']")[0].text
result = ''.join(result.splitlines()) # removing newlines
result = f"""{result}"""
soup = BeautifulSoup(response.content,'html.parser')
result = soup.find('script', type='application/ld+json').text
# print(result)
except IndexError:
return self.NA
try:
# converting json string into dict
# converting JSON object into dict
result = json.loads(result)
except json.decoder.JSONDecodeError as e:
# sometimes json is invalid as 'description' contains inverted commas or other html escape chars
try:
to_parse = ImdbParser(result)
# removing trailer & description schema from json string
parsed = to_parse.remove_trailer
parsed = to_parse.remove_description
# print(parsed)
result = json.loads(parsed)
except json.decoder.JSONDecodeError as e:
try:
# removing reviewBody from json string
parsed = to_parse.remove_review_body
result = json.loads(parsed)
except json.decoder.JSONDecodeError as e:
# invalid char(s) is/are not in description/trailer/reviewBody schema
return self.NA
result = ImdbParser(result).unescape_json_values
except (json.decoder.JSONDecodeError, IndexError) as e:
return self.NA

output = {
"type": result.get('@type'),
"name": result.get('name'),
"alternateName": result.get('alternateName', None),
"url": self.baseURL + result.get('url'),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe it's not the case all the time, but I found out that result.get('url') already has imdb.com inside.

Suggested change
"url": self.baseURL + result.get('url'),
"url": result.get('url'),

"poster": result.get('image'),
"description": result.get('description'),
Expand Down Expand Up @@ -186,40 +190,25 @@ def get(self, url):
if creator.get('@type') == 'Person'
]
}
return json.dumps(output, indent=2)
return output

def get_by_name(self, name, year=None, tv=False):
"""
@description:- Helps to search a file/movie/tv by name.
@parameter-1:- <str:name>, query/name to search.
@parameter-2:- <int:year> OPTIONAL, release year of query/movie/tv/file to search.
@parameter-3:- <bool:tv> OPTIONAL, to filter/limit/bound search result only for 'TV Series'.
@returns:- File/movie/TV info as JSON string.
@returns:- File/movie/TV info as JSON.
"""
results = json.loads(self.search(name, year=year))
results = self.search(name, year=year, tv=tv)
all_results = [i for i in self.search_results['results'] if 'title' in i['url']]
# print(all_results)

# filtering TV and movies
if tv is True: # for tv/Web-Series only
tv_only = [result for result in all_results if "TV" in result['name']]
if year is not None:
tv_only = [result for result in tv_only if str(year) in result['name']]
# double checking by file name
if bool(tv_only):
tv_only_checked = [result for result in tv_only if result['name'].lower().startswith(name.split(" ")[0].lower())]
tv_only = tv_only_checked if bool(tv_only_checked) else tv_only
results['results'] = tv_only if bool(tv_only) else all_results

if tv: # for tv/Web-Series only
results['results'] = all_results
else: # for movies only
movie_only = [result for result in all_results if "TV" not in result['name']]
if year is not None:
movie_only = [result for result in movie_only if str(year) in result['name']]
# double checking by file name
if bool(movie_only):
movie_only_checked = [result for result in movie_only if result['name'].lower().startswith(name.split(" ")[0].lower())]
movie_only = movie_only_checked if bool(movie_only_checked) else movie_only
results['results'] = movie_only if bool(movie_only) else all_results
results['results'] = [result for result in all_results if "movie" == result['type']]
# print(results['results'])

if len(results['results']) > 0:
Expand All @@ -231,7 +220,7 @@ def get_by_id(self, file_id):
"""
@description:- Helps to search a file/movie/tv by its imdb ID.
@parameter-1:- <str:file_id>, imdb ID of the file/movie/tv.
@returns:- File/movie/TV info as JSON string.
@returns:- File/movie/TV info as JSON object.
"""
assert isinstance(file_id, str)
url = f"{self.baseURL}/title/{file_id}"
Expand All @@ -242,36 +231,37 @@ def get_person(self, url):
"""
@description:- Helps to search a person info by its url, (used by person_by_name() & person_by_id() ).
@parameter-1:- <str:url>, url of the person's profile page.
@returns:- Person's info as JSON string.
@returns:- Person's info as JSON object.
"""
try:
response = self.session.get(url)
result = response.html.xpath("//script[@type='application/ld+json']")[0].text
result = f"""{result}"""
soup = BeautifulSoup(response.content,'html.parser')
result = soup.find('script', type='application/ld+json').text
result = json.loads(result)
except json.decoder.JSONDecodeError as e:
result = ImdbParser(result).unescape_json_values
except (json.decoder.JSONDecodeError, IndexError) as e:
return self.NA

del result["@context"]
result['type'] = result.get('@type')
del result["@type"]
return json.dumps(result, indent=2)
return result

def person_by_name(self, name):
"""
@description:- Helps to search a person info by its name.
@parameter-1:- <str:name>, name of the person.
@returns:- Person's info as JSON string.
@returns:- Person's info as JSON object.
"""
results = json.loads(self.search(name, person=True))
results = self.search(name, person=True)
url = results['results'][0].get('url')
return self.get_person(url)

def person_by_id(self, p_id):
"""
@description:- Helps to search a person info by its imdb ID.
@parameter-1:- <str:p_id>, imdb ID of the person's profile.
@returns:- Person's info as JSON string.
@returns:- Person's info as JSON object.
"""
assert isinstance(p_id, str)
url = f"{self.baseURL}/name/{p_id}"
Expand All @@ -282,7 +272,7 @@ def upcoming(self, region=None):
"""
@description:- Helps to get upcoming movies/tv-series.
@parameter-1:- <str:region> OPTIONAL, country code (like US, IN etc.) to filter results by region/country.
@returns:- upcoming movies/TV-Series info as JSON string.
@returns:- upcoming movies/TV-Series info as JSON object.
"""
if region is not None:
assert isinstance(region, str)
Expand All @@ -294,25 +284,21 @@ def upcoming(self, region=None):
response = self.session.get(url)
except requests.exceptions.ConnectionError as e:
response = self.session.get(url, verify=False)

div = response.html.xpath("//div[@id='main']")[0]
h4 = div.find('h4')
ul = div.find('ul')
data = zip(h4, ul)
soup = BeautifulSoup(response.content,'html.parser')
script_tag = soup.find('script', type='application/json').text
script = json.loads(script_tag)
output = []
for zip_el in data:
rel_date = zip_el[0].text
ulist = zip_el[1].find('a')
for movie in ulist:
for group in script["props"]["pageProps"]["groups"]:
for entry in group["entries"]:
output.append({
'id': movie.attrs['href'].split('/')[2],
'name': movie.text,
'url': self.baseURL + movie.attrs['href'],
'release_data': rel_date
'id': entry["id"],
'name': entry["titleText"],
'url': self.baseURL + entry["id"],
'release_data': entry["releaseDate"]
})
results = {'result_count': len(output), 'results': output}
if results['result_count'] > 0:
return json.dumps(results, indent=2)
return results
else:
return self.NA

Expand All @@ -321,16 +307,17 @@ def get_popular(self, url):
"""
@description:- Helps to search popular movies/TV-Series by url, (used by popular_movies() & popular_tv() ).
@parameter-1:- <str:url>, url to search.
@returns:- Files/Movies/TV-Series info as JSON string.
@returns:- Files/Movies/TV-Series info as JSON object.
"""
assert isinstance(url, str)
try:
response = self.session.get(url)
except requests.exceptions.ConnectionError as e:
response = self.session.get(url, verify=False)

links = response.html.xpath('//h3/a')
years = response.html.xpath("//h3")
soup = BeautifulSoup(response.content,'html.parser')
links = soup.select('h3 > a')
years = soup.select('h3 > span:last-child')

if not bool(links) and bool(years):
return self.NA
Expand All @@ -340,17 +327,15 @@ def get_popular(self, url):
href = link.attrs.get('href', "#")
if 'title' in href:
# getting year
year = year.find('span', containing='(')[0] if bool(year.find('span', containing='(')) else ""
if bool(year):
year = "".join(re.findall(r"\d+", year.text))
year = year[:4] + "-" + year[4:] if len(year) == 8 else year # for TV
year = year if len(year) == 4 else year # for movies
if year != "":
year = year.text.split('(')[-1].replace(')', '')
year = year[:-2] if year[-2:] == "– " else year
else:
year = "N/A"
year = "unknown"
# getting poster
file_id = href.split('/')[2]
poster = response.html.xpath(f"//img[@data-tconst='{file_id}']")
poster = poster[0].attrs.get('loadlate', 'image_not_found') if bool(poster) else 'image_not_found'
poster = soup.find('img', {'data-tconst': file_id})
poster = poster.attrs.get('loadlate', 'image_not_found') if bool(poster) else 'image_not_found'
# creating file object
output.append({
'id': file_id,
Expand All @@ -361,7 +346,7 @@ def get_popular(self, url):
})

self.search_results = {'result_count': len(output), 'results': output}
return json.dumps(self.search_results, indent=2)
return self.search_results

def popular_movies(self, genre=None, start_id=1, sort_by=None):
"""
Expand All @@ -370,7 +355,7 @@ def popular_movies(self, genre=None, start_id=1, sort_by=None):
@parameter-2:- <int:start_id> DEFAULT=1, start id to show results (shows results from start_id to start_id+50).
@parameter-3:- <bool:sort_by> OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc).
- (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info)
@returns:- Popular Movies (by genre) info as JSON string.
@returns:- Popular Movies (by genre) info as JSON object.
"""
assert isinstance(start_id, int)
if genre is not None:
Expand All @@ -387,7 +372,7 @@ def popular_tv(self, genre=None, start_id=1, sort_by=None):
@parameter-2:- <int:start_id> DEFAULT=1, start id to show results (shows results from start_id to start_id+50).
@parameter-3:- <bool:sort_by> OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc).
- (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info)
@returns:- Popular TV-Series info as JSON string.
@returns:- Popular TV-Series info as JSON object.
"""
assert isinstance(start_id, int)
if genre is not None:
Expand Down
Loading