From 82bf12fd4b5381e814ea5a1ecfd02ce53f87d03d Mon Sep 17 00:00:00 2001 From: Loukious Date: Sun, 4 Jun 2023 19:17:38 +0100 Subject: [PATCH 1/2] fixed parsing problems + made all methods return json object instead of json string because it makes more sense + added language support --- PyMovieDb/imdb.py | 219 +++++++++++++++++++++----------------------- PyMovieDb/parser.py | 71 ++++++-------- setup.py | 2 +- 3 files changed, 129 insertions(+), 163 deletions(-) diff --git a/PyMovieDb/imdb.py b/PyMovieDb/imdb.py index 1b9742d..3717c13 100644 --- a/PyMovieDb/imdb.py +++ b/PyMovieDb/imdb.py @@ -2,7 +2,7 @@ import json import requests from PyMovieDb import ImdbParser -from requests_html import HTMLSession +from bs4 import BeautifulSoup from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) @@ -40,16 +40,17 @@ class IMDB: #8. popular_tv(genre=None, start_id=1, sort_by=None) -- to get IMDB popular Tv-Series """ - def __init__(self): - self.session = HTMLSession() - self.headers = { + def __init__(self, lang="en"): + self.session = requests.session() + self.session.headers.update({ "Accept": "application/json, text/plain, */*", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36", - "Referer": "https://www.imdb.com/" - } + "Referer": "https://www.imdb.com/", + "Accept-Language": lang + }) self.baseURL = "https://www.imdb.com" self.search_results = {'result_count': 0, 'results': []} - self.NA = json.dumps({"status": 404, "message": "No Result Found!", 'result_count': 0, 'results': []}) + self.NA = {"status": 404, "message": "No Result Found!", 'result_count': 0, 'results': []} # ..................................method to search on IMDB........................................... def search(self, name, year=None, tv=False, person=False): @@ -59,7 +60,7 @@ def search(self, name, year=None, tv=False, person=False): @parameter-2:- OPTIONAL, release year of query/movie/tv/file to search. @parameter-3:- OPTIONAL, to filter/limit/bound search results only for 'TV Series'. @parameter-4:- OPTIONAL, to filter search results only for person. - @returns:- A JSON string: + @returns:- A JSON object: - {'result_count': , 'results': } """ assert isinstance(name, str) @@ -79,73 +80,76 @@ def search(self, name, year=None, tv=False, person=False): except requests.exceptions.ConnectionError as e: response = self.session.get(url, verify=False) - # results = response.html.xpath("//table[@class='findList']/tr") - results = response.html.xpath("//section[@data-testid='find-results-section-title']/div/ul/li") - #print(len(results)) - if tv is True: - results = [result for result in results if "TV" in result.text] - - if person is True: - results = [result for result in results if 'name' in result.find('a')[0].attrs['href']] - #print(results) + soup = BeautifulSoup(response.content,'html.parser') + results = soup.find('script', type='application/json').text + results = json.loads(results) + results = ImdbParser(results).unescape_json_values output = [] - for result in results: - name = result.text.replace('\n', ' ') - url = result.find('a')[0].attrs['href'] - if ('Podcast' not in name) and ('Music Video' not in name): - try: - image = result.xpath("//img")[0].attrs['src'] - file_id = url.split('/')[2] + + if tv: + for item in results["props"]["pageProps"]["titleResults"]["results"]: + if item["imageType"] in ["tvMiniSeries", "tvSeries"]: output.append({ - 'id': file_id, - "name": name, - "url": f"https://www.imdb.com{url}", - "poster": image - }) - except IndexError: - pass - self.search_results = {'result_count': len(output), 'results': output} - return json.dumps(self.search_results, indent=2) + "id": item['id'], + "name": item['titleNameText'], + "url": f"{self.baseURL}/title/{item['id']}", + "poster": item.get('titlePosterImageModel', {}).get('url', ''), + "type": item["imageType"] + }) + + elif person: + for item in results["props"]["pageProps"]["nameResults"]["results"]: + output.append({ + "id": item['id'], + "name": item['displayNameText'], + "url": f"{self.baseURL}/name/{item['id']}", + "poster": item.get('avatarImageModel', {}).get('url', ''), + "type": item["knownForJobCategory"] + }) + else: + for item in results["props"]["pageProps"]["titleResults"]["results"]: + if item["imageType"] not in ["podcastSeries", "tvSpecial"]: + output.append({ + "id": item['id'], + "name": item['titleNameText'], + "url": f"{self.baseURL}/title/{item['id']}", + "poster": item.get('titlePosterImageModel', {}).get('url', ''), + "type": item["imageType"] + }) + for item in results["props"]["pageProps"]["nameResults"]["results"]: + output.append({ + "id": item['id'], + "name": item['displayNameText'], + "url": f"{self.baseURL}/name/{item['id']}", + "poster": item.get('avatarImageModel', {}).get('url', ''), + "type": item["knownForJobCategory"] + }) + + self.search_results = {'result_count': len(output), 'results': output} + return self.search_results # ..............................methods to get a movie/web-series/tv info.............................. def get(self, url): """ @description:- helps to get a file's complete info (used by get_by_name() & get_by_id() ) @parameter:- , url of the file/movie/tv-series. - @returns:- File/movie/TV info as JSON string. + @returns:- File/movie/TV info as JSON. """ try: response = self.session.get(url) - result = response.html.xpath("//script[@type='application/ld+json']")[0].text - result = ''.join(result.splitlines()) # removing newlines - result = f"""{result}""" + soup = BeautifulSoup(response.content,'html.parser') + result = soup.find('script', type='application/ld+json').text # print(result) - except IndexError: - return self.NA - try: - # converting json string into dict + # converting JSON object into dict result = json.loads(result) - except json.decoder.JSONDecodeError as e: - # sometimes json is invalid as 'description' contains inverted commas or other html escape chars - try: - to_parse = ImdbParser(result) - # removing trailer & description schema from json string - parsed = to_parse.remove_trailer - parsed = to_parse.remove_description - # print(parsed) - result = json.loads(parsed) - except json.decoder.JSONDecodeError as e: - try: - # removing reviewBody from json string - parsed = to_parse.remove_review_body - result = json.loads(parsed) - except json.decoder.JSONDecodeError as e: - # invalid char(s) is/are not in description/trailer/reviewBody schema - return self.NA + result = ImdbParser(result).unescape_json_values + except (json.decoder.JSONDecodeError, IndexError) as e: + return self.NA output = { "type": result.get('@type'), "name": result.get('name'), + "alternateName": result.get('alternateName', None), "url": self.baseURL + result.get('url'), "poster": result.get('image'), "description": result.get('description'), @@ -186,7 +190,7 @@ def get(self, url): if creator.get('@type') == 'Person' ] } - return json.dumps(output, indent=2) + return output def get_by_name(self, name, year=None, tv=False): """ @@ -194,32 +198,17 @@ def get_by_name(self, name, year=None, tv=False): @parameter-1:- , query/name to search. @parameter-2:- OPTIONAL, release year of query/movie/tv/file to search. @parameter-3:- OPTIONAL, to filter/limit/bound search result only for 'TV Series'. - @returns:- File/movie/TV info as JSON string. + @returns:- File/movie/TV info as JSON. """ - results = json.loads(self.search(name, year=year)) + results = self.search(name, year=year, tv=tv) all_results = [i for i in self.search_results['results'] if 'title' in i['url']] # print(all_results) # filtering TV and movies - if tv is True: # for tv/Web-Series only - tv_only = [result for result in all_results if "TV" in result['name']] - if year is not None: - tv_only = [result for result in tv_only if str(year) in result['name']] - # double checking by file name - if bool(tv_only): - tv_only_checked = [result for result in tv_only if result['name'].lower().startswith(name.split(" ")[0].lower())] - tv_only = tv_only_checked if bool(tv_only_checked) else tv_only - results['results'] = tv_only if bool(tv_only) else all_results - + if tv: # for tv/Web-Series only + results['results'] = all_results else: # for movies only - movie_only = [result for result in all_results if "TV" not in result['name']] - if year is not None: - movie_only = [result for result in movie_only if str(year) in result['name']] - # double checking by file name - if bool(movie_only): - movie_only_checked = [result for result in movie_only if result['name'].lower().startswith(name.split(" ")[0].lower())] - movie_only = movie_only_checked if bool(movie_only_checked) else movie_only - results['results'] = movie_only if bool(movie_only) else all_results + results['results'] = [result for result in all_results if "movie" == result['type']] # print(results['results']) if len(results['results']) > 0: @@ -231,7 +220,7 @@ def get_by_id(self, file_id): """ @description:- Helps to search a file/movie/tv by its imdb ID. @parameter-1:- , imdb ID of the file/movie/tv. - @returns:- File/movie/TV info as JSON string. + @returns:- File/movie/TV info as JSON object. """ assert isinstance(file_id, str) url = f"{self.baseURL}/title/{file_id}" @@ -242,28 +231,29 @@ def get_person(self, url): """ @description:- Helps to search a person info by its url, (used by person_by_name() & person_by_id() ). @parameter-1:- , url of the person's profile page. - @returns:- Person's info as JSON string. + @returns:- Person's info as JSON object. """ try: response = self.session.get(url) - result = response.html.xpath("//script[@type='application/ld+json']")[0].text - result = f"""{result}""" + soup = BeautifulSoup(response.content,'html.parser') + result = soup.find('script', type='application/ld+json').text result = json.loads(result) - except json.decoder.JSONDecodeError as e: + result = ImdbParser(result).unescape_json_values + except (json.decoder.JSONDecodeError, IndexError) as e: return self.NA del result["@context"] result['type'] = result.get('@type') del result["@type"] - return json.dumps(result, indent=2) + return result def person_by_name(self, name): """ @description:- Helps to search a person info by its name. @parameter-1:- , name of the person. - @returns:- Person's info as JSON string. + @returns:- Person's info as JSON object. """ - results = json.loads(self.search(name, person=True)) + results = self.search(name, person=True) url = results['results'][0].get('url') return self.get_person(url) @@ -271,7 +261,7 @@ def person_by_id(self, p_id): """ @description:- Helps to search a person info by its imdb ID. @parameter-1:- , imdb ID of the person's profile. - @returns:- Person's info as JSON string. + @returns:- Person's info as JSON object. """ assert isinstance(p_id, str) url = f"{self.baseURL}/name/{p_id}" @@ -282,7 +272,7 @@ def upcoming(self, region=None): """ @description:- Helps to get upcoming movies/tv-series. @parameter-1:- OPTIONAL, country code (like US, IN etc.) to filter results by region/country. - @returns:- upcoming movies/TV-Series info as JSON string. + @returns:- upcoming movies/TV-Series info as JSON object. """ if region is not None: assert isinstance(region, str) @@ -294,25 +284,21 @@ def upcoming(self, region=None): response = self.session.get(url) except requests.exceptions.ConnectionError as e: response = self.session.get(url, verify=False) - - div = response.html.xpath("//div[@id='main']")[0] - h4 = div.find('h4') - ul = div.find('ul') - data = zip(h4, ul) + soup = BeautifulSoup(response.content,'html.parser') + script_tag = soup.find('script', type='application/json').text + script = json.loads(script_tag) output = [] - for zip_el in data: - rel_date = zip_el[0].text - ulist = zip_el[1].find('a') - for movie in ulist: + for group in script["props"]["pageProps"]["groups"]: + for entry in group["entries"]: output.append({ - 'id': movie.attrs['href'].split('/')[2], - 'name': movie.text, - 'url': self.baseURL + movie.attrs['href'], - 'release_data': rel_date + 'id': entry["id"], + 'name': entry["titleText"], + 'url': self.baseURL + entry["id"], + 'release_data': entry["releaseDate"] }) results = {'result_count': len(output), 'results': output} if results['result_count'] > 0: - return json.dumps(results, indent=2) + return results else: return self.NA @@ -321,7 +307,7 @@ def get_popular(self, url): """ @description:- Helps to search popular movies/TV-Series by url, (used by popular_movies() & popular_tv() ). @parameter-1:- , url to search. - @returns:- Files/Movies/TV-Series info as JSON string. + @returns:- Files/Movies/TV-Series info as JSON object. """ assert isinstance(url, str) try: @@ -329,8 +315,9 @@ def get_popular(self, url): except requests.exceptions.ConnectionError as e: response = self.session.get(url, verify=False) - links = response.html.xpath('//h3/a') - years = response.html.xpath("//h3") + soup = BeautifulSoup(response.content,'html.parser') + links = soup.select('h3 > a') + years = soup.select('h3 > span:last-child') if not bool(links) and bool(years): return self.NA @@ -340,17 +327,15 @@ def get_popular(self, url): href = link.attrs.get('href', "#") if 'title' in href: # getting year - year = year.find('span', containing='(')[0] if bool(year.find('span', containing='(')) else "" - if bool(year): - year = "".join(re.findall(r"\d+", year.text)) - year = year[:4] + "-" + year[4:] if len(year) == 8 else year # for TV - year = year if len(year) == 4 else year # for movies + if year != "": + year = year.text.split('(')[-1].replace(')', '') + year = year[:-2] if year[-2:] == "– " else year else: - year = "N/A" + year = "unknown" # getting poster file_id = href.split('/')[2] - poster = response.html.xpath(f"//img[@data-tconst='{file_id}']") - poster = poster[0].attrs.get('loadlate', 'image_not_found') if bool(poster) else 'image_not_found' + poster = soup.find('img', {'data-tconst': file_id}) + poster = poster.attrs.get('loadlate', 'image_not_found') if bool(poster) else 'image_not_found' # creating file object output.append({ 'id': file_id, @@ -361,7 +346,7 @@ def get_popular(self, url): }) self.search_results = {'result_count': len(output), 'results': output} - return json.dumps(self.search_results, indent=2) + return self.search_results def popular_movies(self, genre=None, start_id=1, sort_by=None): """ @@ -370,7 +355,7 @@ def popular_movies(self, genre=None, start_id=1, sort_by=None): @parameter-2:- DEFAULT=1, start id to show results (shows results from start_id to start_id+50). @parameter-3:- OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc). - (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info) - @returns:- Popular Movies (by genre) info as JSON string. + @returns:- Popular Movies (by genre) info as JSON object. """ assert isinstance(start_id, int) if genre is not None: @@ -387,7 +372,7 @@ def popular_tv(self, genre=None, start_id=1, sort_by=None): @parameter-2:- DEFAULT=1, start id to show results (shows results from start_id to start_id+50). @parameter-3:- OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc). - (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info) - @returns:- Popular TV-Series info as JSON string. + @returns:- Popular TV-Series info as JSON object. """ assert isinstance(start_id, int) if genre is not None: diff --git a/PyMovieDb/parser.py b/PyMovieDb/parser.py index 5dd9c5d..26e95a9 100644 --- a/PyMovieDb/parser.py +++ b/PyMovieDb/parser.py @@ -1,55 +1,36 @@ +import html # for manipulate incoming data/json from IMDB (for invalid json string) class ImdbParser: """ - - A class to manipulate incoming json string data of a movie/TV from IMDB. - - Changes are required as sometimes the json contains invalid chars in description/reviewBody/trailer schema + - A class to manipulate incoming json object data of a movie/TV from IMDB. + - Changes are required as sometimes the json contains escaped quotes that should be unescaped """ - def __init__(self, json_string): - self.json_string = json_string - + def __init__(self, json_obj): + self.json_obj = json_obj + self.visited = set() + @property - def remove_trailer(self): + def unescape_json_values(self): """ - @description:- Helps to remove 'trailer' schema from IMDB data json string. - @returns:- New updated JSON string. + Unescape all json values in a json object """ - try: - self.json_string = ''.join(self.json_string.splitlines()) - trailer_i = self.json_string.index('"trailer"') - actor_i = self.json_string.index('"actor"') - to_remove = self.json_string[trailer_i:actor_i:1] - self.json_string = self.json_string.replace(to_remove, "") - except ValueError: - self.json_string = self.json_string - return self.json_string + if id(self.json_obj) in self.visited: + return self.json_obj - @property - def remove_description(self): - """ - @description:- Helps to remove 'description' schema from IMDB file json string. - @returns:- New updated JSON string. - """ - try: - review_i = self.json_string.index('"review"') - des_i = self.json_string.index('"description"', 0, review_i) - to_remove = self.json_string[des_i:review_i:1] - self.json_string = self.json_string.replace(to_remove, "") - except ValueError: - self.json_string = self.json_string - return self.json_string + self.visited.add(id(self.json_obj)) - @property - def remove_review_body(self): - """ - @description:- Helps to remove 'reviewBody' schema from IMDB file json string. - @returns:- New updated JSON string. - """ - try: - reviewrating_i = self.json_string.index('"reviewRating"') - reviewbody_i = self.json_string.index('"reviewBody"', 0, reviewrating_i) - to_remove = self.json_string[reviewbody_i:reviewrating_i:1] - self.json_string = self.json_string.replace(to_remove, "") - except ValueError: - self.json_string = self.json_string - return self.json_string + if isinstance(self.json_obj, dict): + for key, value in self.json_obj.items(): + if isinstance(value, str): + self.json_obj[key] = html.unescape(value).replace("\n", " ").replace(" ", " ") + elif isinstance(value, (dict, list)): + self.json_obj[key] = ImdbParser(value).unescape_json_values + + elif isinstance(self.json_obj, list): + for i, value in enumerate(self.json_obj): + if isinstance(value, str): + self.json_obj[i] = html.unescape(value).replace("\n", " ").replace(" ", " ") + elif isinstance(value, (dict, list)): + self.json_obj[i] = ImdbParser(value).unescape_json_values + return self.json_obj diff --git a/setup.py b/setup.py index 734d945..8550493 100644 --- a/setup.py +++ b/setup.py @@ -15,5 +15,5 @@ url="https://github.com/itsmehemant7/PyMovieDb", packages=setuptools.find_packages(), zip_safe=False, - install_requires=["requests-html"] + install_requires=["beautifulsoup4"] ) From 9d904ebead84e0c10fba3b82f8950542648766bb Mon Sep 17 00:00:00 2001 From: Loukios <24696530+Loukious@users.noreply.github.com> Date: Sun, 4 Jun 2023 19:26:06 +0100 Subject: [PATCH 2/2] Bumping version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8550493..299191b 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="PyMovieDb", - version="0.0.8", + version="0.0.9", author="Hemant Malik", author_email="itsmehemant7@gmail.com", description="A Python Module that represents IMDB API",