From 82bf12fd4b5381e814ea5a1ecfd02ce53f87d03d Mon Sep 17 00:00:00 2001
From: Loukious <Loukious@users.noreply.github.com>
Date: Sun, 4 Jun 2023 19:17:38 +0100
Subject: [PATCH 1/2] fixed parsing problems + made all methods return json
 object instead of json string because it makes more sense + added language
 support

---
 PyMovieDb/imdb.py   | 219 +++++++++++++++++++++-----------------------
 PyMovieDb/parser.py |  71 ++++++--------
 setup.py            |   2 +-
 3 files changed, 129 insertions(+), 163 deletions(-)

diff --git a/PyMovieDb/imdb.py b/PyMovieDb/imdb.py
index 1b9742d..3717c13 100644
--- a/PyMovieDb/imdb.py
+++ b/PyMovieDb/imdb.py
@@ -2,7 +2,7 @@
 import json
 import requests
 from PyMovieDb import ImdbParser
-from requests_html import HTMLSession
+from bs4 import BeautifulSoup
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
 
 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
@@ -40,16 +40,17 @@ class IMDB:
             #8. popular_tv(genre=None, start_id=1, sort_by=None)
                 -- to get IMDB popular Tv-Series
     """
-    def __init__(self):
-        self.session = HTMLSession()
-        self.headers = {
+    def __init__(self, lang="en"):
+        self.session = requests.session()
+        self.session.headers.update({
            "Accept": "application/json, text/plain, */*",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
-           "Referer": "https://www.imdb.com/"
-           }
+           "Referer": "https://www.imdb.com/",
+           "Accept-Language": lang
+           })
         self.baseURL = "https://www.imdb.com"
         self.search_results = {'result_count': 0, 'results': []}
-        self.NA = json.dumps({"status": 404, "message": "No Result Found!", 'result_count': 0, 'results': []})
+        self.NA = {"status": 404, "message": "No Result Found!", 'result_count': 0, 'results': []}
 
     # ..................................method to search on IMDB...........................................
     def search(self, name, year=None, tv=False, person=False):
@@ -59,7 +60,7 @@ def search(self, name, year=None, tv=False, person=False):
          @parameter-2:- <int:year> OPTIONAL, release year of query/movie/tv/file to search.
          @parameter-3:- <bool:tv> OPTIONAL, to filter/limit/bound search results only for 'TV Series'.
          @parameter-4:- <bool:person> OPTIONAL, to filter search results only for person.
-         @returns:- A JSON string:
+         @returns:- A JSON object:
                     - {'result_count': <int:total_search_results>, 'results': <list:list_of_files/movie_info_dict>}
         """
         assert isinstance(name, str)
@@ -79,73 +80,76 @@ def search(self, name, year=None, tv=False, person=False):
         except requests.exceptions.ConnectionError as e:
             response = self.session.get(url, verify=False)
 
-        # results = response.html.xpath("//table[@class='findList']/tr")
-        results = response.html.xpath("//section[@data-testid='find-results-section-title']/div/ul/li")
-        #print(len(results))
-        if tv is True:
-            results = [result for result in results if "TV" in result.text]
-
-        if person is True:
-            results = [result for result in results if 'name' in result.find('a')[0].attrs['href']]
-        #print(results)
+        soup = BeautifulSoup(response.content,'html.parser')
+        results = soup.find('script', type='application/json').text
+        results = json.loads(results)
+        results = ImdbParser(results).unescape_json_values
         output = []
-        for result in results:
-            name = result.text.replace('\n', ' ')
-            url = result.find('a')[0].attrs['href']
-            if ('Podcast' not in name) and ('Music Video' not in name):
-                try:
-                    image = result.xpath("//img")[0].attrs['src']
-                    file_id = url.split('/')[2]
+
+        if tv:
+            for item in results["props"]["pageProps"]["titleResults"]["results"]:
+                if item["imageType"] in ["tvMiniSeries", "tvSeries"]:
                     output.append({
-                        'id': file_id,
-                        "name": name,
-                        "url": f"https://www.imdb.com{url}",
-                        "poster": image
-                       })
-                except IndexError:
-                    pass
-                self.search_results = {'result_count': len(output), 'results': output}
-        return json.dumps(self.search_results, indent=2)
+                            "id": item['id'],
+                            "name": item['titleNameText'],
+                            "url": f"{self.baseURL}/title/{item['id']}",
+                            "poster": item.get('titlePosterImageModel', {}).get('url', ''),
+                            "type": item["imageType"]
+                            })
+        
+        elif person:
+            for item in results["props"]["pageProps"]["nameResults"]["results"]:
+                output.append({
+                        "id": item['id'],
+                        "name": item['displayNameText'],
+                        "url": f"{self.baseURL}/name/{item['id']}",
+                        "poster": item.get('avatarImageModel', {}).get('url', ''),
+                        "type": item["knownForJobCategory"]
+                        })
+        else:
+            for item in results["props"]["pageProps"]["titleResults"]["results"]:
+                if item["imageType"] not in ["podcastSeries", "tvSpecial"]:
+                    output.append({
+                            "id": item['id'],
+                            "name": item['titleNameText'],
+                            "url": f"{self.baseURL}/title/{item['id']}",
+                            "poster": item.get('titlePosterImageModel', {}).get('url', ''),
+                            "type": item["imageType"]
+                            })
+            for item in results["props"]["pageProps"]["nameResults"]["results"]:
+                output.append({
+                        "id": item['id'],
+                        "name": item['displayNameText'],
+                        "url": f"{self.baseURL}/name/{item['id']}",
+                        "poster": item.get('avatarImageModel', {}).get('url', ''),
+                        "type": item["knownForJobCategory"]
+                        })
+
+        self.search_results = {'result_count': len(output), 'results': output}
+        return self.search_results
 
     # ..............................methods to get a movie/web-series/tv info..............................
     def get(self, url):
         """
          @description:- helps to get a file's complete info (used by get_by_name() & get_by_id() )
          @parameter:- <str:url>, url of the file/movie/tv-series.
-         @returns:- File/movie/TV info as JSON string.
+         @returns:- File/movie/TV info as JSON.
         """
         try:
             response = self.session.get(url)
-            result = response.html.xpath("//script[@type='application/ld+json']")[0].text
-            result = ''.join(result.splitlines())  # removing newlines
-            result = f"""{result}"""
+            soup = BeautifulSoup(response.content,'html.parser')
+            result = soup.find('script', type='application/ld+json').text
             # print(result)
-        except IndexError:
-            return self.NA
-        try:
-            # converting json string into dict
+            # converting JSON object into dict
             result = json.loads(result)
-        except json.decoder.JSONDecodeError as e:
-            # sometimes json is invalid as 'description' contains inverted commas or other html escape chars
-            try:
-                to_parse = ImdbParser(result)
-                # removing trailer & description schema from json string
-                parsed = to_parse.remove_trailer
-                parsed = to_parse.remove_description
-                # print(parsed)
-                result = json.loads(parsed)
-            except json.decoder.JSONDecodeError as e:
-                try:
-                    # removing reviewBody from json string
-                    parsed = to_parse.remove_review_body
-                    result = json.loads(parsed)
-                except json.decoder.JSONDecodeError as e:
-                    # invalid char(s) is/are not in description/trailer/reviewBody schema
-                    return self.NA
+            result = ImdbParser(result).unescape_json_values
+        except (json.decoder.JSONDecodeError, IndexError) as e:
+            return self.NA
 
         output = {
             "type": result.get('@type'),
             "name": result.get('name'),
+            "alternateName": result.get('alternateName', None),
             "url": self.baseURL + result.get('url'),
             "poster": result.get('image'),
             "description": result.get('description'),
@@ -186,7 +190,7 @@ def get(self, url):
                 if creator.get('@type') == 'Person'
             ]
         }
-        return json.dumps(output, indent=2)
+        return output
 
     def get_by_name(self, name, year=None, tv=False):
         """
@@ -194,32 +198,17 @@ def get_by_name(self, name, year=None, tv=False):
          @parameter-1:- <str:name>, query/name to search.
          @parameter-2:- <int:year> OPTIONAL, release year of query/movie/tv/file to search.
          @parameter-3:- <bool:tv> OPTIONAL, to filter/limit/bound search result only for 'TV Series'.
-         @returns:- File/movie/TV info as JSON string.
+         @returns:- File/movie/TV info as JSON.
         """
-        results = json.loads(self.search(name, year=year))
+        results = self.search(name, year=year, tv=tv)
         all_results = [i for i in self.search_results['results'] if 'title' in i['url']]
         # print(all_results)
 
         # filtering TV and movies
-        if tv is True:  # for tv/Web-Series only
-            tv_only = [result for result in all_results if "TV" in result['name']]
-            if year is not None:
-                tv_only = [result for result in tv_only if str(year) in result['name']]
-            # double checking by file name
-            if bool(tv_only):
-                tv_only_checked = [result for result in tv_only if result['name'].lower().startswith(name.split(" ")[0].lower())]
-                tv_only = tv_only_checked if bool(tv_only_checked) else tv_only
-            results['results'] = tv_only if bool(tv_only) else all_results
-
+        if tv:  # for tv/Web-Series only
+            results['results'] = all_results
         else:  # for movies only
-            movie_only = [result for result in all_results if "TV" not in result['name']]
-            if year is not None:
-                movie_only = [result for result in movie_only if str(year) in result['name']]
-            # double checking by file name
-            if bool(movie_only):
-                movie_only_checked = [result for result in movie_only if result['name'].lower().startswith(name.split(" ")[0].lower())]
-                movie_only = movie_only_checked if bool(movie_only_checked) else movie_only
-            results['results'] = movie_only if bool(movie_only) else all_results
+            results['results'] = [result for result in all_results if "movie" == result['type']]
         # print(results['results'])
 
         if len(results['results']) > 0:
@@ -231,7 +220,7 @@ def get_by_id(self, file_id):
         """
          @description:- Helps to search a file/movie/tv by its imdb ID.
          @parameter-1:- <str:file_id>, imdb ID of the file/movie/tv.
-         @returns:- File/movie/TV info as JSON string.
+         @returns:- File/movie/TV info as JSON object.
         """
         assert isinstance(file_id, str)
         url = f"{self.baseURL}/title/{file_id}"
@@ -242,28 +231,29 @@ def get_person(self, url):
         """
          @description:- Helps to search a person info by its url, (used by person_by_name() & person_by_id() ).
          @parameter-1:- <str:url>, url of the person's profile page.
-         @returns:- Person's info as JSON string.
+         @returns:- Person's info as JSON object.
         """
         try:
             response = self.session.get(url)
-            result = response.html.xpath("//script[@type='application/ld+json']")[0].text
-            result = f"""{result}"""
+            soup = BeautifulSoup(response.content,'html.parser')
+            result = soup.find('script', type='application/ld+json').text
             result = json.loads(result)
-        except json.decoder.JSONDecodeError as e:
+            result = ImdbParser(result).unescape_json_values
+        except (json.decoder.JSONDecodeError, IndexError) as e:
             return self.NA
 
         del result["@context"]
         result['type'] = result.get('@type')
         del result["@type"]
-        return json.dumps(result, indent=2)
+        return result
 
     def person_by_name(self, name):
         """
          @description:- Helps to search a person info by its name.
          @parameter-1:- <str:name>, name of the person.
-         @returns:- Person's info as JSON string.
+         @returns:- Person's info as JSON object.
         """
-        results = json.loads(self.search(name, person=True))
+        results = self.search(name, person=True)
         url = results['results'][0].get('url')
         return self.get_person(url)
 
@@ -271,7 +261,7 @@ def person_by_id(self, p_id):
         """
          @description:- Helps to search a person info by its imdb ID.
          @parameter-1:- <str:p_id>, imdb ID of the person's profile.
-         @returns:- Person's info as JSON string.
+         @returns:- Person's info as JSON object.
         """
         assert isinstance(p_id, str)
         url = f"{self.baseURL}/name/{p_id}"
@@ -282,7 +272,7 @@ def upcoming(self, region=None):
         """
          @description:- Helps to get upcoming movies/tv-series.
          @parameter-1:- <str:region> OPTIONAL, country code (like US, IN etc.) to filter results by region/country.
-         @returns:- upcoming movies/TV-Series info as JSON string.
+         @returns:- upcoming movies/TV-Series info as JSON object.
         """
         if region is not None:
             assert isinstance(region, str)
@@ -294,25 +284,21 @@ def upcoming(self, region=None):
             response = self.session.get(url)
         except requests.exceptions.ConnectionError as e:
             response = self.session.get(url, verify=False)
-
-        div = response.html.xpath("//div[@id='main']")[0]
-        h4 = div.find('h4')
-        ul = div.find('ul')
-        data = zip(h4, ul)
+        soup = BeautifulSoup(response.content,'html.parser')
+        script_tag = soup.find('script', type='application/json').text
+        script = json.loads(script_tag)
         output = []
-        for zip_el in data:
-            rel_date = zip_el[0].text
-            ulist = zip_el[1].find('a')
-            for movie in ulist:
+        for group in script["props"]["pageProps"]["groups"]:
+            for entry in group["entries"]:
                 output.append({
-                    'id': movie.attrs['href'].split('/')[2],
-                    'name': movie.text,
-                    'url': self.baseURL + movie.attrs['href'],
-                    'release_data': rel_date
+                    'id': entry["id"],
+                    'name': entry["titleText"],
+                    'url': self.baseURL + entry["id"],
+                    'release_data': entry["releaseDate"]
                 })
         results = {'result_count': len(output), 'results': output}
         if results['result_count'] > 0:
-            return json.dumps(results, indent=2)
+            return results
         else:
             return self.NA
 
@@ -321,7 +307,7 @@ def get_popular(self, url):
         """
          @description:- Helps to search popular movies/TV-Series by url, (used by popular_movies() & popular_tv() ).
          @parameter-1:- <str:url>, url to search.
-         @returns:- Files/Movies/TV-Series info as JSON string.
+         @returns:- Files/Movies/TV-Series info as JSON object.
         """
         assert isinstance(url, str)
         try:
@@ -329,8 +315,9 @@ def get_popular(self, url):
         except requests.exceptions.ConnectionError as e:
             response = self.session.get(url, verify=False)
 
-        links = response.html.xpath('//h3/a')
-        years = response.html.xpath("//h3")
+        soup = BeautifulSoup(response.content,'html.parser')
+        links = soup.select('h3 > a')
+        years = soup.select('h3 > span:last-child')
 
         if not bool(links) and bool(years):
             return self.NA
@@ -340,17 +327,15 @@ def get_popular(self, url):
             href = link.attrs.get('href', "#")
             if 'title' in href:
                 # getting year
-                year = year.find('span', containing='(')[0] if bool(year.find('span', containing='(')) else ""
-                if bool(year):
-                    year = "".join(re.findall(r"\d+", year.text))
-                    year = year[:4] + "-" + year[4:] if len(year) == 8 else year   # for TV
-                    year = year if len(year) == 4 else year  # for movies
+                if year != "":
+                    year = year.text.split('(')[-1].replace(')', '')
+                    year = year[:-2] if year[-2:] == "– " else year
                 else:
-                    year = "N/A"
+                    year = "unknown"
                 # getting poster
                 file_id = href.split('/')[2]
-                poster = response.html.xpath(f"//img[@data-tconst='{file_id}']")
-                poster = poster[0].attrs.get('loadlate', 'image_not_found') if bool(poster) else 'image_not_found'
+                poster = soup.find('img', {'data-tconst': file_id})
+                poster = poster.attrs.get('loadlate', 'image_not_found') if bool(poster) else 'image_not_found'
                 # creating file object
                 output.append({
                     'id': file_id,
@@ -361,7 +346,7 @@ def get_popular(self, url):
                 })
 
         self.search_results = {'result_count': len(output), 'results': output}
-        return json.dumps(self.search_results, indent=2)
+        return self.search_results
 
     def popular_movies(self, genre=None, start_id=1, sort_by=None):
         """
@@ -370,7 +355,7 @@ def popular_movies(self, genre=None, start_id=1, sort_by=None):
          @parameter-2:- <int:start_id> DEFAULT=1, start id to show results (shows results from start_id to start_id+50).
          @parameter-3:- <bool:sort_by> OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc).
                         - (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info)
-         @returns:- Popular Movies (by genre) info as JSON string.
+         @returns:- Popular Movies (by genre) info as JSON object.
         """
         assert isinstance(start_id, int)
         if genre is not None:
@@ -387,7 +372,7 @@ def popular_tv(self, genre=None, start_id=1, sort_by=None):
          @parameter-2:- <int:start_id> DEFAULT=1, start id to show results (shows results from start_id to start_id+50).
          @parameter-3:- <bool:sort_by> OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc).
                         - (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info)
-         @returns:- Popular TV-Series info as JSON string.
+         @returns:- Popular TV-Series info as JSON object.
         """
         assert isinstance(start_id, int)
         if genre is not None:
diff --git a/PyMovieDb/parser.py b/PyMovieDb/parser.py
index 5dd9c5d..26e95a9 100644
--- a/PyMovieDb/parser.py
+++ b/PyMovieDb/parser.py
@@ -1,55 +1,36 @@
+import html
 # for manipulate incoming data/json from IMDB (for invalid json string)
 class ImdbParser:
     """
-      - A class to manipulate incoming json string data of a movie/TV from IMDB.
-      - Changes are required as sometimes the json contains invalid chars in description/reviewBody/trailer schema
+      - A class to manipulate incoming json object data of a movie/TV from IMDB.
+      - Changes are required as sometimes the json contains escaped quotes that should be unescaped
     """
-    def __init__(self, json_string):
-        self.json_string = json_string
-
+    def __init__(self, json_obj):
+        self.json_obj = json_obj
+        self.visited = set()
+        
     @property
-    def remove_trailer(self):
+    def unescape_json_values(self):
         """
-         @description:- Helps to remove 'trailer' schema from IMDB data json string.
-         @returns:- New updated JSON string.
+        Unescape all json values in a json object
         """
-        try:
-            self.json_string = ''.join(self.json_string.splitlines())
-            trailer_i = self.json_string.index('"trailer"')
-            actor_i = self.json_string.index('"actor"')
-            to_remove = self.json_string[trailer_i:actor_i:1]
-            self.json_string = self.json_string.replace(to_remove, "")
-        except ValueError:
-            self.json_string = self.json_string
-        return self.json_string
+        if id(self.json_obj) in self.visited:
+            return self.json_obj
 
-    @property
-    def remove_description(self):
-        """
-         @description:- Helps to remove 'description' schema from IMDB file json string.
-         @returns:- New updated JSON string.
-        """
-        try:
-            review_i = self.json_string.index('"review"')
-            des_i = self.json_string.index('"description"', 0, review_i)
-            to_remove = self.json_string[des_i:review_i:1]
-            self.json_string = self.json_string.replace(to_remove, "")
-        except ValueError:
-            self.json_string = self.json_string
-        return self.json_string
+        self.visited.add(id(self.json_obj))
 
-    @property
-    def remove_review_body(self):
-        """
-         @description:- Helps to remove 'reviewBody' schema from IMDB file json string.
-         @returns:- New updated JSON string.
-        """
-        try:
-            reviewrating_i = self.json_string.index('"reviewRating"')
-            reviewbody_i = self.json_string.index('"reviewBody"', 0, reviewrating_i)
-            to_remove = self.json_string[reviewbody_i:reviewrating_i:1]
-            self.json_string = self.json_string.replace(to_remove, "")
-        except ValueError:
-            self.json_string = self.json_string
-        return self.json_string
+        if isinstance(self.json_obj, dict):
+            for key, value in self.json_obj.items():
+                if isinstance(value, str):
+                    self.json_obj[key] = html.unescape(value).replace("\n", " ").replace("  ", " ")
+                elif isinstance(value, (dict, list)):
+                    self.json_obj[key] = ImdbParser(value).unescape_json_values
+
+        elif isinstance(self.json_obj, list):
+            for i, value in enumerate(self.json_obj):
+                if isinstance(value, str):
+                    self.json_obj[i] = html.unescape(value).replace("\n", " ").replace("  ", " ")
+                elif isinstance(value, (dict, list)):
+                    self.json_obj[i] = ImdbParser(value).unescape_json_values
 
+        return self.json_obj
diff --git a/setup.py b/setup.py
index 734d945..8550493 100644
--- a/setup.py
+++ b/setup.py
@@ -15,5 +15,5 @@
     url="https://github.com/itsmehemant7/PyMovieDb",
     packages=setuptools.find_packages(),
     zip_safe=False,
-    install_requires=["requests-html"]
+    install_requires=["beautifulsoup4"]
 )

From 9d904ebead84e0c10fba3b82f8950542648766bb Mon Sep 17 00:00:00 2001
From: Loukios <24696530+Loukious@users.noreply.github.com>
Date: Sun, 4 Jun 2023 19:26:06 +0100
Subject: [PATCH 2/2] Bumping version number

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8550493..299191b 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 
 setuptools.setup(
     name="PyMovieDb",
-    version="0.0.8",
+    version="0.0.9",
     author="Hemant Malik",
     author_email="itsmehemant7@gmail.com",
     description="A Python Module that represents IMDB API",