From 8cc5c82c260a9e359afa1d326289f5a0e17acc3b Mon Sep 17 00:00:00 2001
From: Guilherme Garcia <gui.garcia67@gmail.com>
Date: Fri, 1 Nov 2024 08:17:52 -0300
Subject: [PATCH] hotfix: adapts the scrapper for new feature tables format on
 paulo amorim

---
 scrapers/paulo_amorim.py | 120 +++++++++++++++++++++++++++++++++------
 1 file changed, 103 insertions(+), 17 deletions(-)
diff --git a/scrapers/paulo_amorim.py b/scrapers/paulo_amorim.py
index a6da123..c62098f 100644
--- a/scrapers/paulo_amorim.py
+++ b/scrapers/paulo_amorim.py
@@ -143,6 +143,25 @@ def _get_movie_excerpt(self):
                 .text
             )
 
+    def _get_movies_from_table(self, feature_timetable):
+        for feature_tr in feature_timetable.find_all("tr"):
+            feature_tds = feature_tr.find_all("td")
+            for movie in self.movies:
+                if movie["title"].lower() == feature_tds[2].text.lower():
+                    # Movie will be featured today
+                    time_str = (
+                        unicodedata.normalize("NFKC", feature_tds[0].text)
+                        .strip("\n")
+                        .strip()
+                        .split(" ")[0]
+                    )
+                    hour_str, min_str = time_str.split("h")
+                    if min_str:
+                        parsed_time = dt_time(int(hour_str), int(min_str))
+                    else:
+                        parsed_time = dt_time(int(hour_str))
+                    movie["time"].append(parsed_time)
+
     def _get_today_str(self):
         """returns de current day in
         {XX de mês} format, with and without a leading zero
@@ -190,23 +209,7 @@ def _get_todays_features(self):
 
             feature_timetable = p_tag.find_next_sibling("table")
             if feature_timetable:
-                for feature_tr in feature_timetable.find_all("tr"):
-                    feature_tds = feature_tr.find_all("td")
-                    for movie in self.movies:
-                        if movie["title"].lower() == feature_tds[2].text.lower():
-                            # Movie will be featured today
-                            time_str = (
-                                unicodedata.normalize("NFKC", feature_tds[0].text)
-                                .strip("\n")
-                                .strip()
-                                .split(" ")[0]
-                            )
-                            hour_str, min_str = time_str.split("h")
-                            if min_str:
-                                parsed_time = dt_time(int(hour_str), int(min_str))
-                            else:
-                                parsed_time = dt_time(int(hour_str))
-                            movie["time"].append(parsed_time)
+                self._get_movies_from_table(feature_timetable)
             else:
                 for strong in p_tag.find_all("strong"):
                     for movie in self.movies:
@@ -231,6 +234,89 @@ def _get_todays_features(self):
 
                         movie["time"].append(parsed_time)
         features = [movie for movie in self.movies if len(movie["time"]) > 0]
+        if len(features) == 0:
+            # they are probably all in one big unformatted table
+            # <table border="0" cellpadding="0" cellspacing="0" style="width:461px">
+            #     <tbody>
+            #         <tr>
+            #             <td colspan="3"><strong>31 de outubro&nbsp;| quinta</strong></td>
+            #         </tr>
+            #         <tr>
+            #             <td colspan="3">&nbsp;</td>
+            #         </tr>
+            #         <tr>
+            #             <td>14h15</td>
+            #             <td>PA</td>
+            #             <td><a href="https://www.cinematecapauloamorim.com.br/programacao/2111/megalopolis">Megal&oacute;polis</a></td>
+            #         </tr>
+            #         <tr>
+            #            ...
+            #         </tr>
+            #         <tr>
+            #            ...
+            #         </tr>
+            #           ...
+            #         <tr>
+            #             <td colspan="3" rowspan="2">&nbsp;</td>
+            #         </tr>
+            #         <tr>
+            #         </tr>
+            #         <tr>
+            #             <td colspan="3"><strong>1 de novembro&nbsp;| sexta</strong></td>
+            #         </tr>
+            #         <tr>
+            #             <td colspan="3">&nbsp;</td>
+            #         </tr>
+            #         <tr>
+            #             <td>14h15</td>
+            #             <td>PA</td>
+            #             <td><a href="...">...</td>
+            #         </tr>
+            for strong_tag in grade_soup.find_all("strong"):
+                strong_text = unicodedata.normalize("NFKC", strong_tag.text)
+                movie_matches_today = strong_text.lower().startswith(
+                    today_str
+                ) or strong_text.lower().startswith(today_str_no_leading_zero)
+                if not movie_matches_today:
+                    continue
+
+                strong_tag_tr = strong_tag.parent.parent
+                # get all trs after the current one
+                rows_after = strong_tag_tr.find_next_siblings("tr")
+                for feature_tr in rows_after:
+                    feature_tds = feature_tr.find_all("td")
+                    # needs to be in the following format
+                    # <tr>
+                    #    <td>19h</td>
+                    #    <td>PA</td>
+                    #    <td><a href="...">Movie name</a></td>
+                    # </tr>
+                    if len(feature_tds) != 3:
+                        # not in the format we expect
+                        continue
+                    for movie in self.movies:
+                        # make sure we only get the first occurence of that movie
+                        if movie.get("scrapped", False) is True:
+                            continue
+                        if movie["title"].lower() == feature_tds[2].text.lower():
+                            # Movie will be featured today
+                            time_str = (
+                                unicodedata.normalize("NFKC", feature_tds[0].text)
+                                .strip("\n")
+                                .strip()
+                                .split(" ")[0]
+                            )
+                            hour_str, min_str = time_str.split("h")
+                            if min_str:
+                                parsed_time = dt_time(int(hour_str), int(min_str))
+                            else:
+                                parsed_time = dt_time(int(hour_str))
+                            movie["time"].append(parsed_time)
+                            movie["scrapped"] = True
+                    features = [
+                        movie for movie in self.movies if len(movie["time"]) > 0
+                    ]
+
         sorted_features = sorted(features, key=lambda feature: feature["time"][0])
         for feature in sorted_features:
             feature["time"] = "/ ".join(

31 de outubro \| quinta

14h15	PA	Megalópolis


1 de novembro \| sexta

14h15	PA	...
19h	PA	Movie name