Fix syosetu volume parsing and crawler changes

dipu-bd · Sep 19, 2023 · 991c60c · 991c60c
1 parent acb54c4
commit 991c60c
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 22 deletions.
diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py
@@ -117,18 +117,19 @@ def extract_chapter_images(self, chapter: Chapter) -> None:
         if not chapter.body:
             return
 
+        has_changes = False
         chapter.setdefault("images", {})
         soup = self.make_soup(chapter.body)
         for img in soup.select("img[src]"):
             full_url = self.absolute_url(img["src"], page_url=chapter["url"])
             if not full_url.startswith("http"):
                 continue
-
             filename = hashlib.md5(full_url.encode()).hexdigest() + ".jpg"
             img.attrs = {"src": "images/" + filename, "alt": filename}
             chapter.images[filename] = full_url
+            has_changes = True
 
-        if soup.find("body") is not None:
+        if has_changes:
             chapter.body = soup.find("body").decode_contents()
 
     def download_chapters(

diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py
@@ -4,9 +4,7 @@
 from lncrawl.core.crawler import Crawler
 
 logger = logging.getLogger(__name__)
-search_url = (
-    "https://yomou.syosetu.com/search.php?word=%s"
-)
+search_url = "https://yomou.syosetu.com/search.php?word=%s"
 
 
 class SyosetuCrawler(Crawler):
@@ -18,8 +16,12 @@ def search_novel(self, query):
         results = []
         for tab in soup.select(".searchkekka_box"):
             a = tab.select_one(".novel_h a")
-            latest = tab.select_one(".left").get_text(separator=" ").strip()  # e.g.: 連載中 (全604部分)
-            votes = tab.select_one(".attention").text.strip()  # e.g.: "総合ポイント： 625,717 pt"
+            latest = (
+                tab.select_one(".left").get_text(separator=" ").strip()
+            )  # e.g.: 連載中 (全604部分)
+            votes = tab.select_one(
+                ".attention"
+            ).text.strip()  # e.g.: "総合ポイント： 625,717 pt"
             results.append(
                 {
                     "title": a.text.strip(),
@@ -30,9 +32,11 @@ def search_novel(self, query):
         return results
 
     def read_novel_info(self):
+        self.init_parser('xml')
         soup = self.get_soup(self.novel_url)
 
         self.novel_title = soup.select_one(".novel_title").text.strip()
+        logger.debug('Novel title: %s', self.novel_title)
 
         # No novel cover.
 
@@ -41,29 +45,28 @@ def read_novel_info(self):
             self.novel_author = author_tag.text.strip()
 
         # Syosetu calls parts "chapters"
-        volume_id = 0 if len(soup.select(".chapter_title")) != 0 else 1
-        chapter_id = 1
-        for tag in soup.select(".chapter_title, .subtitle a"):
-            if tag.name == "a":
+        chapter_id = 0
+        volume = {"id": 0}
+        self.volumes.append(volume)
+        for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"):
+            if 'chapter_title' in tag.attrs.get('class', ''):
+                # Part/volume (there might be none)
+                volume = {
+                    "id": volume['id'] + 1,
+                    "title": tag.text.strip(),
+                }
+                self.volumes.append(volume)
+            elif tag.name == "a":
                 # Chapter
+                chapter_id += 1
                 self.chapters.append(
                     {
                         "id": chapter_id,
-                        "volume": volume_id,
+                        "volume": volume['id'],
                         "title": tag.text.strip() or ("Chapter %d" % chapter_id),
                         "url": self.absolute_url(tag["href"]),
                     }
                 )
-                chapter_id += 1
-            elif tag.name == "div":
-                # Part/volume (there might be none)
-                self.volumes.append(
-                    {
-                        "id": volume_id,
-                        "title": tag.text.strip(),
-                    }
-                )
-                volume_id += 1
 
     def download_chapter_body(self, chapter):
         soup = self.get_soup(chapter["url"])