Skip to content

Commit

Permalink
Fix syosetu volume parsing and crawler changes
Browse files Browse the repository at this point in the history
  • Loading branch information
dipu-bd committed Sep 19, 2023
1 parent acb54c4 commit 991c60c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 22 deletions.
5 changes: 3 additions & 2 deletions lncrawl/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,19 @@ def extract_chapter_images(self, chapter: Chapter) -> None:
if not chapter.body:
return

has_changes = False
chapter.setdefault("images", {})
soup = self.make_soup(chapter.body)
for img in soup.select("img[src]"):
full_url = self.absolute_url(img["src"], page_url=chapter["url"])
if not full_url.startswith("http"):
continue

filename = hashlib.md5(full_url.encode()).hexdigest() + ".jpg"
img.attrs = {"src": "images/" + filename, "alt": filename}
chapter.images[filename] = full_url
has_changes = True

if soup.find("body") is not None:
if has_changes:
chapter.body = soup.find("body").decode_contents()

def download_chapters(
Expand Down
43 changes: 23 additions & 20 deletions sources/jp/s/syosetu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from lncrawl.core.crawler import Crawler

logger = logging.getLogger(__name__)
search_url = (
"https://yomou.syosetu.com/search.php?word=%s"
)
search_url = "https://yomou.syosetu.com/search.php?word=%s"


class SyosetuCrawler(Crawler):
Expand All @@ -18,8 +16,12 @@ def search_novel(self, query):
results = []
for tab in soup.select(".searchkekka_box"):
a = tab.select_one(".novel_h a")
latest = tab.select_one(".left").get_text(separator=" ").strip() # e.g.: 連載中 (全604部分)
votes = tab.select_one(".attention").text.strip() # e.g.: "総合ポイント: 625,717 pt"
latest = (
tab.select_one(".left").get_text(separator=" ").strip()
) # e.g.: 連載中 (全604部分)
votes = tab.select_one(
".attention"
).text.strip() # e.g.: "総合ポイント: 625,717 pt"
results.append(
{
"title": a.text.strip(),
Expand All @@ -30,9 +32,11 @@ def search_novel(self, query):
return results

def read_novel_info(self):
self.init_parser('xml')
soup = self.get_soup(self.novel_url)

self.novel_title = soup.select_one(".novel_title").text.strip()
logger.debug('Novel title: %s', self.novel_title)

# No novel cover.

Expand All @@ -41,29 +45,28 @@ def read_novel_info(self):
self.novel_author = author_tag.text.strip()

# Syosetu calls parts "chapters"
volume_id = 0 if len(soup.select(".chapter_title")) != 0 else 1
chapter_id = 1
for tag in soup.select(".chapter_title, .subtitle a"):
if tag.name == "a":
chapter_id = 0
volume = {"id": 0}
self.volumes.append(volume)
for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"):
if 'chapter_title' in tag.attrs.get('class', ''):
# Part/volume (there might be none)
volume = {
"id": volume['id'] + 1,
"title": tag.text.strip(),
}
self.volumes.append(volume)
elif tag.name == "a":
# Chapter
chapter_id += 1
self.chapters.append(
{
"id": chapter_id,
"volume": volume_id,
"volume": volume['id'],
"title": tag.text.strip() or ("Chapter %d" % chapter_id),
"url": self.absolute_url(tag["href"]),
}
)
chapter_id += 1
elif tag.name == "div":
# Part/volume (there might be none)
self.volumes.append(
{
"id": volume_id,
"title": tag.text.strip(),
}
)
volume_id += 1

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
Expand Down

0 comments on commit 991c60c

Please sign in to comment.