Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add xnunu.com #2421

Merged
merged 1 commit into from
Jul 24, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions sources/zh/xnunu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import logging
from bs4 import Tag
from lncrawl.core.crawler import Crawler

from lncrawl.models import Volume, Chapter

logger = logging.getLogger(__name__)


class Xnunu(Crawler):
"""https://www.xnunu.com crawler, search is behind captcha so not feasible."""

base_url = ["https://www.xnunu.com/"]

def initialize(self) -> None:
# reminder to bookmark that's in every page at the start -> removed.
self.cleaner.bad_tag_text_pairs["font"] = [
"提示您:看后求收藏(",
"搞事马甲不能掉,新努努书坊,www.xnunu.com",
"),接着再看更方便。"
]

def download_chapter_body(self, chapter: Chapter) -> str:
main_data = self.get_soup(chapter.url)

header = main_data.select_one("div#content>.page-header")
chap_title = header.select_one("h1.h4")
chap_paging = header.select_one("h1.h4>small")

# titles in chapter list are not always complete
# also we don't want a "page x/y" after each chapter's name
chapter.title = chap_title.text.replace(chap_paging.text, "")

# some chapters have multiple pages, we want all the content
# page links look like 123_2 123_3.html, etc.
def _has_next_page(next_url_tag: Tag) -> bool:
url = next_url_tag["href"]
frag = url.split("/")[-1]
return "_" in frag

def cleanup_page(page: Tag) -> Tag:
"""
Get rid of repeating author name at the start of every page
Thus multiple times per chapter...
"""
first_child = page.select("p")[0]
if first_child is not None and first_child.text.startswith(self.novel_author):
first_child.decompose()
return page

curr_page = main_data
first_page = curr_page.select_one("#chaptercontent")
chap_data = [cleanup_page(first_page)]

while _has_next_page(curr_page.select_one("#next_url")):
curr_page = self.get_soup(self.absolute_url(curr_page.select_one("#next_url")["href"]))
chap_data.append(cleanup_page(curr_page.select_one("#chaptercontent")))

return "\n".join([
self.cleaner.extract_contents(chap) for chap in chap_data
])

def read_novel_info(self) -> None:
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

novel_id = self.novel_url.split("/")[-1]
try:
int(novel_id)
except ValueError:
logger.error("Couldn't get novel_id from URL, "
"URL should look like https://www.xnunu.com/book/9223")
return

container = soup.select_one(".book-bookinfo")

possible_title = container.select_one("h1.name")
assert possible_title, "No novel title"
self.novel_title = possible_title.text.strip()
logger.info("Novel title: %s", self.novel_title)

possible_image = container.select_one("img.thumbnail")
if isinstance(possible_image, Tag):
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)

possible_author = container.select_one('p>a.btn-info')
if isinstance(possible_author, Tag):
self.novel_author = possible_author.text.strip()
assert self.novel_author, "No novel author, required for cleanup"
logger.info("Novel Author: %s", self.novel_author)

possible_tag = soup.select_one('ol.breadcrumb > li:nth-child(2) > a')
if isinstance(possible_tag, Tag):
self.novel_tags = [possible_tag.text.strip()]
logger.info("Novel Tag: %s", self.novel_tags)

possible_synopsis = container.select_one("#bookIntro")
if isinstance(possible_synopsis, Tag):
self.novel_synopsis = possible_synopsis.text
logger.info("Novel Synopsis: %s", self.novel_synopsis)

# this is required to be able to fetch chapters if there are > 100 (paginated results)
chap_overview = self.get_soup(f"https://www.xnunu.com/index/9/{novel_id}/1.html")
# this will cause the first page to be fetched two times but is more convenient on the code
dropdown = chap_overview.select_one("#indexselect") # there's two of these, same content
chap_links = [self.absolute_url(opt["value"]) for opt in dropdown.select("option")]

for link in chap_links:
chap_data = self.get_soup(link)
chapter_list = chap_data.select("dl.panel-chapterlist>dd>a")

for a in chapter_list:
vol_id = len(self.chapters) // 100 + 1
if len(self.chapters) % 100 == 0:
self.volumes.append(Volume(vol_id))
self.chapters.append(
Chapter(
len(self.chapters) + 1,
url=self.absolute_url(a["href"]),
title=a.text.strip(),
volume=vol_id,
)
)