diff --git a/lncrawl/templates/novelupdates.py b/lncrawl/templates/novelupdates.py index 8474860ef..1a7a90fa5 100644 --- a/lncrawl/templates/novelupdates.py +++ b/lncrawl/templates/novelupdates.py @@ -28,6 +28,11 @@ class NovelupdatesTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate _cached_crawlers: Mapping[str, Crawler] = {} _title_matcher = re.compile(r"^(c|ch|chap|chapter)?[^\w\d]*(\d+)$", flags=re.I) + def initialize(self): + self.init_executor( + workers=4, + ) + def wait_for_cloudflare(self): if "cf_clearance" in self.cookies: return @@ -156,6 +161,19 @@ def select_chapter_body(self, soup: BeautifulSoup) -> Tag: return super().select_chapter_body(soup) def parse_chapter_body(self, chapter: Chapter, text: str) -> str: + if "re-library" in chapter.url and "translations" not in chapter.url: + soup = self.get_soup(chapter.url) + post_url = soup.select_one(".entry-content > p[style*='center'] a")['href'] + if "page_id" in post_url: + chapter.url = post_url + else: + time.sleep(2.5) + novel_url = f"https://re-library.com/translations/{post_url.split('/')[4:5][0]}" + response = self.get_soup(novel_url) + chapters = response.select(".page_item > a") + chapter.url = chapters[chapter.id - 1]["href"] + time.sleep(2.5) + crawler = self._find_original_crawler(chapter) if hasattr(crawler, "download_chapter_body_in_scraper"): return crawler.download_chapter_body_in_scraper(chapter) diff --git a/sources/en/d/daotranslate.py b/sources/en/d/daotranslate.py index 4b42680f4..8561596c4 100644 --- a/sources/en/d/daotranslate.py +++ b/sources/en/d/daotranslate.py @@ -11,7 +11,7 @@ class DaoTranslateCrawler(Crawler): base_url = "https://daotranslate.com/" - has_mtl= True + has_mtl = True def initialize(self): self.init_executor(ratelimit=1.1) @@ -57,7 +57,6 @@ def read_novel_info(self): self.novel_cover = possible_image["data-src"] logger.info("Novel cover: %s", self.novel_cover) - possible_author = soup.select_one( ".info-content .spe span:nth-child(3) a" ) diff --git a/sources/en/m/mtlnation.py b/sources/en/m/mtlnation.py index 73e4692d8..1808efbed 100644 --- a/sources/en/m/mtlnation.py +++ b/sources/en/m/mtlnation.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import json import logging from urllib.parse import urlencode, urlparse @@ -19,18 +18,9 @@ def initialize(self): self.init_executor(1) def login(self, email: str, password: str) -> None: - self.post_json( - "https://api.mtlnation.com/api/v2/accounts/login", - data=json.dumps( - { - "identity": email, - "password": password, - } - ), - ) - jwt = self.cookies.get("jwt") - self.set_header("authorization", f"JWT {jwt}") - logger.info("Logged in with jwt %s", jwt) + self.set_header("Authorization", f"{email} {password}") + response = self.get_json("https://api.mtlnation.com/api/v2/users/me") + logger.info("Logged in as %s" % response["data"]["name"]) def search_novel(self, query): data = self.get_json( diff --git a/sources/en/r/relibrary.py b/sources/en/r/relibrary.py index ad6f174b2..30ef1bd9b 100644 --- a/sources/en/r/relibrary.py +++ b/sources/en/r/relibrary.py @@ -18,6 +18,7 @@ def initialize(self) -> None: self.init_executor(1) self.cleaner.bad_css.update( [ + "tr", ".nextPageLink", ".prevPageLink", ".su-button", diff --git a/sources/zh/shw5.py b/sources/zh/shw5.py new file mode 100644 index 000000000..160d2f579 --- /dev/null +++ b/sources/zh/shw5.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import logging + +from lncrawl.core.crawler import Crawler + +logger = logging.getLogger(__name__) + + +class Shw5Crawler(Crawler): + base_url = [ + "https://www.shw5.cc/", + "https://www.bq99.cc/", + "https://www.p2wt.com/", + ] + + def read_novel_info(self): + logger.debug("Visiting %s", self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one(".book h1") + assert possible_title, "No novel title" + self.novel_title = possible_title.text + logger.info("Novel title: %s", self.novel_title) + + possible_novel_cover = soup.select_one('.book img') + if possible_novel_cover: + self.novel_cover = self.absolute_url(possible_novel_cover["src"]) + logger.info("Novel cover: %s", self.novel_cover) + + possible_synopsis = soup.select_one('.intro dd') + if possible_synopsis: + self.novel_synopsis = possible_synopsis.text + logger.info("Novel synopsis %s", self.novel_synopsis) + + possible_novel_author = soup.select_one('.book .small span') + if possible_novel_author: + self.novel_author = possible_novel_author.text + logger.info("Novel author: %s", self.novel_author) + + volumes = set([]) + chapters = soup.select_one('.listmain') + for a in chapters.find_all("a", rel=False): + ch_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + volumes.add(vol_id) + self.chapters.append( + { + "id": ch_id, + "volume": vol_id, + "title": a.text, + "url": self.absolute_url(a["href"]), + } + ) + + self.volumes = [{"id": x, "title": ""} for x in volumes] + + def download_chapter_body(self, chapter): + soup = self.get_soup(chapter["url"]) + contents = soup.select_one("#chaptercontent") + return self.cleaner.extract_contents(contents) diff --git a/sources/zh/trxs.py b/sources/zh/trxs.py new file mode 100644 index 000000000..40a82c7c4 --- /dev/null +++ b/sources/zh/trxs.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +import logging + +from lncrawl.core.crawler import Crawler + +logger = logging.getLogger(__name__) + + +class TrxsCrawler(Crawler): + base_url = "https://trxs.cc/" + + def read_novel_info(self): + logger.debug("Visiting %s", self.novel_url) + soup = self.get_soup(self.novel_url, encoding='gb2312') + + possible_title = soup.select_one(".book_info h1") + assert possible_title, "No novel title" + self.novel_title = possible_title.text + logger.info("Novel title: %s", self.novel_title) + + possible_novel_cover = soup.select_one('.book_info img') + if possible_novel_cover: + self.novel_cover = self.absolute_url(possible_novel_cover["src"]) + logger.info("Novel cover: %s", self.novel_cover) + + possible_synopsis = soup.select_one('.book_info p') + if possible_synopsis: + self.novel_synopsis = possible_synopsis.text + logger.info("Novel synopsis %s", self.novel_synopsis) + + possible_novel_author = soup.select_one('.book_info a') + if possible_novel_author: + self.novel_author = possible_novel_author.text + logger.info("Novel author: %s", self.novel_author) + + volumes = set([]) + for a in soup.select(".book_list a"): + ch_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + volumes.add(vol_id) + self.chapters.append( + { + "id": ch_id, + "volume": vol_id, + "title": a.text, + "url": self.absolute_url(a["href"]), + } + ) + + self.volumes = [{"id": x, "title": ""} for x in volumes] + + def download_chapter_body(self, chapter): + soup = self.get_soup(chapter["url"], encoding='gb2312') + contents = soup.select_one(".read_chapterDetail") + return self.cleaner.extract_contents(contents)