From d794048612096877d5e40a6976435577b1bd06b1 Mon Sep 17 00:00:00 2001 From: zGadli Date: Fri, 1 Mar 2024 12:02:42 +0530 Subject: [PATCH 01/10] Update 69shuba.py --- sources/zh/69shuba.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sources/zh/69shuba.py b/sources/zh/69shuba.py index e2fe25b22..95261c158 100644 --- a/sources/zh/69shuba.py +++ b/sources/zh/69shuba.py @@ -14,9 +14,9 @@ "Accept-Language": "en-US,en;q=0.9,de-CH;q=0.8,de;q=0.7", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded", - "Origin": "https://www.69xinshu.com", + "Origin": "https://www.69shu.pro", "DNT": "1", - "Referer": "https://www.69xinshu.com/modules/article/search.php", + "Referer": "https://www.69shu.pro/modules/article/search.php", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Opera GX";v="106"', @@ -28,7 +28,7 @@ } logger = logging.getLogger(__name__) -search_url = "https://www.69xinshu.com/modules/article/search.php" # Updated to the new domain +search_url = "https://www.69shu.pro/modules/article/search.php" # Updated to the new domain class sixnineshu(Crawler): @@ -36,6 +36,7 @@ class sixnineshu(Crawler): "https://www.69shuba.com/", "https://www.69shu.com/", "https://www.69xinshu.com/", + "https://www.69shu.pro/" ] def initialize(self): From 0f7cd6b8236791697516295abe18fda6648eb75c Mon Sep 17 00:00:00 2001 From: zGadli Date: Fri, 15 Mar 2024 14:32:49 +0530 Subject: [PATCH 02/10] Update daotranslate.py --- sources/en/d/daotranslate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sources/en/d/daotranslate.py b/sources/en/d/daotranslate.py index 8561596c4..646456f27 100644 --- a/sources/en/d/daotranslate.py +++ b/sources/en/d/daotranslate.py @@ -6,11 +6,11 @@ from lncrawl.core.crawler import Crawler logger = logging.getLogger(__name__) -search_url = "https://daotranslate.com/?s=%s" +search_url = "https://daotranslate.us/?s=%s" class DaoTranslateCrawler(Crawler): - base_url = "https://daotranslate.com/" + base_url = ["https://daotranslate.com/","https://daotranslate.us/"] has_mtl = True def initialize(self): From efa97352c8e7f85a8b5bbc90a897ae226395507b Mon Sep 17 00:00:00 2001 From: zGadli Date: Fri, 15 Mar 2024 14:35:08 +0530 Subject: [PATCH 03/10] Update daotranslate.py --- sources/en/d/daotranslate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/en/d/daotranslate.py b/sources/en/d/daotranslate.py index 646456f27..05f6a08cf 100644 --- a/sources/en/d/daotranslate.py +++ b/sources/en/d/daotranslate.py @@ -10,7 +10,7 @@ class DaoTranslateCrawler(Crawler): - base_url = ["https://daotranslate.com/","https://daotranslate.us/"] + base_url = ["https://daotranslate.com/", "https://daotranslate.us/"] has_mtl = True def initialize(self): From 5f021d96ed562ed1ff0d7d00ae34f7c45a2f7922 Mon Sep 17 00:00:00 2001 From: zGadli Date: Wed, 20 Mar 2024 12:04:44 +0530 Subject: [PATCH 04/10] Update aquamanga.py --- sources/en/a/aquamanga.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/en/a/aquamanga.py b/sources/en/a/aquamanga.py index b991e4678..8ccd26b53 100644 --- a/sources/en/a/aquamanga.py +++ b/sources/en/a/aquamanga.py @@ -8,7 +8,7 @@ class AquaMangaCrawler(Crawler): has_manga = True - base_url = ["https://aquamanga.com/"] + base_url = ["https://aquamanga.com/", "https://aquamanga.org/"] search_url = "%s?s=%s&post_type=wp-manga&author=&artist=&release=" From e7b7ed3b618b32be1d827bc44287aea2f5efcd91 Mon Sep 17 00:00:00 2001 From: zGadli Date: Wed, 20 Mar 2024 15:57:50 +0530 Subject: [PATCH 05/10] Add source webtoons --- sources/en/w/webtoons.py | 127 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 sources/en/w/webtoons.py diff --git a/sources/en/w/webtoons.py b/sources/en/w/webtoons.py new file mode 100644 index 000000000..4f3927446 --- /dev/null +++ b/sources/en/w/webtoons.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- + +import logging +from urllib.parse import urlparse, parse_qs +from lncrawl.core.crawler import Crawler + +logger = logging.getLogger(__name__) + + +class WebToonsCrawler(Crawler): + has_manga = True + base_url = ["https://www.webtoons.com/"] + + search_url = "%ssearch?keyword=%s" + + def initialize(self) -> None: + self.cleaner.bad_tags.update(["h3"]) + + def search_novel(self, query): + query = query.lower().replace(" ", "+") + + search_url1 = self.search_url % (self.home_url, query) + search_url2 = search_url1 + "&searchType=CHALLENGE" + + soup = self.get_soup(search_url1) + soup1 = self.get_soup(search_url2) + + results = [] + for tab in soup.select("ul.card_lst li"): + a = tab.select_one("a") + title = tab.select_one("p.subj") + results.append( + { + "title": title, + "url": self.absolute_url(a["href"]) + } + ) + + for tab in soup1.select("div.challenge_lst.search ul"): + a = tab.select_one("a.challenge_item") + title = tab.select_one("p.subj") + results.append( + { + "title": title, + "url": self.absolute_url(a["href"]) + } + ) + + return results + + def read_novel_info(self): # need to check if there is only 1 pagination + logger.debug("Visiting %s", self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one("h1.subj") + self.novel_title = possible_title.text.strip() + logger.info("Novel title: %s", self.novel_title) + + self.novel_author = soup.select_one("a.author").text + logger.info("%s", self.novel_author) + + last_link = soup.select_one("div.paginate a:nth-last-child(1)") + + url = str(last_link["href"]) + + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + page_number = query_params.get('page', [])[0] if 'page' in query_params else None + page_number = int(page_number) + + futures = [ + self.executor.submit(self.get_soup, f"{self.novel_url}&page={i}") + for i in range(1, page_number + 1) + ] + page_soups = [f.result() for f in futures] + # url_selector : element["href"] , chap_title : element.select_one("span.subj").text + + num = 1 + numbers = [] + chap_links = [] + chap_titles = [] + + for element in reversed( + [a for soup in page_soups for a in soup.select("#_listUl a")] + ): + numbers.append(num) + chap_links.append(element["href"]) + chap_titles.append(element.select_one("span.subj").text) + num += 1 + + data = {} + sets_of_data = [] + + for number, link, title in zip(numbers, chap_links, chap_titles): + sets_of_data.append((number, link, title)) + + for number, link, title in sets_of_data: + data[number] = (link, title) + + for chap_num, (link, title) in data.items(): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({"id": vol_id}) + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": title, + "url": self.absolute_url(link), + } + ) + + def download_chapter_body(self, chapter): + logger.info("Visiting %s", chapter["url"]) + soup = self.get_soup(chapter["url"]) + contents = soup.select_one("#_imageList") + + for img in contents.findAll("img"): + if img.has_attr("data-src"): + src_url = img["data-src"] + parent = img.parent + img.extract() + new_tag = soup.new_tag("img", src=src_url) + parent.append(new_tag) + + return self.cleaner.extract_contents(contents) From 2f6f0bda6d048e5f1484b6080a31a8042569d9a9 Mon Sep 17 00:00:00 2001 From: zGadli Date: Wed, 20 Mar 2024 17:35:44 +0530 Subject: [PATCH 06/10] fix source webtoons --- lncrawl/core/scraper.py | 1 + sources/en/w/webtoons.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lncrawl/core/scraper.py b/lncrawl/core/scraper.py index 689a2fae9..464b2bda6 100644 --- a/lncrawl/core/scraper.py +++ b/lncrawl/core/scraper.py @@ -303,6 +303,7 @@ def get_soup(self, url, headers={}, encoding=None, **kwargs) -> BeautifulSoup: "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9", ) + kwargs["headers"] = headers response = self.get_response(url, **kwargs) self.last_soup_url = url return self.make_soup(response, encoding) diff --git a/sources/en/w/webtoons.py b/sources/en/w/webtoons.py index 4f3927446..941ec3856 100644 --- a/sources/en/w/webtoons.py +++ b/sources/en/w/webtoons.py @@ -113,12 +113,12 @@ def read_novel_info(self): # need to check if there is only 1 pagination def download_chapter_body(self, chapter): logger.info("Visiting %s", chapter["url"]) - soup = self.get_soup(chapter["url"]) + soup = self.get_soup(chapter["url"], headers={'Referer': f'{self.novel_url}'}) contents = soup.select_one("#_imageList") for img in contents.findAll("img"): - if img.has_attr("data-src"): - src_url = img["data-src"] + if img.has_attr("data-url"): + src_url = img["data-url"] parent = img.parent img.extract() new_tag = soup.new_tag("img", src=src_url) From aed724ea8276caca19623d62ea81623c45c2c8da Mon Sep 17 00:00:00 2001 From: zGadli Date: Wed, 20 Mar 2024 17:45:40 +0530 Subject: [PATCH 07/10] revert changes and create new PR for new source --- lncrawl/core/scraper.py | 1 - sources/en/w/webtoons.py | 127 --------------------------------------- 2 files changed, 128 deletions(-) delete mode 100644 sources/en/w/webtoons.py diff --git a/lncrawl/core/scraper.py b/lncrawl/core/scraper.py index 464b2bda6..689a2fae9 100644 --- a/lncrawl/core/scraper.py +++ b/lncrawl/core/scraper.py @@ -303,7 +303,6 @@ def get_soup(self, url, headers={}, encoding=None, **kwargs) -> BeautifulSoup: "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9", ) - kwargs["headers"] = headers response = self.get_response(url, **kwargs) self.last_soup_url = url return self.make_soup(response, encoding) diff --git a/sources/en/w/webtoons.py b/sources/en/w/webtoons.py deleted file mode 100644 index 941ec3856..000000000 --- a/sources/en/w/webtoons.py +++ /dev/null @@ -1,127 +0,0 @@ -# -*- coding: utf-8 -*- - -import logging -from urllib.parse import urlparse, parse_qs -from lncrawl.core.crawler import Crawler - -logger = logging.getLogger(__name__) - - -class WebToonsCrawler(Crawler): - has_manga = True - base_url = ["https://www.webtoons.com/"] - - search_url = "%ssearch?keyword=%s" - - def initialize(self) -> None: - self.cleaner.bad_tags.update(["h3"]) - - def search_novel(self, query): - query = query.lower().replace(" ", "+") - - search_url1 = self.search_url % (self.home_url, query) - search_url2 = search_url1 + "&searchType=CHALLENGE" - - soup = self.get_soup(search_url1) - soup1 = self.get_soup(search_url2) - - results = [] - for tab in soup.select("ul.card_lst li"): - a = tab.select_one("a") - title = tab.select_one("p.subj") - results.append( - { - "title": title, - "url": self.absolute_url(a["href"]) - } - ) - - for tab in soup1.select("div.challenge_lst.search ul"): - a = tab.select_one("a.challenge_item") - title = tab.select_one("p.subj") - results.append( - { - "title": title, - "url": self.absolute_url(a["href"]) - } - ) - - return results - - def read_novel_info(self): # need to check if there is only 1 pagination - logger.debug("Visiting %s", self.novel_url) - soup = self.get_soup(self.novel_url) - - possible_title = soup.select_one("h1.subj") - self.novel_title = possible_title.text.strip() - logger.info("Novel title: %s", self.novel_title) - - self.novel_author = soup.select_one("a.author").text - logger.info("%s", self.novel_author) - - last_link = soup.select_one("div.paginate a:nth-last-child(1)") - - url = str(last_link["href"]) - - parsed_url = urlparse(url) - query_params = parse_qs(parsed_url.query) - page_number = query_params.get('page', [])[0] if 'page' in query_params else None - page_number = int(page_number) - - futures = [ - self.executor.submit(self.get_soup, f"{self.novel_url}&page={i}") - for i in range(1, page_number + 1) - ] - page_soups = [f.result() for f in futures] - # url_selector : element["href"] , chap_title : element.select_one("span.subj").text - - num = 1 - numbers = [] - chap_links = [] - chap_titles = [] - - for element in reversed( - [a for soup in page_soups for a in soup.select("#_listUl a")] - ): - numbers.append(num) - chap_links.append(element["href"]) - chap_titles.append(element.select_one("span.subj").text) - num += 1 - - data = {} - sets_of_data = [] - - for number, link, title in zip(numbers, chap_links, chap_titles): - sets_of_data.append((number, link, title)) - - for number, link, title in sets_of_data: - data[number] = (link, title) - - for chap_num, (link, title) in data.items(): - chap_id = len(self.chapters) + 1 - vol_id = 1 + len(self.chapters) // 100 - if chap_id % 100 == 1: - self.volumes.append({"id": vol_id}) - self.chapters.append( - { - "id": chap_id, - "volume": vol_id, - "title": title, - "url": self.absolute_url(link), - } - ) - - def download_chapter_body(self, chapter): - logger.info("Visiting %s", chapter["url"]) - soup = self.get_soup(chapter["url"], headers={'Referer': f'{self.novel_url}'}) - contents = soup.select_one("#_imageList") - - for img in contents.findAll("img"): - if img.has_attr("data-url"): - src_url = img["data-url"] - parent = img.parent - img.extract() - new_tag = soup.new_tag("img", src=src_url) - parent.append(new_tag) - - return self.cleaner.extract_contents(contents) From 4c0071b7f5785d9cc7ae0aa732fc7126a628e711 Mon Sep 17 00:00:00 2001 From: zGadli Date: Wed, 20 Mar 2024 20:04:52 +0530 Subject: [PATCH 08/10] Revert "Update 69shuba.py" This reverts commit d794048612096877d5e40a6976435577b1bd06b1. --- sources/zh/69shuba.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sources/zh/69shuba.py b/sources/zh/69shuba.py index 95261c158..e2fe25b22 100644 --- a/sources/zh/69shuba.py +++ b/sources/zh/69shuba.py @@ -14,9 +14,9 @@ "Accept-Language": "en-US,en;q=0.9,de-CH;q=0.8,de;q=0.7", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded", - "Origin": "https://www.69shu.pro", + "Origin": "https://www.69xinshu.com", "DNT": "1", - "Referer": "https://www.69shu.pro/modules/article/search.php", + "Referer": "https://www.69xinshu.com/modules/article/search.php", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Opera GX";v="106"', @@ -28,7 +28,7 @@ } logger = logging.getLogger(__name__) -search_url = "https://www.69shu.pro/modules/article/search.php" # Updated to the new domain +search_url = "https://www.69xinshu.com/modules/article/search.php" # Updated to the new domain class sixnineshu(Crawler): @@ -36,7 +36,6 @@ class sixnineshu(Crawler): "https://www.69shuba.com/", "https://www.69shu.com/", "https://www.69xinshu.com/", - "https://www.69shu.pro/" ] def initialize(self): From e1c192586b0b21fb17d58a5a101366e56bfb3cc7 Mon Sep 17 00:00:00 2001 From: zGadli Date: Wed, 20 Mar 2024 20:08:36 +0530 Subject: [PATCH 09/10] revert changes --- sources/en/a/aquamanga.py | 2 +- sources/en/d/daotranslate.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sources/en/a/aquamanga.py b/sources/en/a/aquamanga.py index 8ccd26b53..b991e4678 100644 --- a/sources/en/a/aquamanga.py +++ b/sources/en/a/aquamanga.py @@ -8,7 +8,7 @@ class AquaMangaCrawler(Crawler): has_manga = True - base_url = ["https://aquamanga.com/", "https://aquamanga.org/"] + base_url = ["https://aquamanga.com/"] search_url = "%s?s=%s&post_type=wp-manga&author=&artist=&release=" diff --git a/sources/en/d/daotranslate.py b/sources/en/d/daotranslate.py index 05f6a08cf..8561596c4 100644 --- a/sources/en/d/daotranslate.py +++ b/sources/en/d/daotranslate.py @@ -6,11 +6,11 @@ from lncrawl.core.crawler import Crawler logger = logging.getLogger(__name__) -search_url = "https://daotranslate.us/?s=%s" +search_url = "https://daotranslate.com/?s=%s" class DaoTranslateCrawler(Crawler): - base_url = ["https://daotranslate.com/", "https://daotranslate.us/"] + base_url = "https://daotranslate.com/" has_mtl = True def initialize(self): From b9ff32b54a04f0ecb43a29716bfcb38804268304 Mon Sep 17 00:00:00 2001 From: zGadli Date: Wed, 20 Mar 2024 20:21:59 +0530 Subject: [PATCH 10/10] Update scraper.py --- lncrawl/core/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lncrawl/core/scraper.py b/lncrawl/core/scraper.py index 689a2fae9..fe6b25539 100644 --- a/lncrawl/core/scraper.py +++ b/lncrawl/core/scraper.py @@ -105,7 +105,7 @@ def __process_request(self, method: str, url, **kwargs): headers.setdefault("Referer", self.last_soup_url or self.home_url) headers.setdefault("User-Agent", self.user_agent) kwargs["headers"] = { - str(k).encode("ascii"): str(v).encode("ascii") + str(k).encode("utf-8"): str(v).encode("utf-8") for k, v in headers.items() if v }