Skip to content

Commit

Permalink
Merge branch 'dev' into add-source
Browse files Browse the repository at this point in the history
  • Loading branch information
zGadli authored Jun 17, 2024
2 parents 7d9ac4f + d80d6ce commit e5a2694
Show file tree
Hide file tree
Showing 6 changed files with 689 additions and 405 deletions.
8 changes: 7 additions & 1 deletion .github/contribs.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,5 +101,11 @@
"Nilan Ekanayake": null,
"[email protected]": null,
"Vuizur": "Vuizur",
"[email protected]": "Vuizur"
"[email protected]": "Vuizur",
"Serhii Kapchynskyi": null,
"[email protected]": null,
"CxRxExO": "CxRxExO",
"[email protected]": "CxRxExO",
"Zokhoi": "Zokhoi",
"[email protected]": "Zokhoi"
}
839 changes: 437 additions & 402 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

90 changes: 90 additions & 0 deletions sources/en/g/genesistls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
import logging
from lncrawl.core.crawler import Crawler

logger = logging.getLogger(__name__)

novel_toc_url = "https://genesistls.com/series/%s"
chapter_list_url = "http://gravitytales.com/novel/%s/chapters"


class GenesisTlsCrawler(Crawler):
base_url = "https://genesistls.com"
search_url = base_url + "/?s=%s"

def search_novel(self, query):
soup = self.get_soup(self.search_url % query)

results = []
for novel_article in soup.select(".listupd article"):
novel_url = novel_article.select_one("a")["href"]
novel_title = novel_article.select_one("span.ntitle").text
novel_image = novel_article.select_one("img")["src"].split("?")[0]

results.append(
{
"url": novel_url,
"title": novel_title,
"img": novel_image
}
)

return results

def read_novel_info(self):
soup = self.get_soup(self.novel_url)

potential_novel_title = soup.select_one("h1.entry-title")
assert potential_novel_title, "No novel title"
self.novel_title = potential_novel_title.text
logger.info("Novel title: %s", self.novel_title)

potential_author = soup.select_one("a[href^=\"https://genesistls.com/writer/\"]")
assert potential_author, "No author"
self.novel_author = potential_author.text
logger.info("Novel author: %s", self.novel_author)

potential_cover = self.absolute_url(
soup.select_one(".bigcontent img[itemprop=image]")["src"]
).split("?")[0]
assert potential_cover, "No cover"
self.novel_cover = potential_cover
logger.info("Novel cover: %s", self.novel_cover)

for ep_list_item in soup.select("article.hentry .eplister ul li"):

# Check whether the chapter is paid and skip if true
paid_chapter = ep_list_item.select_one("div.epl-price").text != "Free"
if paid_chapter:
continue

chapter_id = len(self.chapters) + 1
vol_id = chapter_id // 100 + 1

potential_chapter_title = ep_list_item.select_one("div.epl-title").text
chapter_title = potential_chapter_title if len(potential_chapter_title) else f"Chapter {len(self.chapters) + 1}"

chapter_url = ep_list_item.select_one("a")["href"]

if len(self.volumes) < vol_id:
self.volumes.append({"id": vol_id})

self.chapters.append(
{
"id": chapter_id,
"volume": vol_id,
"title": chapter_title,
"url": self.absolute_url(chapter_url)
}
)

logger.debug(
"%d chapters and %d volumes found", len(self.chapters), len(self.volumes)
)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])

contents = soup.select_one("div.epcontent")
contents = self.cleaner.extract_contents(contents)
return str(contents)
152 changes: 152 additions & 0 deletions sources/es/novelasligeras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# -*- coding: utf-8 -*-
import logging
import re
from typing import List
from urllib.parse import urlparse

from lncrawl.core.crawler import Crawler
from lncrawl.models import Chapter, SearchResult, Volume

logger = logging.getLogger(__name__)
search_url = (
"https://novelasligeras.net/?post_type=product&title=1&excerpt=1&content=0&categories=1&attributes=1"
"&tags=1&sku=0&orderby=title-DESC&ixwps=1&s=%s"
)


class NovelasLigerasCrawler(Crawler):
base_url = ["https://novelasligeras.net/"]
has_manga = False
has_mtl = False

def initialize(self) -> None:
self.cleaner.bad_text_regex.update(["Publicidad"])
self.cleaner.bad_css.update(["div[style]"])

def login(self, email: str, password: str) -> None:
# TODO optimize login headers
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0",
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"TE": "Trailers",
"Referer": "https://novelasligeras.net/index.php/suscripcion-ingresar/",
}
data = {
"log": email,
"pwd": password,
"wp-submit": "Acceder",
"redirect_to": "https://novelasligeras.net/index.php/suscripcion-cuenta-v2/",
"mepr_process_login_form": "true",
"mepr_is_login_page": "true",
"testcookie": "1",
}
self.post_response(self.base_url[0], data=data, headers=header)

def search_novel(self, query) -> List[SearchResult]:
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)

results = []
for tab in soup.select(".wf-cell[data-post-id]"):
title = tab.attrs["data-name"]
rating_element = tab.select_one(".star-rating")
rating = "N/A"
if rating_element:
rating = rating_element.attrs["aria-label"]
url_element = tab.select_one(".alignnone")
url = url_element.attrs["href"]
results.append(
SearchResult(
title=title.strip(),
url=self.absolute_url(url),
info="Clasificación: %s" % rating,
)
)

return results

def read_novel_info(self) -> None:
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one("h1.product_title")
assert possible_title, "Sin título"
self.novel_title = possible_title.text.strip()

possible_author = soup.select_one(
'tr.woocommerce-product-attributes-item--attribute_pa_escritor a[rel="tag"]'
)
if possible_author:
self.novel_author = possible_author.text.strip()

possible_cover = soup.select_one('meta[property="og:image"]')
if possible_cover:
self.novel_cover = self.absolute_url(possible_cover["content"])

synopsis = soup.select_one(".woocommerce-product-details__short-description")
if synopsis:
self.novel_synopsis = synopsis.text

hostname = urlparse(self.novel_url).hostname or ""
pattern = re.escape(hostname) + "/index.php" + r"/\d{4}/\d{2}/\d{2}/"

volume_pattern = r"-volumen-(\d+)-"

logger.debug("pattern = %s", pattern)

last_vol_id = 0
chapters_count = 0

for a in soup.select(
".wpb_wrapper a:not([id],[title],[href$='suscripciones/'],[href*='patreon'],[href*='paypal'])"
):
if not re.search(pattern, a["href"]):
continue
chapters_count += 1
chap_id = chapters_count

match = re.search(volume_pattern, a["href"])
if match:
vol_id = int(match.group(1))
last_vol_id = vol_id
else:
vol_id = last_vol_id

vol_present = any(vol["id"] == vol_id for vol in self.volumes)
vol_title = f"Volumen {vol_id}"
if not vol_present:
self.volumes.append(Volume(id=vol_id, title=vol_title))

temp_title = a.text.strip()
temp_title = re.sub(r"\bCapitulo\b", "Capítulo", temp_title)

if "Parte" in temp_title and "Capítulo" in temp_title:
partes = temp_title.split(" – ")
title = " – ".join(partes[::-1])
else:
title = temp_title

self.chapters.append(
Chapter(
id=chap_id,
title=title,
url=self.absolute_url(a["href"]),
volume=vol_id,
volume_title=vol_title,
)
)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
if soup.select_one(".wpb_text_column > div:nth-child(1)"):
text = soup.select_one(".wpb_text_column > div:nth-child(1)")
return self.cleaner.extract_contents(text)
return "--Error al cargar el capítulo--"
3 changes: 2 additions & 1 deletion sources/zh/piaotian.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
import logging

from lncrawl.core.crawler import Crawler
Expand Down Expand Up @@ -67,4 +68,4 @@ def download_chapter_body(self, chapter):

text = self.cleaner.extract_contents(body)

return text
return text

0 comments on commit e5a2694

Please sign in to comment.