Skip to content

Commit

Permalink
add nyxtranslation as a new en source
Browse files Browse the repository at this point in the history
  • Loading branch information
ACA authored and dipu-bd committed Feb 18, 2024
1 parent 9ba5d83 commit c307213
Showing 1 changed file with 140 additions and 0 deletions.
140 changes: 140 additions & 0 deletions sources/en/n/nyxtranslation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-
import logging
import re

from bs4.element import Tag
from lncrawl.core.crawler import Crawler
from lncrawl.models import Volume, Chapter

logger = logging.getLogger(__name__)

volume_aliases = {"volume", "arc", "series", "saga", "chronicle", "tome", "storyline"}


class NYXTranslation(Crawler):
base_url = ["https://nyx-translation.com/", "https://nyxtranslation.home.blog/"]
has_manga = False
has_mtl = False

def initialize(self):
self.cleaner.bad_tags.add("script")
self.cleaner.bad_tags.add("a")

def read_novel_info(self):
soup = self.get_soup(self.novel_url)

content = soup.select_one("main#main > article")

entry_title = content.select_one("h1.entry-title")
assert isinstance(entry_title, Tag)
self.novel_title = entry_title.text.strip()
pre_tags = content.find("strong", text=re.compile(r"Genre.*:.*"))
if pre_tags:
tags = pre_tags.find_next_sibling(None, text=re.compile(r"\w+,"))
for tag in tags.split(", "):
self.novel_tags.append(tag)

pre_author = content.find("strong", text=re.compile(r"Author.*:?.*"))
if pre_author:
maybe_author = pre_author.next_sibling
author = maybe_author
if ": " in maybe_author.text:
author = maybe_author.next_sibling
self.novel_author = author.text

cover = content.select_one("img") # first image is the novel cover
if cover:
src = str(cover['src'])
# may be replaced with JS after load, in such case try and get the real img hidden in data-values
if src.startswith("data:"):
try:
src = cover["data-orig-file"]
except KeyError:
pass
self.novel_cover = self.absolute_url(src)

description = ""
description_start = content.find("p", text="Description")
d_next = description_start.next_sibling
while True:
if not isinstance(d_next, Tag):
d_next = d_next.next_sibling
continue
if "Alternative Name(s)" in d_next.next_sibling or d_next.name != "p":
break
description += d_next.text + "\n"
d_next = d_next.next_sibling
self.novel_synopsis = description

# "inconsistency is key" - the site author, probably... (s is optional)
chapters_start = content.find("p", text=re.compile(r"Table of Contents?", re.IGNORECASE))
c_next = chapters_start.next_sibling
chap = ""
while c_next:
if not isinstance(c_next, Tag):
c_next = c_next.next_sibling
continue

# there are some aria-hidden spacing divs within the chapter list
# also skip text-emtpy elements
if (c_next.name == "div" and c_next.has_attr("aria-hidden")) or not c_next.text:
c_next = c_next.next_sibling
continue
links = c_next.find_all("a")
if not links:
if self.is_volume(c_next.text):
logger.info("Found a volume: %s", c_next.text)
self.volumes.append(
Volume(
id=len(self.volumes) + 1,
title=c_next.text.strip().replace(":", ""),
)
)
else:
# these are all elements (except the spacer div) that shouldn't appear -> it should be done
if c_next.name in ["div", "script", "footer"]:
break
chap = c_next.text # would be a chapter title
else:
for link in links:
href = str(link["href"])
if not self.on_site(href):
logger.info("Found external link, assuming lazy structure, link: %s", href)
c_next = chapters_start.parent.next_sibling
break # break out of for loop in this case.
if not re.match(re.compile(r".+-part-\d+.*"), href.lower()):
chap = ""
self.chapters.append(
Chapter(
id=len(self.chapters) + 1,
title=f"{chap} {link.text.lower()}",
url=self.absolute_url(href),
# guarantee chapters like prologues listed outside vol1, are in vol1
volume=max(len(self.volumes), 1),
)
)
c_next = c_next.next_sibling

# in rare cases the volume names don't have any indicators, so we end up without any, this "fixes" that.
if not self.volumes:
self.volumes.append(
Volume(
id=1,
title="All content"
)
)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url)

contents_html = soup.select_one("div.entry-content")
return self.cleaner.extract_contents(contents_html)

def on_site(self, href: str) -> bool:
if "http" in href.lower():
return max([href.startswith(url) for url in self.base_url])
return False

@classmethod
def is_volume(cls, text: str) -> bool:
return bool(max([x in text.lower() for x in volume_aliases]))

0 comments on commit c307213

Please sign in to comment.