Skip to content

Commit

Permalink
freewebnovel remove random self-promo in most cases
Browse files Browse the repository at this point in the history
  • Loading branch information
ACA committed Feb 24, 2024
1 parent 0aaf170 commit bbf31f7
Showing 1 changed file with 9 additions and 0 deletions.
9 changes: 9 additions & 0 deletions sources/en/f/freewebnovel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import unicodedata
import re

from bs4 import BeautifulSoup, Tag

Expand Down Expand Up @@ -100,8 +101,16 @@ def normalize_text(self, text: str) -> str:

def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
body_tag = soup.select_one(".m-read .txt")
# style element on page that hides usually last paragraph which contains randomised self-promo text
has_promo = soup.find("style", text=re.compile("p:nth-last-child\\(\\d\\)"))
if body_tag:
normalized_body = self.normalize_text(str(body_tag))
normalized_soup = BeautifulSoup(normalized_body, "html.parser")
if has_promo:
# get index out of css selector and manually remove it via decompose
idx = int(re.match(re.compile(".+p:nth-last-child\\((\\d)\\).+"), has_promo.text)[1])
random_self_promo = normalized_soup.find_all("p")[-idx]
if isinstance(random_self_promo, Tag):
random_self_promo.decompose()
return normalized_soup
return body_tag

0 comments on commit bbf31f7

Please sign in to comment.