Skip to content

Commit

Permalink
Merge pull request #360 from multiflexi/rss_tweaks
Browse files Browse the repository at this point in the history
improve RSS collector
  • Loading branch information
Progress1 authored Aug 30, 2024
2 parents 83e3f49 + f9ebc82 commit d00c55d
Showing 1 changed file with 20 additions and 17 deletions.
37 changes: 20 additions & 17 deletions src/collectors/collectors/rss_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,22 +138,25 @@ def strip_html_tags(html_string):
"rss", source.name, f"Visiting article {count}/{len(feed['entries'])}: {link_for_article}"
)
html_article = ""
request = urllib.request.Request(link_for_article)
request.add_header("User-Agent", user_agent)

with opener(request) as response:
html_article = response.read()

soup = BeautifulSoup(html_article, features="html.parser")

if html_article:
article_text = [p.text.strip() for p in soup.findAll("p")]
replaced_str = "\xa0"
article_sanit = [w.replace(replaced_str, " ") for w in article_text]
article_sanit = " ".join(article_sanit)
# use HTML article if it is longer than summary
if len(article_sanit) > len(summary):
article = article_sanit
try:
request = urllib.request.Request(link_for_article)
request.add_header("User-Agent", user_agent)

with opener(request) as response:
html_article = response.read()

soup = BeautifulSoup(html_article, features="html.parser")

if html_article:
article_text = [p.text.strip() for p in soup.findAll("p")]
replaced_str = "\xa0"
article_sanit = [w.replace(replaced_str, " ") for w in article_text]
article_sanit = " ".join(article_sanit)
# use HTML article if it is longer than summary
if len(article_sanit) > len(summary):
article = article_sanit
except Exception as error:
log_manager.log_collector_activity("rss", source.name, f"Failed to fetch article - {error}")

# use summary if article is empty
if summary and not article:
Expand Down Expand Up @@ -202,4 +205,4 @@ def strip_html_tags(html_string):
BaseCollector.print_exception(source, error)
log_manager.log_debug(traceback.format_exc())

log_manager.log_debug("{} collection finished.".format(self.type))
log_manager.log_debug(f"{self.type} collection finished.")

0 comments on commit d00c55d

Please sign in to comment.