-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Logging improvement part 2. (Collectors)
Fixed: AttributeError: 'WebDriver' object has no attribute 'dispose' Tidy up log types Same logs for each collector (start, finish) moved to one place
- Loading branch information
Jan Polonsky
committed
Oct 23, 2024
1 parent
d573455
commit 5bb45ab
Showing
9 changed files
with
2,216 additions
and
2,253 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,140 +1,134 @@ | ||
"""Module for Atom collector.""" | ||
|
||
import datetime | ||
import hashlib | ||
import uuid | ||
import traceback | ||
import feedparser | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
from .base_collector import BaseCollector | ||
from managers.log_manager import logger | ||
from shared.schema.news_item import NewsItemData | ||
from shared.schema.parameter import Parameter, ParameterType | ||
|
||
|
||
class AtomCollector(BaseCollector): | ||
"""Collector for gathering data from Atom. | ||
Attributes: | ||
type (str): Type of the collector. | ||
name (str): Name of the collector. | ||
description (str): Description of the collector. | ||
parameters (list): List of parameters required for the collector. | ||
Methods: | ||
collect(source): Collect data from an Atom feed. | ||
Raises: | ||
Exception: If an error occurs during the collection process. | ||
""" | ||
|
||
type = "ATOM_COLLECTOR" | ||
name = "Atom Collector" | ||
description = "Collector for gathering data from Atom feeds" | ||
|
||
parameters = [ | ||
Parameter(0, "ATOM_FEED_URL", "Atom feed URL", "Full url for Atom feed", ParameterType.STRING), | ||
Parameter(0, "USER_AGENT", "User agent", "Type of user agent", ParameterType.STRING), | ||
Parameter( | ||
0, | ||
"LINKS_LIMIT", | ||
"Limit for article links", | ||
"OPTIONAL: Maximum number of article links to process. Default: all", | ||
ParameterType.NUMBER, | ||
), | ||
] | ||
|
||
parameters.extend(BaseCollector.parameters) | ||
|
||
news_items = [] | ||
|
||
@BaseCollector.ignore_exceptions | ||
def collect(self, source): | ||
"""Collect data from Atom feed. | ||
Parameters: | ||
source -- Source object. | ||
""" | ||
self.collector_source = f"{self.name} '{source.name}':" | ||
BaseCollector.update_last_attempt(source) | ||
feed_url = source.parameter_values["ATOM_FEED_URL"] | ||
user_agent = source.parameter_values["USER_AGENT"] | ||
interval = source.parameter_values["REFRESH_INTERVAL"] # noqa: F841 | ||
links_limit = BaseCollector.read_int_parameter("LINKS_LIMIT", 0, source) | ||
|
||
logger.info(f"{self.collector_source} Starting collector for {feed_url}") | ||
|
||
proxies = {} | ||
if "PROXY_SERVER" in source.parameter_values: | ||
proxy_server = source.parameter_values["PROXY_SERVER"] | ||
if proxy_server.startswith("https://"): | ||
proxies["https"] = proxy_server | ||
elif proxy_server.startswith("http://"): | ||
proxies["http"] = proxy_server | ||
else: | ||
proxies["http"] = "http://" + proxy_server | ||
|
||
try: | ||
if proxies: | ||
atom_xml = requests.get(feed_url, headers={"User-Agent": user_agent}, proxies=proxies) | ||
feed = feedparser.parse(atom_xml.text) | ||
else: | ||
feed = feedparser.parse(feed_url) | ||
|
||
logger.info(f"{self.collector_source} Atom returned feed with {len(feed['entries'])} entries") | ||
|
||
news_items = [] | ||
|
||
count = 0 | ||
for feed_entry in feed["entries"]: | ||
count += 1 | ||
link_for_article = feed_entry["link"] | ||
logger.info(f"{self.collector_source} Visiting article {count}/{len(feed['entries'])}: {link_for_article}") | ||
if proxies: | ||
page = requests.get(link_for_article, headers={"User-Agent": user_agent}, proxies=proxies) | ||
else: | ||
page = requests.get(link_for_article, headers={"User-Agent": user_agent}) | ||
|
||
html_content = page.text | ||
|
||
if html_content: | ||
content = BeautifulSoup(html_content, features="html.parser").text | ||
else: | ||
content = "" | ||
|
||
description = feed_entry["summary"][:500].replace("<p>", " ") | ||
|
||
# author can exist/miss in header/entry | ||
author = feed_entry["author"] if "author" in feed_entry else "" | ||
for_hash = author + feed_entry["title"] + feed_entry["link"] | ||
|
||
news_item = NewsItemData( | ||
uuid.uuid4(), | ||
hashlib.sha256(for_hash.encode()).hexdigest(), | ||
feed_entry["title"], | ||
description, | ||
feed_url, | ||
feed_entry["link"], | ||
feed_entry["updated"], | ||
author, | ||
datetime.datetime.now(), | ||
content, | ||
source.id, | ||
[], | ||
) | ||
|
||
news_items.append(news_item) | ||
|
||
if count >= links_limit & links_limit > 0: | ||
logger.info(f"{self.collector_source} Limit for article links reached ({links_limit})") | ||
break | ||
|
||
BaseCollector.publish(news_items, source) | ||
|
||
except Exception as error: | ||
logger.info(f"{self.collector_source} Atom collection exceptionally failed") | ||
BaseCollector.print_exception(source, error) | ||
logger.debug(traceback.format_exc()) | ||
|
||
logger.debug(f"{self.type} collection finished.") | ||
"""Module for Atom collector.""" | ||
|
||
import datetime | ||
import hashlib | ||
import uuid | ||
import traceback | ||
import feedparser | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
from .base_collector import BaseCollector | ||
from managers.log_manager import logger | ||
from shared.schema.news_item import NewsItemData | ||
from shared.schema.parameter import Parameter, ParameterType | ||
|
||
|
||
class AtomCollector(BaseCollector): | ||
"""Collector for gathering data from Atom. | ||
Attributes: | ||
type (str): Type of the collector. | ||
name (str): Name of the collector. | ||
description (str): Description of the collector. | ||
parameters (list): List of parameters required for the collector. | ||
Methods: | ||
collect(source): Collect data from an Atom feed. | ||
Raises: | ||
Exception: If an error occurs during the collection process. | ||
""" | ||
|
||
type = "ATOM_COLLECTOR" | ||
name = "Atom Collector" | ||
description = "Collector for gathering data from Atom feeds" | ||
|
||
parameters = [ | ||
Parameter(0, "ATOM_FEED_URL", "Atom feed URL", "Full url for Atom feed", ParameterType.STRING), | ||
Parameter(0, "USER_AGENT", "User agent", "Type of user agent", ParameterType.STRING), | ||
Parameter( | ||
0, | ||
"LINKS_LIMIT", | ||
"Limit for article links", | ||
"OPTIONAL: Maximum number of article links to process. Default: all", | ||
ParameterType.NUMBER, | ||
), | ||
] | ||
|
||
parameters.extend(BaseCollector.parameters) | ||
|
||
news_items = [] | ||
|
||
@BaseCollector.ignore_exceptions | ||
def collect(self, source): | ||
"""Collect data from Atom feed. | ||
Parameters: | ||
source -- Source object. | ||
""" | ||
feed_url = source.parameter_values["ATOM_FEED_URL"] | ||
user_agent = source.parameter_values["USER_AGENT"] | ||
interval = source.parameter_values["REFRESH_INTERVAL"] # noqa: F841 | ||
links_limit = BaseCollector.read_int_parameter("LINKS_LIMIT", 0, source) | ||
|
||
logger.info(f"{self.collector_source} Requesting feed URL {feed_url}") | ||
|
||
proxies = {} | ||
if "PROXY_SERVER" in source.parameter_values: | ||
proxy_server = source.parameter_values["PROXY_SERVER"] | ||
if proxy_server.startswith("https://"): | ||
proxies["https"] = proxy_server | ||
elif proxy_server.startswith("http://"): | ||
proxies["http"] = proxy_server | ||
else: | ||
proxies["http"] = "http://" + proxy_server | ||
|
||
try: | ||
if proxies: | ||
atom_xml = requests.get(feed_url, headers={"User-Agent": user_agent}, proxies=proxies) | ||
feed = feedparser.parse(atom_xml.text) | ||
else: | ||
feed = feedparser.parse(feed_url) | ||
|
||
logger.debug(f"{self.collector_source} Atom returned feed with {len(feed['entries'])} entries") | ||
|
||
news_items = [] | ||
|
||
count = 0 | ||
for feed_entry in feed["entries"]: | ||
count += 1 | ||
link_for_article = feed_entry["link"] | ||
logger.info(f"{self.collector_source} Visiting article {count}/{len(feed['entries'])}: {link_for_article}") | ||
if proxies: | ||
page = requests.get(link_for_article, headers={"User-Agent": user_agent}, proxies=proxies) | ||
else: | ||
page = requests.get(link_for_article, headers={"User-Agent": user_agent}) | ||
|
||
html_content = page.text | ||
|
||
if html_content: | ||
content = BeautifulSoup(html_content, features="html.parser").text | ||
else: | ||
content = "" | ||
|
||
description = feed_entry["summary"][:500].replace("<p>", " ") | ||
|
||
# author can exist/miss in header/entry | ||
author = feed_entry["author"] if "author" in feed_entry else "" | ||
for_hash = author + feed_entry["title"] + feed_entry["link"] | ||
|
||
news_item = NewsItemData( | ||
uuid.uuid4(), | ||
hashlib.sha256(for_hash.encode()).hexdigest(), | ||
feed_entry["title"], | ||
description, | ||
feed_url, | ||
feed_entry["link"], | ||
feed_entry["updated"], | ||
author, | ||
datetime.datetime.now(), | ||
content, | ||
source.id, | ||
[], | ||
) | ||
|
||
news_items.append(news_item) | ||
|
||
if count >= links_limit & links_limit > 0: | ||
logger.debug(f"{self.collector_source} Limit for article links reached ({links_limit})") | ||
break | ||
|
||
BaseCollector.publish(news_items, source, self.collector_source) | ||
|
||
except Exception as error: | ||
logger.exception(f"{self.collector_source} Collection failed: {error}") |
Oops, something went wrong.