Skip to content

Commit

Permalink
Merge pull request #185 from multiflexi/update_collectors
Browse files Browse the repository at this point in the history
Update collectors
  • Loading branch information
milankowww authored Nov 29, 2023
2 parents 23df5fa + 5773134 commit 430e19c
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 81 deletions.
4 changes: 2 additions & 2 deletions docker/Dockerfile.collectors
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.7-alpine3.14 AS build_shared
FROM python:3.12-alpine3.18 AS build_shared

WORKDIR /build_shared/

Expand All @@ -8,7 +8,7 @@ RUN python -m build



FROM python:3.7-alpine3.14 AS production
FROM python:3.12-alpine3.18 AS production

WORKDIR /app/

Expand Down
115 changes: 69 additions & 46 deletions src/collectors/collectors/rss_collector.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""RSS collector module."""
import datetime
import hashlib
import uuid
Expand All @@ -17,59 +18,72 @@


class RSSCollector(BaseCollector):
"""RSS collector class.
Arguments:
BaseCollector -- Base collector class.
"""

type = "RSS_COLLECTOR"
name = "RSS Collector"
description = "Collector for gathering data from RSS feeds"

parameters = [
Parameter(0, "FEED_URL", "Feed URL", "Full url for RSS feed", ParameterType.STRING),
Parameter(0, "USER_AGENT", "User agent", "Type of user agent", ParameterType.STRING)
Parameter(0, "USER_AGENT", "User agent", "Type of user agent", ParameterType.STRING),
]

parameters.extend(BaseCollector.parameters)

news_items = []

def collect(self, source):
"""Collect data from RSS feed.
feed_url = source.parameter_values['FEED_URL']
interval = source.parameter_values['REFRESH_INTERVAL']
Arguments:
source -- Source object.
"""
feed_url = source.parameter_values["FEED_URL"]
# interval = source.parameter_values["REFRESH_INTERVAL"]

log_manager.log_collector_activity('rss', source.name, 'Starting collector for url: {}'.format(feed_url))
log_manager.log_collector_activity("rss", source.name, "Starting collector for url: {}".format(feed_url))

user_agent = source.parameter_values['USER_AGENT']
user_agent = source.parameter_values["USER_AGENT"]
if user_agent:
feedparser.USER_AGENT = user_agent
user_agent_headers = {'User-Agent': user_agent}
# user_agent_headers = {"User-Agent": user_agent}
else:
user_agent_headers = { }
# user_agent_headers = {}
pass

# use system proxy
proxy_handler = None
opener = urllib.request.urlopen

if 'PROXY_SERVER' in source.parameter_values:
proxy_server = source.parameter_values['PROXY_SERVER']
if "PROXY_SERVER" in source.parameter_values:
proxy_server = source.parameter_values["PROXY_SERVER"]

# disable proxy - do not use system proxy
if proxy_server == 'none':
if proxy_server == "none":
proxy_handler = urllib.request.ProxyHandler({})
else:
proxy = re.search(r"^(http|https|socks4|socks5)://([a-zA-Z0-9\-\.\_]+):(\d+)/?$", proxy_server)
if proxy:
scheme, host, port = proxy.groups()
# classic HTTP/HTTPS proxy
if scheme in ['http', 'https']:
proxy_handler = urllib.request.ProxyHandler({
'http': '{}://{}:{}'.format(scheme, host, port),
'https': '{}://{}:{}'.format(scheme, host, port),
'ftp': '{}://{}:{}'.format(scheme, host, port)
})
if scheme in ["http", "https"]:
proxy_handler = urllib.request.ProxyHandler(
{
"http": "{}://{}:{}".format(scheme, host, port),
"https": "{}://{}:{}".format(scheme, host, port),
"ftp": "{}://{}:{}".format(scheme, host, port),
}
)
# socks4 proxy
elif scheme == 'socks4':
elif scheme == "socks4":
proxy_handler = SocksiPyHandler(socks.SOCKS4, host, int(port))
# socks5 proxy
elif scheme == 'socks5':
elif scheme == "socks5":
proxy_handler = SocksiPyHandler(socks.SOCKS5, host, int(port))

# use proxy in urllib
Expand All @@ -78,63 +92,72 @@ def collect(self, source):

try:
if proxy_handler:
feed = feedparser.parse(feed_url, handlers = [proxy_handler])
feed = feedparser.parse(feed_url, handlers=[proxy_handler])
else:
feed = feedparser.parse(feed_url)

log_manager.log_collector_activity('rss', source.name, 'RSS returned feed with {} entries'.format(len(feed['entries'])))
log_manager.log_collector_activity("rss", source.name, "RSS returned feed with {} entries".format(len(feed["entries"])))

news_items = []

for feed_entry in feed['entries']:

for key in ['author', 'published', 'title', 'description', 'link']:
if not feed_entry.has_key(key):
feed_entry[key] = ''
for feed_entry in feed["entries"]:
for key in ["author", "published", "title", "description", "link"]:
if key not in feed_entry.keys():
feed_entry[key] = ""

limit = BaseCollector.history(interval)
published = feed_entry['published']
published = dateparser.parse(published, settings={'DATE_ORDER': 'DMY'})
# limit = BaseCollector.history(interval)
published = feed_entry["published"]
published = dateparser.parse(published, settings={"DATE_ORDER": "DMY"})

# if published > limit: TODO: uncomment after testing, we need some initial data now
link_for_article = feed_entry['link']
link_for_article = feed_entry["link"]
if not link_for_article:
log_manager.log_collector_activity("rss", source.name, "Skipping (empty link)")
continue

log_manager.log_collector_activity('rss', source.name, 'Processing entry [{}]'.format(link_for_article))
log_manager.log_collector_activity("rss", source.name, "Processing entry [{}]".format(link_for_article))

html_content = ''
html_content = ""
request = urllib.request.Request(link_for_article)
request.add_header('User-Agent', user_agent)
request.add_header("User-Agent", user_agent)

with opener(request) as response:
html_content = response.read()

soup = BeautifulSoup(html_content, features='html.parser')
soup = BeautifulSoup(html_content, features="html.parser")

content = ''
content = ""

if html_content:
content_text = [p.text.strip() for p in soup.findAll('p')]
replaced_str = '\xa0'
content_text = [p.text.strip() for p in soup.findAll("p")]
replaced_str = "\xa0"
if replaced_str:
content = [w.replace(replaced_str, ' ') for w in content_text]
content = ' '.join(content)

for_hash = feed_entry['author'] + feed_entry['title'] + feed_entry['link']

news_item = NewsItemData(uuid.uuid4(), hashlib.sha256(for_hash.encode()).hexdigest(),
feed_entry['title'], feed_entry['description'], feed_url, feed_entry['link'],
feed_entry['published'], feed_entry['author'], datetime.datetime.now(),
content, source.id, [])
content = [w.replace(replaced_str, " ") for w in content_text]
content = " ".join(content)

for_hash = feed_entry["author"] + feed_entry["title"] + feed_entry["link"]

news_item = NewsItemData(
uuid.uuid4(),
hashlib.sha256(for_hash.encode()).hexdigest(),
feed_entry["title"],
feed_entry["description"],
feed_url,
feed_entry["link"],
feed_entry["published"],
feed_entry["author"],
datetime.datetime.now(),
content,
source.id,
[],
)

news_items.append(news_item)

BaseCollector.publish(news_items, source)

except Exception as error:
log_manager.log_collector_activity('rss', source.name, 'RSS collection exceptionally failed')
log_manager.log_collector_activity("rss", source.name, "RSS collection exceptionally failed")
BaseCollector.print_exception(source, error)
log_manager.log_debug(traceback.format_exc())

Expand Down
23 changes: 18 additions & 5 deletions src/collectors/managers/auth_manager.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
"""Authorization manager for the API.
Returns:
wrapper: Wrapper function for the API endpoints.
"""
from functools import wraps
from flask import request
import os
import ssl

api_key = os.getenv('API_KEY')
api_key = os.getenv("API_KEY")

if os.getenv('SSL_VERIFICATION') == "False":
if os.getenv("SSL_VERIFICATION") == "False":
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
Expand All @@ -15,11 +20,19 @@


def api_key_required(fn):
"""Check for API key in the request header.
Arguments:
fn -- The function to be decorated.
Returns:
wrapper: Wrapper function for the API endpoints.
"""

@wraps(fn)
def wrapper(*args, **kwargs):

if not request.headers.has_key('Authorization') or request.headers['Authorization'] != ('Bearer ' + api_key):
return {'error': 'not authorized'}, 401
if "Authorization" not in request.headers.keys() or request.headers["Authorization"] != ("Bearer " + api_key):
return {"error": "not authorized"}, 401
else:
return fn(*args, **kwargs)

Expand Down
45 changes: 17 additions & 28 deletions src/collectors/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,30 +1,19 @@
beautifulsoup4==4.8.1
bleach==4.1.0
certifi==2021.10.8
feedparser==5.2.1
Flask==1.1.4
Flask-Cors==3.0.10
Flask-RESTful==0.3.7
gevent==21.8.0
greenlet==1.1.1
gunicorn==20.0.4
lxml==4.6.5
marshmallow==3.18.0
beautifulsoup4==4.12.2
bleach==6.1.0
dateparser==1.2.0
feedparser==6.0.10
Flask==3.0.0
Flask-Cors==4.0.0
Flask-RESTful==0.3.10
gevent==23.9.1
gunicorn==21.2.0
marshmallow==3.20.1
marshmallow-enum==1.5.1
Jinja2==2.11.3
MarkupSafe==1.1.0
pyslack==0.5.0
PySocks==1.7.1
python-dateutil==2.8.1
python-dotenv==0.10.5
pytz==2019.3
requests==2.26.0
schedule==0.6.0
selenium==4.0.0
six==1.14.0
slackclient==1.0.7
soupsieve==1.9.5
tweepy==3.8.0
Werkzeug==0.16.0
zipp==3.1.0
dateparser==1.1.1
python-dateutil==2.8.2
python-dotenv==1.0.0
requests==2.31.0
schedule==1.2.1
selenium==4.15.2
slackclient==1.3.2
tweepy==4.14.0

0 comments on commit 430e19c

Please sign in to comment.