From 58929c12ab990d34f2afb4029ed7e0f2ee5e3b03 Mon Sep 17 00:00:00 2001 From: Jan Polonsky Date: Wed, 22 Nov 2023 18:22:14 +0100 Subject: [PATCH] Support latest Selenium version (changed methods, depreciated functions, new syntax) Based on Update collectors #185. !!! NEED this commit to get working !!! --- src/collectors/collectors/atom_collector.py | 13 ++-- src/collectors/collectors/web_collector.py | 76 ++++++++++----------- 2 files changed, 46 insertions(+), 43 deletions(-) diff --git a/src/collectors/collectors/atom_collector.py b/src/collectors/collectors/atom_collector.py index d2b72910b..3ed9df57a 100644 --- a/src/collectors/collectors/atom_collector.py +++ b/src/collectors/collectors/atom_collector.py @@ -1,12 +1,14 @@ import datetime import hashlib import uuid +import traceback import feedparser import requests from bs4 import BeautifulSoup from dateutil.parser import parse from .base_collector import BaseCollector +from managers import log_manager from shared.schema.news_item import NewsItemData from shared.schema.parameter import Parameter, ParameterType @@ -29,6 +31,7 @@ def collect(self, source): feed_url = source.parameter_values['ATOM_FEED_URL'] user_agent = source.parameter_values['USER_AGENT'] interval = source.parameter_values['REFRESH_INTERVAL'] + log_manager.log_collector_activity("atom", source.name, "Starting collector for url: {}".format(feed_url)) proxies = {} if 'PROXY_SERVER' in source.parameter_values: @@ -49,16 +52,14 @@ def collect(self, source): news_items = [] + limit = BaseCollector.history(interval) for feed_entry in feed['entries']: - - limit = BaseCollector.history(interval) published = feed_entry['updated'] published = parse(published, tzinfos=BaseCollector.timezone_info()) - if str(published) > str(limit): link_for_article = feed_entry['link'] - + log_manager.log_collector_activity("atom", source.name, "Processing entry [{}]".format(link_for_article)) if proxies: page = requests.get(link_for_article, headers={'User-Agent': user_agent}, proxies=proxies) else: @@ -84,4 +85,8 @@ def collect(self, source): BaseCollector.publish(news_items, source) except Exception as error: + log_manager.log_collector_activity("atom", source.name, "ATOM collection exceptionally failed") BaseCollector.print_exception(source, error) + log_manager.log_debug(traceback.format_exc()) + + log_manager.log_debug("{} collection finished.".format(self.type)) diff --git a/src/collectors/collectors/web_collector.py b/src/collectors/collectors/web_collector.py index fc9c9f172..7c4a7a33e 100644 --- a/src/collectors/collectors/web_collector.py +++ b/src/collectors/collectors/web_collector.py @@ -12,7 +12,9 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options as ChromeOptions +from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.firefox.options import Options as FirefoxOptions +from selenium.webdriver.firefox.service import Service as FirefoxService from urllib.parse import urlparse import os import dateparser @@ -137,17 +139,17 @@ def __find_element_by(driver, element_selector): element = None if prefix == 'id': - element = driver.find_element_by_id(selector) + element = driver.find_element(By.ID, selector) if prefix == 'name': - element = driver.find_element_by_name(selector) + element = driver.find_element(By.NAME, selector) elif prefix == 'xpath': - element = driver.find_element_by_xpath(selector) + element = driver.find_element(By.XPATH, selector) elif prefix in [ 'tag_name', 'tag' ]: - element = driver.find_element_by_tag_name(selector) + element = driver.find_element(By.TAG_NAME, selector) elif prefix in [ 'class_name', 'class' ]: - element = driver.find_element_by_class_name(selector) + element = driver.find_element(By.CLASS_NAME, selector) elif prefix in [ 'css_selector', 'css' ]: - element = driver.find_element_by_css_selector(selector) + element = driver.find_element(By.CSS_SELECTOR, selector) return element @@ -177,17 +179,17 @@ def __find_elements_by(driver, element_selector): elements = None if prefix == 'id': - elements = [ driver.find_element_by_id(selector) ] + elements = [ driver.find_element(By.ID, selector) ] if prefix == 'name': - elements = driver.find_elements_by_name(selector) + elements = driver.find_element(By.NAME, selector) elif prefix == 'xpath': - elements = driver.find_elements_by_xpath(selector) + elements = driver.find_element(By.XPATH, selector) elif prefix in [ 'tag_name', 'tag' ]: - elements = driver.find_elements_by_tag_name(selector) + elements = driver.find_elements(By.TAG_NAME, selector) elif prefix in [ 'class_name', 'class' ]: - elements = driver.find_elements_by_class_name(selector) + elements = driver.find_element(By.CLASS_NAME, selector) elif prefix in [ 'css_selector', 'css' ]: - elements = driver.find_elements_by_css_selector(selector) + elements = driver.find_elements(By.CSS_SELECTOR, selector) return elements @staticmethod @@ -218,7 +220,6 @@ def __wait_for_new_tab(browser, timeout, current_tab): browser.switch_to.window(tab) return - def __close_other_tabs(self, browser, handle_to_keep, fallback_url): try: handles_to_close = copy.copy(browser.window_handles) @@ -369,12 +370,12 @@ def __get_headless_driver_chrome(self): chrome_options.add_argument("--headless") chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument('--incognito') + chrome_service = ChromeService(executable_path=chrome_driver_executable) if self.user_agent: chrome_options.add_argument('user-agent=' + self.user_agent) if self.tor_service.lower() == 'yes': socks_proxy = "socks5://127.0.0.1:9050" chrome_options.add_argument('--proxy-server={}'.format(socks_proxy)) - driver = webdriver.Chrome(executable_path=chrome_driver_executable, options=chrome_options) elif self.proxy: webdriver.DesiredCapabilities.CHROME['proxy'] = { "proxyType": "MANUAL", @@ -382,10 +383,8 @@ def __get_headless_driver_chrome(self): "ftpProxy": self.proxy, "sslProxy": self.proxy } - driver = webdriver.Chrome(executable_path=chrome_driver_executable, options=chrome_options) - else: - driver = webdriver.Chrome(executable_path=chrome_driver_executable, options=chrome_options) - + + driver = webdriver.Chrome(service=chrome_service, options=chrome_options) log_manager.log_debug('Chrome driver initialized.') return driver @@ -407,30 +406,26 @@ def __get_headless_driver_firefox(self): if self.user_agent: firefox_options.add_argument('user-agent=' + self.user_agent) - profile = webdriver.FirefoxProfile() - firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX - firefox_capabilities['marionette'] = True - if self.tor_service.lower() == 'yes': - profile.set_preference('network.proxy.type', 1) # manual proxy config - profile.set_preference('network.proxy.socks', '127.0.0.1') - profile.set_preference('network.proxy.socks_port', 9050) - profile.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8'); + firefox_options.set_preference('network.proxy.type', 1) # manual proxy config + firefox_options.set_preference('network.proxy.socks', '127.0.0.1') + firefox_options.set_preference('network.proxy.socks_port', 9050) + firefox_options.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8'); elif self.proxy: - profile.set_preference('network.proxy.type', 1) # manual proxy config - profile.set_preference('network.proxy.http', self.proxy_host) - profile.set_preference('network.proxy.http_port', int(self.proxy_port)) - profile.set_preference('network.proxy.ssl', self.proxy_host) - profile.set_preference('network.proxy.ssl_port', int(self.proxy_port)) - profile.set_preference('network.proxy.ftp', self.proxy) - profile.set_preference('network.proxy.ftp_port', int(self.proxy_port)) - profile.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8'); + firefox_options.set_preference('network.proxy.type', 1) # manual proxy config + firefox_options.set_preference('network.proxy.http', self.proxy_host) + firefox_options.set_preference('network.proxy.http_port', int(self.proxy_port)) + firefox_options.set_preference('network.proxy.ssl', self.proxy_host) + firefox_options.set_preference('network.proxy.ssl_port', int(self.proxy_port)) + firefox_options.set_preference('network.proxy.ftp', self.proxy) + firefox_options.set_preference('network.proxy.ftp_port', int(self.proxy_port)) + firefox_options.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8'); else: - profile.set_preference('network.proxy.type', 0) # no proxy + firefox_options.set_preference('network.proxy.type', 0) # no proxy - profile.update_preferences() - driver = webdriver.Firefox(profile, executable_path=firefox_driver_executable, options=firefox_options, capabilities=firefox_capabilities) + firefox_service = FirefoxService(executable_path=firefox_driver_executable) + driver = webdriver.Firefox(service=firefox_service, options=firefox_options) log_manager.log_debug('Firefox driver initialized.') return driver @@ -518,8 +513,11 @@ def __browse_title_page(self, index_url): popup = WebDriverWait(browser, 10).until(EC.presence_of_element_located(self.__get_element_locator(self.selectors['popup_close']))) except Exception as ex: log_manager.log_collector_activity('web', self.source.name, 'Popup find error: ' + traceback.format_exc()) - if popup: - popup.click() + try: + if popup: + popup.click() + except Exception as ex: + log_manager.log_collector_activity('web', self.source.name, 'Popup click error: ' + traceback.format_exc()) # if there is a "load more" selector, click on it! page = 1