diff --git a/.gitignore b/.gitignore index 14d49f48..0e957105 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ *.diff .*.sw* /brozzler.egg-info/ +venv diff --git a/brozzler/browser.py b/brozzler/browser.py index 0cda56ee..ad4b7ec1 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -288,20 +288,23 @@ class Browser: ''' logger = logging.getLogger(__module__ + '.' + __qualname__) - def __init__(self, **kwargs): + def __init__(self, chrome_exe, browserless_port, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' - self.chrome = Chrome(**kwargs) self.websock_url = None self.websock = None self.websock_thread = None self.is_browsing = False - self._command_id = Counter() self._wait_interval = 0.5 + self.browse_port = browserless_port + self.is_browserless = chrome_exe == 'browserless' + self.chrome = Chrome(chrome_exe=chrome_exe, browserless_port=browserless_port, + is_browserless=self.is_browserless, **kwargs) + self._command_id = Counter() def __enter__(self): self.start() @@ -343,6 +346,14 @@ def start(self, **kwargs): **kwargs: arguments for self.chrome.start(...) ''' if not self.is_running(): + + if self.is_browserless: + # Open a ws to create a browser on demand + args = self.chrome._browserless_args() + self.browserless_ws = websocket.create_connection( + "ws://localhost:" + str(self.browse_port) + "?" + args + ) + self.websock_url = self.chrome.start(**kwargs) self.websock = websocket.WebSocketApp(self.websock_url) self.websock_thread = WebsockReceiverThread( diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 18c82c93..93a898e9 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -30,6 +30,7 @@ import json import tempfile import sys +import functools def check_version(chrome_exe): ''' @@ -62,7 +63,8 @@ def check_version(chrome_exe): class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) - def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): + def __init__(self, chrome_exe, browserless_port, is_browserless, + port=9222, ignore_cert_errors=False): ''' Initializes instance of this class. @@ -74,17 +76,27 @@ def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): ignore_cert_errors: configure chrome to accept all certs (default False) ''' - if chrome_exe == 'browserless': - # init browserless here maybe - pass + + self.is_browserless = is_browserless + self.browserless_port = browserless_port + + if self.is_browserless: + # browserless isn't attached to a PID + self.chrome_exe = None + self.port = None + else: # use a local browser self.port = port self.chrome_exe = chrome_exe - self.ignore_cert_errors = ignore_cert_errors - self._shutdown = threading.Event() - self.chrome_process = None - + + self.ignore_cert_errors = ignore_cert_errors + self._shutdown = threading.Event() + self._home_tmpdir = tempfile.TemporaryDirectory() + self._chrome_user_data_dir = os.path.join( + self._home_tmpdir.name, 'chrome-user-data') + self.chrome_process = None + def __enter__(self): ''' Returns websocket url to chrome window with about:blank loaded. @@ -139,8 +151,41 @@ def persist_and_read_cookie_db(self): cookie_location, exc_info=True) return cookie_db - def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, - disk_cache_size=None, websocket_timeout=60): + def _chrome_args(self, disk_cache_dir=None, disk_cache_size=None, + proxy=None): + chrome_args = [ + self.chrome_exe, + '--remote-debugging-port=%s' % self.port or self.browserless_port, + '--use-mock-keychain', # mac thing + '--user-data-dir=%s' % self._chrome_user_data_dir, + '--disable-background-networking', '--disable-breakpad', + '--disable-renderer-backgrounding', '--disable-hang-monitor', + '--disable-background-timer-throttling', '--mute-audio', + '--disable-web-sockets', + '--window-size=1100,900', '--no-default-browser-check', + '--disable-first-run-ui', '--no-first-run', + '--homepage=about:blank', '--disable-direct-npapi-requests', + '--disable-web-security', '--disable-notifications', + '--disable-extensions', '--disable-save-password-bubble', + '--disable-sync'] + + extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS') + if extra_chrome_args: + chrome_args.extend(extra_chrome_args.split()) + if disk_cache_dir: + chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir) + if disk_cache_size: + chrome_args.append('--disk-cache-size=%s' % disk_cache_size) + if self.ignore_cert_errors: + chrome_args.append('--ignore-certificate-errors') + if proxy: + chrome_args.append('--proxy-server=%s' % proxy) + chrome_args.append('about:blank') + + return chrome_args + + def start(self, proxy=None, cookie_db=None, + disk_cache_dir=None, disk_cache_size=None, websocket_timeout=60): ''' Starts chrome/chromium process. @@ -158,44 +203,20 @@ def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, Returns: websocket url to chrome window with about:blank loaded ''' + # these can raise exceptions - self._home_tmpdir = tempfile.TemporaryDirectory() - self._chrome_user_data_dir = os.path.join( - self._home_tmpdir.name, 'chrome-user-data') if cookie_db: self._init_cookie_db(cookie_db) self._shutdown.clear() new_env = os.environ.copy() new_env['HOME'] = self._home_tmpdir.name - chrome_args = [ - self.chrome_exe, - '--remote-debugging-port=%s' % self.port, - '--use-mock-keychain', # mac thing - '--user-data-dir=%s' % self._chrome_user_data_dir, - '--disable-background-networking', '--disable-breakpad', - '--disable-renderer-backgrounding', '--disable-hang-monitor', - '--disable-background-timer-throttling', '--mute-audio', - '--disable-web-sockets', - '--window-size=1100,900', '--no-default-browser-check', - '--disable-first-run-ui', '--no-first-run', - '--homepage=about:blank', '--disable-direct-npapi-requests', - '--disable-web-security', '--disable-notifications', - '--disable-extensions', '--disable-save-password-bubble', - '--disable-sync'] + chrome_args = self._chrome_args(disk_cache_dir=disk_cache_dir, disk_cache_size=disk_cache_size, + proxy=proxy) + + if self.is_browserless: + return self.start_browserless() - extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS') - if extra_chrome_args: - chrome_args.extend(extra_chrome_args.split()) - if disk_cache_dir: - chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir) - if disk_cache_size: - chrome_args.append('--disk-cache-size=%s' % disk_cache_size) - if self.ignore_cert_errors: - chrome_args.append('--ignore-certificate-errors') - if proxy: - chrome_args.append('--proxy-server=%s' % proxy) - chrome_args.append('about:blank') self.logger.info('running: %r', subprocess.list2cmdline(chrome_args)) # start_new_session - new process group so we can kill the whole group self.chrome_process = subprocess.Popen( @@ -209,7 +230,25 @@ def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, return self._websocket_url(timeout_sec=websocket_timeout) - def _websocket_url(self, timeout_sec = 60): + def _browserless_args(self): + chrome_args = self._chrome_args() + chrome_args.pop(0) + chrome_args.pop(0) + return functools.reduce(lambda a, b: a + "&" + b, chrome_args) + + def start_browserless(self): + json_url = "http://localhost:" + str(self.browserless_port) + "/sessions" + + brwlss_json_raw = urllib.request.urlopen(json_url, timeout=30).read() + brwlss_json = json.loads(brwlss_json_raw) + wsURL = brwlss_json[0]['webSocketDebuggerUrl'] + + self.logger.info('got chrome websocket debug url %s from Browserless at %s', wsURL, json_url) + self.port = brwlss_json[0]['port'] + + return wsURL + + def _websocket_url(self, timeout_sec=60): json_url = 'http://localhost:%s/json' % self.port # make this a member variable so that kill -QUIT reports it self._start = time.time() @@ -261,7 +300,7 @@ def readline_nonblock(f): buf = b'' try: while not self._shutdown.is_set() and ( - len(buf) == 0 or buf[-1] != 0xa) and select.select( + len(buf) == 0 or buf[-1] != 0xa) and select.select( [f],[],[],0.5)[0]: buf += f.read(1) except (ValueError, OSError): @@ -282,8 +321,8 @@ def readline_nonblock(f): buf = readline_nonblock(self.chrome_process.stderr) if buf: self.logger.trace( - 'chrome pid %s STDERR %s', - self.chrome_process.pid, buf) + 'chrome pid %s STDERR %s', + self.chrome_process.pid, buf) except: self.logger.error('unexpected exception', exc_info=True) diff --git a/brozzler/cli.py b/brozzler/cli.py index 98ba0b91..231553cc 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -174,6 +174,9 @@ def brozzle_page(argv=None): '--skip-browserless', dest='skip_browserless', action='store_true') arg_parser.add_argument( '--simpler404', dest='simpler404', action='store_true') + arg_parser.add_argument( + '--browserless-port', dest='browserless_port', default='3000', + help='port on which the browserless instance is') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -210,7 +213,8 @@ def on_screenshot(screenshot_jpeg): f.write(screenshot_jpeg) logging.info('wrote screenshot to %s', filename) - browser = brozzler.Browser(chrome_exe=args.chrome_exe) + browser = brozzler.Browser(chrome_exe=args.chrome_exe, + browserless_port=args.browserless_port) try: browser.start(proxy=args.proxy) outlinks = worker.brozzle_page(