diff --git a/brozzler/cli.py b/brozzler/cli.py index bea5153f..ddf26e6f 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -437,6 +437,50 @@ def brozzler_new_site(argv=None): brozzler.new_site(frontier, site) +def brozzler_new_page(argv=None): + """ + Command line utility entry point for queuing a new brozzler page. + Takes a url, site_id, and parent_page_id, and adds a page object in rethinkdb, which + brozzler-workers will look at and start crawling. + """ + argv = argv or sys.argv + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(argv[0]), + description="brozzler-new-page - queue url to brozzler site", + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) + arg_parser.add_argument("url", metavar="URL", help="URL to add to site") + arg_parser.add_argument( + "site_id", metavar="SITE_ID", help="UUID of site object to add the page to" + ) + arg_parser.add_argument( + "parent_page_id", + metavar="PARENT_PAGE_ID", + help="ID of Page object to add the page as an outlink to", + ) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser, argv) + args = arg_parser.parse_args(args=argv[1:]) + configure_logging(args) + + rr = rethinker(args) + + site_result = rr.table("sites").get(args.site_id).run() + if not site_result: + raise Exception() + site = brozzler.Site(rr, site_result) + + parent_page_result = rr.table("pages").get(args.parent_page_id).run() + if not parent_page_result: + raise Exception() + parent_page = brozzler.Page(rr, parent_page_result) + + frontier = brozzler.RethinkDbFrontier(rr) + frontier.scope_and_schedule_outlinks( + site=site, parent_page=parent_page, outlinks=[args.url] + ) + + def brozzler_worker(argv=None): """ Main entry point for brozzler, gets sites and pages to brozzle from diff --git a/setup.py b/setup.py index c275b2b9..21e4087a 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.54", + version="1.5.55", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", @@ -51,6 +51,7 @@ def find_package_data(package): "brozzle-page=brozzler.cli:brozzle_page", "brozzler-new-job=brozzler.cli:brozzler_new_job", "brozzler-new-site=brozzler.cli:brozzler_new_site", + "brozzler-new-page=brozzler.cli:brozzler_new_page", "brozzler-worker=brozzler.cli:brozzler_worker", "brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables", "brozzler-list-captures=brozzler.cli:brozzler_list_captures",