From 16d2b17772e03c22e6a6dcd2328f4939622c22d3 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Tue, 24 Sep 2024 15:21:36 -0700 Subject: [PATCH 1/3] feat: Add new cli option to insert url into existing crawl. --- brozzler/cli.py | 40 +++++++++++++++++++++++++++++++++++++++- setup.py | 3 ++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index bea5153f..6b5c4536 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -437,6 +437,44 @@ def brozzler_new_site(argv=None): brozzler.new_site(frontier, site) +def brozzler_new_page(argv=None): + """ + Command line utility entry point for queuing a new brozzler page. + Takes a url and site_id and adds a page object in rethinkdb, which + brozzler-workers will look at and start crawling. + """ + argv = argv or sys.argv + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(argv[0]), + description="brozzler-new-page - queue url to brozzler site", + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) + arg_parser.add_argument("url", metavar="URL", help="URL to add to site") + arg_parser.add_argument("site_id", metavar="SITE_ID", + help="UUID of site object to add the page to") + arg_parser.add_argument("parent_page_id", metavar="PARENT_PAGE_ID", + help="ID of Page object to add the page as an outlink to") + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser, argv) + args = arg_parser.parse_args(args=argv[1:]) + configure_logging(args) + + rr = rethinker(args) + + site_result = rr.table("sites").get(args.site_id).run() + if not site_result: + raise Exception() + site = brozzler.Site(rr, site_result) + + parent_page_result = rr.table("pages").get(args.parent_page_id).run() + if not parent_page_result: + raise Exception() + parent_page = brozzler.Page(rr, parent_page_result) + + frontier = brozzler.RethinkDbFrontier(rr) + frontier.scope_and_schedule_outlinks(site=site, parent_page=parent_page, outlinks=[args.url]) + + def brozzler_worker(argv=None): """ Main entry point for brozzler, gets sites and pages to brozzle from @@ -1065,7 +1103,7 @@ def brozzler_list_captures(argv=None): reql = reql.order_by(index="abbr_canon_surt_timestamp") reql = reql.filter( lambda capture: (capture["canon_surt"] >= key) - & (capture["canon_surt"] <= end_key) + & (capture["canon_surt"] <= end_key) ) logging.debug("querying rethinkdb: %s", reql) results = reql.run() diff --git a/setup.py b/setup.py index c275b2b9..21e4087a 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.54", + version="1.5.55", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", @@ -51,6 +51,7 @@ def find_package_data(package): "brozzle-page=brozzler.cli:brozzle_page", "brozzler-new-job=brozzler.cli:brozzler_new_job", "brozzler-new-site=brozzler.cli:brozzler_new_site", + "brozzler-new-page=brozzler.cli:brozzler_new_page", "brozzler-worker=brozzler.cli:brozzler_worker", "brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables", "brozzler-list-captures=brozzler.cli:brozzler_list_captures", From 0eb3957335402345a108a4ae3be61627111f875c Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Tue, 24 Sep 2024 15:26:17 -0700 Subject: [PATCH 2/3] chore: adjust brozzler_new_page comment --- brozzler/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 6b5c4536..9f7b3a95 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -440,7 +440,7 @@ def brozzler_new_site(argv=None): def brozzler_new_page(argv=None): """ Command line utility entry point for queuing a new brozzler page. - Takes a url and site_id and adds a page object in rethinkdb, which + Takes a url, site_id, and parent_page_id, and adds a page object in rethinkdb, which brozzler-workers will look at and start crawling. """ argv = argv or sys.argv From 5138fe95846b0982c8d100a8afbd0a0a18916b1f Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Tue, 24 Sep 2024 16:25:46 -0700 Subject: [PATCH 3/3] chore: formatting --- brozzler/cli.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 9f7b3a95..ddf26e6f 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -450,10 +450,14 @@ def brozzler_new_page(argv=None): formatter_class=BetterArgumentDefaultsHelpFormatter, ) arg_parser.add_argument("url", metavar="URL", help="URL to add to site") - arg_parser.add_argument("site_id", metavar="SITE_ID", - help="UUID of site object to add the page to") - arg_parser.add_argument("parent_page_id", metavar="PARENT_PAGE_ID", - help="ID of Page object to add the page as an outlink to") + arg_parser.add_argument( + "site_id", metavar="SITE_ID", help="UUID of site object to add the page to" + ) + arg_parser.add_argument( + "parent_page_id", + metavar="PARENT_PAGE_ID", + help="ID of Page object to add the page as an outlink to", + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -472,7 +476,9 @@ def brozzler_new_page(argv=None): parent_page = brozzler.Page(rr, parent_page_result) frontier = brozzler.RethinkDbFrontier(rr) - frontier.scope_and_schedule_outlinks(site=site, parent_page=parent_page, outlinks=[args.url]) + frontier.scope_and_schedule_outlinks( + site=site, parent_page=parent_page, outlinks=[args.url] + ) def brozzler_worker(argv=None): @@ -1103,7 +1109,7 @@ def brozzler_list_captures(argv=None): reql = reql.order_by(index="abbr_canon_surt_timestamp") reql = reql.filter( lambda capture: (capture["canon_surt"] >= key) - & (capture["canon_surt"] <= end_key) + & (capture["canon_surt"] <= end_key) ) logging.debug("querying rethinkdb: %s", reql) results = reql.run()