From 119c6859d532b4bcff7899e2421e83193f788665 Mon Sep 17 00:00:00 2001 From: DallanQ Date: Wed, 11 Oct 2023 10:10:54 -0600 Subject: [PATCH 1/5] all magazines working except new era --- .gitignore | 1 + notebooks/05_magazine_crawler.ipynb | 437 ++++++++++++++++++++++++++++ 2 files changed, 438 insertions(+) create mode 100644 notebooks/05_magazine_crawler.ipynb diff --git a/.gitignore b/.gitignore index 79a2b31..046a699 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ /.env.* __pycache__/ /notebooks/.ipynb_checkpoints/ +/models/.ipynb_checkpoints/ /notebooks/wandb/ .idea/ .venv/ diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb new file mode 100644 index 0000000..1448573 --- /dev/null +++ b/notebooks/05_magazine_crawler.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "936d02dd", + "metadata": {}, + "source": [ + "# Crawl Magazines from the Church of Jesus Christ of Latter-day Saints" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7fe5bf12", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "4bca89a2", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "from urllib.parse import urljoin, urlparse\n", + "\n", + "from bs4 import BeautifulSoup\n", + "from tqdm.auto import tqdm\n", + "\n", + "from models.crawl_utils import get_page, save_page" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "e753397e", + "metadata": {}, + "outputs": [], + "source": [ + "# config\n", + "magazine_urls = [\n", + " # 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n", + " # 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n", + " # 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',\n", + " # 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n", + " #'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng'\n", + "]\n", + "base_dir = '../data/load/raw'\n", + "bs_parser = 'html.parser'\n", + "seconds_delay = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "af2c51a9", + "metadata": {}, + "outputs": [], + "source": [ + "def _is_issue_link(url: str) -> bool:\n", + " path_components = urlparse(url).path.split('/')\n", + " print('is_issue_link', url, path_components)\n", + " # must be 5 components (first component is empty)\n", + " # new-era issue links have 6 path components\n", + " return len(path_components) == 5 or (len(path_components) == 6 and path_components[4] == 'new-era-19712020')\n", + "\n", + "\n", + "def get_issue_links(base_url, html):\n", + " soup = BeautifulSoup(html, bs_parser)\n", + " return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n", + " if _is_issue_link(urljoin(base_url, a['href']))]\n", + "\n", + "def get_year_month_links(url, html):\n", + " links = get_issue_links(url, html)\n", + " year_month_links = []\n", + " for link in tqdm(links):\n", + " path_components = urlparse(link).path.split('/')\n", + " print('link and components', link, path_components)\n", + " if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):\n", + " # year-month link\n", + " print('year-month link', link)\n", + " year_month_links.append(link)\n", + " elif len(path_components[-1]) == 4:\n", + " # year_only_link\n", + " print('year-only link', link)\n", + " time.sleep(seconds_delay) \n", + " status_code, html = get_page(link)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={link}\")\n", + " continue\n", + " new_links = get_issue_links(link, html)\n", + " for new_link in new_links:\n", + " print('issue link', new_link)\n", + " year_month_links.append(new_link)\n", + " else:\n", + " print('unexpected link', link, path_components[-1]) \n", + " # TODO remove break\n", + " break\n", + " return year_month_links\n", + "\n", + "def _is_article_link(url: str) -> bool:\n", + " path_components = urlparse(url).path.split('/')\n", + " # # must be 6 or 7 components (first component is empty)\n", + " return len(path_components) == 6 or len(path_components) == 7\n", + "\n", + "\n", + "def get_article_links(base_url, html):\n", + " soup = BeautifulSoup(html, bs_parser)\n", + " return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n", + " if _is_article_link(urljoin(base_url, a['href']))]\n", + "\n", + "\n", + "def get_article_path(url):\n", + " path_components = urlparse(url).path.split('/')\n", + " path = '_'.join(path_components[2:])\n", + " return os.path.join(base_dir, f\"{path}.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "2f5ebfa4", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a7c8554d8a074ab6b07bbcee01d85e65", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00 22\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m 23\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "for url in tqdm(magazine_urls):\n", + " time.sleep(seconds_delay)\n", + " status_code, html = get_page(url)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={url}\")\n", + " continue\n", + " year_month_links = get_year_month_links(url, html)\n", + " print('year-month-links', len(year_month_links))\n", + " for link in tqdm(year_month_links):\n", + " time.sleep(seconds_delay)\n", + " status_code, html = get_page(link)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={url}\")\n", + " continue \n", + " article_links = get_article_links(link, html)\n", + " for article_link in tqdm(article_links):\n", + " path = get_article_path(article_link)\n", + " print('path', path, article_link)\n", + " if os.path.exists(path):\n", + " continue\n", + " print(\" \", path)\n", + " time.sleep(seconds_delay)\n", + " status_code, html = get_page(article_link)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={article_link}\")\n", + " continue\n", + " save_page(path, article_link, html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6f58e52", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "models", + "language": "python", + "name": "models" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From aa6acf6531ef0fef35339277685dae17285f9241 Mon Sep 17 00:00:00 2001 From: DallanQ Date: Wed, 11 Oct 2023 10:38:18 -0600 Subject: [PATCH 2/5] finalize magazine crawler --- notebooks/05_magazine_crawler.ipynb | 283 ++++++++++------------------ 1 file changed, 103 insertions(+), 180 deletions(-) diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb index 1448573..64b6c80 100644 --- a/notebooks/05_magazine_crawler.ipynb +++ b/notebooks/05_magazine_crawler.ipynb @@ -59,17 +59,22 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 70, "id": "af2c51a9", "metadata": {}, "outputs": [], "source": [ "def _is_issue_link(url: str) -> bool:\n", " path_components = urlparse(url).path.split('/')\n", - " print('is_issue_link', url, path_components)\n", - " # must be 5 components (first component is empty)\n", - " # new-era issue links have 6 path components\n", - " return len(path_components) == 5 or (len(path_components) == 6 and path_components[4] == 'new-era-19712020')\n", + " # print('is_issue_link', url, path_components)\n", + " if len(path_components) < 5:\n", + " return False\n", + " elif path_components[4] == 'new-era-19712020':\n", + " # new-era issue links must have 6 path components\n", + " return len(path_components) == 6\n", + " else:\n", + " # all other issue links must have 5 components (first component is empty)\n", + " return len(path_components) == 5\n", "\n", "\n", "def get_issue_links(base_url, html):\n", @@ -82,14 +87,14 @@ " year_month_links = []\n", " for link in tqdm(links):\n", " path_components = urlparse(link).path.split('/')\n", - " print('link and components', link, path_components)\n", + " # print('link and components', link, path_components)\n", " if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):\n", " # year-month link\n", - " print('year-month link', link)\n", + " # print('year-month link', link)\n", " year_month_links.append(link)\n", " elif len(path_components[-1]) == 4:\n", " # year_only_link\n", - " print('year-only link', link)\n", + " # print('year-only link', link)\n", " time.sleep(seconds_delay) \n", " status_code, html = get_page(link)\n", " if status_code != 200:\n", @@ -97,7 +102,7 @@ " continue\n", " new_links = get_issue_links(link, html)\n", " for new_link in new_links:\n", - " print('issue link', new_link)\n", + " # print('issue link', new_link)\n", " year_month_links.append(new_link)\n", " else:\n", " print('unexpected link', link, path_components[-1]) \n", @@ -108,7 +113,8 @@ "def _is_article_link(url: str) -> bool:\n", " path_components = urlparse(url).path.split('/')\n", " # # must be 6 or 7 components (first component is empty)\n", - " return len(path_components) == 6 or len(path_components) == 7\n", + " return (len(path_components) == 6 or len(path_components) == 7) and \\\n", + " path_components[-2] != 'new-era-19712020' and path_components[-1] != 'contents'\n", "\n", "\n", "def get_article_links(base_url, html):\n", @@ -125,14 +131,14 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 72, "id": "2f5ebfa4", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a7c8554d8a074ab6b07bbcee01d85e65", + "model_id": "0798b0665b13420a9dd5e93e83dc71bd", "version_major": 2, "version_minor": 0 }, @@ -143,75 +149,36 @@ "metadata": {}, "output_type": "display_data" }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9631b6adafed4a80869af96df6933da7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00 22\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m 23\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n", + "Cell \u001b[0;32mIn[72], line 23\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m, path)\n\u001b[0;32m---> 23\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m 24\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } @@ -382,8 +304,9 @@ " print(f\"Status code={status_code} url={url}\")\n", " continue\n", " year_month_links = get_year_month_links(url, html)\n", - " print('year-month-links', len(year_month_links))\n", + " print('year-month-links', url, len(year_month_links))\n", " for link in tqdm(year_month_links):\n", + " print('year-month link', link)\n", " time.sleep(seconds_delay)\n", " status_code, html = get_page(link)\n", " if status_code != 200:\n", @@ -392,7 +315,7 @@ " article_links = get_article_links(link, html)\n", " for article_link in tqdm(article_links):\n", " path = get_article_path(article_link)\n", - " print('path', path, article_link)\n", + " # print('path', path, article_link)\n", " if os.path.exists(path):\n", " continue\n", " print(\" \", path)\n", From b2471a2e6f5151e0dd353c5a87a2d572e60be7b6 Mon Sep 17 00:00:00 2001 From: DallanQ Date: Wed, 11 Oct 2023 10:52:52 -0600 Subject: [PATCH 3/5] uncomment magazines --- notebooks/05_magazine_crawler.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb index 64b6c80..1803180 100644 --- a/notebooks/05_magazine_crawler.ipynb +++ b/notebooks/05_magazine_crawler.ipynb @@ -45,12 +45,12 @@ "source": [ "# config\n", "magazine_urls = [\n", - " # 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n", - " # 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n", - " # 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n", " 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',\n", - " # 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n", - " #'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng'\n", + " 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng',\n", "]\n", "base_dir = '../data/load/raw'\n", "bs_parser = 'html.parser'\n", From 6b1aa4d5822e12d96898c01552bf27e9496b175b Mon Sep 17 00:00:00 2001 From: DallanQ Date: Wed, 11 Oct 2023 11:07:59 -0600 Subject: [PATCH 4/5] add instructions to README --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e6d9a75..ab95cac 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ PINECONE_ENV=your_pinecone_environment_name (found on API keys page) `mkdir data` -`aws s3 sync s3://iloveconference.data data` +`aws s3 sync s3://scripturecentralqa.data data` ## Developing @@ -56,6 +56,13 @@ Run `nox` before creating a pull request to ensure that all checks pass. ### Running notebooks +After running `poetry shell`, you need to install the poetry virtual environment as a jupyter kernel. +Let's name it "models": `python -m ipykernel install --user --name models` +You only need to do this once. + +You can run notebooks either in VS Code, or in your browser. +To run notebooks in the browser, you run + `` env PYTHONPATH=`pwd` jupyter notebook `` or (if you have fish shell) From 6389379e25f078905d5ca86f1f2fd0d902360b06 Mon Sep 17 00:00:00 2001 From: DallanQ Date: Wed, 11 Oct 2023 12:11:23 -0600 Subject: [PATCH 5/5] move delay into get_page function --- models/crawl_utils.py | 8 +- notebooks/05_conference_crawler.ipynb | 19 ++- notebooks/05_magazine_crawler.ipynb | 190 ++------------------------ 3 files changed, 27 insertions(+), 190 deletions(-) diff --git a/models/crawl_utils.py b/models/crawl_utils.py index 9d8c567..df1ec0e 100644 --- a/models/crawl_utils.py +++ b/models/crawl_utils.py @@ -1,5 +1,6 @@ """Crawl utils.""" import json +import time from typing import Optional from typing import Tuple @@ -7,7 +8,11 @@ def get_page( - url: str, headers: Optional[dict[str, str]] = None, encoding: str = "utf-8", timeout: int = 30 + url: str, + delay_seconds: int = 30, + headers: Optional[dict[str, str]] = None, + encoding: str = "utf-8", + timeout: int = 30 ) -> Tuple[int, str]: """Get page from url.""" if headers is None: @@ -26,6 +31,7 @@ def get_page( "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", # noqa: B950 } response = requests.get(url, headers=headers, timeout=timeout) + time.sleep(delay_seconds) if encoding: response.encoding = encoding return response.status_code, response.text diff --git a/notebooks/05_conference_crawler.ipynb b/notebooks/05_conference_crawler.ipynb index 8b77ade..38762af 100644 --- a/notebooks/05_conference_crawler.ipynb +++ b/notebooks/05_conference_crawler.ipynb @@ -27,7 +27,6 @@ "outputs": [], "source": [ "import os\n", - "import time\n", "from urllib.parse import urljoin, urlparse\n", "\n", "from bs4 import BeautifulSoup\n", @@ -43,12 +42,12 @@ "outputs": [], "source": [ "# config\n", - "years = range(2023, 2024)\n", - "months = [10]\n", + "years = range(1971, 2024)\n", + "months = [4, 10]\n", "host = 'https://www.churchofjesuschrist.org'\n", - "base_dir = '../data/load/raw'\n", + "base_dir = '../data/load/raw/conference'\n", "bs_parser = 'html.parser'\n", - "seconds_delay = 30" + "delay_seconds = 30" ] }, { @@ -86,24 +85,22 @@ "for year in years:\n", " for month in months:\n", " dir_url = f\"{host}/study/general-conference/{year}/{month}?lang=eng\"\n", - " status_code, dir_html = get_page(dir_url)\n", + " status_code, dir_html = get_page(dir_url, delay_seconds)\n", " if status_code != 200:\n", " print(f\"Status code={status_code} url={dir_url}\")\n", " continue\n", " talk_urls = get_talk_urls(dir_url, dir_html)\n", " print(dir_url, len(talk_urls))\n", - " time.sleep(seconds_delay)\n", " for talk_url in talk_urls:\n", " path = get_talk_path(talk_url)\n", " if os.path.exists(path):\n", " continue\n", " print(\" \", path)\n", - " status_code, talk_html = get_page(talk_url)\n", + " status_code, talk_html = get_page(talk_url, delay_seconds)\n", " if status_code != 200:\n", " print(f\"Status code={status_code} url={talk_url}\")\n", " continue\n", - " save_page(path, talk_url, talk_html)\n", - " time.sleep(seconds_delay)" + " save_page(path, talk_url, talk_html)" ] }, { @@ -131,7 +128,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.11.6" } }, "nbformat": 4, diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb index 1803180..a83b095 100644 --- a/notebooks/05_magazine_crawler.ipynb +++ b/notebooks/05_magazine_crawler.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "7fe5bf12", "metadata": {}, "outputs": [], @@ -21,13 +21,12 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "4bca89a2", "metadata": {}, "outputs": [], "source": [ "import os\n", - "import time\n", "from urllib.parse import urljoin, urlparse\n", "\n", "from bs4 import BeautifulSoup\n", @@ -38,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "id": "e753397e", "metadata": {}, "outputs": [], @@ -52,14 +51,14 @@ " 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n", " 'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng',\n", "]\n", - "base_dir = '../data/load/raw'\n", + "base_dir = '../data/load/raw/magazines'\n", "bs_parser = 'html.parser'\n", - "seconds_delay = 3" + "delay_seconds = 30" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": null, "id": "af2c51a9", "metadata": {}, "outputs": [], @@ -95,8 +94,7 @@ " elif len(path_components[-1]) == 4:\n", " # year_only_link\n", " # print('year-only link', link)\n", - " time.sleep(seconds_delay) \n", - " status_code, html = get_page(link)\n", + " status_code, html = get_page(link, delay_seconds)\n", " if status_code != 200:\n", " print(f\"Status code={status_code} url={link}\")\n", " continue\n", @@ -131,175 +129,13 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": null, "id": "2f5ebfa4", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0798b0665b13420a9dd5e93e83dc71bd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 23\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m 24\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "for url in tqdm(magazine_urls):\n", - " time.sleep(seconds_delay)\n", - " status_code, html = get_page(url)\n", + " status_code, html = get_page(url, delay_seconds)\n", " if status_code != 200:\n", " print(f\"Status code={status_code} url={url}\")\n", " continue\n", @@ -307,8 +143,7 @@ " print('year-month-links', url, len(year_month_links))\n", " for link in tqdm(year_month_links):\n", " print('year-month link', link)\n", - " time.sleep(seconds_delay)\n", - " status_code, html = get_page(link)\n", + " status_code, html = get_page(link, delay_seconds)\n", " if status_code != 200:\n", " print(f\"Status code={status_code} url={url}\")\n", " continue \n", @@ -319,8 +154,7 @@ " if os.path.exists(path):\n", " continue\n", " print(\" \", path)\n", - " time.sleep(seconds_delay)\n", - " status_code, html = get_page(article_link)\n", + " status_code, html = get_page(article_link, delay_seconds)\n", " if status_code != 200:\n", " print(f\"Status code={status_code} url={article_link}\")\n", " continue\n",