From 119c6859d532b4bcff7899e2421e83193f788665 Mon Sep 17 00:00:00 2001
From: DallanQ <dallan@quass.org>
Date: Wed, 11 Oct 2023 10:10:54 -0600
Subject: [PATCH 1/5] all magazines working except new era

---
 .gitignore                          |   1 +
 notebooks/05_magazine_crawler.ipynb | 437 ++++++++++++++++++++++++++++
 2 files changed, 438 insertions(+)
 create mode 100644 notebooks/05_magazine_crawler.ipynb

diff --git a/.gitignore b/.gitignore
index 79a2b31..046a699 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@
 /.env.*
 __pycache__/
 /notebooks/.ipynb_checkpoints/
+/models/.ipynb_checkpoints/
 /notebooks/wandb/
 .idea/
 .venv/
diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb
new file mode 100644
index 0000000..1448573
--- /dev/null
+++ b/notebooks/05_magazine_crawler.ipynb
@@ -0,0 +1,437 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "936d02dd",
+   "metadata": {},
+   "source": [
+    "# Crawl Magazines from the Church of Jesus Christ of Latter-day Saints"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7fe5bf12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "4bca89a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "from urllib.parse import urljoin, urlparse\n",
+    "\n",
+    "from bs4 import BeautifulSoup\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from models.crawl_utils import get_page, save_page"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "e753397e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config\n",
+    "magazine_urls = [\n",
+    "    # 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n",
+    "    # 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n",
+    "    # 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',\n",
+    "    # 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n",
+    "    #'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng'\n",
+    "]\n",
+    "base_dir = '../data/load/raw'\n",
+    "bs_parser = 'html.parser'\n",
+    "seconds_delay = 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "af2c51a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _is_issue_link(url: str) -> bool:\n",
+    "    path_components = urlparse(url).path.split('/')\n",
+    "    print('is_issue_link', url, path_components)\n",
+    "    # must be 5 components (first component is empty)\n",
+    "    # new-era issue links have 6 path components\n",
+    "    return len(path_components) == 5 or (len(path_components) == 6 and path_components[4] == 'new-era-19712020')\n",
+    "\n",
+    "\n",
+    "def get_issue_links(base_url, html):\n",
+    "    soup = BeautifulSoup(html, bs_parser)\n",
+    "    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n",
+    "            if _is_issue_link(urljoin(base_url, a['href']))]\n",
+    "\n",
+    "def get_year_month_links(url, html):\n",
+    "    links = get_issue_links(url, html)\n",
+    "    year_month_links = []\n",
+    "    for link in tqdm(links):\n",
+    "        path_components = urlparse(link).path.split('/')\n",
+    "        print('link and components', link, path_components)\n",
+    "        if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):\n",
+    "            # year-month link\n",
+    "            print('year-month link', link)\n",
+    "            year_month_links.append(link)\n",
+    "        elif len(path_components[-1]) == 4:\n",
+    "            # year_only_link\n",
+    "            print('year-only link', link)\n",
+    "            time.sleep(seconds_delay)            \n",
+    "            status_code, html = get_page(link)\n",
+    "            if status_code != 200:\n",
+    "                print(f\"Status code={status_code} url={link}\")\n",
+    "                continue\n",
+    "            new_links = get_issue_links(link, html)\n",
+    "            for new_link in new_links:\n",
+    "                print('issue link', new_link)\n",
+    "                year_month_links.append(new_link)\n",
+    "        else:\n",
+    "            print('unexpected link', link, path_components[-1])    \n",
+    "        # TODO remove break\n",
+    "        break\n",
+    "    return year_month_links\n",
+    "\n",
+    "def _is_article_link(url: str) -> bool:\n",
+    "    path_components = urlparse(url).path.split('/')\n",
+    "    # # must be 6 or 7 components (first component is empty)\n",
+    "    return len(path_components) == 6 or len(path_components) == 7\n",
+    "\n",
+    "\n",
+    "def get_article_links(base_url, html):\n",
+    "    soup = BeautifulSoup(html, bs_parser)\n",
+    "    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n",
+    "            if _is_article_link(urljoin(base_url, a['href']))]\n",
+    "\n",
+    "\n",
+    "def get_article_path(url):\n",
+    "    path_components = urlparse(url).path.split('/')\n",
+    "    path = '_'.join(path_components[2:])\n",
+    "    return os.path.join(base_dir, f\"{path}.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "2f5ebfa4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7c8554d8a074ab6b07bbcee01d85e65",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "is_issue_link https://www.churchofjesuschrist.org/study/lib?lang=eng ['', 'study', 'lib']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines?lang=eng ['', 'study', 'magazines']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/notes ['', 'notes']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2020']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2019?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2019']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2018?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2018']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2017?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2017']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2016?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2016']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2015?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2015']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2014?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2014']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2013?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2013']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2012?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2012']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2011?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2011']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2010?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2010']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2009?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2009']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2008?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2008']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2007?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2007']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2006?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2006']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2005?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2005']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2004?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2004']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2003?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2003']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2002?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2002']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2001?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2001']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2000?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2000']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1999?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1999']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1998?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1998']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1997?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1997']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1996?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1996']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1995?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1995']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1994?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1994']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1993?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1993']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1992?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1992']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1991?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1991']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1990?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1990']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1989?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1989']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1988?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1988']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1987?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1987']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1986?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1986']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1985?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1985']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1984?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1984']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1983?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1983']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1982?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1982']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1981?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1981']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1980?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1980']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1979?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1979']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1978?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1978']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1977?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1977']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1976?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1976']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1975?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1975']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1974?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1974']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1973?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1973']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1972?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1972']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1971?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1971']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3d865e66dd964e12ad0f9d1c82916b9f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/50 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "link and components https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2020']\n",
+      "year-only link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/lib?lang=eng ['', 'study', 'lib']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines?lang=eng ['', 'study', 'magazines']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/notes ['', 'notes']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/01?lang=eng ['', 'study', 'new-era', '2020', '01']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/02?lang=eng ['', 'study', 'new-era', '2020', '02']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/03?lang=eng ['', 'study', 'new-era', '2020', '03']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/04?lang=eng ['', 'study', 'new-era', '2020', '04']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/05?lang=eng ['', 'study', 'new-era', '2020', '05']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/06?lang=eng ['', 'study', 'new-era', '2020', '06']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/07?lang=eng ['', 'study', 'new-era', '2020', '07']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/08?lang=eng ['', 'study', 'new-era', '2020', '08']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/09?lang=eng ['', 'study', 'new-era', '2020', '09']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/10?lang=eng ['', 'study', 'new-era', '2020', '10']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/11?lang=eng ['', 'study', 'new-era', '2020', '11']\n",
+      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/12?lang=eng ['', 'study', 'new-era', '2020', '12']\n",
+      "issue link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/01?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/02?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/03?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/04?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/05?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/06?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/07?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/08?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/09?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/10?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/11?lang=eng\n",
+      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/12?lang=eng\n",
+      "year-month-links 13\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6a2b9ae30c784d5b850a6054774e7beb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/13 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "58a8773db0044810bd0f73f3b69d32d5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/50 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2020.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2020.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2019.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2019?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2019.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2018.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2018?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2018.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2017.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2017?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2017.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2016.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2016?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2016.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2015.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2015?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2015.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2014.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2014?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2014.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2013.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2013?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2013.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2012.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2012?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2012.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2011.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2011?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2011.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2010.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2010?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2010.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2009.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2009?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2009.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2008.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2008?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2008.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2007.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2007?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2007.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2006.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2006?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2006.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2005.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2005?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2005.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2004.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2004?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2004.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2003.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2003?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2003.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2002.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2002?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2002.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2001.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2001?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2001.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2000.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2000?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2000.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1999.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1999?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1999.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1998.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1998?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1998.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1997.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1997?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1997.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1996.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1996?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1996.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1995.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1995?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1995.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1994.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1994?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1994.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1993.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1993?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1993.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1992.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1992?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1992.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1991.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1991?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1991.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1990.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1990?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1990.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1989.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1989?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1989.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1988.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1988?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1988.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1987.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1987?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1987.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1986.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1986?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1986.json\n",
+      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1985.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1985?lang=eng\n",
+      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1985.json\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[60], line 22\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m    \u001b[39m\u001b[38;5;124m\"\u001b[39m, path)\n\u001b[0;32m---> 22\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m     23\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m     24\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "for url in tqdm(magazine_urls):\n",
+    "    time.sleep(seconds_delay)\n",
+    "    status_code, html = get_page(url)\n",
+    "    if status_code != 200:\n",
+    "        print(f\"Status code={status_code} url={url}\")\n",
+    "        continue\n",
+    "    year_month_links = get_year_month_links(url, html)\n",
+    "    print('year-month-links', len(year_month_links))\n",
+    "    for link in tqdm(year_month_links):\n",
+    "        time.sleep(seconds_delay)\n",
+    "        status_code, html = get_page(link)\n",
+    "        if status_code != 200:\n",
+    "            print(f\"Status code={status_code} url={url}\")\n",
+    "            continue        \n",
+    "        article_links = get_article_links(link, html)\n",
+    "        for article_link in tqdm(article_links):\n",
+    "            path = get_article_path(article_link)\n",
+    "            print('path', path, article_link)\n",
+    "            if os.path.exists(path):\n",
+    "                continue\n",
+    "            print(\"    \", path)\n",
+    "            time.sleep(seconds_delay)\n",
+    "            status_code, html = get_page(article_link)\n",
+    "            if status_code != 200:\n",
+    "                print(f\"Status code={status_code} url={article_link}\")\n",
+    "                continue\n",
+    "            save_page(path, article_link, html)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6f58e52",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "models",
+   "language": "python",
+   "name": "models"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From aa6acf6531ef0fef35339277685dae17285f9241 Mon Sep 17 00:00:00 2001
From: DallanQ <dallan@quass.org>
Date: Wed, 11 Oct 2023 10:38:18 -0600
Subject: [PATCH 2/5] finalize magazine crawler

---
 notebooks/05_magazine_crawler.ipynb | 283 ++++++++++------------------
 1 file changed, 103 insertions(+), 180 deletions(-)

diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb
index 1448573..64b6c80 100644
--- a/notebooks/05_magazine_crawler.ipynb
+++ b/notebooks/05_magazine_crawler.ipynb
@@ -59,17 +59,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 70,
    "id": "af2c51a9",
    "metadata": {},
    "outputs": [],
    "source": [
     "def _is_issue_link(url: str) -> bool:\n",
     "    path_components = urlparse(url).path.split('/')\n",
-    "    print('is_issue_link', url, path_components)\n",
-    "    # must be 5 components (first component is empty)\n",
-    "    # new-era issue links have 6 path components\n",
-    "    return len(path_components) == 5 or (len(path_components) == 6 and path_components[4] == 'new-era-19712020')\n",
+    "    # print('is_issue_link', url, path_components)\n",
+    "    if len(path_components) < 5:\n",
+    "        return False\n",
+    "    elif path_components[4] == 'new-era-19712020':\n",
+    "        # new-era issue links must have 6 path components\n",
+    "        return len(path_components) == 6\n",
+    "    else:\n",
+    "        # all other issue links must have 5 components (first component is empty)\n",
+    "        return len(path_components) == 5\n",
     "\n",
     "\n",
     "def get_issue_links(base_url, html):\n",
@@ -82,14 +87,14 @@
     "    year_month_links = []\n",
     "    for link in tqdm(links):\n",
     "        path_components = urlparse(link).path.split('/')\n",
-    "        print('link and components', link, path_components)\n",
+    "        # print('link and components', link, path_components)\n",
     "        if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):\n",
     "            # year-month link\n",
-    "            print('year-month link', link)\n",
+    "            # print('year-month link', link)\n",
     "            year_month_links.append(link)\n",
     "        elif len(path_components[-1]) == 4:\n",
     "            # year_only_link\n",
-    "            print('year-only link', link)\n",
+    "            # print('year-only link', link)\n",
     "            time.sleep(seconds_delay)            \n",
     "            status_code, html = get_page(link)\n",
     "            if status_code != 200:\n",
@@ -97,7 +102,7 @@
     "                continue\n",
     "            new_links = get_issue_links(link, html)\n",
     "            for new_link in new_links:\n",
-    "                print('issue link', new_link)\n",
+    "                # print('issue link', new_link)\n",
     "                year_month_links.append(new_link)\n",
     "        else:\n",
     "            print('unexpected link', link, path_components[-1])    \n",
@@ -108,7 +113,8 @@
     "def _is_article_link(url: str) -> bool:\n",
     "    path_components = urlparse(url).path.split('/')\n",
     "    # # must be 6 or 7 components (first component is empty)\n",
-    "    return len(path_components) == 6 or len(path_components) == 7\n",
+    "    return (len(path_components) == 6 or len(path_components) == 7) and \\\n",
+    "        path_components[-2] != 'new-era-19712020' and path_components[-1] != 'contents'\n",
     "\n",
     "\n",
     "def get_article_links(base_url, html):\n",
@@ -125,14 +131,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 72,
    "id": "2f5ebfa4",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a7c8554d8a074ab6b07bbcee01d85e65",
+       "model_id": "0798b0665b13420a9dd5e93e83dc71bd",
        "version_major": 2,
        "version_minor": 0
       },
@@ -143,75 +149,36 @@
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9631b6adafed4a80869af96df6933da7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/50 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "is_issue_link https://www.churchofjesuschrist.org/study/lib?lang=eng ['', 'study', 'lib']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines?lang=eng ['', 'study', 'magazines']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/notes ['', 'notes']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2020']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2019?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2019']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2018?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2018']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2017?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2017']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2016?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2016']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2015?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2015']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2014?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2014']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2013?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2013']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2012?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2012']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2011?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2011']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2010?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2010']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2009?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2009']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2008?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2008']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2007?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2007']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2006?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2006']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2005?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2005']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2004?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2004']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2003?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2003']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2002?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2002']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2001?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2001']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2000?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2000']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1999?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1999']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1998?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1998']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1997?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1997']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1996?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1996']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1995?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1995']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1994?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1994']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1993?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1993']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1992?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1992']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1991?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1991']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1990?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1990']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1989?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1989']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1988?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1988']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1987?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1987']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1986?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1986']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1985?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1985']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1984?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1984']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1983?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1983']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1982?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1982']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1981?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1981']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1980?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1980']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1979?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1979']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1978?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1978']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1977?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1977']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1976?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1976']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1975?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1975']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1974?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1974']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1973?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1973']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1972?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1972']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1971?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '1971']\n"
+      "year-month-links https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng 12\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3d865e66dd964e12ad0f9d1c82916b9f",
+       "model_id": "11bd719ef2d44e74a13aa16d4c92c8cc",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/50 [00:00<?, ?it/s]"
+       "  0%|          | 0/12 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -221,64 +188,81 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "link and components https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020', '2020']\n",
-      "year-only link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/lib?lang=eng ['', 'study', 'lib']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines?lang=eng ['', 'study', 'magazines']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng ['', 'study', 'magazines', 'for-the-strength-of-youth', 'new-era-19712020']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/notes ['', 'notes']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/01?lang=eng ['', 'study', 'new-era', '2020', '01']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/02?lang=eng ['', 'study', 'new-era', '2020', '02']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/03?lang=eng ['', 'study', 'new-era', '2020', '03']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/04?lang=eng ['', 'study', 'new-era', '2020', '04']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/05?lang=eng ['', 'study', 'new-era', '2020', '05']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/06?lang=eng ['', 'study', 'new-era', '2020', '06']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/07?lang=eng ['', 'study', 'new-era', '2020', '07']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/08?lang=eng ['', 'study', 'new-era', '2020', '08']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/09?lang=eng ['', 'study', 'new-era', '2020', '09']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/10?lang=eng ['', 'study', 'new-era', '2020', '10']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/11?lang=eng ['', 'study', 'new-era', '2020', '11']\n",
-      "is_issue_link https://www.churchofjesuschrist.org/study/new-era/2020/12?lang=eng ['', 'study', 'new-era', '2020', '12']\n",
-      "issue link https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/01?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/02?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/03?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/04?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/05?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/06?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/07?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/08?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/09?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/10?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/11?lang=eng\n",
-      "issue link https://www.churchofjesuschrist.org/study/new-era/2020/12?lang=eng\n",
-      "year-month-links 13\n"
+      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/01?lang=eng\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6a2b9ae30c784d5b850a6054774e7beb",
+       "model_id": "882e1275984848e899abc8eaffae9d93",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/13 [00:00<?, ?it/s]"
+       "  0%|          | 0/44 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/02?lang=eng\n"
+     ]
+    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "58a8773db0044810bd0f73f3b69d32d5",
+       "model_id": "909691fd86394034b82670efefe51f1a",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/50 [00:00<?, ?it/s]"
+       "  0%|          | 0/44 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/03?lang=eng\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2500de8d11064d3292f8c6733a9094e0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/36 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/04?lang=eng\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cc60ffb3dc7e40bb98f18baef5bc32dd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/44 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -288,78 +272,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2020.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2020?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2020.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2019.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2019?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2019.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2018.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2018?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2018.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2017.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2017?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2017.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2016.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2016?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2016.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2015.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2015?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2015.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2014.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2014?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2014.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2013.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2013?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2013.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2012.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2012?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2012.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2011.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2011?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2011.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2010.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2010?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2010.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2009.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2009?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2009.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2008.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2008?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2008.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2007.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2007?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2007.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2006.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2006?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2006.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2005.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2005?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2005.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2004.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2004?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2004.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2003.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2003?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2003.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2002.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2002?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2002.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2001.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2001?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2001.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2000.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/2000?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_2000.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1999.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1999?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1999.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1998.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1998?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1998.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1997.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1997?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1997.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1996.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1996?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1996.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1995.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1995?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1995.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1994.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1994?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1994.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1993.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1993?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1993.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1992.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1992?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1992.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1991.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1991?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1991.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1990.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1990?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1990.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1989.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1989?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1989.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1988.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1988?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1988.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1987.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1987?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1987.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1986.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1986?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1986.json\n",
-      "path ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1985.json https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020/1985?lang=eng\n",
-      "     ../data/load/raw/magazines_for-the-strength-of-youth_new-era-19712020_1985.json\n"
+      "     ../data/load/raw/new-era_2020_04_the-book-of-mormon-a-witness-of-the-resurrection.json\n",
+      "     ../data/load/raw/new-era_2020_04_he-is-the-light.json\n",
+      "     ../data/load/raw/new-era_2020_04_connect.json\n",
+      "     ../data/load/raw/new-era_2020_04_fun-stop.json\n",
+      "     ../data/load/raw/new-era_2020_04_q-a_what-do-you-say-when-your-friends-dont-believe-that-the-first-vision-could-happen.json\n",
+      "     ../data/load/raw/new-era_2020_04_q-a_how-did-joseph-smith-translate-the-book-of-mormon.json\n",
+      "     ../data/load/raw/new-era_2020_04_firm-foundations.json\n",
+      "     ../data/load/raw/new-era_2020_04_the-sacrament-and-taking-upon-us-the-name-of-jesus-christ.json\n",
+      "     ../data/load/raw/new-era_2020_04_king-benjamin.json\n",
+      "     ../data/load/raw/new-era_2020_04_share-the-message-of-the-ongoing-restoration.json\n"
      ]
     },
     {
@@ -369,7 +291,7 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[60], line 22\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m    \u001b[39m\u001b[38;5;124m\"\u001b[39m, path)\n\u001b[0;32m---> 22\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m     23\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m     24\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n",
+      "Cell \u001b[0;32mIn[72], line 23\u001b[0m\n\u001b[1;32m     21\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m    \u001b[39m\u001b[38;5;124m\"\u001b[39m, path)\n\u001b[0;32m---> 23\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m     24\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m     25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n",
       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
@@ -382,8 +304,9 @@
     "        print(f\"Status code={status_code} url={url}\")\n",
     "        continue\n",
     "    year_month_links = get_year_month_links(url, html)\n",
-    "    print('year-month-links', len(year_month_links))\n",
+    "    print('year-month-links', url, len(year_month_links))\n",
     "    for link in tqdm(year_month_links):\n",
+    "        print('year-month link', link)\n",
     "        time.sleep(seconds_delay)\n",
     "        status_code, html = get_page(link)\n",
     "        if status_code != 200:\n",
@@ -392,7 +315,7 @@
     "        article_links = get_article_links(link, html)\n",
     "        for article_link in tqdm(article_links):\n",
     "            path = get_article_path(article_link)\n",
-    "            print('path', path, article_link)\n",
+    "            # print('path', path, article_link)\n",
     "            if os.path.exists(path):\n",
     "                continue\n",
     "            print(\"    \", path)\n",

From b2471a2e6f5151e0dd353c5a87a2d572e60be7b6 Mon Sep 17 00:00:00 2001
From: DallanQ <dallan@quass.org>
Date: Wed, 11 Oct 2023 10:52:52 -0600
Subject: [PATCH 3/5] uncomment magazines

---
 notebooks/05_magazine_crawler.ipynb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb
index 64b6c80..1803180 100644
--- a/notebooks/05_magazine_crawler.ipynb
+++ b/notebooks/05_magazine_crawler.ipynb
@@ -45,12 +45,12 @@
    "source": [
     "# config\n",
     "magazine_urls = [\n",
-    "    # 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n",
-    "    # 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n",
-    "    # 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n",
     "    'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',\n",
-    "    # 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n",
-    "    #'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng'\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng',\n",
     "]\n",
     "base_dir = '../data/load/raw'\n",
     "bs_parser = 'html.parser'\n",

From 6b1aa4d5822e12d96898c01552bf27e9496b175b Mon Sep 17 00:00:00 2001
From: DallanQ <dallan@quass.org>
Date: Wed, 11 Oct 2023 11:07:59 -0600
Subject: [PATCH 4/5] add instructions to README

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e6d9a75..ab95cac 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ PINECONE_ENV=your_pinecone_environment_name (found on API keys page)
 
 `mkdir data`
 
-`aws s3 sync s3://iloveconference.data data`
+`aws s3 sync s3://scripturecentralqa.data data`
 
 ## Developing
 
@@ -56,6 +56,13 @@ Run `nox` before creating a pull request to ensure that all checks pass.
 
 ### Running notebooks
 
+After running `poetry shell`, you need to install the poetry virtual environment as a jupyter kernel. 
+Let's name it "models": `python -m ipykernel install --user --name models`
+You only need to do this once.
+
+You can run notebooks either in VS Code, or in your browser.
+To run notebooks in the browser, you run
+
 `` env PYTHONPATH=`pwd` jupyter notebook ``
 
 or (if you have fish shell)

From 6389379e25f078905d5ca86f1f2fd0d902360b06 Mon Sep 17 00:00:00 2001
From: DallanQ <dallan@quass.org>
Date: Wed, 11 Oct 2023 12:11:23 -0600
Subject: [PATCH 5/5] move delay into get_page function

---
 models/crawl_utils.py                 |   8 +-
 notebooks/05_conference_crawler.ipynb |  19 ++-
 notebooks/05_magazine_crawler.ipynb   | 190 ++------------------------
 3 files changed, 27 insertions(+), 190 deletions(-)

diff --git a/models/crawl_utils.py b/models/crawl_utils.py
index 9d8c567..df1ec0e 100644
--- a/models/crawl_utils.py
+++ b/models/crawl_utils.py
@@ -1,5 +1,6 @@
 """Crawl utils."""
 import json
+import time
 from typing import Optional
 from typing import Tuple
 
@@ -7,7 +8,11 @@
 
 
 def get_page(
-    url: str, headers: Optional[dict[str, str]] = None, encoding: str = "utf-8", timeout: int = 30
+    url: str, 
+    delay_seconds: int = 30, 
+    headers: Optional[dict[str, str]] = None, 
+    encoding: str = "utf-8", 
+    timeout: int = 30
 ) -> Tuple[int, str]:
     """Get page from url."""
     if headers is None:
@@ -26,6 +31,7 @@ def get_page(
             "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",  # noqa: B950
         }
     response = requests.get(url, headers=headers, timeout=timeout)
+    time.sleep(delay_seconds)
     if encoding:
         response.encoding = encoding
     return response.status_code, response.text
diff --git a/notebooks/05_conference_crawler.ipynb b/notebooks/05_conference_crawler.ipynb
index 8b77ade..38762af 100644
--- a/notebooks/05_conference_crawler.ipynb
+++ b/notebooks/05_conference_crawler.ipynb
@@ -27,7 +27,6 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import time\n",
     "from urllib.parse import urljoin, urlparse\n",
     "\n",
     "from bs4 import BeautifulSoup\n",
@@ -43,12 +42,12 @@
    "outputs": [],
    "source": [
     "# config\n",
-    "years = range(2023, 2024)\n",
-    "months = [10]\n",
+    "years = range(1971, 2024)\n",
+    "months = [4, 10]\n",
     "host = 'https://www.churchofjesuschrist.org'\n",
-    "base_dir = '../data/load/raw'\n",
+    "base_dir = '../data/load/raw/conference'\n",
     "bs_parser = 'html.parser'\n",
-    "seconds_delay = 30"
+    "delay_seconds = 30"
    ]
   },
   {
@@ -86,24 +85,22 @@
     "for year in years:\n",
     "    for month in months:\n",
     "        dir_url = f\"{host}/study/general-conference/{year}/{month}?lang=eng\"\n",
-    "        status_code, dir_html = get_page(dir_url)\n",
+    "        status_code, dir_html = get_page(dir_url, delay_seconds)\n",
     "        if status_code != 200:\n",
     "            print(f\"Status code={status_code} url={dir_url}\")\n",
     "            continue\n",
     "        talk_urls = get_talk_urls(dir_url, dir_html)\n",
     "        print(dir_url, len(talk_urls))\n",
-    "        time.sleep(seconds_delay)\n",
     "        for talk_url in talk_urls:\n",
     "            path = get_talk_path(talk_url)\n",
     "            if os.path.exists(path):\n",
     "                continue\n",
     "            print(\"    \", path)\n",
-    "            status_code, talk_html = get_page(talk_url)\n",
+    "            status_code, talk_html = get_page(talk_url, delay_seconds)\n",
     "            if status_code != 200:\n",
     "                print(f\"Status code={status_code} url={talk_url}\")\n",
     "                continue\n",
-    "            save_page(path, talk_url, talk_html)\n",
-    "            time.sleep(seconds_delay)"
+    "            save_page(path, talk_url, talk_html)"
    ]
   },
   {
@@ -131,7 +128,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb
index 1803180..a83b095 100644
--- a/notebooks/05_magazine_crawler.ipynb
+++ b/notebooks/05_magazine_crawler.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "7fe5bf12",
    "metadata": {},
    "outputs": [],
@@ -21,13 +21,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
    "id": "4bca89a2",
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
-    "import time\n",
     "from urllib.parse import urljoin, urlparse\n",
     "\n",
     "from bs4 import BeautifulSoup\n",
@@ -38,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": null,
    "id": "e753397e",
    "metadata": {},
    "outputs": [],
@@ -52,14 +51,14 @@
     "    'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n",
     "    'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng',\n",
     "]\n",
-    "base_dir = '../data/load/raw'\n",
+    "base_dir = '../data/load/raw/magazines'\n",
     "bs_parser = 'html.parser'\n",
-    "seconds_delay = 3"
+    "delay_seconds = 30"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": null,
    "id": "af2c51a9",
    "metadata": {},
    "outputs": [],
@@ -95,8 +94,7 @@
     "        elif len(path_components[-1]) == 4:\n",
     "            # year_only_link\n",
     "            # print('year-only link', link)\n",
-    "            time.sleep(seconds_delay)            \n",
-    "            status_code, html = get_page(link)\n",
+    "            status_code, html = get_page(link, delay_seconds)\n",
     "            if status_code != 200:\n",
     "                print(f\"Status code={status_code} url={link}\")\n",
     "                continue\n",
@@ -131,175 +129,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": null,
    "id": "2f5ebfa4",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0798b0665b13420a9dd5e93e83dc71bd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9631b6adafed4a80869af96df6933da7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/50 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "year-month-links https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng 12\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "11bd719ef2d44e74a13aa16d4c92c8cc",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/12 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/01?lang=eng\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "882e1275984848e899abc8eaffae9d93",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/44 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/02?lang=eng\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "909691fd86394034b82670efefe51f1a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/44 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/03?lang=eng\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2500de8d11064d3292f8c6733a9094e0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/36 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "!!! link https://www.churchofjesuschrist.org/study/new-era/2020/04?lang=eng\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cc60ffb3dc7e40bb98f18baef5bc32dd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/44 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "     ../data/load/raw/new-era_2020_04_the-book-of-mormon-a-witness-of-the-resurrection.json\n",
-      "     ../data/load/raw/new-era_2020_04_he-is-the-light.json\n",
-      "     ../data/load/raw/new-era_2020_04_connect.json\n",
-      "     ../data/load/raw/new-era_2020_04_fun-stop.json\n",
-      "     ../data/load/raw/new-era_2020_04_q-a_what-do-you-say-when-your-friends-dont-believe-that-the-first-vision-could-happen.json\n",
-      "     ../data/load/raw/new-era_2020_04_q-a_how-did-joseph-smith-translate-the-book-of-mormon.json\n",
-      "     ../data/load/raw/new-era_2020_04_firm-foundations.json\n",
-      "     ../data/load/raw/new-era_2020_04_the-sacrament-and-taking-upon-us-the-name-of-jesus-christ.json\n",
-      "     ../data/load/raw/new-era_2020_04_king-benjamin.json\n",
-      "     ../data/load/raw/new-era_2020_04_share-the-message-of-the-ongoing-restoration.json\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[72], line 23\u001b[0m\n\u001b[1;32m     21\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m    \u001b[39m\u001b[38;5;124m\"\u001b[39m, path)\n\u001b[0;32m---> 23\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(seconds_delay)\n\u001b[1;32m     24\u001b[0m status_code, html \u001b[38;5;241m=\u001b[39m get_page(article_link)\n\u001b[1;32m     25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "for url in tqdm(magazine_urls):\n",
-    "    time.sleep(seconds_delay)\n",
-    "    status_code, html = get_page(url)\n",
+    "    status_code, html = get_page(url, delay_seconds)\n",
     "    if status_code != 200:\n",
     "        print(f\"Status code={status_code} url={url}\")\n",
     "        continue\n",
@@ -307,8 +143,7 @@
     "    print('year-month-links', url, len(year_month_links))\n",
     "    for link in tqdm(year_month_links):\n",
     "        print('year-month link', link)\n",
-    "        time.sleep(seconds_delay)\n",
-    "        status_code, html = get_page(link)\n",
+    "        status_code, html = get_page(link, delay_seconds)\n",
     "        if status_code != 200:\n",
     "            print(f\"Status code={status_code} url={url}\")\n",
     "            continue        \n",
@@ -319,8 +154,7 @@
     "            if os.path.exists(path):\n",
     "                continue\n",
     "            print(\"    \", path)\n",
-    "            time.sleep(seconds_delay)\n",
-    "            status_code, html = get_page(article_link)\n",
+    "            status_code, html = get_page(article_link, delay_seconds)\n",
     "            if status_code != 200:\n",
     "                print(f\"Status code={status_code} url={article_link}\")\n",
     "                continue\n",