Merge branch 'main' of https://github.com/iloveconference/models

iloveconference · Oct 20, 2023 · 8a32715 · 8a32715
2 parents 1d7263d + 8955567
commit 8a32715
Show file tree

Hide file tree

Showing 4 changed files with 201 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@
 /.env.*
 __pycache__/
 /notebooks/.ipynb_checkpoints/
+/models/.ipynb_checkpoints/
 /notebooks/wandb/
 .idea/
 .venv/
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ PINECONE_ENV=your_pinecone_environment_name (found on API keys page)
 
 `mkdir data`
 
-`aws s3 sync s3://iloveconference.data data`
+`aws s3 sync s3://scripturecentralqa.data data`
 
 ## Developing
 
@@ -59,6 +59,7 @@ Run `nox` before creating a pull request to ensure that all checks pass.
 ### Running notebooks
 
 After running `poetry shell`, you need to install the poetry virtual environment as a jupyter kernel.
+
 Let's name it "models": `python -m ipykernel install --user --name models`
 You only need to do this once.
 

diff --git a/notebooks/05_conference_crawler.ipynb b/notebooks/05_conference_crawler.ipynb
@@ -42,10 +42,10 @@
    "outputs": [],
    "source": [
     "# config\n",
-    "years = range(2023, 2024)\n",
-    "months = [10]\n",
+    "years = range(1971, 2024)\n",
+    "months = [4, 10]\n",
     "host = 'https://www.churchofjesuschrist.org'\n",
-    "base_dir = '../data/load/raw'\n",
+    "base_dir = '../data/load/raw/conference'\n",
     "bs_parser = 'html.parser'\n",
     "delay_seconds = 30"
    ]
@@ -128,7 +128,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb
@@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "936d02dd",
+   "metadata": {},
+   "source": [
+    "# Crawl Magazines from the Church of Jesus Christ of Latter-day Saints"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fe5bf12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bca89a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from urllib.parse import urljoin, urlparse\n",
+    "\n",
+    "from bs4 import BeautifulSoup\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from models.crawl_utils import get_page, save_page"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e753397e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config\n",
+    "magazine_urls = [\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n",
+    "    'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng',\n",
+    "]\n",
+    "base_dir = '../data/load/raw/magazines'\n",
+    "bs_parser = 'html.parser'\n",
+    "delay_seconds = 30"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af2c51a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _is_issue_link(url: str) -> bool:\n",
+    "    path_components = urlparse(url).path.split('/')\n",
+    "    # print('is_issue_link', url, path_components)\n",
+    "    if len(path_components) < 5:\n",
+    "        return False\n",
+    "    elif path_components[4] == 'new-era-19712020':\n",
+    "        # new-era issue links must have 6 path components\n",
+    "        return len(path_components) == 6\n",
+    "    else:\n",
+    "        # all other issue links must have 5 components (first component is empty)\n",
+    "        return len(path_components) == 5\n",
+    "\n",
+    "\n",
+    "def get_issue_links(base_url, html):\n",
+    "    soup = BeautifulSoup(html, bs_parser)\n",
+    "    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n",
+    "            if _is_issue_link(urljoin(base_url, a['href']))]\n",
+    "\n",
+    "def get_year_month_links(url, html):\n",
+    "    links = get_issue_links(url, html)\n",
+    "    year_month_links = []\n",
+    "    for link in tqdm(links):\n",
+    "        path_components = urlparse(link).path.split('/')\n",
+    "        # print('link and components', link, path_components)\n",
+    "        if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):\n",
+    "            # year-month link\n",
+    "            # print('year-month link', link)\n",
+    "            year_month_links.append(link)\n",
+    "        elif len(path_components[-1]) == 4:\n",
+    "            # year_only_link\n",
+    "            # print('year-only link', link)\n",
+    "            status_code, html = get_page(link, delay_seconds)\n",
+    "            if status_code != 200:\n",
+    "                print(f\"Status code={status_code} url={link}\")\n",
+    "                continue\n",
+    "            new_links = get_issue_links(link, html)\n",
+    "            for new_link in new_links:\n",
+    "                # print('issue link', new_link)\n",
+    "                year_month_links.append(new_link)\n",
+    "        else:\n",
+    "            print('unexpected link', link, path_components[-1])    \n",
+    "        # TODO remove break\n",
+    "        break\n",
+    "    return year_month_links\n",
+    "\n",
+    "def _is_article_link(url: str) -> bool:\n",
+    "    path_components = urlparse(url).path.split('/')\n",
+    "    # # must be 6 or 7 components (first component is empty)\n",
+    "    return (len(path_components) == 6 or len(path_components) == 7) and \\\n",
+    "        path_components[-2] != 'new-era-19712020' and path_components[-1] != 'contents'\n",
+    "\n",
+    "\n",
+    "def get_article_links(base_url, html):\n",
+    "    soup = BeautifulSoup(html, bs_parser)\n",
+    "    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n",
+    "            if _is_article_link(urljoin(base_url, a['href']))]\n",
+    "\n",
+    "\n",
+    "def get_article_path(url):\n",
+    "    path_components = urlparse(url).path.split('/')\n",
+    "    path = '_'.join(path_components[2:])\n",
+    "    return os.path.join(base_dir, f\"{path}.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f5ebfa4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for url in tqdm(magazine_urls):\n",
+    "    status_code, html = get_page(url, delay_seconds)\n",
+    "    if status_code != 200:\n",
+    "        print(f\"Status code={status_code} url={url}\")\n",
+    "        continue\n",
+    "    year_month_links = get_year_month_links(url, html)\n",
+    "    print('year-month-links', url, len(year_month_links))\n",
+    "    for link in tqdm(year_month_links):\n",
+    "        print('year-month link', link)\n",
+    "        status_code, html = get_page(link, delay_seconds)\n",
+    "        if status_code != 200:\n",
+    "            print(f\"Status code={status_code} url={url}\")\n",
+    "            continue        \n",
+    "        article_links = get_article_links(link, html)\n",
+    "        for article_link in tqdm(article_links):\n",
+    "            path = get_article_path(article_link)\n",
+    "            # print('path', path, article_link)\n",
+    "            if os.path.exists(path):\n",
+    "                continue\n",
+    "            print(\"    \", path)\n",
+    "            status_code, html = get_page(article_link, delay_seconds)\n",
+    "            if status_code != 200:\n",
+    "                print(f\"Status code={status_code} url={article_link}\")\n",
+    "                continue\n",
+    "            save_page(path, article_link, html)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6f58e52",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "models",
+   "language": "python",
+   "name": "models"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}