diff --git a/.gitignore b/.gitignore index 79a2b31..046a699 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ /.env.* __pycache__/ /notebooks/.ipynb_checkpoints/ +/models/.ipynb_checkpoints/ /notebooks/wandb/ .idea/ .venv/ diff --git a/README.md b/README.md index 23dfca1..8987c08 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ PINECONE_ENV=your_pinecone_environment_name (found on API keys page) `mkdir data` -`aws s3 sync s3://iloveconference.data data` +`aws s3 sync s3://scripturecentralqa.data data` ## Developing @@ -59,6 +59,7 @@ Run `nox` before creating a pull request to ensure that all checks pass. ### Running notebooks After running `poetry shell`, you need to install the poetry virtual environment as a jupyter kernel. + Let's name it "models": `python -m ipykernel install --user --name models` You only need to do this once. diff --git a/notebooks/05_conference_crawler.ipynb b/notebooks/05_conference_crawler.ipynb index 26a895f..38762af 100644 --- a/notebooks/05_conference_crawler.ipynb +++ b/notebooks/05_conference_crawler.ipynb @@ -42,10 +42,10 @@ "outputs": [], "source": [ "# config\n", - "years = range(2023, 2024)\n", - "months = [10]\n", + "years = range(1971, 2024)\n", + "months = [4, 10]\n", "host = 'https://www.churchofjesuschrist.org'\n", - "base_dir = '../data/load/raw'\n", + "base_dir = '../data/load/raw/conference'\n", "bs_parser = 'html.parser'\n", "delay_seconds = 30" ] @@ -128,7 +128,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.11.6" } }, "nbformat": 4, diff --git a/notebooks/05_magazine_crawler.ipynb b/notebooks/05_magazine_crawler.ipynb new file mode 100644 index 0000000..a83b095 --- /dev/null +++ b/notebooks/05_magazine_crawler.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "936d02dd", + "metadata": {}, + "source": [ + "# Crawl Magazines from the Church of Jesus Christ of Latter-day Saints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fe5bf12", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bca89a2", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from urllib.parse import urljoin, urlparse\n", + "\n", + "from bs4 import BeautifulSoup\n", + "from tqdm.auto import tqdm\n", + "\n", + "from models.crawl_utils import get_page, save_page" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e753397e", + "metadata": {}, + "outputs": [], + "source": [ + "# config\n", + "magazine_urls = [\n", + " 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n", + " 'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng',\n", + "]\n", + "base_dir = '../data/load/raw/magazines'\n", + "bs_parser = 'html.parser'\n", + "delay_seconds = 30" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af2c51a9", + "metadata": {}, + "outputs": [], + "source": [ + "def _is_issue_link(url: str) -> bool:\n", + " path_components = urlparse(url).path.split('/')\n", + " # print('is_issue_link', url, path_components)\n", + " if len(path_components) < 5:\n", + " return False\n", + " elif path_components[4] == 'new-era-19712020':\n", + " # new-era issue links must have 6 path components\n", + " return len(path_components) == 6\n", + " else:\n", + " # all other issue links must have 5 components (first component is empty)\n", + " return len(path_components) == 5\n", + "\n", + "\n", + "def get_issue_links(base_url, html):\n", + " soup = BeautifulSoup(html, bs_parser)\n", + " return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n", + " if _is_issue_link(urljoin(base_url, a['href']))]\n", + "\n", + "def get_year_month_links(url, html):\n", + " links = get_issue_links(url, html)\n", + " year_month_links = []\n", + " for link in tqdm(links):\n", + " path_components = urlparse(link).path.split('/')\n", + " # print('link and components', link, path_components)\n", + " if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):\n", + " # year-month link\n", + " # print('year-month link', link)\n", + " year_month_links.append(link)\n", + " elif len(path_components[-1]) == 4:\n", + " # year_only_link\n", + " # print('year-only link', link)\n", + " status_code, html = get_page(link, delay_seconds)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={link}\")\n", + " continue\n", + " new_links = get_issue_links(link, html)\n", + " for new_link in new_links:\n", + " # print('issue link', new_link)\n", + " year_month_links.append(new_link)\n", + " else:\n", + " print('unexpected link', link, path_components[-1]) \n", + " # TODO remove break\n", + " break\n", + " return year_month_links\n", + "\n", + "def _is_article_link(url: str) -> bool:\n", + " path_components = urlparse(url).path.split('/')\n", + " # # must be 6 or 7 components (first component is empty)\n", + " return (len(path_components) == 6 or len(path_components) == 7) and \\\n", + " path_components[-2] != 'new-era-19712020' and path_components[-1] != 'contents'\n", + "\n", + "\n", + "def get_article_links(base_url, html):\n", + " soup = BeautifulSoup(html, bs_parser)\n", + " return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n", + " if _is_article_link(urljoin(base_url, a['href']))]\n", + "\n", + "\n", + "def get_article_path(url):\n", + " path_components = urlparse(url).path.split('/')\n", + " path = '_'.join(path_components[2:])\n", + " return os.path.join(base_dir, f\"{path}.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f5ebfa4", + "metadata": {}, + "outputs": [], + "source": [ + "for url in tqdm(magazine_urls):\n", + " status_code, html = get_page(url, delay_seconds)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={url}\")\n", + " continue\n", + " year_month_links = get_year_month_links(url, html)\n", + " print('year-month-links', url, len(year_month_links))\n", + " for link in tqdm(year_month_links):\n", + " print('year-month link', link)\n", + " status_code, html = get_page(link, delay_seconds)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={url}\")\n", + " continue \n", + " article_links = get_article_links(link, html)\n", + " for article_link in tqdm(article_links):\n", + " path = get_article_path(article_link)\n", + " # print('path', path, article_link)\n", + " if os.path.exists(path):\n", + " continue\n", + " print(\" \", path)\n", + " status_code, html = get_page(article_link, delay_seconds)\n", + " if status_code != 200:\n", + " print(f\"Status code={status_code} url={article_link}\")\n", + " continue\n", + " save_page(path, article_link, html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6f58e52", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "models", + "language": "python", + "name": "models" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}