Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
DallanQ committed Oct 20, 2023
2 parents 1d7263d + 8955567 commit 8a32715
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
/.env.*
__pycache__/
/notebooks/.ipynb_checkpoints/
/models/.ipynb_checkpoints/
/notebooks/wandb/
.idea/
.venv/
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ PINECONE_ENV=your_pinecone_environment_name (found on API keys page)

`mkdir data`

`aws s3 sync s3://iloveconference.data data`
`aws s3 sync s3://scripturecentralqa.data data`

## Developing

Expand All @@ -59,6 +59,7 @@ Run `nox` before creating a pull request to ensure that all checks pass.
### Running notebooks

After running `poetry shell`, you need to install the poetry virtual environment as a jupyter kernel.

Let's name it "models": `python -m ipykernel install --user --name models`
You only need to do this once.

Expand Down
8 changes: 4 additions & 4 deletions notebooks/05_conference_crawler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@
"outputs": [],
"source": [
"# config\n",
"years = range(2023, 2024)\n",
"months = [10]\n",
"years = range(1971, 2024)\n",
"months = [4, 10]\n",
"host = 'https://www.churchofjesuschrist.org'\n",
"base_dir = '../data/load/raw'\n",
"base_dir = '../data/load/raw/conference'\n",
"bs_parser = 'html.parser'\n",
"delay_seconds = 30"
]
Expand Down Expand Up @@ -128,7 +128,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.11.6"
}
},
"nbformat": 4,
Expand Down
194 changes: 194 additions & 0 deletions notebooks/05_magazine_crawler.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "936d02dd",
"metadata": {},
"source": [
"# Crawl Magazines from the Church of Jesus Christ of Latter-day Saints"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fe5bf12",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bca89a2",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from urllib.parse import urljoin, urlparse\n",
"\n",
"from bs4 import BeautifulSoup\n",
"from tqdm.auto import tqdm\n",
"\n",
"from models.crawl_utils import get_page, save_page"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e753397e",
"metadata": {},
"outputs": [],
"source": [
"# config\n",
"magazine_urls = [\n",
" 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',\n",
" 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',\n",
" 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',\n",
" 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',\n",
" 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',\n",
" 'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng',\n",
"]\n",
"base_dir = '../data/load/raw/magazines'\n",
"bs_parser = 'html.parser'\n",
"delay_seconds = 30"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af2c51a9",
"metadata": {},
"outputs": [],
"source": [
"def _is_issue_link(url: str) -> bool:\n",
" path_components = urlparse(url).path.split('/')\n",
" # print('is_issue_link', url, path_components)\n",
" if len(path_components) < 5:\n",
" return False\n",
" elif path_components[4] == 'new-era-19712020':\n",
" # new-era issue links must have 6 path components\n",
" return len(path_components) == 6\n",
" else:\n",
" # all other issue links must have 5 components (first component is empty)\n",
" return len(path_components) == 5\n",
"\n",
"\n",
"def get_issue_links(base_url, html):\n",
" soup = BeautifulSoup(html, bs_parser)\n",
" return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n",
" if _is_issue_link(urljoin(base_url, a['href']))]\n",
"\n",
"def get_year_month_links(url, html):\n",
" links = get_issue_links(url, html)\n",
" year_month_links = []\n",
" for link in tqdm(links):\n",
" path_components = urlparse(link).path.split('/')\n",
" # print('link and components', link, path_components)\n",
" if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):\n",
" # year-month link\n",
" # print('year-month link', link)\n",
" year_month_links.append(link)\n",
" elif len(path_components[-1]) == 4:\n",
" # year_only_link\n",
" # print('year-only link', link)\n",
" status_code, html = get_page(link, delay_seconds)\n",
" if status_code != 200:\n",
" print(f\"Status code={status_code} url={link}\")\n",
" continue\n",
" new_links = get_issue_links(link, html)\n",
" for new_link in new_links:\n",
" # print('issue link', new_link)\n",
" year_month_links.append(new_link)\n",
" else:\n",
" print('unexpected link', link, path_components[-1]) \n",
" # TODO remove break\n",
" break\n",
" return year_month_links\n",
"\n",
"def _is_article_link(url: str) -> bool:\n",
" path_components = urlparse(url).path.split('/')\n",
" # # must be 6 or 7 components (first component is empty)\n",
" return (len(path_components) == 6 or len(path_components) == 7) and \\\n",
" path_components[-2] != 'new-era-19712020' and path_components[-1] != 'contents'\n",
"\n",
"\n",
"def get_article_links(base_url, html):\n",
" soup = BeautifulSoup(html, bs_parser)\n",
" return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \\\n",
" if _is_article_link(urljoin(base_url, a['href']))]\n",
"\n",
"\n",
"def get_article_path(url):\n",
" path_components = urlparse(url).path.split('/')\n",
" path = '_'.join(path_components[2:])\n",
" return os.path.join(base_dir, f\"{path}.json\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f5ebfa4",
"metadata": {},
"outputs": [],
"source": [
"for url in tqdm(magazine_urls):\n",
" status_code, html = get_page(url, delay_seconds)\n",
" if status_code != 200:\n",
" print(f\"Status code={status_code} url={url}\")\n",
" continue\n",
" year_month_links = get_year_month_links(url, html)\n",
" print('year-month-links', url, len(year_month_links))\n",
" for link in tqdm(year_month_links):\n",
" print('year-month link', link)\n",
" status_code, html = get_page(link, delay_seconds)\n",
" if status_code != 200:\n",
" print(f\"Status code={status_code} url={url}\")\n",
" continue \n",
" article_links = get_article_links(link, html)\n",
" for article_link in tqdm(article_links):\n",
" path = get_article_path(article_link)\n",
" # print('path', path, article_link)\n",
" if os.path.exists(path):\n",
" continue\n",
" print(\" \", path)\n",
" status_code, html = get_page(article_link, delay_seconds)\n",
" if status_code != 200:\n",
" print(f\"Status code={status_code} url={article_link}\")\n",
" continue\n",
" save_page(path, article_link, html)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6f58e52",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "models",
"language": "python",
"name": "models"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 8a32715

Please sign in to comment.