Skip to content

Commit

Permalink
Merge pull request #99 from scripturecentralqa/encyclopedia_review
Browse files Browse the repository at this point in the history
reviewed encyclopedia of mormonism document
  • Loading branch information
DallanQ authored Nov 21, 2023
2 parents 0fce62d + b474381 commit 923d6c9
Show file tree
Hide file tree
Showing 2 changed files with 334 additions and 1 deletion.
22 changes: 21 additions & 1 deletion models/load_encyclopedia.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,40 @@
"""Load encyclopedia."""
import re

from bs4 import BeautifulSoup
from langchain.schema.document import Document

from models.load_utils import clean
from models.load_utils import to_markdown


line = "[A](/index.php?"


def remove_stuff_words(line: str, text: str) -> str:
"""This function removes stuff words from the page content."""
pattern = re.compile(r"\[A\]\(/index\.php\?")
match = re.search(pattern, text)

if match:
# Remove everything after the matched line
cleaned_text = text[: match.start()]
return cleaned_text

# If no match is found, return the original text
return text


def load_encyclopedia(url: str, html: str, bs_parser: str = "html.parser") -> Document:
"""Load encyclopedia from a url and html."""
soup = BeautifulSoup(html, bs_parser)
title = soup.find("span", class_="mw-page-title-main")
body = soup.find("div", class_="mw-parser-output")
content = clean(to_markdown(str(body), base_url=url)) if body else ""
clean_content = remove_stuff_words(line, content)

metadata = {
"url": url,
"title": clean(title) if title else "",
}
return Document(page_content=content, metadata=metadata)
return Document(page_content=clean_content, metadata=metadata)
Loading

0 comments on commit 923d6c9

Please sign in to comment.