Skip to content

Commit

Permalink
Merge pull request #102 from scripturecentralqa/review_dc_places
Browse files Browse the repository at this point in the history
Reviewed D&C Places Documents
  • Loading branch information
DallanQ authored Nov 21, 2023
2 parents 923d6c9 + 50b4a93 commit ccc315a
Show file tree
Hide file tree
Showing 2 changed files with 1,095 additions and 0 deletions.
11 changes: 11 additions & 0 deletions models/load_dc_places.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
from models.load_utils import to_markdown


def remove_year_headers(text: str) -> str:
"""Define the regular expression pattern."""
pattern = r"## \d{4}(-\d{4})?(\s+-{3,})?"

# Use re.sub to replace matches with an empty string
cleaned_text = re.sub(pattern, "", text)

return cleaned_text


def places_clean(text: str) -> str:
"""Make key points a level 3 heading."""
text = clean(text)
Expand Down Expand Up @@ -52,6 +62,7 @@ def load_dc_places(url: str, html: str, bs_parser: str = "html.parser") -> Docum
):
text = places_clean(to_markdown(str(section), base_url=url)) if section else ""
text = replace_header_with_year(text)
text = remove_year_headers(text)
# print('text:',text)
if text == "## ":
continue
Expand Down
Loading

0 comments on commit ccc315a

Please sign in to comment.