From 542b952fd4bfca901454656700753f2fd3ddd9d0 Mon Sep 17 00:00:00 2001 From: Uwa Joseph Date: Tue, 21 Nov 2023 08:53:03 -0600 Subject: [PATCH 1/4] first commit for review_dc_places --- models/load_dc_places.py | 7 + tests/test_load_dc_places_extra.py | 1080 ++++++++++++++++++++++++++++ 2 files changed, 1087 insertions(+) create mode 100644 tests/test_load_dc_places_extra.py diff --git a/models/load_dc_places.py b/models/load_dc_places.py index 81a22f4..d4547b6 100644 --- a/models/load_dc_places.py +++ b/models/load_dc_places.py @@ -9,6 +9,13 @@ from models.load_utils import to_markdown +def remove_h2_content(html_content): + """Remove date in h2 tag.""" + pattern = re.compile(r'

(.*?)

', re.DOTALL) + modified_html = re.sub(pattern, '

', html_content) + return modified_html + + def places_clean(text: str) -> str: """Make key points a level 3 heading.""" text = clean(text) diff --git a/tests/test_load_dc_places_extra.py b/tests/test_load_dc_places_extra.py new file mode 100644 index 0000000..0278835 --- /dev/null +++ b/tests/test_load_dc_places_extra.py @@ -0,0 +1,1080 @@ +"""Test cases for the load_dc_places module.""" +# flake8: noqa + +from models.load_dc_places import load_dc_places + + +html = """ + + + + + + + + + + Adam-ondi-Ahman, Missouri | Doctrine and Covenants Central + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+ +
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+
+
+
+
+

/ Places of the D&C / Adam-ondi-Ahman, Missouri

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Adam-ondi-Ahman, Missouri

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

1838

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Photo Credit: Kenneth Mays, 2008

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Significant Events At a Glance

+
+ +
+
+
+
+
+
+
+
+

D&C Sections Received Here

+
+
+
+

(Click for Section Study Helps)

+
+
+
+

(Tap for Section Study Helps)

+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Adam-ondi-Ahman

+
+
+
+

Key Points of Interest

+
+ +
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+

Directions to Adam-ondi-Ahman, Missouri

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +""" + + +def test_load_dc_places_extra() -> None: + """It returns a valid Document for a conference talk.""" + url = "https://doctrineandcovenantscentral.org/adam-ondi-ahman-missouri/" + result = load_dc_places(url, html) + assert len(result.page_content) > 0 + assert result.metadata["url"] == url + assert result.metadata["title"] == "Places of the D&C / Adam-ondi-Ahman, Missouri" + assert result.page_content.startswith("## 1838") + + +# assert result.page_content.endswith("abandoned") From 58d0733c7704da0650b531fad1d8c095e3156fd9 Mon Sep 17 00:00:00 2001 From: Gennecis Date: Tue, 21 Nov 2023 18:51:44 +0100 Subject: [PATCH 2/4] reviewed dc_places docs --- models/load_dc_places.py | 14 +++++++++----- tests/test_load_dc_places_extra.py | 8 ++++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/models/load_dc_places.py b/models/load_dc_places.py index d4547b6..8f573a0 100644 --- a/models/load_dc_places.py +++ b/models/load_dc_places.py @@ -9,11 +9,14 @@ from models.load_utils import to_markdown -def remove_h2_content(html_content): - """Remove date in h2 tag.""" - pattern = re.compile(r'

(.*?)

', re.DOTALL) - modified_html = re.sub(pattern, '

', html_content) - return modified_html +def remove_year_headers(text): + """Define the regular expression pattern.""" + pattern = r"## \d{4}(-\d{4})?(\s+-{3,})?" + + # Use re.sub to replace matches with an empty string + cleaned_text = re.sub(pattern, "", text) + + return cleaned_text def places_clean(text: str) -> str: @@ -59,6 +62,7 @@ def load_dc_places(url: str, html: str, bs_parser: str = "html.parser") -> Docum ): text = places_clean(to_markdown(str(section), base_url=url)) if section else "" text = replace_header_with_year(text) + text = remove_year_headers(text) # print('text:',text) if text == "## ": continue diff --git a/tests/test_load_dc_places_extra.py b/tests/test_load_dc_places_extra.py index 0278835..6bb15b8 100644 --- a/tests/test_load_dc_places_extra.py +++ b/tests/test_load_dc_places_extra.py @@ -2,6 +2,7 @@ # flake8: noqa from models.load_dc_places import load_dc_places +from models.load_dc_places import remove_year_headers html = """ @@ -1074,7 +1075,10 @@ def test_load_dc_places_extra() -> None: assert len(result.page_content) > 0 assert result.metadata["url"] == url assert result.metadata["title"] == "Places of the D&C / Adam-ondi-Ahman, Missouri" - assert result.page_content.startswith("## 1838") + assert not result.page_content.startswith("## 1838") -# assert result.page_content.endswith("abandoned") +def test_remove_year_headers(): + text = "## 1883" + cleaned_text = remove_year_headers(text) + assert cleaned_text == "" From da292a032ba23be7b48c125402a61bd68b552eb3 Mon Sep 17 00:00:00 2001 From: Gennecis Date: Tue, 21 Nov 2023 18:59:57 +0100 Subject: [PATCH 3/4] fixed mypy errors --- models/load_dc_places.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/load_dc_places.py b/models/load_dc_places.py index 8f573a0..482a600 100644 --- a/models/load_dc_places.py +++ b/models/load_dc_places.py @@ -9,7 +9,7 @@ from models.load_utils import to_markdown -def remove_year_headers(text): +def remove_year_headers(text: str) -> str: """Define the regular expression pattern.""" pattern = r"## \d{4}(-\d{4})?(\s+-{3,})?" From 50b4a935d6a041e2ffd35c889ac962e2998780c7 Mon Sep 17 00:00:00 2001 From: Gennecis Date: Tue, 21 Nov 2023 19:45:42 +0100 Subject: [PATCH 4/4] fixed return type error --- tests/test_load_dc_places_extra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_load_dc_places_extra.py b/tests/test_load_dc_places_extra.py index 6bb15b8..fdd58de 100644 --- a/tests/test_load_dc_places_extra.py +++ b/tests/test_load_dc_places_extra.py @@ -1078,7 +1078,7 @@ def test_load_dc_places_extra() -> None: assert not result.page_content.startswith("## 1838") -def test_remove_year_headers(): +def test_remove_year_headers() -> None: text = "## 1883" cleaned_text = remove_year_headers(text) assert cleaned_text == ""