Link lists must start of beginning of line

Blake-Madden · Jun 12, 2024 · 40bc19d · 40bc19d
1 parent 1b9d66b
commit 40bc19d
Show file tree

Hide file tree

Showing 2 changed files with 145 additions and 104 deletions.
diff --git a/src/import/html_extract_text.cpp b/src/import/html_extract_text.cpp
@@ -915,6 +915,18 @@ namespace lily_of_the_valley
         constexpr int MAX_CONTENT_BETWEEN_LINK_LIST_LINKS{ 3 };
         std::vector<size_t> linkListPositions;
         std::vector<size_t> linkListPositionsEnds;
+        const auto isAtStartOfLine = [this]()
+        {
+            const size_t lastNotHSpace = get_filtered_buffer().find_last_not_of(L" \t");
+            if (lastNotHSpace != std::wstring::npos)
+                {
+                return is_either(get_filtered_buffer()[lastNotHSpace], L'\n', L'\r');
+                }
+            else
+                {
+                return true;
+                }
+        };
 
         while (start && (start < endSentinel))
             {
@@ -925,36 +937,40 @@ namespace lily_of_the_valley
             // (which isn't valid HTML, but you never know)
             if (currentElement == L"a" && previousElement != L"a")
                 {
-                ++consecutiveAHrefs;
-                linkListPositions.push_back(get_filtered_buffer().length());
-                // Review what is between the previous anchor end and this new one.
-                if (linkListPositions.size() > 0 && linkListPositionsEnds.size() > 0)
+                // the first link in the list must start at the beginning of a new line
+                if (!(consecutiveAHrefs == 0 && !isAtStartOfLine()))
                     {
-                    const std::wstring_view previousRead =
-                        string_util::trim_view(std::wstring_view{ get_filtered_buffer() }.substr(
-                            linkListPositionsEnds.back(),
-                            linkListPositions.back() - linkListPositionsEnds.back()));
-                    if (!previousRead.empty())
+                    ++consecutiveAHrefs;
+                    linkListPositions.push_back(get_filtered_buffer().length());
+                    // Review what is between the previous anchor end and this new one.
+                    if (linkListPositions.size() > 0 && linkListPositionsEnds.size() > 0)
                         {
-                        // too much content between end of link and start of next one...
-                        if (previousRead.length() > MAX_CONTENT_BETWEEN_LINK_LIST_LINKS)
+                        const std::wstring_view previousRead = string_util::trim_view(
+                            std::wstring_view{ get_filtered_buffer() }.substr(
+                                linkListPositionsEnds.back(),
+                                linkListPositions.back() - linkListPositionsEnds.back()));
+                        if (!previousRead.empty())
                             {
-                            consecutiveAHrefs = 0;
-                            linkListPositions.clear();
-                            linkListPositionsEnds.clear();
-                            }
-                        else
-                            {
-                            // ...or if anything other than spaces or punctuation between the links,
-                            // then this isn't a link list
-                            for (const auto chr : previousRead)
+                            // too much content between end of link and start of next one...
+                            if (previousRead.length() > MAX_CONTENT_BETWEEN_LINK_LIST_LINKS)
+                                {
+                                consecutiveAHrefs = 0;
+                                linkListPositions.clear();
+                                linkListPositionsEnds.clear();
+                                }
+                            else
                                 {
-                                if (!(std::iswpunct(chr) || std::iswspace(chr)))
+                                // ...or if anything other than spaces or punctuation between the
+                                // links, then this isn't a link list
+                                for (const auto chr : previousRead)
                                     {
-                                    consecutiveAHrefs = 0;
-                                    linkListPositions.clear();
-                                    linkListPositionsEnds.clear();
-                                    break;
+                                    if (!(std::iswpunct(chr) || std::iswspace(chr)))
+                                        {
+                                        consecutiveAHrefs = 0;
+                                        linkListPositions.clear();
+                                        linkListPositionsEnds.clear();
+                                        break;
+                                        }
                                     }
                                 }
                             }

diff --git a/tests/htmlimporttests.cpp b/tests/htmlimporttests.cpp
@@ -264,6 +264,110 @@ TEST_CASE("HTML parser tags", "[html import]")
         }
     }
 
+TEST_CASE("HTML Parser Link Lists", "[html import]")
+    {
+    SECTION("Link list with break")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><br ><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\n\tEmail, \n\tMail, \n\tCall 555-5555" });
+        }
+    SECTION("Link list with image")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><img src='flower.png'><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
+        }
+    SECTION("Link list at beginning")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
+        }
+    SECTION("Link list at beginning with spaces")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"   \t<a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"   \t\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
+        }
+    SECTION("Link list")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
+        }
+    SECTION("Link list breaks not at start of line")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact us by <a href=''>Prayer Card</a><a href=''> Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555</p>";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact us by Prayer Card Email, Mail, Call 555-5555\n\n" });
+        }
+    SECTION("Link list lots of spaces")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a>      ,        <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail      ,        \n\tMail, \n\tCall 555-5555" });
+        }
+    SECTION("Link list with trailing content")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card \n\tEmail, \n\tMail, \n\tCall 555-5555\n\nSome more content\n\n" });
+        }
+    SECTION("Link list empty")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a>";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t" });
+        }
+    SECTION("Link list empty trailing content")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a><p>Some more content</p>";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t\n\nSome more content\n\n" });
+        }
+    SECTION("Link list breaks, overlapping anchors")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card<a href=''> Email</a></a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, Mail, Call 555-5555\n\nSome more content\n\n" });
+        }
+    SECTION("Link list breaks, not enough links")
+        {
+        // needs 4 links, only has 3
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card, Call 555-5555" });
+        }
+    SECTION("Link list breaks from extra text content")
+        {
+        // text content between links causes them to not be a link list
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> (extras available!) <a href=''>Email</a> <a href=''>Mail</a> <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card (extras available!) Email Mail Call 555-5555" });
+        }
+    SECTION("Link list breaks from too wide extra content")
+        {
+        html_extract_text filter_html;
+        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, ||<a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
+        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
+        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, ||Mail, Call 555-5555" });
+        }
+    }
+
 TEST_CASE("HTML Parser", "[html import]")
     {
     SECTION("Find Bookmark")
@@ -812,85 +916,6 @@ TEST_CASE("HTML Parser", "[html import]")
         p = filter_html(text, std::wcslen(text), true, false);
         CHECK(std::wcscmp(p, L"Contact 555-5555 for details.") == 0);
         }
-    SECTION("Link list with break")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><br ><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\n\tEmail, \n\tMail, \n\tCall 555-5555" });
-        }
-    SECTION("Link list with image")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><img src='flower.png'><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
-        }
-    SECTION("Link list")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
-        }
-    SECTION("Link list lots of spaces")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a>      ,        <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail      ,        \n\tMail, \n\tCall 555-5555" });
-        }
-    SECTION("Link list with trailing content")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card \n\tEmail, \n\tMail, \n\tCall 555-5555\n\nSome more content\n\n" });
-        }
-    SECTION("Link list empty")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a>";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t" });
-        }
-    SECTION("Link list empty trailing content")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a><p>Some more content</p>";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t\n\nSome more content\n\n" });
-        }
-    SECTION("Link list breaks, overlapping anchors")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card<a href=''> Email</a></a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, Mail, Call 555-5555\n\nSome more content\n\n" });
-        }
-    SECTION("Link list breaks, not enough links")
-        {
-        // needs 4 links, only has 3
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a>, <a href=''>Call</a> 555-5555";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card, Call 555-5555" });
-        }
-    SECTION("Link list breaks from extra text content")
-        {
-        // text content between links causes them to not be a link list
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> (extras available!) <a href=''>Email</a> <a href=''>Mail</a> <a href=''>Call</a> 555-5555";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card (extras available!) Email Mail Call 555-5555" });
-        }
-    SECTION("Link list breaks from too wide extra content")
-        {
-        html_extract_text filter_html;
-        const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, ||<a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
-        const std::wstring res = filter_html(text, std::wcslen(text), true, false);
-        CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, ||Mail, Call 555-5555" });
-        }
     SECTION("Template placeHolders")
         {
         html_extract_text filter_html;