diff --git a/src/import/html_extract_text.cpp b/src/import/html_extract_text.cpp index d4e14c47..6f507d88 100644 --- a/src/import/html_extract_text.cpp +++ b/src/import/html_extract_text.cpp @@ -915,6 +915,18 @@ namespace lily_of_the_valley constexpr int MAX_CONTENT_BETWEEN_LINK_LIST_LINKS{ 3 }; std::vector linkListPositions; std::vector linkListPositionsEnds; + const auto isAtStartOfLine = [this]() + { + const size_t lastNotHSpace = get_filtered_buffer().find_last_not_of(L" \t"); + if (lastNotHSpace != std::wstring::npos) + { + return is_either(get_filtered_buffer()[lastNotHSpace], L'\n', L'\r'); + } + else + { + return true; + } + }; while (start && (start < endSentinel)) { @@ -925,36 +937,40 @@ namespace lily_of_the_valley // (which isn't valid HTML, but you never know) if (currentElement == L"a" && previousElement != L"a") { - ++consecutiveAHrefs; - linkListPositions.push_back(get_filtered_buffer().length()); - // Review what is between the previous anchor end and this new one. - if (linkListPositions.size() > 0 && linkListPositionsEnds.size() > 0) + // the first link in the list must start at the beginning of a new line + if (!(consecutiveAHrefs == 0 && !isAtStartOfLine())) { - const std::wstring_view previousRead = - string_util::trim_view(std::wstring_view{ get_filtered_buffer() }.substr( - linkListPositionsEnds.back(), - linkListPositions.back() - linkListPositionsEnds.back())); - if (!previousRead.empty()) + ++consecutiveAHrefs; + linkListPositions.push_back(get_filtered_buffer().length()); + // Review what is between the previous anchor end and this new one. + if (linkListPositions.size() > 0 && linkListPositionsEnds.size() > 0) { - // too much content between end of link and start of next one... - if (previousRead.length() > MAX_CONTENT_BETWEEN_LINK_LIST_LINKS) + const std::wstring_view previousRead = string_util::trim_view( + std::wstring_view{ get_filtered_buffer() }.substr( + linkListPositionsEnds.back(), + linkListPositions.back() - linkListPositionsEnds.back())); + if (!previousRead.empty()) { - consecutiveAHrefs = 0; - linkListPositions.clear(); - linkListPositionsEnds.clear(); - } - else - { - // ...or if anything other than spaces or punctuation between the links, - // then this isn't a link list - for (const auto chr : previousRead) + // too much content between end of link and start of next one... + if (previousRead.length() > MAX_CONTENT_BETWEEN_LINK_LIST_LINKS) + { + consecutiveAHrefs = 0; + linkListPositions.clear(); + linkListPositionsEnds.clear(); + } + else { - if (!(std::iswpunct(chr) || std::iswspace(chr))) + // ...or if anything other than spaces or punctuation between the + // links, then this isn't a link list + for (const auto chr : previousRead) { - consecutiveAHrefs = 0; - linkListPositions.clear(); - linkListPositionsEnds.clear(); - break; + if (!(std::iswpunct(chr) || std::iswspace(chr))) + { + consecutiveAHrefs = 0; + linkListPositions.clear(); + linkListPositionsEnds.clear(); + break; + } } } } diff --git a/tests/htmlimporttests.cpp b/tests/htmlimporttests.cpp index b27dc4c4..15a85983 100644 --- a/tests/htmlimporttests.cpp +++ b/tests/htmlimporttests.cpp @@ -264,6 +264,110 @@ TEST_CASE("HTML parser tags", "[html import]") } } +TEST_CASE("HTML Parser Link Lists", "[html import]") + { + SECTION("Link list with break") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer Card
Email, Mail, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\n\tEmail, \n\tMail, \n\tCall 555-5555" }); + } + SECTION("Link list with image") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer CardEmail, Mail, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" }); + } + SECTION("Link list at beginning") + { + html_extract_text filter_html; + const wchar_t* text = L"Prayer CardEmail, Mail, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" }); + } + SECTION("Link list at beginning with spaces") + { + html_extract_text filter_html; + const wchar_t* text = L" \tPrayer CardEmail, Mail, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L" \t\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" }); + } + SECTION("Link list") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer CardEmail, Mail, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" }); + } + SECTION("Link list breaks not at start of line") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact us by Prayer Card Email, Mail, Call 555-5555

"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact us by Prayer Card Email, Mail, Call 555-5555\n\n" }); + } + SECTION("Link list lots of spaces") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer CardEmail , Mail, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail , \n\tMail, \n\tCall 555-5555" }); + } + SECTION("Link list with trailing content") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer Card Email, Mail, Call 555-5555

Some more content

"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card \n\tEmail, \n\tMail, \n\tCall 555-5555\n\nSome more content\n\n" }); + } + SECTION("Link list empty") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t" }); + } + SECTION("Link list empty trailing content") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Some more content

"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t\n\nSome more content\n\n" }); + } + SECTION("Link list breaks, overlapping anchors") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer Card Email, Mail, Call 555-5555

Some more content

"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, Mail, Call 555-5555\n\nSome more content\n\n" }); + } + SECTION("Link list breaks, not enough links") + { + // needs 4 links, only has 3 + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer Card, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card, Call 555-5555" }); + } + SECTION("Link list breaks from extra text content") + { + // text content between links causes them to not be a link list + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer Card (extras available!) Email Mail Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card (extras available!) Email Mail Call 555-5555" }); + } + SECTION("Link list breaks from too wide extra content") + { + html_extract_text filter_html; + const wchar_t* text = L"

Contact:

Prayer Card Email, ||Mail, Call 555-5555"; + const std::wstring res = filter_html(text, std::wcslen(text), true, false); + CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, ||Mail, Call 555-5555" }); + } + } + TEST_CASE("HTML Parser", "[html import]") { SECTION("Find Bookmark") @@ -812,85 +916,6 @@ TEST_CASE("HTML Parser", "[html import]") p = filter_html(text, std::wcslen(text), true, false); CHECK(std::wcscmp(p, L"Contact 555-5555 for details.") == 0); } - SECTION("Link list with break") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer Card
Email, Mail, Call 555-5555"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\n\tEmail, \n\tMail, \n\tCall 555-5555" }); - } - SECTION("Link list with image") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer CardEmail, Mail, Call 555-5555"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" }); - } - SECTION("Link list") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer CardEmail, Mail, Call 555-5555"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" }); - } - SECTION("Link list lots of spaces") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer CardEmail , Mail, Call 555-5555"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail , \n\tMail, \n\tCall 555-5555" }); - } - SECTION("Link list with trailing content") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer Card Email, Mail, Call 555-5555

Some more content

"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card \n\tEmail, \n\tMail, \n\tCall 555-5555\n\nSome more content\n\n" }); - } - SECTION("Link list empty") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t" }); - } - SECTION("Link list empty trailing content") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Some more content

"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t\n\nSome more content\n\n" }); - } - SECTION("Link list breaks, overlapping anchors") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer Card Email, Mail, Call 555-5555

Some more content

"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, Mail, Call 555-5555\n\nSome more content\n\n" }); - } - SECTION("Link list breaks, not enough links") - { - // needs 4 links, only has 3 - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer Card, Call 555-5555"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card, Call 555-5555" }); - } - SECTION("Link list breaks from extra text content") - { - // text content between links causes them to not be a link list - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer Card (extras available!) Email Mail Call 555-5555"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card (extras available!) Email Mail Call 555-5555" }); - } - SECTION("Link list breaks from too wide extra content") - { - html_extract_text filter_html; - const wchar_t* text = L"

Contact:

Prayer Card Email, ||Mail, Call 555-5555"; - const std::wstring res = filter_html(text, std::wcslen(text), true, false); - CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, ||Mail, Call 555-5555" }); - } SECTION("Template placeHolders") { html_extract_text filter_html;