Skip to content

Commit

Permalink
Link lists must start of beginning of line
Browse files Browse the repository at this point in the history
  • Loading branch information
Blake-Madden committed Jun 12, 2024
1 parent 1b9d66b commit 40bc19d
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 104 deletions.
66 changes: 41 additions & 25 deletions src/import/html_extract_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -915,6 +915,18 @@ namespace lily_of_the_valley
constexpr int MAX_CONTENT_BETWEEN_LINK_LIST_LINKS{ 3 };
std::vector<size_t> linkListPositions;
std::vector<size_t> linkListPositionsEnds;
const auto isAtStartOfLine = [this]()
{
const size_t lastNotHSpace = get_filtered_buffer().find_last_not_of(L" \t");
if (lastNotHSpace != std::wstring::npos)
{
return is_either(get_filtered_buffer()[lastNotHSpace], L'\n', L'\r');
}
else
{
return true;
}
};

while (start && (start < endSentinel))
{
Expand All @@ -925,36 +937,40 @@ namespace lily_of_the_valley
// (which isn't valid HTML, but you never know)
if (currentElement == L"a" && previousElement != L"a")
{
++consecutiveAHrefs;
linkListPositions.push_back(get_filtered_buffer().length());
// Review what is between the previous anchor end and this new one.
if (linkListPositions.size() > 0 && linkListPositionsEnds.size() > 0)
// the first link in the list must start at the beginning of a new line
if (!(consecutiveAHrefs == 0 && !isAtStartOfLine()))
{
const std::wstring_view previousRead =
string_util::trim_view(std::wstring_view{ get_filtered_buffer() }.substr(
linkListPositionsEnds.back(),
linkListPositions.back() - linkListPositionsEnds.back()));
if (!previousRead.empty())
++consecutiveAHrefs;
linkListPositions.push_back(get_filtered_buffer().length());
// Review what is between the previous anchor end and this new one.
if (linkListPositions.size() > 0 && linkListPositionsEnds.size() > 0)
{
// too much content between end of link and start of next one...
if (previousRead.length() > MAX_CONTENT_BETWEEN_LINK_LIST_LINKS)
const std::wstring_view previousRead = string_util::trim_view(
std::wstring_view{ get_filtered_buffer() }.substr(
linkListPositionsEnds.back(),
linkListPositions.back() - linkListPositionsEnds.back()));
if (!previousRead.empty())
{
consecutiveAHrefs = 0;
linkListPositions.clear();
linkListPositionsEnds.clear();
}
else
{
// ...or if anything other than spaces or punctuation between the links,
// then this isn't a link list
for (const auto chr : previousRead)
// too much content between end of link and start of next one...
if (previousRead.length() > MAX_CONTENT_BETWEEN_LINK_LIST_LINKS)
{
consecutiveAHrefs = 0;
linkListPositions.clear();
linkListPositionsEnds.clear();
}
else
{
if (!(std::iswpunct(chr) || std::iswspace(chr)))
// ...or if anything other than spaces or punctuation between the
// links, then this isn't a link list
for (const auto chr : previousRead)
{
consecutiveAHrefs = 0;
linkListPositions.clear();
linkListPositionsEnds.clear();
break;
if (!(std::iswpunct(chr) || std::iswspace(chr)))
{
consecutiveAHrefs = 0;
linkListPositions.clear();
linkListPositionsEnds.clear();
break;
}
}
}
}
Expand Down
183 changes: 104 additions & 79 deletions tests/htmlimporttests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,110 @@ TEST_CASE("HTML parser tags", "[html import]")
}
}

TEST_CASE("HTML Parser Link Lists", "[html import]")
{
SECTION("Link list with break")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><br ><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list with image")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><img src='flower.png'><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list at beginning")
{
html_extract_text filter_html;
const wchar_t* text = L"<a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list at beginning with spaces")
{
html_extract_text filter_html;
const wchar_t* text = L" \t<a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L" \t\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list breaks not at start of line")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact us by <a href=''>Prayer Card</a><a href=''> Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact us by Prayer Card Email, Mail, Call 555-5555\n\n" });
}
SECTION("Link list lots of spaces")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a> , <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail , \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list with trailing content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card \n\tEmail, \n\tMail, \n\tCall 555-5555\n\nSome more content\n\n" });
}
SECTION("Link list empty")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t" });
}
SECTION("Link list empty trailing content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a><p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t\n\nSome more content\n\n" });
}
SECTION("Link list breaks, overlapping anchors")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card<a href=''> Email</a></a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, Mail, Call 555-5555\n\nSome more content\n\n" });
}
SECTION("Link list breaks, not enough links")
{
// needs 4 links, only has 3
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card, Call 555-5555" });
}
SECTION("Link list breaks from extra text content")
{
// text content between links causes them to not be a link list
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> (extras available!) <a href=''>Email</a> <a href=''>Mail</a> <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card (extras available!) Email Mail Call 555-5555" });
}
SECTION("Link list breaks from too wide extra content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, ||<a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, ||Mail, Call 555-5555" });
}
}

TEST_CASE("HTML Parser", "[html import]")
{
SECTION("Find Bookmark")
Expand Down Expand Up @@ -812,85 +916,6 @@ TEST_CASE("HTML Parser", "[html import]")
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wcscmp(p, L"Contact 555-5555 for details.") == 0);
}
SECTION("Link list with break")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><br ><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list with image")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><img src='flower.png'><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list lots of spaces")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a> , <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail , \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list with trailing content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card \n\tEmail, \n\tMail, \n\tCall 555-5555\n\nSome more content\n\n" });
}
SECTION("Link list empty")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t" });
}
SECTION("Link list empty trailing content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a><p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t\n\nSome more content\n\n" });
}
SECTION("Link list breaks, overlapping anchors")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card<a href=''> Email</a></a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, Mail, Call 555-5555\n\nSome more content\n\n" });
}
SECTION("Link list breaks, not enough links")
{
// needs 4 links, only has 3
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card, Call 555-5555" });
}
SECTION("Link list breaks from extra text content")
{
// text content between links causes them to not be a link list
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> (extras available!) <a href=''>Email</a> <a href=''>Mail</a> <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card (extras available!) Email Mail Call 555-5555" });
}
SECTION("Link list breaks from too wide extra content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, ||<a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card Email, ||Mail, Call 555-5555" });
}
SECTION("Template placeHolders")
{
html_extract_text filter_html;
Expand Down

0 comments on commit 40bc19d

Please sign in to comment.