Skip to content

Commit

Permalink
Treat consective anchors in HTML as a list
Browse files Browse the repository at this point in the history
  • Loading branch information
Blake-Madden committed Jun 11, 2024
1 parent 12a00ec commit dac9e0f
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 9 deletions.
78 changes: 72 additions & 6 deletions src/import/html_extract_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -905,11 +905,73 @@ namespace lily_of_the_valley
const wchar_t* const endSentinel = html_text + text_length;
case_insensitive_wstring currentElement;
case_insensitive_wstring previousElement;

// More than three consecutive A HREFs will result in them being formatted
// into a list that is tabbed over. Even for links that are next to each other
// as a paragraph, we will want to show them as a link list rather than
// a false paragraph of text.
size_t consecutiveAHrefs{ 0 };
constexpr int LINK_LIST_LINK_MIN{ 3 };
bool linkListEnded{ false };
std::vector<size_t> linkListPositions;
std::vector<size_t> linkListPositionsEnds;

while (start && (start < endSentinel))
{
const size_t remainingTextLength = (endSentinel - start);
previousElement = currentElement;
currentElement.assign(get_element_name(start + 1, false));

if (currentElement == L"a")
{
++consecutiveAHrefs;
linkListPositions.push_back(get_filtered_buffer().length());
// Review what is between the previous anchor end and this new one.
if (linkListPositions.size() > 0 && linkListPositionsEnds.size() > 0)
{
std::wstring_view previousRead =
std::wstring_view{ get_filtered_buffer() }.substr(
linkListPositionsEnds.back(),
linkListPositions.back() - linkListPositionsEnds.back());
if (!previousRead.empty())
{
// if anything other than spaces or punctuation between the links,
// then this isn't a link list
for (const auto chr : previousRead)
{
if (!(std::iswpunct(chr) || std::iswspace(chr)))
{
consecutiveAHrefs = 0;
linkListPositions.clear();
linkListPositionsEnds.clear();
break;
}
}
}
}
}
else if (currentElement == L"/a")
{
linkListPositionsEnds.push_back(get_filtered_buffer().length());
}
// Ignore any breaks and images between links.
// (Images can be small icons next to a bullet point hyperlink.)
// Anything else will break the current series of A HREFs.
else if (currentElement != L"br" && currentElement != L"img")
{
if (consecutiveAHrefs >= LINK_LIST_LINK_MIN)
{
// insert a newline and tab in front of each link in the link list
for (size_t i = 0; i < linkListPositions.size(); ++i)
{
get_filtered_buffer().insert(linkListPositions[i] + (i * 2), L"\n\t");
}
}
consecutiveAHrefs = 0;
linkListPositions.clear();
linkListPositionsEnds.clear();
}

bool isSymbolFontSection = false;
// if it's a comment, then look for matching comment ending sequence
if (remainingTextLength >= 4 && start[0] == L'<' && start[1] == L'!' &&
Expand Down Expand Up @@ -1316,12 +1378,6 @@ namespace lily_of_the_valley
add_character(L'\n');
add_character(L'\n');
}
// break in front of link indicates that this is a link list,
// so add an extra newline in front of it
else if (previousElement == L"br")
{
add_character(L'\n');
}
}
}
else if (currentElement == L"span")
Expand Down Expand Up @@ -1446,6 +1502,16 @@ namespace lily_of_the_valley
}
}

// flush out a trailing link list (if present)
if (consecutiveAHrefs >= LINK_LIST_LINK_MIN)
{
// insert newlines and tab in front of each link in the link list
for (size_t i = 0; i < linkListPositions.size(); ++i)
{
get_filtered_buffer().insert(linkListPositions[i] + (i * 2), L"\n\t");
}
}

// get any text lingering after the last >
if (end && end < endSentinel && include_outer_text)
{
Expand Down
57 changes: 54 additions & 3 deletions tests/htmlimporttests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -812,14 +812,65 @@ TEST_CASE("HTML Parser", "[html import]")
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wcscmp(p, L"Contact 555-5555 for details.") == 0);
}
SECTION("Link list with break")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><br ><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list with image")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><img src='flower.png'><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><br ><a href=''>Email</a>";
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a><a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card\n\tEmail, \n\tMail, \n\tCall 555-5555" });
}
SECTION("Link list with trailing content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> <a href=''>Email</a>, <a href=''>Mail</a>, <a href=''>Call</a> 555-5555<p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\tPrayer Card \n\tEmail, \n\tMail, \n\tCall 555-5555\n\nSome more content\n\n" });
}
SECTION("Link list empty")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t" });
}
SECTION("Link list empty trailing content")
{
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''></a><a href=''></a><a href=''></a><a href=''></a><p>Some more content</p>";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\n\n\t\n\t\n\t\n\t\n\nSome more content\n\n" });
}
SECTION("Link list not enough links")
{
// needs 4 links, only has 3
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a>, <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card, Call 555-5555" });
}
SECTION("Link list with extra content")
{
// text content between links causes them to not be a link list
html_extract_text filter_html;
const wchar_t* text = L"<p>Contact:</p><a href=''>Prayer Card</a> (extras available!) <a href=''>Email</a> <a href=''>Mail</a> <a href=''>Call</a> 555-5555";
const std::wstring res = filter_html(text, std::wcslen(text), true, false);
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card\n\nEmail" });
CHECK(res == std::wstring{ L"\n\nContact:\n\nPrayer Card (extras available!) Email Mail Call 555-5555" });
}
SECTION("Template PlaceHolders")
SECTION("Template placeHolders")
{
html_extract_text filter_html;
const wchar_t* text = LR"(<a class = "breadcrumbs__link" href = "index.php">Mr. ${_EscapeTool.xml($level.title)}</a>)";
Expand Down

0 comments on commit dac9e0f

Please sign in to comment.