From 3dfcdd9fd0eebbd9f34cfda69336f763f08ba236 Mon Sep 17 00:00:00 2001 From: Blake-Madden Date: Fri, 5 Apr 2024 11:12:23 -0400 Subject: [PATCH] Don't harvest base64-encoded images as image paths --- src/import/html_extract_text.cpp | 15 +++++++++++---- tests/htmlimporttests.cpp | 16 +++++++++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/import/html_extract_text.cpp b/src/import/html_extract_text.cpp index dcc75b7c..818c12cb 100644 --- a/src/import/html_extract_text.cpp +++ b/src/import/html_extract_text.cpp @@ -2025,6 +2025,7 @@ namespace html_utilities const wchar_t* html_image_parse::operator()() { static const std::wstring_view HTML_IMAGE(L"img"); + static const std::wstring_view DATA_IMAGE(L"data:image"); // reset m_current_hyperlink_length = 0; @@ -2040,7 +2041,9 @@ namespace html_utilities { const auto [imageSrc, imageLength] = html_extract_text::read_attribute(m_html_text, L"src", false, true); - if (imageSrc) + if (imageSrc != nullptr && + // skip over base64-encoded image data, we just want file paths + string_util::strnicmp(imageSrc, DATA_IMAGE.data(), DATA_IMAGE.length()) != 0) { m_html_text = imageSrc; m_current_hyperlink_length = imageLength; @@ -2112,6 +2115,7 @@ namespace html_utilities static const std::wstring_view HTML_SCRIPT(L"script"); static const std::wstring_view HTML_SCRIPT_END(L""); static const std::wstring_view HTML_IMAGE(L"img"); + static const std::wstring_view DATA_IMAGE(L"data:image"); // if we are in an embedded script block, then continue parsing the // links out of that instead of using the regular parser if (m_inside_of_script_section) @@ -2164,15 +2168,18 @@ namespace html_utilities // see if it is an IMG, Frame (sometimes they have a SRC to another HTML page), or JS link if ((m_include_image_links && m_current_link_is_image) || m_current_link_is_javascript || - html_extract_text::compare_element(m_html_text+1, + html_extract_text::compare_element(m_html_text + 1, HTML_FRAME, false) || - html_extract_text::compare_element(m_html_text+1, + html_extract_text::compare_element(m_html_text + 1, HTML_IFRAME, false)) { m_html_text += 4; const auto [imageSrc, imageLengh] = html_extract_text::read_attribute(m_html_text, L"src", false, true); - if (imageSrc) + if (imageSrc != nullptr && + // skip over base64-encoded image data, we just want file paths + string_util::strnicmp(imageSrc, DATA_IMAGE.data(), DATA_IMAGE.length()) != + 0) { m_html_text = imageSrc; m_current_hyperlink_length = imageLengh; diff --git a/tests/htmlimporttests.cpp b/tests/htmlimporttests.cpp index bb16dfd5..daf5f7f9 100644 --- a/tests/htmlimporttests.cpp +++ b/tests/htmlimporttests.cpp @@ -1613,7 +1613,7 @@ TEST_CASE("Html Image Parse", "[html import]") CHECK(parse.get_current_hyperlink_length() == 16); CHECK(parse() == nullptr); - + CHECK(parse.get_current_hyperlink_length() == 0); CHECK(parse() == nullptr); } @@ -1626,8 +1626,18 @@ TEST_CASE("Html Image Parse", "[html import]") CHECK(parse.get_current_hyperlink_length() == 16); CHECK(parse() == nullptr); + CHECK(parse.get_current_hyperlink_length() == 0); + CHECK(parse() == nullptr); + } + SECTION("Image Base-64 Encoded") + { + const wchar_t* text = L" Hello page some text 404 "; + html_image_parse parse(text, std::wcslen(text) ); + + // don't pick up the encoded image data CHECK(parse() == nullptr); + CHECK(parse.get_current_hyperlink_length() == 0); } SECTION("Images") @@ -1642,7 +1652,7 @@ TEST_CASE("Html Image Parse", "[html import]") CHECK(parse.get_current_hyperlink_length() == 9); CHECK(parse() == nullptr); - + CHECK(parse.get_current_hyperlink_length() == 0); CHECK(parse() == nullptr); } @@ -1658,7 +1668,7 @@ TEST_CASE("Html Image Parse", "[html import]") CHECK(parse.get_current_hyperlink_length() == 9); CHECK(parse() == nullptr); - + CHECK(parse.get_current_hyperlink_length() == 0); CHECK(parse() == nullptr); }