From 3dfcdd9fd0eebbd9f34cfda69336f763f08ba236 Mon Sep 17 00:00:00 2001
From: Blake-Madden <madindayton@outlook.com>
Date: Fri, 5 Apr 2024 11:12:23 -0400
Subject: [PATCH] Don't harvest base64-encoded images as image paths

---
 src/import/html_extract_text.cpp | 15 +++++++++++----
 tests/htmlimporttests.cpp        | 16 +++++++++++++---
 2 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/src/import/html_extract_text.cpp b/src/import/html_extract_text.cpp
index dcc75b7c..818c12cb 100644
--- a/src/import/html_extract_text.cpp
+++ b/src/import/html_extract_text.cpp
@@ -2025,6 +2025,7 @@ namespace html_utilities
     const wchar_t* html_image_parse::operator()()
         {
         static const std::wstring_view HTML_IMAGE(L"img");
+        static const std::wstring_view DATA_IMAGE(L"data:image");
         // reset
         m_current_hyperlink_length = 0;
 
@@ -2040,7 +2041,9 @@ namespace html_utilities
                 {
                 const auto [imageSrc, imageLength] =
                     html_extract_text::read_attribute(m_html_text, L"src", false, true);
-                if (imageSrc)
+                if (imageSrc != nullptr &&
+                    // skip over base64-encoded image data, we just want file paths
+                    string_util::strnicmp(imageSrc, DATA_IMAGE.data(), DATA_IMAGE.length()) != 0)
                     {
                     m_html_text = imageSrc;
                     m_current_hyperlink_length = imageLength;
@@ -2112,6 +2115,7 @@ namespace html_utilities
         static const std::wstring_view HTML_SCRIPT(L"script");
         static const std::wstring_view HTML_SCRIPT_END(L"</script>");
         static const std::wstring_view HTML_IMAGE(L"img");
+        static const std::wstring_view DATA_IMAGE(L"data:image");
         // if we are in an embedded script block, then continue parsing the
         // links out of that instead of using the regular parser
         if (m_inside_of_script_section)
@@ -2164,15 +2168,18 @@ namespace html_utilities
                 // see if it is an IMG, Frame (sometimes they have a SRC to another HTML page), or JS link
                 if ((m_include_image_links && m_current_link_is_image) ||
                     m_current_link_is_javascript  ||
-                    html_extract_text::compare_element(m_html_text+1,
+                    html_extract_text::compare_element(m_html_text + 1,
                         HTML_FRAME, false) ||
-                    html_extract_text::compare_element(m_html_text+1,
+                    html_extract_text::compare_element(m_html_text + 1,
                         HTML_IFRAME, false))
                     {
                     m_html_text += 4;
                     const auto [imageSrc, imageLengh] =
                         html_extract_text::read_attribute(m_html_text, L"src", false, true);
-                    if (imageSrc)
+                    if (imageSrc != nullptr &&
+                        // skip over base64-encoded image data, we just want file paths
+                        string_util::strnicmp(imageSrc, DATA_IMAGE.data(), DATA_IMAGE.length()) !=
+                            0)
                         {
                         m_html_text = imageSrc;
                         m_current_hyperlink_length = imageLengh;
diff --git a/tests/htmlimporttests.cpp b/tests/htmlimporttests.cpp
index bb16dfd5..daf5f7f9 100644
--- a/tests/htmlimporttests.cpp
+++ b/tests/htmlimporttests.cpp
@@ -1613,7 +1613,7 @@ TEST_CASE("Html Image Parse", "[html import]")
         CHECK(parse.get_current_hyperlink_length() == 16);
 
         CHECK(parse() == nullptr);
-
+        CHECK(parse.get_current_hyperlink_length() == 0);
         CHECK(parse() == nullptr);
         }
 
@@ -1626,8 +1626,18 @@ TEST_CASE("Html Image Parse", "[html import]")
         CHECK(parse.get_current_hyperlink_length() == 16);
 
         CHECK(parse() == nullptr);
+        CHECK(parse.get_current_hyperlink_length() == 0);
+        CHECK(parse() == nullptr);
+        }
 
+    SECTION("Image Base-64 Encoded")
+        {
+        const wchar_t* text = L"<heAD><baSE hrEf=\"www.mysite\"></base></HEAD> Hello <A hRef=\"www.page.com\">page</a> some text <img SRc=\"data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==\"/><a href=\'404\'>404</A> <img></img><a href=\"\"></a> <a href=></a><scripT type=\"text/javascript\" sRC=\"/scripts/statmenu4.js\"></Script>";
+        html_image_parse parse(text, std::wcslen(text) );
+
+        // don't pick up the encoded image data
         CHECK(parse() == nullptr);
+        CHECK(parse.get_current_hyperlink_length() == 0);
         }
 
     SECTION("Images")
@@ -1642,7 +1652,7 @@ TEST_CASE("Html Image Parse", "[html import]")
         CHECK(parse.get_current_hyperlink_length() == 9);
 
         CHECK(parse() == nullptr);
-
+        CHECK(parse.get_current_hyperlink_length() == 0);
         CHECK(parse() == nullptr);
         }
 
@@ -1658,7 +1668,7 @@ TEST_CASE("Html Image Parse", "[html import]")
         CHECK(parse.get_current_hyperlink_length() == 9);
 
         CHECK(parse() == nullptr);
-
+        CHECK(parse.get_current_hyperlink_length() == 0);
         CHECK(parse() == nullptr);
         }