Skip to content

Commit

Permalink
Don't harvest base64-encoded images as image paths
Browse files Browse the repository at this point in the history
  • Loading branch information
Blake-Madden committed Apr 5, 2024
1 parent fd80ec5 commit 3dfcdd9
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 7 deletions.
15 changes: 11 additions & 4 deletions src/import/html_extract_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2025,6 +2025,7 @@ namespace html_utilities
const wchar_t* html_image_parse::operator()()
{
static const std::wstring_view HTML_IMAGE(L"img");
static const std::wstring_view DATA_IMAGE(L"data:image");
// reset
m_current_hyperlink_length = 0;

Expand All @@ -2040,7 +2041,9 @@ namespace html_utilities
{
const auto [imageSrc, imageLength] =
html_extract_text::read_attribute(m_html_text, L"src", false, true);
if (imageSrc)
if (imageSrc != nullptr &&
// skip over base64-encoded image data, we just want file paths
string_util::strnicmp(imageSrc, DATA_IMAGE.data(), DATA_IMAGE.length()) != 0)
{
m_html_text = imageSrc;
m_current_hyperlink_length = imageLength;
Expand Down Expand Up @@ -2112,6 +2115,7 @@ namespace html_utilities
static const std::wstring_view HTML_SCRIPT(L"script");
static const std::wstring_view HTML_SCRIPT_END(L"</script>");
static const std::wstring_view HTML_IMAGE(L"img");
static const std::wstring_view DATA_IMAGE(L"data:image");
// if we are in an embedded script block, then continue parsing the
// links out of that instead of using the regular parser
if (m_inside_of_script_section)
Expand Down Expand Up @@ -2164,15 +2168,18 @@ namespace html_utilities
// see if it is an IMG, Frame (sometimes they have a SRC to another HTML page), or JS link
if ((m_include_image_links && m_current_link_is_image) ||
m_current_link_is_javascript ||
html_extract_text::compare_element(m_html_text+1,
html_extract_text::compare_element(m_html_text + 1,
HTML_FRAME, false) ||
html_extract_text::compare_element(m_html_text+1,
html_extract_text::compare_element(m_html_text + 1,
HTML_IFRAME, false))
{
m_html_text += 4;
const auto [imageSrc, imageLengh] =
html_extract_text::read_attribute(m_html_text, L"src", false, true);
if (imageSrc)
if (imageSrc != nullptr &&
// skip over base64-encoded image data, we just want file paths
string_util::strnicmp(imageSrc, DATA_IMAGE.data(), DATA_IMAGE.length()) !=
0)
{
m_html_text = imageSrc;
m_current_hyperlink_length = imageLengh;
Expand Down
16 changes: 13 additions & 3 deletions tests/htmlimporttests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1613,7 +1613,7 @@ TEST_CASE("Html Image Parse", "[html import]")
CHECK(parse.get_current_hyperlink_length() == 16);

CHECK(parse() == nullptr);

CHECK(parse.get_current_hyperlink_length() == 0);
CHECK(parse() == nullptr);
}

Expand All @@ -1626,8 +1626,18 @@ TEST_CASE("Html Image Parse", "[html import]")
CHECK(parse.get_current_hyperlink_length() == 16);

CHECK(parse() == nullptr);
CHECK(parse.get_current_hyperlink_length() == 0);
CHECK(parse() == nullptr);
}

SECTION("Image Base-64 Encoded")
{
const wchar_t* text = L"<heAD><baSE hrEf=\"www.mysite\"></base></HEAD> Hello <A hRef=\"www.page.com\">page</a> some text <img SRc=\"\"/><a href=\'404\'>404</A> <img></img><a href=\"\"></a> <a href=></a><scripT type=\"text/javascript\" sRC=\"/scripts/statmenu4.js\"></Script>";
html_image_parse parse(text, std::wcslen(text) );

// don't pick up the encoded image data
CHECK(parse() == nullptr);
CHECK(parse.get_current_hyperlink_length() == 0);
}

SECTION("Images")
Expand All @@ -1642,7 +1652,7 @@ TEST_CASE("Html Image Parse", "[html import]")
CHECK(parse.get_current_hyperlink_length() == 9);

CHECK(parse() == nullptr);

CHECK(parse.get_current_hyperlink_length() == 0);
CHECK(parse() == nullptr);
}

Expand All @@ -1658,7 +1668,7 @@ TEST_CASE("Html Image Parse", "[html import]")
CHECK(parse.get_current_hyperlink_length() == 9);

CHECK(parse() == nullptr);

CHECK(parse.get_current_hyperlink_length() == 0);
CHECK(parse() == nullptr);
}

Expand Down

0 comments on commit 3dfcdd9

Please sign in to comment.