From 305dd5e25d702ca002dc8c41b0941e628c185db3 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sat, 14 Dec 2024 11:33:14 -0500 Subject: [PATCH] chore(abs): add safety check joining --- Cargo.lock | 12 +++--- spider/Cargo.toml | 2 +- spider/src/utils/abs.rs | 65 +++++++++++++++++++++++++------ spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 8 files changed, 65 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b36047949..93298da68 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5029,7 +5029,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.21.28" +version = "2.21.29" dependencies = [ "ahash", "aho-corasick", @@ -5092,7 +5092,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.21.28" +version = "2.21.29" dependencies = [ "adblock", "aho-corasick", @@ -5182,7 +5182,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.21.28" +version = "2.21.29" dependencies = [ "clap", "env_logger", @@ -5207,7 +5207,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.21.28" +version = "2.21.29" dependencies = [ "aho-corasick", "fast_html2md", @@ -5229,7 +5229,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.21.28" +version = "2.21.29" dependencies = [ "indexmap 1.9.3", "serde", @@ -5241,7 +5241,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.21.28" +version = "2.21.29" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 8a0520c5c..fefb5b030 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.21.28" +version = "2.21.29" authors = [ "j-mendez " ] diff --git a/spider/src/utils/abs.rs b/spider/src/utils/abs.rs index 192315e24..2a12bcba5 100644 --- a/spider/src/utils/abs.rs +++ b/spider/src/utils/abs.rs @@ -51,13 +51,23 @@ pub(crate) fn parse_absolute_url(url: &str) -> Option> { } } -/// Convert to absolute path. The base url must be the root path to avoid infinite appending. +/// Return handling for the links +enum LinkReturn { + /// Early return + EarlyReturn, + /// Empty ignore + Empty, + /// Absolute url + Absolute(Url), +} + #[inline] -pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { +/// Handle the base url return to determine 3rd party urls. +fn handle_base(href: &str) -> LinkReturn { let href = href.trim(); if href.is_empty() || href == "#" || href == "javascript:void(0);" { - return base.clone(); + return LinkReturn::EarlyReturn; } // handle absolute urls. @@ -81,7 +91,7 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { // Ignore protocols that are in the IGNORED_PROTOCOLS set if IGNORED_PROTOCOLS.contains(protocol_slice_section) { - return base.clone(); + return LinkReturn::EarlyReturn; } // valid protocol to take absolute @@ -91,27 +101,58 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { if PROTOCOLS.contains(protocol_slice) { if let Ok(mut next_url) = Url::parse(href) { next_url.set_fragment(None); - return next_url; + return LinkReturn::Absolute(next_url); } } } } } - // we can swap the domains if they do not match incase of crawler redirect anti-bot - match base.join(href) { - Ok(mut joined) => { - joined.set_fragment(None); - joined + LinkReturn::Empty +} + +/// Convert to absolute path. The base url must be the root path to avoid infinite appending. +/// We always handle the urls from the base path. +#[inline] +pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { + if base.path() != "/" { + let mut base = base.clone(); + convert_abs_url(&mut base); + + match handle_base(href) { + LinkReturn::Absolute(u) => return u, + LinkReturn::EarlyReturn => return base.clone(), + _ => (), + } + + match base.join(href) { + Ok(mut joined) => { + joined.set_fragment(None); + joined + } + Err(_) => base.to_owned(), + } + } else { + match handle_base(href) { + LinkReturn::Absolute(u) => return u, + LinkReturn::EarlyReturn => return base.clone(), + _ => (), + } + // we can swap the domains if they do not match incase of crawler redirect anti-bot + match base.join(href) { + Ok(mut joined) => { + joined.set_fragment(None); + joined + } + Err(_) => base.to_owned(), } - Err(_) => base.clone(), } } #[cfg(test)] mod tests { - use crate::utils::parse_absolute_url; use super::convert_abs_path; + use crate::utils::parse_absolute_url; #[test] fn test_basic_join() { diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index c8af0fce7..f8034edf1 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.21.28" +version = "2.21.29" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index bf870ad03..4b9433dfe 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.21.28" +version = "2.21.29" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 245429f12..5527b8c2a 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.21.28" +version = "2.21.29" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 2465e98ce..bee35c685 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.21.28" +version = "2.21.29" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 41dca2381..e5340dc90 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.21.28" +version = "2.21.29" authors = [ "j-mendez " ]