Skip to content

Commit

Permalink
chore(abs): add safety check joining
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 14, 2024
1 parent 5a0f96a commit 10162a2
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 24 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.21.28"
version = "2.21.29"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
65 changes: 53 additions & 12 deletions spider/src/utils/abs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,23 @@ pub(crate) fn parse_absolute_url(url: &str) -> Option<Box<Url>> {
}
}

/// Convert to absolute path. The base url must be the root path to avoid infinite appending.
/// Return handling for the links
enum LinkReturn {
/// Early return
EarlyReturn,
/// Empty ignore
Empty,
/// Absolute url
Absolute(Url),
}

#[inline]
pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {
/// Handle the base url return to determine 3rd party urls.
fn handle_base(href: &str) -> LinkReturn {
let href = href.trim();

if href.is_empty() || href == "#" || href == "javascript:void(0);" {
return base.clone();
return LinkReturn::EarlyReturn;
}

// handle absolute urls.
Expand All @@ -81,7 +91,7 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {

// Ignore protocols that are in the IGNORED_PROTOCOLS set
if IGNORED_PROTOCOLS.contains(protocol_slice_section) {
return base.clone();
return LinkReturn::EarlyReturn;
}

// valid protocol to take absolute
Expand All @@ -91,27 +101,58 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {
if PROTOCOLS.contains(protocol_slice) {
if let Ok(mut next_url) = Url::parse(href) {
next_url.set_fragment(None);
return next_url;
return LinkReturn::Absolute(next_url);
}
}
}
}
}

// we can swap the domains if they do not match incase of crawler redirect anti-bot
match base.join(href) {
Ok(mut joined) => {
joined.set_fragment(None);
joined
LinkReturn::Empty
}

/// Convert to absolute path. The base url must be the root path to avoid infinite appending.
/// We always handle the urls from the base path.
#[inline]
pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {
if base.path() != "/" {
let mut base = base.clone();
convert_abs_url(&mut base);

match handle_base(href) {
LinkReturn::Absolute(u) => return u,
LinkReturn::EarlyReturn => return base.to_owned(),
_ => (),
}

match base.join(href) {
Ok(mut joined) => {
joined.set_fragment(None);
joined
}
Err(_) => base.to_owned(),
}
} else {
match handle_base(href) {
LinkReturn::Absolute(u) => return u,
LinkReturn::EarlyReturn => return base.to_owned(),
_ => (),
}
// we can swap the domains if they do not match incase of crawler redirect anti-bot
match base.join(href) {
Ok(mut joined) => {
joined.set_fragment(None);
joined
}
Err(_) => base.to_owned(),
}
Err(_) => base.clone(),
}
}

#[cfg(test)]
mod tests {
use crate::utils::parse_absolute_url;
use super::convert_abs_path;
use crate::utils::parse_absolute_url;

#[test]
fn test_basic_join() {
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.21.28"
version = "2.21.29"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.21.28"
version = "2.21.29"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.21.28"
version = "2.21.29"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.21.28"
version = "2.21.29"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.21.28"
version = "2.21.29"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 10162a2

Please sign in to comment.