diff --git a/Cargo.lock b/Cargo.lock index 4cfc702dd..8e765c88f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4273,7 +4273,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.21.13" +version = "2.21.15" dependencies = [ "ahash", "aho-corasick", @@ -4336,7 +4336,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.21.13" +version = "2.21.15" dependencies = [ "adblock", "base64 0.22.1", @@ -4373,7 +4373,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.21.13" +version = "2.21.15" dependencies = [ "clap", "env_logger", @@ -4398,7 +4398,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.21.13" +version = "2.21.15" dependencies = [ "aho-corasick", "fast_html2md", @@ -4420,7 +4420,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.21.13" +version = "2.21.15" dependencies = [ "indexmap 1.9.3", "serde", @@ -4432,7 +4432,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.21.13" +version = "2.21.15" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 921306fac..91732c266 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.21.13" +version = "2.21.15" authors = [ "j-mendez " ] diff --git a/spider/src/features/chrome_common.rs b/spider/src/features/chrome_common.rs index c73f179b2..cdd28bbb7 100644 --- a/spider/src/features/chrome_common.rs +++ b/spider/src/features/chrome_common.rs @@ -1,7 +1,7 @@ use crate::utils::trie::Trie; #[cfg(feature = "chrome")] -use chromiumoxide::handler::network::NetworkInterceptManager; +use chromiumoxide::handler::blockers::intercept_manager::NetworkInterceptManager; /// wrapper for non chrome interception. does nothing. #[derive(Debug, Default, Clone, Copy, PartialEq)] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 6caf2c285..a12811efe 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.21.13" +version = "2.21.15" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/browser.rs b/spider_chrome/src/browser.rs index 5398e2d98..f1dc2fc8d 100644 --- a/spider_chrome/src/browser.rs +++ b/spider_chrome/src/browser.rs @@ -11,24 +11,13 @@ use futures::channel::oneshot::channel as oneshot_channel; use futures::select; use futures::SinkExt; -use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam}; -use chromiumoxide_cdp::cdp::browser_protocol::storage::{ - ClearCookiesParams, GetCookiesParams, SetCookiesParams, -}; -use chromiumoxide_cdp::cdp::browser_protocol::target::{ - CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams, TargetId, - TargetInfo, -}; -use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind}; -use chromiumoxide_types::*; - use crate::async_process::{self, Child, ExitStatus, Stdio}; use crate::cmd::{to_command_response, CommandMessage}; use crate::conn::Connection; use crate::detection::{self, DetectionOptions}; use crate::error::{BrowserStderr, CdpError, Result}; +use crate::handler::blockers::intercept_manager::NetworkInterceptManager; use crate::handler::browser::BrowserContext; -use crate::handler::network::NetworkInterceptManager; use crate::handler::viewport::Viewport; use crate::handler::{Handler, HandlerConfig, HandlerMessage, REQUEST_TIMEOUT}; use crate::listeners::{EventListenerRequest, EventStream}; @@ -37,6 +26,16 @@ use crate::utils; use chromiumoxide_cdp::cdp::browser_protocol::browser::{ BrowserContextId, CloseReturns, GetVersionParams, GetVersionReturns, }; +use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam}; +use chromiumoxide_cdp::cdp::browser_protocol::storage::{ + ClearCookiesParams, GetCookiesParams, SetCookiesParams, +}; +use chromiumoxide_cdp::cdp::browser_protocol::target::{ + CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams, TargetId, + TargetInfo, +}; +use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind}; +use chromiumoxide_types::*; /// Default `Browser::launch` timeout in MS pub const LAUNCH_TIMEOUT: u64 = 20_000; diff --git a/spider_chrome/src/handler/blockers/intercept_manager.rs b/spider_chrome/src/handler/blockers/intercept_manager.rs new file mode 100644 index 000000000..fdc8f239e --- /dev/null +++ b/spider_chrome/src/handler/blockers/intercept_manager.rs @@ -0,0 +1,100 @@ +/// Custom network intercept types to expect on a domain +#[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)] +pub enum NetworkInterceptManager { + /// tiktok.com + TikTok, + /// facebook.com + Facebook, + /// amazon.com + Amazon, + /// x.com + X, + /// LinkedIn, + LinkedIn, + /// netflix.com + Netflix, + /// medium.com + Medium, + /// upwork.com, + Upwork, + /// glassdoor.com + Glassdoor, + /// ebay.com + Ebay, + /// nytimes.com + Nytimes, + /// wikipedia.com + Wikipedia, + #[default] + /// Unknown + Unknown, +} + +lazy_static::lazy_static! { + /// Top tier list of the most common websites visited. + pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [ + ("https://www.tiktok.com", NetworkInterceptManager::TikTok), + ("https://tiktok.com", NetworkInterceptManager::TikTok), + ("https://www.amazon.", NetworkInterceptManager::Amazon), + ("https://amazon.", NetworkInterceptManager::Amazon), + ("https://www.x.com", NetworkInterceptManager::X), + ("https://x.com", NetworkInterceptManager::X), + ("https://www.netflix.com", NetworkInterceptManager::Netflix), + ("https://netflix.com", NetworkInterceptManager::Netflix), + ( + "https://www.linkedin.com", + NetworkInterceptManager::LinkedIn + ), + ("https://linkedin.com", NetworkInterceptManager::LinkedIn), + ("https://www.upwork.com", NetworkInterceptManager::Upwork), + ("https://upwork.com", NetworkInterceptManager::Upwork), + ("https://www.glassdoor.", NetworkInterceptManager::Glassdoor), + ("https://glassdoor.", NetworkInterceptManager::Glassdoor), + ("https://www.medium.com", NetworkInterceptManager::Medium), + ("https://medium.com", NetworkInterceptManager::Medium), + ("https://www.ebay.", NetworkInterceptManager::Ebay), + ("https://ebay.", NetworkInterceptManager::Ebay), + ("https://www.nytimes.com", NetworkInterceptManager::Nytimes), + ("https://nytimes.com", NetworkInterceptManager::Nytimes), + ("wikipedia.org", NetworkInterceptManager::Wikipedia), + ]; +} + +/// The find type is own. +#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)] +enum FindType { + #[default] + /// Starts with. + StartsWith, + /// Contains. + Contains, +} + +impl NetworkInterceptManager { + /// a custom intercept handle. + pub fn new(url: &str) -> NetworkInterceptManager { + TOP_TIER_LIST + .iter() + .find(|&(pattern, nm)| { + if nm.get_pattern() == FindType::StartsWith { + url.starts_with(pattern) + } else { + url.contains(pattern) + } + }) + .map(|&(_, manager_type)| manager_type) + .unwrap_or(NetworkInterceptManager::Unknown) + } + /// Setup the intercept handle + pub fn setup(&mut self, url: &str) -> Self { + NetworkInterceptManager::new(url) + } + + /// determine the pattern to use. + fn get_pattern(&self) -> FindType { + match self { + NetworkInterceptManager::Wikipedia => FindType::Contains, + _ => FindType::StartsWith, + } + } +} diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs index 44c8df052..c923c7d8f 100644 --- a/spider_chrome/src/handler/blockers/mod.rs +++ b/spider_chrome/src/handler/blockers/mod.rs @@ -6,6 +6,8 @@ pub mod amazon_blockers; pub mod ebay_blockers; /// glassdoor blockers pub mod glassdoor_blockers; +/// interception manager +pub mod intercept_manager; /// linkedin blockers pub mod linkedin_blockers; /// medium blockers @@ -24,7 +26,6 @@ pub mod upwork_blockers; pub mod wikipedia_blockers; /// x blockers pub mod x_blockers; - /// xhr blockers pub mod xhr; @@ -75,3 +76,18 @@ impl Trie { false } } + +/// Url matches analytics that we want to ignore or trackers. +pub(crate) fn ignore_script_embedded(url: &str) -> bool { + crate::handler::blockers::scripts::URL_IGNORE_EMBEDED_TRIE.contains_prefix(url) +} + +/// Url matches analytics that we want to ignore or trackers. +pub(crate) fn ignore_script_xhr(url: &str) -> bool { + crate::handler::blockers::xhr::URL_IGNORE_XHR_TRIE.contains_prefix(url) +} + +/// Url matches media that we want to ignore. +pub(crate) fn ignore_script_xhr_media(url: &str) -> bool { + crate::handler::blockers::xhr::URL_IGNORE_XHR_MEDIA_TRIE.contains_prefix(url) +} diff --git a/spider_chrome/src/handler/blockers/scripts.rs b/spider_chrome/src/handler/blockers/scripts.rs index c4143a92a..bfcfcdd3b 100644 --- a/spider_chrome/src/handler/blockers/scripts.rs +++ b/spider_chrome/src/handler/blockers/scripts.rs @@ -17,6 +17,7 @@ lazy_static::lazy_static! { "https://www.gstatic.com/cv/js/sender/", "https://googleads.g.doubleclick.net", "https://www.google-analytics.com", + "https://www.googleanalytics.com", "https://iabusprivacy.pmc.com/geo-info.js", "https://cookie-cdn.cookiepro.com/consent", "https://load.sumome.com/", @@ -88,6 +89,11 @@ lazy_static::lazy_static! { "https://mab.chartbeat.com/mab_strategy/", "https://c.amazon-adsystem.com/", "https://rumcdn.geoedge.be/", + "https://assets.adobedtm.com/extensions/", + "https://macro.adnami.io/macro/spec/adsm.macro.", + "https://log.medietall.no/analytics.js", + "https://lwadm.com/lw/pbjs?", + "https://cl.k5a.io/", ".sharethis.com", ".newrelic.com", ".googlesyndication.com", @@ -242,4 +248,177 @@ lazy_static::lazy_static! { trie }; + /// Ignore list of scripts paths. + pub (crate) static ref URL_IGNORE_TRIE_PATHS: Trie = { + let mut trie = Trie::new(); + let patterns = [ + // explicit ignore tracking.js and ad files + "privacy-notice.js", + "tracking.js", + "track.js", + "ads.js", + "analytics.js", + "otSDKStub.js", + "otBannerSdk.js", + "_vercel/insights/script.js", + "analytics.", + ]; + for pattern in &patterns { + trie.insert(pattern); + } + trie + }; + +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_url_ignore_trie_contains() { + // Positive tests - these URLs should be contained in the trie + let positive_cases = vec![ + "https://www.googletagservices.com/tag/", + "https://www.google-analytics.com", + "https://www.googleanalytics.com", + ".newrelic.com", + "privacy-notice.js", + ]; + + // Negative tests - these URLs should not be contained in the trie + let negative_cases = vec![ + "https://not-a-tracked-url.com/script.js", + "https://google.com", + ]; + + for case in positive_cases { + assert!( + URL_IGNORE_TRIE.contains_prefix(case), + "Trie should contain: {}", + case + ); + } + + for case in negative_cases { + assert!( + !URL_IGNORE_TRIE.contains_prefix(case), + "Trie should not contain: {}", + case + ); + } + } + + #[test] + fn test_url_ignore_embedded_trie_contains() { + // Positive tests - these URLs should be contained in the trie + let positive_cases = vec![ + "https://www.youtube.com/embed/", + "https://www.google.com/maps/embed?", + ".amplitude.com", + ]; + + // Negative tests - these URLs should not be contained in the trie + let negative_cases = vec![ + "https://secure-site.com/resource.js", + "https://example.com/embed.js", + ]; + + for case in positive_cases { + assert!( + URL_IGNORE_EMBEDED_TRIE.contains_prefix(case), + "Trie should contain: {}", + case + ); + } + + for case in negative_cases { + assert!( + !URL_IGNORE_EMBEDED_TRIE.contains_prefix(case), + "Trie should not contain: {}", + case + ); + } + } + + #[test] + fn test_url_ignore_script_base_paths_contains() { + // Positive tests - these paths should be contained in the trie + let positive_cases = vec!["wp-content/plugins/cookie-law-info", "analytics/"]; + + // Negative tests - these paths should not be contained in the trie + let negative_cases = vec![ + "wp-content/some-untracked-plugin/", + "random/path/analytics.js", + ]; + + for case in positive_cases { + assert!( + URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(case), + "Trie should contain: {}", + case + ); + } + + for case in negative_cases { + assert!( + !URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(case), + "Trie should not contain: {}", + case + ); + } + } + + #[test] + fn test_url_ignore_script_style_paths_contains() { + // Positive tests - these paths should be contained in the trie + let positive_cases = vec!["wp-content/themes/", "npm/bootstrap@"]; + + // Negative tests - these paths should not be contained in the trie + let negative_cases = vec![ + "wp-content/some-other-theme/", + "wp-content/plugins/untracked-plugin/", + ]; + + for case in positive_cases { + assert!( + URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(case), + "Trie should contain: {}", + case + ); + } + + for case in negative_cases { + assert!( + !URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(case), + "Trie should not contain: {}", + case + ); + } + } + + #[test] + fn test_url_ignore_trie_paths_contains() { + // Positive tests - these paths should be contained in the trie + let positive_cases = vec!["privacy-notice.js", "tracking.js"]; + + // Negative tests - these paths should not be contained in the trie + let negative_cases = vec!["non-ignored.js", "non-related/tracking.js"]; + + for case in positive_cases { + assert!( + URL_IGNORE_TRIE_PATHS.contains_prefix(case), + "Trie should contain: {}", + case + ); + } + + for case in negative_cases { + assert!( + !URL_IGNORE_TRIE_PATHS.contains_prefix(case), + "Trie should not contain: {}", + case + ); + } + } } diff --git a/spider_chrome/src/handler/blockers/xhr.rs b/spider_chrome/src/handler/blockers/xhr.rs index 6ead6cd7d..f7c96b530 100644 --- a/spider_chrome/src/handler/blockers/xhr.rs +++ b/spider_chrome/src/handler/blockers/xhr.rs @@ -12,7 +12,8 @@ lazy_static::lazy_static! { "https://soundcloud.com/player/", "https://open.spotify.com/", "https://api.spotify.com/v1/", - "https://music.apple.com/" + "https://music.apple.com/", + "https://maps.googleapis.com/" ]; for pattern in &patterns { trie.insert(pattern); @@ -87,6 +88,7 @@ lazy_static::lazy_static! { ".wixapps.net/api/v1/bulklog", "https://error-analytics-sessions-production.shopifysvc.com/", "https://static-forms.", + "https://nhst.tt.omtrdc.net/rest/v1/delivery", // video embeddings "https://video.squarespace-cdn.com/content/", "https://bes.gcp.data.bigcommerce.com/nobot", @@ -96,6 +98,7 @@ lazy_static::lazy_static! { "https://mab.chartbeat.com/", "https://c.go-mpulse.net/", "https://prodregistryv2.org/v1/", + "https://dpm.demdex.net/", "googlesyndication.com", ".doubleclick.net", ".doofinder.com", @@ -117,3 +120,96 @@ lazy_static::lazy_static! { }; } + +#[cfg(test)] +mod tests { + use super::*; + use case_insensitive_string::CaseInsensitiveString; + + #[test] + fn test_url_ignore_xhr_media_trie_contains() { + // Positive tests - these URLs should be contained in the trie + let positive_cases = vec![ + "https://www.youtube.com/s/player/", + "https://api.spotify.com/v1/", + ]; + + // Negative tests - these URLs should not be contained in the trie + let negative_cases = vec!["https://www.google.com/", "https://api.example.com/v1/"]; + + for case in positive_cases { + assert!( + URL_IGNORE_XHR_MEDIA_TRIE.contains_prefix(case), + "Trie should contain: {}", + case + ); + } + + for case in negative_cases { + assert!( + !URL_IGNORE_XHR_MEDIA_TRIE.contains_prefix(case), + "Trie should not contain: {}", + case + ); + } + } + + #[test] + fn test_ignore_xhr_assets_contains() { + // Positive tests - these file types (considering case insensitivity) should be contained in the set + let positive_cases = vec!["jpg", "mp3", "WOFF", ".svg"]; + + // Negative tests - these file types should not be contained in the set + let negative_cases = vec!["randomfiletype", "xyz"]; + + for case in positive_cases { + let case_ci: CaseInsensitiveString = case.into(); + assert!( + IGNORE_XHR_ASSETS.contains(&case_ci), + "HashSet should contain: {}", + case + ); + } + + for case in negative_cases { + let case_ci: CaseInsensitiveString = case.into(); + assert!( + !IGNORE_XHR_ASSETS.contains(&case_ci), + "HashSet should not contain: {}", + case + ); + } + } + + #[test] + fn test_url_ignore_xhr_trie_contains() { + // Positive tests - these URLs should be contained in the trie + let positive_cases = vec![ + "https://play.google.com/log?", + "https://googleads.g.doubleclick.net/pagead/id", + ".doubleclick.net", + ]; + + // Negative tests - these URLs should not be contained in the trie + let negative_cases = vec![ + "https://example.com/track", + "https://anotherdomain.com/api/", + ]; + + for case in positive_cases { + assert!( + URL_IGNORE_XHR_TRIE.contains_prefix(case), + "Trie should contain: {}", + case + ); + } + + for case in negative_cases { + assert!( + !URL_IGNORE_XHR_TRIE.contains_prefix(case), + "Trie should not contain: {}", + case + ); + } + } +} diff --git a/spider_chrome/src/handler/mod.rs b/spider_chrome/src/handler/mod.rs index ea763fbed..116efd56f 100644 --- a/spider_chrome/src/handler/mod.rs +++ b/spider_chrome/src/handler/mod.rs @@ -1,5 +1,5 @@ +use crate::handler::blockers::intercept_manager::NetworkInterceptManager; use hashbrown::{HashMap, HashSet}; -use network::NetworkInterceptManager; use std::pin::Pin; use std::time::{Duration, Instant}; diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index b03434953..a278ba009 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -1,10 +1,11 @@ use super::blockers::{ + ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media, + intercept_manager::NetworkInterceptManager, scripts::{ - URL_IGNORE_EMBEDED_TRIE, URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, - URL_IGNORE_TRIE, + URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE, + URL_IGNORE_TRIE_PATHS, }, - xhr::{IGNORE_XHR_ASSETS, URL_IGNORE_XHR_MEDIA_TRIE, URL_IGNORE_XHR_TRIE}, - Trie, + xhr::IGNORE_XHR_ASSETS, }; use crate::auth::Credentials; use crate::cmd::CommandChain; @@ -26,8 +27,9 @@ use chromiumoxide_cdp::cdp::browser_protocol::{ use chromiumoxide_types::{Command, Method, MethodId}; use hashbrown::{HashMap, HashSet}; use lazy_static::lazy_static; +use reqwest::header::PROXY_AUTHORIZATION; +use std::collections::VecDeque; use std::time::Duration; -use std::{collections::VecDeque, default}; lazy_static! { /// allowed js frameworks and libs excluding some and adding additional URLs @@ -96,148 +98,11 @@ lazy_static! { "Ping", }; - /// Ignore list of scripts paths. - static ref URL_IGNORE_TRIE_PATHS: Trie = { - let mut trie = Trie::new(); - let patterns = [ - // explicit ignore tracking.js and ad files - "privacy-notice.js", - "tracking.js", - "track.js", - "ads.js", - "analytics.js", - "otSDKStub.js", - "otBannerSdk.js", - "_vercel/insights/script.js", - "analytics.", - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; - /// Case insenstive css matching pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css"); } -/// Url matches analytics that we want to ignore or trackers. -pub(crate) fn ignore_script_embedded(url: &str) -> bool { - URL_IGNORE_EMBEDED_TRIE.contains_prefix(url) -} - -/// Url matches analytics that we want to ignore or trackers. -pub(crate) fn ignore_script_xhr(url: &str) -> bool { - URL_IGNORE_XHR_TRIE.contains_prefix(url) -} - -/// Url matches media that we want to ignore. -pub(crate) fn ignore_script_xhr_media(url: &str) -> bool { - URL_IGNORE_XHR_MEDIA_TRIE.contains_prefix(url) -} - -/// Custom network intercept types to expect on a domain -#[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)] -pub enum NetworkInterceptManager { - /// tiktok.com - TikTok, - /// facebook.com - Facebook, - /// amazon.com - Amazon, - /// x.com - X, - /// LinkedIn, - LinkedIn, - /// netflix.com - Netflix, - /// medium.com - Medium, - /// upwork.com, - Upwork, - /// glassdoor.com - Glassdoor, - /// ebay.com - Ebay, - /// nytimes.com - Nytimes, - /// wikipedia.com - Wikipedia, - #[default] - /// Unknown - Unknown, -} - -lazy_static! { - /// Top tier list of the most common websites visited. - pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [ - ("https://www.tiktok.com", NetworkInterceptManager::TikTok), - ("https://tiktok.com", NetworkInterceptManager::TikTok), - ("https://www.amazon.", NetworkInterceptManager::Amazon), - ("https://amazon.", NetworkInterceptManager::Amazon), - ("https://www.x.com", NetworkInterceptManager::X), - ("https://x.com", NetworkInterceptManager::X), - ("https://www.netflix.com", NetworkInterceptManager::Netflix), - ("https://netflix.com", NetworkInterceptManager::Netflix), - ( - "https://www.linkedin.com", - NetworkInterceptManager::LinkedIn - ), - ("https://linkedin.com", NetworkInterceptManager::LinkedIn), - ("https://www.upwork.com", NetworkInterceptManager::Upwork), - ("https://upwork.com", NetworkInterceptManager::Upwork), - ("https://www.glassdoor.", NetworkInterceptManager::Glassdoor), - ("https://glassdoor.", NetworkInterceptManager::Glassdoor), - ("https://www.medium.com", NetworkInterceptManager::Medium), - ("https://medium.com", NetworkInterceptManager::Medium), - ("https://www.ebay.", NetworkInterceptManager::Ebay), - ("https://ebay.", NetworkInterceptManager::Ebay), - ("https://www.nytimes.com", NetworkInterceptManager::Nytimes), - ("https://nytimes.com", NetworkInterceptManager::Nytimes), - ("wikipedia.org", NetworkInterceptManager::Wikipedia), - ]; -} - -/// The find type is own. -#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)] -enum FindType { - #[default] - /// Starts with. - StartsWith, - /// Contains. - Contains, -} - -impl NetworkInterceptManager { - /// a custom intercept handle. - pub fn new(url: &str) -> NetworkInterceptManager { - TOP_TIER_LIST - .iter() - .find(|&(pattern, nm)| { - if nm.get_pattern() == FindType::StartsWith { - url.starts_with(pattern) - } else { - url.contains(pattern) - } - }) - .map(|&(_, manager_type)| manager_type) - .unwrap_or(NetworkInterceptManager::Unknown) - } - /// Setup the intercept handle - pub fn setup(&mut self, url: &str) -> Self { - NetworkInterceptManager::new(url) - } - - /// determine the pattern to use. - fn get_pattern(&self) -> FindType { - match self { - NetworkInterceptManager::Wikipedia => FindType::Contains, - _ => FindType::StartsWith, - } - } -} - #[derive(Debug)] pub struct NetworkManager { queued_events: VecDeque, @@ -336,7 +201,7 @@ impl NetworkManager { pub fn set_extra_headers(&mut self, headers: std::collections::HashMap) { self.extra_headers = headers; - self.extra_headers.remove("proxy-authorization"); + self.extra_headers.remove(PROXY_AUTHORIZATION.as_str()); if let Ok(headers) = serde_json::to_value(&self.extra_headers) { self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers))); } diff --git a/spider_chrome/src/handler/target.rs b/spider_chrome/src/handler/target.rs index ae4b52aaa..d4e669cda 100644 --- a/spider_chrome/src/handler/target.rs +++ b/spider_chrome/src/handler/target.rs @@ -18,6 +18,7 @@ use chromiumoxide_cdp::cdp::events::CdpEvent; use chromiumoxide_cdp::cdp::CdpEventMessage; use chromiumoxide_types::{Command, Method, Request, Response}; +use super::blockers::intercept_manager::NetworkInterceptManager; use crate::auth::Credentials; use crate::cdp::browser_protocol::target::CloseTargetParams; use crate::cmd::CommandChain; @@ -41,8 +42,6 @@ use chromiumoxide_cdp::cdp::js_protocol::runtime::{ }; use std::time::Duration; -use super::network::NetworkInterceptManager; - macro_rules! advance_state { ($s:ident, $cx:ident, $now:ident, $cmds: ident, $next_state:expr ) => {{ if let Poll::Ready(poll) = $cmds.poll($now) { diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 1a77160dc..02feaf7a9 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.21.13" +version = "2.21.15" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 8ecde0a32..02a43a835 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.21.13" +version = "2.21.15" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 228a200ab..cfcaa88f7 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.21.13" +version = "2.21.15" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 6a3315491..09155a57b 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.21.13" +version = "2.21.15" authors = [ "j-mendez " ]