From ca712b516a128ce6456252eba9f5616aa66956b1 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 12 Dec 2024 13:35:30 -0500 Subject: [PATCH] chore(chrome): add to block list --- Cargo.lock | 12 +- examples/real_world.rs | 65 ++++--- spider/Cargo.toml | 2 +- spider/src/configuration.rs | 4 +- spider/src/features/chrome_common.rs | 13 +- spider/src/page.rs | 1 + spider/src/website.rs | 2 +- spider_chrome/Cargo.toml | 2 +- .../src/handler/blockers/intercept_manager.rs | 162 +++++++++++------- spider_chrome/src/handler/blockers/mod.rs | 3 + spider_chrome/src/handler/blockers/scripts.rs | 3 + .../handler/blockers/tcgplayer_blockers.rs | 27 +++ spider_chrome/src/handler/blockers/xhr.rs | 1 + spider_chrome/src/handler/network.rs | 13 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 18 files changed, 208 insertions(+), 110 deletions(-) create mode 100644 spider_chrome/src/handler/blockers/tcgplayer_blockers.rs diff --git a/Cargo.lock b/Cargo.lock index d8345eb93..0c310f86e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4273,7 +4273,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.21.16" +version = "2.21.17" dependencies = [ "ahash", "aho-corasick", @@ -4336,7 +4336,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.21.16" +version = "2.21.17" dependencies = [ "adblock", "base64 0.22.1", @@ -4373,7 +4373,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.21.16" +version = "2.21.17" dependencies = [ "clap", "env_logger", @@ -4398,7 +4398,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.21.16" +version = "2.21.17" dependencies = [ "aho-corasick", "fast_html2md", @@ -4420,7 +4420,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.21.16" +version = "2.21.17" dependencies = [ "indexmap 1.9.3", "serde", @@ -4432,7 +4432,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.21.16" +version = "2.21.17" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/real_world.rs b/examples/real_world.rs index faff6bacc..488875f32 100644 --- a/examples/real_world.rs +++ b/examples/real_world.rs @@ -1,7 +1,8 @@ -//! cargo run --example real_world --features="chrome chrome_intercept spider_utils/transformations" +//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations" extern crate spider; use crate::spider::tokio::io::AsyncWriteExt; +use spider::configuration::{WaitForDelay, WaitForSelector}; use spider::tokio; use spider::website::Website; use spider::{ @@ -14,12 +15,18 @@ async fn crawl_website(url: &str) -> Result<()> { let mut stdout = tokio::io::stdout(); let mut website: Website = Website::new(url) - .with_limit(1) + .with_limit(5) .with_chrome_intercept(RequestInterceptConfiguration::new(true)) .with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis( - 200, + 500, ))))) + .with_subdomains(true) + .with_wait_for_idle_dom(Some(WaitForSelector::new( + Some(Duration::from_millis(100)), + "body".into(), + ))) .with_block_assets(true) + // .with_wait_for_delay(Some(WaitForDelay::new(Some(Duration::from_millis(10000))))) .with_stealth(true) .with_return_page_links(true) .with_fingerprint(true) @@ -30,36 +37,40 @@ async fn crawl_website(url: &str) -> Result<()> { let mut rx2 = website.subscribe(16).unwrap(); - tokio::spawn(async move { - while let Ok(page) = rx2.recv().await { - let _ = stdout - .write_all( - format!( - "- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n", - page.get_url(), - page.bytes_transferred.unwrap_or_default(), - page.get_html_bytes_u8().len(), - match page.page_links { - Some(ref l) => l.len(), - _ => 0, - } + let start = crate::tokio::time::Instant::now(); + + let (links, _) = tokio::join!( + async move { + website.crawl().await; + website.unsubscribe(); + website.get_links() + }, + async move { + while let Ok(page) = rx2.recv().await { + let _ = stdout + .write_all( + format!( + "- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n", + page.get_url(), + page.bytes_transferred.unwrap_or_default(), + page.get_html_bytes_u8().len(), + match page.page_links { + Some(ref l) => l.len(), + _ => 0, + } + ) + .as_bytes(), ) - .as_bytes(), - ) - .await; + .await; + } } - }); - - let start = crate::tokio::time::Instant::now(); - website.crawl().await; + ); let duration = start.elapsed(); - let links = website.get_links(); - println!( "Time elapsed in website.crawl({}) is: {:?} for total pages: {:?}", - website.get_url(), + url, duration, links.len() ); @@ -71,7 +82,7 @@ async fn crawl_website(url: &str) -> Result<()> { async fn main() -> Result<()> { env_logger::init(); let _ = tokio::join!( - crawl_website("https://choosealicense.com"), + crawl_website("https://www.choosealicense.com"), crawl_website("https://jeffmendez.com"), crawl_website("https://example.com"), ); diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 992df9431..087e50fa2 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.21.16" +version = "2.21.17" authors = [ "j-mendez " ] diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index 126b03ade..ff90236e3 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -853,7 +853,7 @@ impl Configuration { pub fn with_chrome_intercept( &mut self, chrome_intercept: RequestInterceptConfiguration, - url: &str, + url: &Option>, ) -> &mut Self { self.chrome_intercept = chrome_intercept; self.chrome_intercept.setup_intercept_manager(url); @@ -865,7 +865,7 @@ impl Configuration { pub fn with_chrome_intercept( &mut self, _chrome_intercept: RequestInterceptConfiguration, - _url: &str, + _url: &Option>, ) -> &mut Self { self } diff --git a/spider/src/features/chrome_common.rs b/spider/src/features/chrome_common.rs index cdd28bbb7..ec8da2eef 100644 --- a/spider/src/features/chrome_common.rs +++ b/spider/src/features/chrome_common.rs @@ -16,13 +16,9 @@ pub enum NetworkInterceptManager { #[cfg(not(feature = "chrome"))] impl NetworkInterceptManager { /// a custom intercept handle. - pub fn new(_url: &str) -> NetworkInterceptManager { + pub fn new(_url: &Option>) -> NetworkInterceptManager { NetworkInterceptManager::Unknown } - /// Setup the intercept handle - pub fn setup(&mut self, url: &str) -> Self { - NetworkInterceptManager::new(url) - } } #[derive(Debug, Default, Clone, PartialEq)] @@ -682,7 +678,10 @@ impl RequestInterceptConfiguration { } } /// Setup a new intercept config with a custom intercept manager. - pub fn new_manager(enabled: bool, url: &str) -> RequestInterceptConfiguration { + pub fn new_manager( + enabled: bool, + url: &Option>, + ) -> RequestInterceptConfiguration { RequestInterceptConfiguration { enabled, block_javascript: false, @@ -695,7 +694,7 @@ impl RequestInterceptConfiguration { } /// Setup the network request manager type. - pub fn setup_intercept_manager(&mut self, url: &str) { + pub fn setup_intercept_manager(&mut self, url: &Option>) { self.intercept_manager = NetworkInterceptManager::new(url); } diff --git a/spider/src/page.rs b/spider/src/page.rs index 978ab62ca..bfde640ad 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -229,6 +229,7 @@ pub fn push_link>( ) { if let Some(b) = base { let mut abs = convert_abs_path(b, href); + let new_page = abs != **b; if let Some(link_map) = links_pages { diff --git a/spider/src/website.rs b/spider/src/website.rs index f168d16eb..7a19898e5 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -4046,7 +4046,7 @@ impl Website { chrome_intercept: RequestInterceptConfiguration, ) -> &mut Self { self.configuration - .with_chrome_intercept(chrome_intercept, &self.url); + .with_chrome_intercept(chrome_intercept, &self.domain_parsed); self } diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 53287fdf1..ead6166eb 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.21.16" +version = "2.21.17" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/handler/blockers/intercept_manager.rs b/spider_chrome/src/handler/blockers/intercept_manager.rs index fdc8f239e..6f238f663 100644 --- a/spider_chrome/src/handler/blockers/intercept_manager.rs +++ b/spider_chrome/src/handler/blockers/intercept_manager.rs @@ -1,3 +1,5 @@ +use phf::phf_map; + /// Custom network intercept types to expect on a domain #[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)] pub enum NetworkInterceptManager { @@ -25,76 +27,116 @@ pub enum NetworkInterceptManager { Nytimes, /// wikipedia.com Wikipedia, + /// tcgplayer.com + Tcgplayer, #[default] /// Unknown Unknown, } -lazy_static::lazy_static! { - /// Top tier list of the most common websites visited. - pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [ - ("https://www.tiktok.com", NetworkInterceptManager::TikTok), - ("https://tiktok.com", NetworkInterceptManager::TikTok), - ("https://www.amazon.", NetworkInterceptManager::Amazon), - ("https://amazon.", NetworkInterceptManager::Amazon), - ("https://www.x.com", NetworkInterceptManager::X), - ("https://x.com", NetworkInterceptManager::X), - ("https://www.netflix.com", NetworkInterceptManager::Netflix), - ("https://netflix.com", NetworkInterceptManager::Netflix), - ( - "https://www.linkedin.com", - NetworkInterceptManager::LinkedIn - ), - ("https://linkedin.com", NetworkInterceptManager::LinkedIn), - ("https://www.upwork.com", NetworkInterceptManager::Upwork), - ("https://upwork.com", NetworkInterceptManager::Upwork), - ("https://www.glassdoor.", NetworkInterceptManager::Glassdoor), - ("https://glassdoor.", NetworkInterceptManager::Glassdoor), - ("https://www.medium.com", NetworkInterceptManager::Medium), - ("https://medium.com", NetworkInterceptManager::Medium), - ("https://www.ebay.", NetworkInterceptManager::Ebay), - ("https://ebay.", NetworkInterceptManager::Ebay), - ("https://www.nytimes.com", NetworkInterceptManager::Nytimes), - ("https://nytimes.com", NetworkInterceptManager::Nytimes), - ("wikipedia.org", NetworkInterceptManager::Wikipedia), - ]; -} - -/// The find type is own. -#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)] -enum FindType { - #[default] - /// Starts with. - StartsWith, - /// Contains. - Contains, -} +// Top tier 100 domain list. +static DOMAIN_MAP: phf::Map<&'static str, NetworkInterceptManager> = phf_map! { + "tiktok.com" => NetworkInterceptManager::TikTok, + "facebook.com" => NetworkInterceptManager::Facebook, + "amazon.com" => NetworkInterceptManager::Amazon, + "x.com" => NetworkInterceptManager::X, + "linkedin.com" => NetworkInterceptManager::LinkedIn, + "netflix.com" => NetworkInterceptManager::Netflix, + "medium.com" => NetworkInterceptManager::Medium, + "upwork.com" => NetworkInterceptManager::Upwork, + "glassdoor.com" => NetworkInterceptManager::Glassdoor, + "ebay.com" => NetworkInterceptManager::Ebay, + "nytimes.com" => NetworkInterceptManager::Nytimes, + "wikipedia.org" => NetworkInterceptManager::Wikipedia, + "tcgplayer.com" => NetworkInterceptManager::Tcgplayer, +}; impl NetworkInterceptManager { - /// a custom intercept handle. - pub fn new(url: &str) -> NetworkInterceptManager { - TOP_TIER_LIST - .iter() - .find(|&(pattern, nm)| { - if nm.get_pattern() == FindType::StartsWith { - url.starts_with(pattern) + pub fn new(url: &Option>) -> NetworkInterceptManager { + if let Some(parsed_url) = url { + if let Some(domain) = parsed_url.domain() { + // list of top websites should at most two - can always do a second pass. + let domain_parts: Vec<&str> = domain.split('.').collect(); + + let base_domain = if domain_parts.len() > 2 { + format!( + "{}.{}", + domain_parts[domain_parts.len() - 2], + domain_parts[domain_parts.len() - 1] + ) } else { - url.contains(pattern) - } - }) - .map(|&(_, manager_type)| manager_type) - .unwrap_or(NetworkInterceptManager::Unknown) + domain.to_string() + }; + + return *DOMAIN_MAP + .get(&base_domain) + .unwrap_or(&NetworkInterceptManager::Unknown); + } + } + NetworkInterceptManager::Unknown + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use url::Url; + + // Helper function to create an Option> from a string + fn create_url(url: &str) -> Option> { + Url::parse(url).ok().map(Box::new) } - /// Setup the intercept handle - pub fn setup(&mut self, url: &str) -> Self { - NetworkInterceptManager::new(url) + + #[test] + fn test_known_domains() { + let cases = vec![ + ("http://www.tiktok.com", NetworkInterceptManager::TikTok), + ("https://facebook.com", NetworkInterceptManager::Facebook), + ("https://www.amazon.com", NetworkInterceptManager::Amazon), + ("https://subdomain.x.com", NetworkInterceptManager::X), + ("https://linkedin.com/in/someone", NetworkInterceptManager::LinkedIn), + ("https://www.netflix.com/browse", NetworkInterceptManager::Netflix), + ("https://medium.com", NetworkInterceptManager::Medium), + ("https://sub.upwork.com", NetworkInterceptManager::Upwork), + ("https://glassdoor.com", NetworkInterceptManager::Glassdoor), + ("https://ebay.com", NetworkInterceptManager::Ebay), + ("https://nytimes.com/section/world", NetworkInterceptManager::Nytimes), + ("https://en.wikipedia.org/wiki/Rust", NetworkInterceptManager::Wikipedia), + ("https://market.tcgplayer.com", NetworkInterceptManager::Tcgplayer), + ]; + + for (url, expected) in cases { + assert_eq!(NetworkInterceptManager::new(&create_url(url)), expected); + } } - /// determine the pattern to use. - fn get_pattern(&self) -> FindType { - match self { - NetworkInterceptManager::Wikipedia => FindType::Contains, - _ => FindType::StartsWith, + #[test] + fn test_unknown_domains() { + let cases = vec![ + "https://www.unknown.com", + "http://subdomain.randomstuff.org", + "https://notindatabase.co.uk", + "https://another.unknown.site", + ]; + + for url in cases { + assert_eq!(NetworkInterceptManager::new(&create_url(url)), NetworkInterceptManager::Unknown); } } -} + + #[test] + fn test_invalid_urls() { + let cases = vec![ + "not-a-url", + "ftp://invalid.protocol.com", + "http://", + "", + ]; + + for url in cases { + // Parsing will fail here, resulting in None, which should return Unknown + assert_eq!(NetworkInterceptManager::new(&create_url(url)), NetworkInterceptManager::Unknown); + } + } +} \ No newline at end of file diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs index c923c7d8f..694e65fc9 100644 --- a/spider_chrome/src/handler/blockers/mod.rs +++ b/spider_chrome/src/handler/blockers/mod.rs @@ -18,6 +18,8 @@ pub mod netflix_blockers; pub mod nytimes_blockers; /// script blockers pub mod scripts; +/// block tcgplayer.com +pub mod tcgplayer_blockers; /// tiktok blockers pub mod tiktok_blockers; /// upwork blockers @@ -26,6 +28,7 @@ pub mod upwork_blockers; pub mod wikipedia_blockers; /// x blockers pub mod x_blockers; + /// xhr blockers pub mod xhr; diff --git a/spider_chrome/src/handler/blockers/scripts.rs b/spider_chrome/src/handler/blockers/scripts.rs index 080d4bff7..67bbe337d 100644 --- a/spider_chrome/src/handler/blockers/scripts.rs +++ b/spider_chrome/src/handler/blockers/scripts.rs @@ -92,6 +92,7 @@ lazy_static::lazy_static! { "https://assets.adobedtm.com/extensions/", "https://macro.adnami.io/macro/spec/adsm.macro.", "https://log.medietall.no/analytics.js", + "https://cdn.siftscience.com/s.js", "https://lwadm.com/lw/pbjs?", "https://cl.k5a.io/", "https://cdn-cookieyes.com/", @@ -139,6 +140,7 @@ lazy_static::lazy_static! { let patterns = [ "https://www.youtube.com/embed/", // YouTube video embeds "https://www.google.com/maps/embed?", // Google Maps embeds + "https://maps.google.com", // Google maps iframe. "https://player.vimeo.com/video/", // Vimeo video embeds "https://player.vimeo.com/api/player.js", // Vimeo video embeds "https://open.spotify.com/embed/", // Spotify music embeds @@ -193,6 +195,7 @@ lazy_static::lazy_static! { ".googlesyndication.com/safeframe/", // repeat consent js "/ccpa/user-consent.min.js", + "consent-manager", "/cookiebanner/js/", "cookielaw.org", // privacy diff --git a/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs b/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs new file mode 100644 index 000000000..bdf79b05d --- /dev/null +++ b/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs @@ -0,0 +1,27 @@ +use crate::handler::blockers::Trie; + +lazy_static::lazy_static! { + /// Ignore list of urls. + static ref URL_IGNORE_TRIE: Trie = { + let mut trie = Trie::new(); + let patterns = [ + "https://data.tcgplayer.com/suggestions/trending", + "https://mpapi.tcgplayer.com/v2/kickbacks?active=true", + "https://homepage.marketplace.tcgplayer.com/sitealert.json", + "https://infinite-api.tcgplayer.com/signup/?", + "https://features.tcgplayer.com/v1/optimizely/Variation/", + "https://mpapi.tcgplayer.com/v2/address/countryCodes?mpfev=3031" + ]; + for pattern in &patterns { + trie.insert(pattern); + } + trie + }; +} + +// Block tcgplayer events that are not required +pub fn block_tcgplayer( + event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, +) -> bool { + URL_IGNORE_TRIE.contains_prefix(&event.request.url) +} diff --git a/spider_chrome/src/handler/blockers/xhr.rs b/spider_chrome/src/handler/blockers/xhr.rs index 28e15e351..90acf9f49 100644 --- a/spider_chrome/src/handler/blockers/xhr.rs +++ b/spider_chrome/src/handler/blockers/xhr.rs @@ -87,6 +87,7 @@ lazy_static::lazy_static! { "https://events.api.secureserver.net/", "https://csp.secureserver.net/eventbus", "https://doh.cq0.co/resolve", + "https://cdn.segment.", ".wixapps.net/api/v1/bulklog", "https://error-analytics-sessions-production.shopifysvc.com/", "https://static-forms.", diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index a278ba009..6d56f4d46 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -459,6 +459,9 @@ impl NetworkManager { NetworkInterceptManager::Wikipedia => { super::blockers::wikipedia_blockers::block_wikipedia(event) } + NetworkInterceptManager::Tcgplayer => { + super::blockers::tcgplayer_blockers::block_tcgplayer(event) + } NetworkInterceptManager::Nytimes => { super::blockers::nytimes_blockers::block_nytimes( event, @@ -484,6 +487,11 @@ impl NetworkManager { }; if skip_networking { + tracing::debug!( + "Blocked: {:?} - {}", + event.resource_type, + event.request.url + ); let fullfill_params = crate::handler::network::fetch::FulfillRequestParams::new( event.request_id.clone(), @@ -492,7 +500,7 @@ impl NetworkManager { self.push_cdp_request(fullfill_params); } else { tracing::debug!( - "Network Allowed: {:?} - {}", + "Allowed: {:?} - {}", event.resource_type, event.request.url ); @@ -593,6 +601,9 @@ impl NetworkManager { NetworkInterceptManager::LinkedIn => { super::blockers::linkedin_blockers::block_linkedin(event) } + NetworkInterceptManager::Tcgplayer => { + super::blockers::tcgplayer_blockers::block_tcgplayer(event) + } NetworkInterceptManager::Medium => { super::blockers::medium_blockers::block_medium(event) } diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index e2f22f734..f08e892e6 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.21.16" +version = "2.21.17" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index d0e8b1055..86c0e5442 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.21.16" +version = "2.21.17" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 172fce90e..e4a51132c 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.21.16" +version = "2.21.17" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 97f14725a..4d9e993d2 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.21.16" +version = "2.21.17" authors = [ "j-mendez " ]