diff --git a/Cargo.lock b/Cargo.lock index 3e6cc1fdf..5b591289c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5029,7 +5029,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.21.21" +version = "2.21.23" dependencies = [ "ahash", "aho-corasick", @@ -5092,9 +5092,10 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.21.21" +version = "2.21.23" dependencies = [ "adblock", + "aho-corasick", "base64 0.22.1", "bytes", "case_insensitive_string", @@ -5181,7 +5182,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.21.21" +version = "2.21.23" dependencies = [ "clap", "env_logger", @@ -5206,7 +5207,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.21.21" +version = "2.21.23" dependencies = [ "aho-corasick", "fast_html2md", @@ -5228,7 +5229,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.21.21" +version = "2.21.23" dependencies = [ "indexmap 1.9.3", "serde", @@ -5240,7 +5241,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.21.21" +version = "2.21.23" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/real_world.rs b/examples/real_world.rs index 488875f32..5c69b41cb 100644 --- a/examples/real_world.rs +++ b/examples/real_world.rs @@ -1,4 +1,4 @@ -//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations" +//! cargo run --example real_world --features="chrome chrome_intercept real_browser" extern crate spider; use crate::spider::tokio::io::AsyncWriteExt; @@ -13,10 +13,13 @@ use std::time::Duration; async fn crawl_website(url: &str) -> Result<()> { let mut stdout = tokio::io::stdout(); + let mut interception = RequestInterceptConfiguration::new(true); + + interception.block_javascript = true; let mut website: Website = Website::new(url) .with_limit(5) - .with_chrome_intercept(RequestInterceptConfiguration::new(true)) + .with_chrome_intercept(interception) .with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis( 500, ))))) @@ -82,7 +85,7 @@ async fn crawl_website(url: &str) -> Result<()> { async fn main() -> Result<()> { env_logger::init(); let _ = tokio::join!( - crawl_website("https://www.choosealicense.com"), + crawl_website("https://choosealicense.com"), crawl_website("https://jeffmendez.com"), crawl_website("https://example.com"), ); diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 0cf413d44..7b8d2929e 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.21.21" +version = "2.21.23" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index b78437afc..8d0bee436 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.21.21" +version = "2.21.23" rust-version = "1.70" authors = [ "j-mendez " @@ -53,6 +53,7 @@ adblock = { version = "0.9", optional = true, default-features = false, features rand = "0.8" case_insensitive_string = { version = "0.2", features = ["compact", "serde"] } hashbrown = { version = "0.15", default-features = true } +aho-corasick = "1" [target.'cfg(windows)'.dependencies] winreg = "0.52" diff --git a/spider_chrome/src/handler/blockers/xhr.rs b/spider_chrome/src/handler/blockers/xhr.rs index 90acf9f49..e5a1b8f1f 100644 --- a/spider_chrome/src/handler/blockers/xhr.rs +++ b/spider_chrome/src/handler/blockers/xhr.rs @@ -95,6 +95,7 @@ lazy_static::lazy_static! { // video embeddings "https://video.squarespace-cdn.com/content/", "https://bes.gcp.data.bigcommerce.com/nobot", + "https://www.youtube.com/youtubei/", "http://ec.editmysite.com", "https://dcinfos-cache.abtasty.com/", "https://featureassets.org/", diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 8f53b4470..f91165bcb 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -10,6 +10,7 @@ use super::blockers::{ use crate::auth::Credentials; use crate::cmd::CommandChain; use crate::handler::http::HttpRequest; +use aho_corasick::AhoCorasick; use case_insensitive_string::CaseInsensitiveString; use chromiumoxide_cdp::cdp::browser_protocol::fetch::{ self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams, @@ -32,23 +33,32 @@ use std::collections::VecDeque; use std::time::Duration; lazy_static! { - /// allowed js frameworks and libs excluding some and adding additional URLs - pub static ref JS_FRAMEWORK_ALLOW: phf::Set<&'static str> = { - phf::phf_set! { - // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones - "jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", - "react.development.js", "react-dom.development.js", "react.production.min.js", - "react-dom.production.min.js", "vue.global.js", "vue.esm-browser.js", "vue.js", - "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", - "d3.js", "lodash.min.js", "lodash.js", - "app.js", "main.js", "index.js", "bundle.js", "vendor.js", - // Verified 3rd parties for request - "https://m.stripe.network/inner.html", - "https://m.stripe.network/out-4.5.43.js", - "https://challenges.cloudflare.com/turnstile", - "https://js.stripe.com/v3/" - } - }; + /// General patterns for popular libraries and resources + static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![ + "jquery", // Covers jquery.min.js, jquery.js, etc. + "angular", + "react", // Covers all React-related patterns + "vue", // Covers all Vue-related patterns + "bootstrap", + "d3", + "lodash", + "ajax", + "app", // Covers general app scripts like app.js + "main", + "index", + "bundle", + "vendor", + "/wp-content/js/", // Covers word press content + // Verified 3rd parties for request + "https://m.stripe.network/", + "https://challenges.cloudflare.com/", + "https://js.stripe.com/", + "https://cdn.prod.website-files.com/", // webflow cdn scripts + "https://cdnjs.cloudflare.com/" // cloudflare cdn scripts + ]; + + /// Create a static AhoCorasick matcher based on the allowed list + static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).unwrap(); /// path of a js framework pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = { @@ -400,7 +410,7 @@ impl NetworkManager { && ResourceType::Stylesheet == event.resource_type || self.block_javascript && javascript_resource - && !JS_FRAMEWORK_ALLOW.contains(current_url) + && !ALLOWED_MATCHER.is_match(current_url) } else { skip_networking }; @@ -502,7 +512,7 @@ impl NetworkManager { && ResourceType::Stylesheet == event.resource_type || self.block_javascript && javascript_resource - && !JS_FRAMEWORK_ALLOW.contains(current_url) + && !ALLOWED_MATCHER.contains(current_url) } else { skip_networking }; diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 29d1b2d87..91d71a1c7 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.21.21" +version = "2.21.23" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index fd3205ef6..b3235994a 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.21.21" +version = "2.21.23" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index d18addeec..8499c00df 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.21.21" +version = "2.21.23" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 4babe05aa..467fbeadb 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.21.21" +version = "2.21.23" authors = [ "j-mendez " ]