From 6e14f1c8b3c3f75e000073d7a94836bba2b2cba5 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sat, 14 Dec 2024 06:40:21 -0500 Subject: [PATCH] chore(page): fix block asset handling --- Cargo.lock | 12 +- examples/real_world.rs | 4 +- spider/Cargo.toml | 2 +- spider/src/page.rs | 119 +++++++++--------- spider/src/utils/abs.rs | 7 +- spider_chrome/Cargo.toml | 2 +- spider_chrome/src/handler/blockers/scripts.rs | 2 +- spider_chrome/src/handler/blockers/xhr.rs | 1 + spider_chrome/src/handler/network.rs | 4 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 13 files changed, 86 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c9c9f9612..4fa0598e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5029,7 +5029,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.21.26" +version = "2.21.27" dependencies = [ "ahash", "aho-corasick", @@ -5092,7 +5092,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.21.26" +version = "2.21.27" dependencies = [ "adblock", "aho-corasick", @@ -5182,7 +5182,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.21.26" +version = "2.21.27" dependencies = [ "clap", "env_logger", @@ -5207,7 +5207,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.21.26" +version = "2.21.27" dependencies = [ "aho-corasick", "fast_html2md", @@ -5229,7 +5229,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.21.26" +version = "2.21.27" dependencies = [ "indexmap 1.9.3", "serde", @@ -5241,7 +5241,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.21.26" +version = "2.21.27" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/real_world.rs b/examples/real_world.rs index 5c69b41cb..5b8acdb8e 100644 --- a/examples/real_world.rs +++ b/examples/real_world.rs @@ -1,4 +1,4 @@ -//! cargo run --example real_world --features="chrome chrome_intercept real_browser" +//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations" extern crate spider; use crate::spider::tokio::io::AsyncWriteExt; @@ -8,6 +8,7 @@ use spider::website::Website; use spider::{ configuration::WaitForIdleNetwork, features::chrome_common::RequestInterceptConfiguration, }; + use std::io::Result; use std::time::Duration; @@ -23,7 +24,6 @@ async fn crawl_website(url: &str) -> Result<()> { .with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis( 500, ))))) - .with_subdomains(true) .with_wait_for_idle_dom(Some(WaitForSelector::new( Some(Duration::from_millis(100)), "body".into(), diff --git a/spider/Cargo.toml b/spider/Cargo.toml index e0c8e3a6c..38c853dc3 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.21.26" +version = "2.21.27" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index bfde640ad..a28402400 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -31,20 +31,6 @@ lazy_static! { static ref GATSBY: Option = Some("gatsby-chunk-mapping".into()); } -#[cfg(any(feature = "smart", feature = "chrome_intercept"))] -lazy_static! { - /// popular js frameworks and libs - pub static ref JS_FRAMEWORK_ASSETS: phf::Set<&'static str> = { - phf::phf_set! { - "jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js", - "vue.global.js", "vue.global.prod.js", "vue.runtime.", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", "d3.js", "material-components-web.min.js", - "otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js", "layui.js", ".js?meteor_js_resource=true", "lodash.min.js", "lodash.js", - // possible js that could be critical. - "app.js", "main.js", "index.js", "bundle.js", "vendor.js", - } - }; -} - #[cfg(all( not(feature = "decentralized"), not(feature = "full_resources"), @@ -70,40 +56,56 @@ lazy_static! { }; } -#[cfg(any(feature = "chrome_intercept"))] -lazy_static! { - /// allowed js frameworks and libs excluding some and adding additional URLs. - pub static ref JS_FRAMEWORK_ALLOW: phf::Set<&'static str> = { - phf::phf_set! { - // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones - "jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", - "react.development.js", "react-dom.development.js", "react.production.min.js", - "react-dom.production.min.js", "vue.global.js", "vue.global.prod.js", "vue.esm-browser.js", "vue.js", - "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", ".js?meteor_js_resource=true", - "d3.js", "layui.js", "lodash.min.js", "lodash.js", - "app.js", "main.js", "index.js", "bundle.js", "vendor.js", - // Verified 3rd parties for request - "https://m.stripe.network/inner.html", - "https://m.stripe.network/out-4.5.43.js", - "https://challenges.cloudflare.com/turnstile", - "https://js.stripe.com/v3/" - } - }; -} - lazy_static! { - /// include only list of resources - pub(crate) static ref ONLY_RESOURCES: HashSet = { - let mut m: HashSet = HashSet::with_capacity(28); + /// Visual assets to ignore. + pub(crate) static ref IGNORE_ASSETS: HashSet = { + let mut m: HashSet = HashSet::with_capacity(62); m.extend([ - "html", "htm", "shtml", "asp", "aspx", "php", "jps", "jpsx", "jsp", "cfm", "xhtml", "rhtml", "phtml", "erb", - // handle .. prefix for urls ending with an extra ending - ".html", ".htm", ".shtml", ".asp", ".aspx", ".php", ".jps", ".jpsx", ".jsp", ".cfm", ".xhtml", ".rhtml", ".phtml", ".erb", + "jpg", "jpeg", "png", "gif", "svg", "webp", // Image files + "mp4", "avi", "mov", "wmv", "flv", // Video files + "mp3", "wav", "ogg", // Audio files + "woff", "woff2", "ttf", "otf", // Font files + "swf", "xap", // Flash/Silverlight files + "ico", "eot", // Other resource files + "bmp", "tiff", "tif", "heic", "heif", // Additional Image files + "mkv", "webm", "m4v", // Additional Video files + "aac", "flac", "m4a", "aiff", // Additional Audio files + "pdf", "eps", // Other additional files + + // Including extensions with extra dot + ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", + ".mp4", ".avi", ".mov", ".wmv", ".flv", + ".mp3", ".wav", ".ogg", + ".woff", ".woff2", ".ttf", ".otf", + ".swf", ".xap", + ".ico", ".eot", + ".bmp", ".tiff", ".tif", ".heic", ".heif", + ".mkv", ".webm", ".m4v", + ".aac", ".flac", ".m4a", ".aiff", + ".pdf", ".eps" ].map(|s| s.into())); m }; + + /// The chunk size for the rewriter. Can be adjusted using the env var "SPIDER_STREAMING_CHUNK_SIZE". + pub(crate) static ref STREAMING_CHUNK_SIZE: usize = { + const DEFAULT_STREAMING_CHUNK_SIZE: usize = 8192; + const MIN_STREAMING_CHUNK_SIZE: usize = DEFAULT_STREAMING_CHUNK_SIZE / 4; + + std::env::var("SPIDER_STREAMING_CHUNK_SIZE") + .ok() + .and_then(|val| val.parse::().ok()) + .map(|val| { + if val < MIN_STREAMING_CHUNK_SIZE { + MIN_STREAMING_CHUNK_SIZE + } else { + val + } + }) + .unwrap_or(DEFAULT_STREAMING_CHUNK_SIZE) + }; } /// The AI data returned from a GPT. @@ -240,8 +242,10 @@ pub fn push_link>( if new_page { let scheme = abs.scheme(); + if scheme == "https" || scheme == "http" { let host_name = abs.host_str(); + let mut can_process = parent_host_match( host_name, base_domain, @@ -264,6 +268,7 @@ pub fn push_link>( let hchars = abs.path(); + // check if the file is a resource and block if it is if let Some(position) = hchars.rfind('.') { let hlen = hchars.len(); let has_asset = hlen - position; @@ -272,7 +277,7 @@ pub fn push_link>( let next_position = position + 1; if !full_resources - && !ONLY_RESOURCES.contains::( + && IGNORE_ASSETS.contains::( &hchars[next_position..].into(), ) { @@ -1342,8 +1347,7 @@ impl Page { lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {}); let html_bytes = html.as_bytes(); - let chunk_size = 8192; - let chunks = html_bytes.chunks(chunk_size); + let chunks = html_bytes.chunks(*STREAMING_CHUNK_SIZE); let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], A>); @@ -1394,6 +1398,7 @@ impl Page { } else { None }; + if !html.is_empty() { if html.starts_with("); @@ -1677,9 +1681,7 @@ impl Page { abs.path_segments().ok_or_else(|| "cannot be base") { while let Some(p) = paths.next() { - // todo: get the path last before None instead of checking for ends_with - if p.ends_with(".js") - && JS_FRAMEWORK_ASSETS.contains(&p) + if chromiumoxide::handler::network::ALLOWED_MATCHER.is_match(&p) { rerender.swap(true, Ordering::Relaxed); } @@ -1731,8 +1733,7 @@ impl Page { }); let html_bytes = html_resource.as_bytes(); - let chunk_size = 8192; - let chunks = html_bytes.chunks(chunk_size); + let chunks = html_bytes.chunks(*STREAMING_CHUNK_SIZE); let mut wrote_error = false; let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], A>); @@ -1804,14 +1805,14 @@ impl Page { true, &Some(crate::configuration::WaitFor::new( Some( - core::time::Duration::from_secs(120), // default a duration for smart handling. (maybe expose later on.) + core::time::Duration::from_secs(60), // default a duration for smart handling. (maybe expose later on.) ), None, true, true, None, Some(crate::configuration::WaitForSelector::new( - Some(core::time::Duration::from_millis(500)), + Some(core::time::Duration::from_millis(250)), "body".into(), )), )), @@ -1827,7 +1828,14 @@ impl Page { .await; if let Some(h) = intercept_handle { - let _ = h.await; + let abort_handle = h.abort_handle(); + if let Err(elasped) = + tokio::time::timeout(tokio::time::Duration::from_secs(10), h) + .await + { + log::warn!("Handler timeout exceeded {elasped}"); + abort_handle.abort(); + } } if let Ok(resource) = page_resource { @@ -1946,8 +1954,7 @@ impl Page { let mut rewriter = lol_html::send::HtmlRewriter::new(settings, |_c: &[u8]| {}); let html_bytes = html.as_bytes(); - let chunk_size = 8192; - let chunks = html_bytes.chunks(chunk_size); + let chunks = html_bytes.chunks(*STREAMING_CHUNK_SIZE); let mut wrote_error = false; let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], A>); diff --git a/spider/src/utils/abs.rs b/spider/src/utils/abs.rs index c7bbc7313..50267ab68 100644 --- a/spider/src/utils/abs.rs +++ b/spider/src/utils/abs.rs @@ -1,4 +1,4 @@ -use crate::page::ONLY_RESOURCES; +use crate::page::IGNORE_ASSETS; use phf::phf_set; use url::Url; @@ -101,12 +101,15 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { if let Some(position) = href.rfind('.') { let hlen = href.len(); let has_asset = hlen - position; + if has_asset >= 3 { let next_position = position + 1; - if !ONLY_RESOURCES.contains::( + + if IGNORE_ASSETS.contains::( &href[next_position..].into(), ) { let full_url = format!("{}://{}", base.scheme(), href); + if let Ok(mut next_url) = Url::parse(&full_url) { next_url.set_fragment(None); return next_url; diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 3200fcef6..8b0f9a26e 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.21.26" +version = "2.21.27" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/handler/blockers/scripts.rs b/spider_chrome/src/handler/blockers/scripts.rs index 5c7a5453e..289f94d0b 100644 --- a/spider_chrome/src/handler/blockers/scripts.rs +++ b/spider_chrome/src/handler/blockers/scripts.rs @@ -22,6 +22,7 @@ lazy_static::lazy_static! { "https://www.googleanalytics.com", "https://iabusprivacy.pmc.com/geo-info.js", "https://cookie-cdn.cookiepro.com/consent", + "https://static.hotjar.com/", "https://load.sumome.com/", "https://www.mongoosemetrics.com/", "https://geolocation-recommendations.shopifyapps.com/", @@ -57,7 +58,6 @@ lazy_static::lazy_static! { "https://featureassets.org", "https://cdn.rudderlabs.com", "https://script.hotjar.com/", - "https://static.hotjar.com/", "https://cdn.insurads.com/", "https://cdn-ukwest.onetrust.com", "https://cdn.onetrust.com", diff --git a/spider_chrome/src/handler/blockers/xhr.rs b/spider_chrome/src/handler/blockers/xhr.rs index 49f7ac5fd..eb7afbe77 100644 --- a/spider_chrome/src/handler/blockers/xhr.rs +++ b/spider_chrome/src/handler/blockers/xhr.rs @@ -65,6 +65,7 @@ lazy_static::lazy_static! { "https://s.yimg.com/wi", "https://disney.my.sentry.io/api/", "https://www.redditstatic.com/ads", + "https://static.hotjar.com/", "https://sentry.io/api/", "https://buy.tinypass.com/", "https://idx.liadm.com", diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 4ea929f9d..3c1954593 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -57,8 +57,8 @@ lazy_static! { "https://cdnjs.cloudflare.com/" // cloudflare cdn scripts ]; - /// Create a static AhoCorasick matcher based on the allowed list - static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).unwrap(); + /// Determine if a script should be rendered in the browser by name. + pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).unwrap(); /// path of a js framework pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = { diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 0c32f55a6..2c364c28e 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.21.26" +version = "2.21.27" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 436a136ad..c61088587 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.21.26" +version = "2.21.27" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 75c0170da..a080d1597 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.21.26" +version = "2.21.27" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 4aafafef9..2a515e548 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.21.26" +version = "2.21.27" authors = [ "j-mendez " ]