Skip to content

Commit

Permalink
chore(page): fix block asset handling
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 14, 2024
1 parent 5f2da26 commit 6e14f1c
Show file tree
Hide file tree
Showing 13 changed files with 86 additions and 75 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions examples/real_world.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! cargo run --example real_world --features="chrome chrome_intercept real_browser"
//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations"
extern crate spider;
use crate::spider::tokio::io::AsyncWriteExt;
Expand All @@ -8,6 +8,7 @@ use spider::website::Website;
use spider::{
configuration::WaitForIdleNetwork, features::chrome_common::RequestInterceptConfiguration,
};

use std::io::Result;
use std::time::Duration;

Expand All @@ -23,7 +24,6 @@ async fn crawl_website(url: &str) -> Result<()> {
.with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
500,
)))))
.with_subdomains(true)
.with_wait_for_idle_dom(Some(WaitForSelector::new(
Some(Duration::from_millis(100)),
"body".into(),
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.21.26"
version = "2.21.27"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
119 changes: 63 additions & 56 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,6 @@ lazy_static! {
static ref GATSBY: Option<String> = Some("gatsby-chunk-mapping".into());
}

#[cfg(any(feature = "smart", feature = "chrome_intercept"))]
lazy_static! {
/// popular js frameworks and libs
pub static ref JS_FRAMEWORK_ASSETS: phf::Set<&'static str> = {
phf::phf_set! {
"jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js",
"vue.global.js", "vue.global.prod.js", "vue.runtime.", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", "d3.js", "material-components-web.min.js",
"otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js", "layui.js", ".js?meteor_js_resource=true", "lodash.min.js", "lodash.js",
// possible js that could be critical.
"app.js", "main.js", "index.js", "bundle.js", "vendor.js",
}
};
}

#[cfg(all(
not(feature = "decentralized"),
not(feature = "full_resources"),
Expand All @@ -70,40 +56,56 @@ lazy_static! {
};
}

#[cfg(any(feature = "chrome_intercept"))]
lazy_static! {
/// allowed js frameworks and libs excluding some and adding additional URLs.
pub static ref JS_FRAMEWORK_ALLOW: phf::Set<&'static str> = {
phf::phf_set! {
// Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
"jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js",
"react.development.js", "react-dom.development.js", "react.production.min.js",
"react-dom.production.min.js", "vue.global.js", "vue.global.prod.js", "vue.esm-browser.js", "vue.js",
"bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", ".js?meteor_js_resource=true",
"d3.js", "layui.js", "lodash.min.js", "lodash.js",
"app.js", "main.js", "index.js", "bundle.js", "vendor.js",
// Verified 3rd parties for request
"https://m.stripe.network/inner.html",
"https://m.stripe.network/out-4.5.43.js",
"https://challenges.cloudflare.com/turnstile",
"https://js.stripe.com/v3/"
}
};
}

lazy_static! {
/// include only list of resources
pub(crate) static ref ONLY_RESOURCES: HashSet<CaseInsensitiveString> = {
let mut m: HashSet<CaseInsensitiveString> = HashSet::with_capacity(28);
/// Visual assets to ignore.
pub(crate) static ref IGNORE_ASSETS: HashSet<CaseInsensitiveString> = {
let mut m: HashSet<CaseInsensitiveString> = HashSet::with_capacity(62);

m.extend([
"html", "htm", "shtml", "asp", "aspx", "php", "jps", "jpsx", "jsp", "cfm", "xhtml", "rhtml", "phtml", "erb",
// handle .. prefix for urls ending with an extra ending
".html", ".htm", ".shtml", ".asp", ".aspx", ".php", ".jps", ".jpsx", ".jsp", ".cfm", ".xhtml", ".rhtml", ".phtml", ".erb",
"jpg", "jpeg", "png", "gif", "svg", "webp", // Image files
"mp4", "avi", "mov", "wmv", "flv", // Video files
"mp3", "wav", "ogg", // Audio files
"woff", "woff2", "ttf", "otf", // Font files
"swf", "xap", // Flash/Silverlight files
"ico", "eot", // Other resource files
"bmp", "tiff", "tif", "heic", "heif", // Additional Image files
"mkv", "webm", "m4v", // Additional Video files
"aac", "flac", "m4a", "aiff", // Additional Audio files
"pdf", "eps", // Other additional files

// Including extensions with extra dot
".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp",
".mp4", ".avi", ".mov", ".wmv", ".flv",
".mp3", ".wav", ".ogg",
".woff", ".woff2", ".ttf", ".otf",
".swf", ".xap",
".ico", ".eot",
".bmp", ".tiff", ".tif", ".heic", ".heif",
".mkv", ".webm", ".m4v",
".aac", ".flac", ".m4a", ".aiff",
".pdf", ".eps"
].map(|s| s.into()));

m
};

/// The chunk size for the rewriter. Can be adjusted using the env var "SPIDER_STREAMING_CHUNK_SIZE".
pub(crate) static ref STREAMING_CHUNK_SIZE: usize = {
const DEFAULT_STREAMING_CHUNK_SIZE: usize = 8192;
const MIN_STREAMING_CHUNK_SIZE: usize = DEFAULT_STREAMING_CHUNK_SIZE / 4;

std::env::var("SPIDER_STREAMING_CHUNK_SIZE")
.ok()
.and_then(|val| val.parse::<usize>().ok())
.map(|val| {
if val < MIN_STREAMING_CHUNK_SIZE {
MIN_STREAMING_CHUNK_SIZE
} else {
val
}
})
.unwrap_or(DEFAULT_STREAMING_CHUNK_SIZE)
};
}

/// The AI data returned from a GPT.
Expand Down Expand Up @@ -240,8 +242,10 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(

if new_page {
let scheme = abs.scheme();

if scheme == "https" || scheme == "http" {
let host_name = abs.host_str();

let mut can_process = parent_host_match(
host_name,
base_domain,
Expand All @@ -264,6 +268,7 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(

let hchars = abs.path();

// check if the file is a resource and block if it is
if let Some(position) = hchars.rfind('.') {
let hlen = hchars.len();
let has_asset = hlen - position;
Expand All @@ -272,7 +277,7 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
let next_position = position + 1;

if !full_resources
&& !ONLY_RESOURCES.contains::<CaseInsensitiveString>(
&& IGNORE_ASSETS.contains::<CaseInsensitiveString>(
&hchars[next_position..].into(),
)
{
Expand Down Expand Up @@ -1342,8 +1347,7 @@ impl Page {
lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});

let html_bytes = html.as_bytes();
let chunk_size = 8192;
let chunks = html_bytes.chunks(chunk_size);
let chunks = html_bytes.chunks(*STREAMING_CHUNK_SIZE);

let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], A>);

Expand Down Expand Up @@ -1394,6 +1398,7 @@ impl Page {
} else {
None
};

if !html.is_empty() {
if html.starts_with("<?xml") {
self.links_stream_xml_links_stream_base(selectors, html, &mut map)
Expand Down Expand Up @@ -1449,8 +1454,7 @@ impl Page {
lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});

let html_bytes = html.as_bytes();
let chunk_size = 8192;
let chunks = html_bytes.chunks(chunk_size);
let chunks = html_bytes.chunks(*STREAMING_CHUNK_SIZE);
let mut wrote_error = false;

let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], A>);
Expand Down Expand Up @@ -1677,9 +1681,7 @@ impl Page {
abs.path_segments().ok_or_else(|| "cannot be base")
{
while let Some(p) = paths.next() {
// todo: get the path last before None instead of checking for ends_with
if p.ends_with(".js")
&& JS_FRAMEWORK_ASSETS.contains(&p)
if chromiumoxide::handler::network::ALLOWED_MATCHER.is_match(&p)
{
rerender.swap(true, Ordering::Relaxed);
}
Expand Down Expand Up @@ -1731,8 +1733,7 @@ impl Page {
});

let html_bytes = html_resource.as_bytes();
let chunk_size = 8192;
let chunks = html_bytes.chunks(chunk_size);
let chunks = html_bytes.chunks(*STREAMING_CHUNK_SIZE);
let mut wrote_error = false;

let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], A>);
Expand Down Expand Up @@ -1804,14 +1805,14 @@ impl Page {
true,
&Some(crate::configuration::WaitFor::new(
Some(
core::time::Duration::from_secs(120), // default a duration for smart handling. (maybe expose later on.)
core::time::Duration::from_secs(60), // default a duration for smart handling. (maybe expose later on.)
),
None,
true,
true,
None,
Some(crate::configuration::WaitForSelector::new(
Some(core::time::Duration::from_millis(500)),
Some(core::time::Duration::from_millis(250)),
"body".into(),
)),
)),
Expand All @@ -1827,7 +1828,14 @@ impl Page {
.await;

if let Some(h) = intercept_handle {
let _ = h.await;
let abort_handle = h.abort_handle();
if let Err(elasped) =
tokio::time::timeout(tokio::time::Duration::from_secs(10), h)
.await
{
log::warn!("Handler timeout exceeded {elasped}");
abort_handle.abort();
}
}

if let Ok(resource) = page_resource {
Expand Down Expand Up @@ -1946,8 +1954,7 @@ impl Page {
let mut rewriter = lol_html::send::HtmlRewriter::new(settings, |_c: &[u8]| {});

let html_bytes = html.as_bytes();
let chunk_size = 8192;
let chunks = html_bytes.chunks(chunk_size);
let chunks = html_bytes.chunks(*STREAMING_CHUNK_SIZE);
let mut wrote_error = false;

let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], A>);
Expand Down
7 changes: 5 additions & 2 deletions spider/src/utils/abs.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::page::ONLY_RESOURCES;
use crate::page::IGNORE_ASSETS;
use phf::phf_set;
use url::Url;

Expand Down Expand Up @@ -101,12 +101,15 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {
if let Some(position) = href.rfind('.') {
let hlen = href.len();
let has_asset = hlen - position;

if has_asset >= 3 {
let next_position = position + 1;
if !ONLY_RESOURCES.contains::<case_insensitive_string::CaseInsensitiveString>(

if IGNORE_ASSETS.contains::<case_insensitive_string::CaseInsensitiveString>(
&href[next_position..].into(),
) {
let full_url = format!("{}://{}", base.scheme(), href);

if let Ok(mut next_url) = Url::parse(&full_url) {
next_url.set_fragment(None);
return next_url;
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.21.26"
version = "2.21.27"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/src/handler/blockers/scripts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ lazy_static::lazy_static! {
"https://www.googleanalytics.com",
"https://iabusprivacy.pmc.com/geo-info.js",
"https://cookie-cdn.cookiepro.com/consent",
"https://static.hotjar.com/",
"https://load.sumome.com/",
"https://www.mongoosemetrics.com/",
"https://geolocation-recommendations.shopifyapps.com/",
Expand Down Expand Up @@ -57,7 +58,6 @@ lazy_static::lazy_static! {
"https://featureassets.org",
"https://cdn.rudderlabs.com",
"https://script.hotjar.com/",
"https://static.hotjar.com/",
"https://cdn.insurads.com/",
"https://cdn-ukwest.onetrust.com",
"https://cdn.onetrust.com",
Expand Down
1 change: 1 addition & 0 deletions spider_chrome/src/handler/blockers/xhr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ lazy_static::lazy_static! {
"https://s.yimg.com/wi",
"https://disney.my.sentry.io/api/",
"https://www.redditstatic.com/ads",
"https://static.hotjar.com/",
"https://sentry.io/api/",
"https://buy.tinypass.com/",
"https://idx.liadm.com",
Expand Down
4 changes: 2 additions & 2 deletions spider_chrome/src/handler/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ lazy_static! {
"https://cdnjs.cloudflare.com/" // cloudflare cdn scripts
];

/// Create a static AhoCorasick matcher based on the allowed list
static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).unwrap();
/// Determine if a script should be rendered in the browser by name.
pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).unwrap();

/// path of a js framework
pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.21.26"
version = "2.21.27"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.21.26"
version = "2.21.27"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.21.26"
version = "2.21.27"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
Loading

0 comments on commit 6e14f1c

Please sign in to comment.