Skip to content

Commit

Permalink
chore(chrome): fix allowed critical rendering script interception
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 14, 2024
1 parent 3cfacdb commit 36c15d1
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 34 deletions.
13 changes: 7 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions examples/real_world.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations"
//! cargo run --example real_world --features="chrome chrome_intercept real_browser"
extern crate spider;
use crate::spider::tokio::io::AsyncWriteExt;
Expand All @@ -13,10 +13,13 @@ use std::time::Duration;

async fn crawl_website(url: &str) -> Result<()> {
let mut stdout = tokio::io::stdout();
let mut interception = RequestInterceptConfiguration::new(true);

interception.block_javascript = true;

let mut website: Website = Website::new(url)
.with_limit(5)
.with_chrome_intercept(RequestInterceptConfiguration::new(true))
.with_chrome_intercept(interception)
.with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
500,
)))))
Expand Down Expand Up @@ -82,7 +85,7 @@ async fn crawl_website(url: &str) -> Result<()> {
async fn main() -> Result<()> {
env_logger::init();
let _ = tokio::join!(
crawl_website("https://www.choosealicense.com"),
crawl_website("https://choosealicense.com"),
crawl_website("https://jeffmendez.com"),
crawl_website("https://example.com"),
);
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.21.21"
version = "2.21.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
3 changes: 2 additions & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.21.21"
version = "2.21.23"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down Expand Up @@ -53,6 +53,7 @@ adblock = { version = "0.9", optional = true, default-features = false, features
rand = "0.8"
case_insensitive_string = { version = "0.2", features = ["compact", "serde"] }
hashbrown = { version = "0.15", default-features = true }
aho-corasick = "1"

[target.'cfg(windows)'.dependencies]
winreg = "0.52"
Expand Down
1 change: 1 addition & 0 deletions spider_chrome/src/handler/blockers/xhr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ lazy_static::lazy_static! {
// video embeddings
"https://video.squarespace-cdn.com/content/",
"https://bes.gcp.data.bigcommerce.com/nobot",
"https://www.youtube.com/youtubei/",
"http://ec.editmysite.com",
"https://dcinfos-cache.abtasty.com/",
"https://featureassets.org/",
Expand Down
48 changes: 29 additions & 19 deletions spider_chrome/src/handler/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use super::blockers::{
use crate::auth::Credentials;
use crate::cmd::CommandChain;
use crate::handler::http::HttpRequest;
use aho_corasick::AhoCorasick;
use case_insensitive_string::CaseInsensitiveString;
use chromiumoxide_cdp::cdp::browser_protocol::fetch::{
self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
Expand All @@ -32,23 +33,32 @@ use std::collections::VecDeque;
use std::time::Duration;

lazy_static! {
/// allowed js frameworks and libs excluding some and adding additional URLs
pub static ref JS_FRAMEWORK_ALLOW: phf::Set<&'static str> = {
phf::phf_set! {
// Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
"jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js",
"react.development.js", "react-dom.development.js", "react.production.min.js",
"react-dom.production.min.js", "vue.global.js", "vue.esm-browser.js", "vue.js",
"bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js",
"d3.js", "lodash.min.js", "lodash.js",
"app.js", "main.js", "index.js", "bundle.js", "vendor.js",
// Verified 3rd parties for request
"https://m.stripe.network/inner.html",
"https://m.stripe.network/out-4.5.43.js",
"https://challenges.cloudflare.com/turnstile",
"https://js.stripe.com/v3/"
}
};
/// General patterns for popular libraries and resources
static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
"jquery", // Covers jquery.min.js, jquery.js, etc.
"angular",
"react", // Covers all React-related patterns
"vue", // Covers all Vue-related patterns
"bootstrap",
"d3",
"lodash",
"ajax",
"app", // Covers general app scripts like app.js
"main",
"index",
"bundle",
"vendor",
"/wp-content/js/", // Covers word press content
// Verified 3rd parties for request
"https://m.stripe.network/",
"https://challenges.cloudflare.com/",
"https://js.stripe.com/",
"https://cdn.prod.website-files.com/", // webflow cdn scripts
"https://cdnjs.cloudflare.com/" // cloudflare cdn scripts
];

/// Create a static AhoCorasick matcher based on the allowed list
static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).unwrap();

/// path of a js framework
pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
Expand Down Expand Up @@ -400,7 +410,7 @@ impl NetworkManager {
&& ResourceType::Stylesheet == event.resource_type
|| self.block_javascript
&& javascript_resource
&& !JS_FRAMEWORK_ALLOW.contains(current_url)
&& !ALLOWED_MATCHER.is_match(current_url)
} else {
skip_networking
};
Expand Down Expand Up @@ -502,7 +512,7 @@ impl NetworkManager {
&& ResourceType::Stylesheet == event.resource_type
|| self.block_javascript
&& javascript_resource
&& !JS_FRAMEWORK_ALLOW.contains(current_url)
&& !ALLOWED_MATCHER.contains(current_url)
} else {
skip_networking
};
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.21.21"
version = "2.21.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.21.21"
version = "2.21.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.21.21"
version = "2.21.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.21.21"
version = "2.21.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 36c15d1

Please sign in to comment.