Skip to content

Commit

Permalink
chore(chrome): fix xhr interception
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 8, 2024
1 parent ee098f2 commit bd58155
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 20 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion examples/real_world.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! `cargo run --example real_world --features="chrome chrome_intercept spider_utils/transformations"`
//! cargo run --example real_world --features="chrome chrome_intercept spider_utils/transformations"
extern crate spider;
use crate::spider::tokio::io::AsyncWriteExt;
Expand All @@ -20,6 +20,7 @@ async fn crawl_website(url: &str) -> Result<()> {
.with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
200,
)))))
.with_block_assets(true)
.with_stealth(true)
.with_return_page_links(true)
.with_fingerprint(true)
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.20.6"
version = "2.20.7"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.20.6"
version = "2.20.7"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
41 changes: 34 additions & 7 deletions spider_chrome/src/handler/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ lazy_static! {
"https://acdn.adnxs.com/ast/ast.js",
"https://schibsted-cdn.relevant-digital.com/static/tags/",
"https://bat.bing.net",
"https://static.addtoany.com/menu/",
"https://www.b2i.us/b2i/",
"https://acsbapp.com/apps/app/dist/js/app.js",
".sharethis.com",
".newrelic.com",
".googlesyndication.com",
Expand Down Expand Up @@ -191,7 +194,8 @@ lazy_static! {
"otSDKStub.js",
"otBannerSdk.js",
"_vercel/insights/script.js",
"analytics."
"analytics.",
"cookie-law-info-ccpa.js"
];
for pattern in &patterns {
trie.insert(pattern);
Expand All @@ -212,8 +216,11 @@ lazy_static! {
// amazon product feedback
"https://www.amazon.com/af/feedback-link?",
"https://www.google.com/ads/ga-audiences",
"https://player.vimeo.com/video/",
"https://www.youtube.com/iframe_api",
"https://tr.snapchat.com/config/",
"https://collect.tealiumiq.com/",
"https://cdn.acsbapp.com/config/",
"https://s.yimg.com/wi",
"https://disney.my.sentry.io/api/",
"https://www.redditstatic.com/ads",
Expand Down Expand Up @@ -241,7 +248,9 @@ lazy_static! {
"https://logs.",
"/track.php",
"/api/v1/bulklog",
"cookieconsentpub"
"cookieconsentpub",
"cookie-law-info",
"mediaelement-and-player.min.j"
];
for pattern in &patterns {
trie.insert(pattern);
Expand All @@ -268,6 +277,8 @@ lazy_static! {
"https://www.googletagmanager.com/ns.html", // Google tag manager.
"https://consentcdn.cookiebot.com", // Cookie bot
"https://www.youtube.com/iframe_api", // Youtube iframes.
"https://f.vimeocdn.com", // Vimeo EMBEDDINGS
"https://i.vimeocdn.com/",
// "https://www.youtube.com/s/player/", // Youtube player not needed usually since iframe_api is used mainly
// vercel live
"https://vercel.live/api/",
Expand All @@ -281,6 +292,7 @@ lazy_static! {
"https://tr.snapchat.com/",
"https://buy.tinypass.com",
"https://nimbleplot.com/",
"https://my.actiondata.co/js/tracker.php",
// ignore font extras
"https://kit.fontawesome.com/",
"https://use.typekit.net",
Expand All @@ -291,6 +303,7 @@ lazy_static! {
// ignore extra ads
".sharethis.com",
"amazon-adsystem.com",
".vimeocdn.com",
"g.doubleclick.net",
"https://securepubads.g.doubleclick.net",
"googlesyndication.com",
Expand Down Expand Up @@ -324,7 +337,6 @@ lazy_static! {
"https://open.spotify.com/",
"https://api.spotify.com/v1/",
"https://music.apple.com/"

];
for pattern in &patterns {
trie.insert(pattern);
Expand Down Expand Up @@ -638,7 +650,11 @@ impl NetworkManager {
/// Determine if the request should be skipped.
fn skip_xhr(&self, skip_networking: bool, event: &EventRequestPaused) -> bool {
// XHR check
if !skip_networking && event.resource_type == ResourceType::Xhr {
if !skip_networking
&& (event.resource_type == ResourceType::Xhr
|| event.resource_type == ResourceType::WebSocket
|| event.resource_type == ResourceType::Fetch)
{
let request_url = event.request.url.as_str();

// check if part of ignore scripts.
Expand All @@ -648,7 +664,7 @@ impl NetworkManager {
true
} else if self.block_stylesheets || self.ignore_visuals {
let block_css = self.block_stylesheets;
let block_media = self.ignore_visuals && self.only_html;
let block_media = self.ignore_visuals;

let mut block_request = false;

Expand Down Expand Up @@ -704,6 +720,9 @@ impl NetworkManager {
|| event.resource_type == ResourceType::CspViolationReport
|| event.resource_type == ResourceType::Ping
|| event.resource_type == ResourceType::Prefetch;
let network_resource = event.resource_type == ResourceType::Xhr
|| event.resource_type == ResourceType::Fetch
|| event.resource_type == ResourceType::WebSocket;

// main initial check
let skip_networking = if !skip_networking {
Expand Down Expand Up @@ -746,7 +765,7 @@ impl NetworkManager {
// custom interception layer.
let skip_networking = if !skip_networking
&& (javascript_resource
|| event.resource_type == ResourceType::Xhr
|| network_resource
|| event.resource_type == ResourceType::Document)
{
match self.intercept_manager {
Expand Down Expand Up @@ -791,6 +810,11 @@ impl NetworkManager {
);
self.push_cdp_request(fullfill_params);
} else {
tracing::debug!(
"Network Allowed: {:?} - {}",
event.resource_type,
event.request.url
);
self.push_cdp_request(ContinueRequestParams::new(event.request_id.clone()))
}
}
Expand Down Expand Up @@ -818,6 +842,9 @@ impl NetworkManager {
|| event.resource_type == ResourceType::CspViolationReport
|| event.resource_type == ResourceType::Ping
|| event.resource_type == ResourceType::Prefetch;
let network_resource = event.resource_type == ResourceType::Xhr
|| event.resource_type == ResourceType::Fetch
|| event.resource_type == ResourceType::WebSocket;

// main initial check
let skip_networking = if !skip_networking {
Expand Down Expand Up @@ -866,7 +893,7 @@ impl NetworkManager {
// custom interception layer.
let skip_networking = if !skip_networking
&& (javascript_resource
|| event.resource_type == ResourceType::Xhr
|| network_resource
|| event.resource_type == ResourceType::Document)
{
match self.intercept_manager {
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.20.6"
version = "2.20.7"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.20.6"
version = "2.20.7"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.20.6"
version = "2.20.7"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.20.6"
version = "2.20.7"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit bd58155

Please sign in to comment.