Skip to content

Commit

Permalink
chore(chrome): add to block list
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 12, 2024
1 parent da3f060 commit ca8d5e0
Show file tree
Hide file tree
Showing 16 changed files with 427 additions and 173 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.21.13"
version = "2.21.15"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider/src/features/chrome_common.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::utils::trie::Trie;

#[cfg(feature = "chrome")]
use chromiumoxide::handler::network::NetworkInterceptManager;
use chromiumoxide::handler::blockers::intercept_manager::NetworkInterceptManager;

/// wrapper for non chrome interception. does nothing.
#[derive(Debug, Default, Clone, Copy, PartialEq)]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.21.13"
version = "2.21.15"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
23 changes: 11 additions & 12 deletions spider_chrome/src/browser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,13 @@ use futures::channel::oneshot::channel as oneshot_channel;
use futures::select;
use futures::SinkExt;

use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam};
use chromiumoxide_cdp::cdp::browser_protocol::storage::{
ClearCookiesParams, GetCookiesParams, SetCookiesParams,
};
use chromiumoxide_cdp::cdp::browser_protocol::target::{
CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams, TargetId,
TargetInfo,
};
use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind};
use chromiumoxide_types::*;

use crate::async_process::{self, Child, ExitStatus, Stdio};
use crate::cmd::{to_command_response, CommandMessage};
use crate::conn::Connection;
use crate::detection::{self, DetectionOptions};
use crate::error::{BrowserStderr, CdpError, Result};
use crate::handler::blockers::intercept_manager::NetworkInterceptManager;
use crate::handler::browser::BrowserContext;
use crate::handler::network::NetworkInterceptManager;
use crate::handler::viewport::Viewport;
use crate::handler::{Handler, HandlerConfig, HandlerMessage, REQUEST_TIMEOUT};
use crate::listeners::{EventListenerRequest, EventStream};
Expand All @@ -37,6 +26,16 @@ use crate::utils;
use chromiumoxide_cdp::cdp::browser_protocol::browser::{
BrowserContextId, CloseReturns, GetVersionParams, GetVersionReturns,
};
use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam};
use chromiumoxide_cdp::cdp::browser_protocol::storage::{
ClearCookiesParams, GetCookiesParams, SetCookiesParams,
};
use chromiumoxide_cdp::cdp::browser_protocol::target::{
CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams, TargetId,
TargetInfo,
};
use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind};
use chromiumoxide_types::*;

/// Default `Browser::launch` timeout in MS
pub const LAUNCH_TIMEOUT: u64 = 20_000;
Expand Down
100 changes: 100 additions & 0 deletions spider_chrome/src/handler/blockers/intercept_manager.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/// Custom network intercept types to expect on a domain
#[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)]
pub enum NetworkInterceptManager {
/// tiktok.com
TikTok,
/// facebook.com
Facebook,
/// amazon.com
Amazon,
/// x.com
X,
/// LinkedIn,
LinkedIn,
/// netflix.com
Netflix,
/// medium.com
Medium,
/// upwork.com,
Upwork,
/// glassdoor.com
Glassdoor,
/// ebay.com
Ebay,
/// nytimes.com
Nytimes,
/// wikipedia.com
Wikipedia,
#[default]
/// Unknown
Unknown,
}

lazy_static::lazy_static! {
/// Top tier list of the most common websites visited.
pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [
("https://www.tiktok.com", NetworkInterceptManager::TikTok),
("https://tiktok.com", NetworkInterceptManager::TikTok),
("https://www.amazon.", NetworkInterceptManager::Amazon),
("https://amazon.", NetworkInterceptManager::Amazon),
("https://www.x.com", NetworkInterceptManager::X),
("https://x.com", NetworkInterceptManager::X),
("https://www.netflix.com", NetworkInterceptManager::Netflix),
("https://netflix.com", NetworkInterceptManager::Netflix),
(
"https://www.linkedin.com",
NetworkInterceptManager::LinkedIn
),
("https://linkedin.com", NetworkInterceptManager::LinkedIn),
("https://www.upwork.com", NetworkInterceptManager::Upwork),
("https://upwork.com", NetworkInterceptManager::Upwork),
("https://www.glassdoor.", NetworkInterceptManager::Glassdoor),
("https://glassdoor.", NetworkInterceptManager::Glassdoor),
("https://www.medium.com", NetworkInterceptManager::Medium),
("https://medium.com", NetworkInterceptManager::Medium),
("https://www.ebay.", NetworkInterceptManager::Ebay),
("https://ebay.", NetworkInterceptManager::Ebay),
("https://www.nytimes.com", NetworkInterceptManager::Nytimes),
("https://nytimes.com", NetworkInterceptManager::Nytimes),
("wikipedia.org", NetworkInterceptManager::Wikipedia),
];
}

/// The find type is own.
#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)]
enum FindType {
#[default]
/// Starts with.
StartsWith,
/// Contains.
Contains,
}

impl NetworkInterceptManager {
/// a custom intercept handle.
pub fn new(url: &str) -> NetworkInterceptManager {
TOP_TIER_LIST
.iter()
.find(|&(pattern, nm)| {
if nm.get_pattern() == FindType::StartsWith {
url.starts_with(pattern)
} else {
url.contains(pattern)
}
})
.map(|&(_, manager_type)| manager_type)
.unwrap_or(NetworkInterceptManager::Unknown)
}
/// Setup the intercept handle
pub fn setup(&mut self, url: &str) -> Self {
NetworkInterceptManager::new(url)
}

/// determine the pattern to use.
fn get_pattern(&self) -> FindType {
match self {
NetworkInterceptManager::Wikipedia => FindType::Contains,
_ => FindType::StartsWith,
}
}
}
18 changes: 17 additions & 1 deletion spider_chrome/src/handler/blockers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ pub mod amazon_blockers;
pub mod ebay_blockers;
/// glassdoor blockers
pub mod glassdoor_blockers;
/// interception manager
pub mod intercept_manager;
/// linkedin blockers
pub mod linkedin_blockers;
/// medium blockers
Expand All @@ -24,7 +26,6 @@ pub mod upwork_blockers;
pub mod wikipedia_blockers;
/// x blockers
pub mod x_blockers;

/// xhr blockers
pub mod xhr;

Expand Down Expand Up @@ -75,3 +76,18 @@ impl Trie {
false
}
}

/// Url matches analytics that we want to ignore or trackers.
pub(crate) fn ignore_script_embedded(url: &str) -> bool {
crate::handler::blockers::scripts::URL_IGNORE_EMBEDED_TRIE.contains_prefix(url)
}

/// Url matches analytics that we want to ignore or trackers.
pub(crate) fn ignore_script_xhr(url: &str) -> bool {
crate::handler::blockers::xhr::URL_IGNORE_XHR_TRIE.contains_prefix(url)
}

/// Url matches media that we want to ignore.
pub(crate) fn ignore_script_xhr_media(url: &str) -> bool {
crate::handler::blockers::xhr::URL_IGNORE_XHR_MEDIA_TRIE.contains_prefix(url)
}
Loading

0 comments on commit ca8d5e0

Please sign in to comment.