From 4928ee30e36b7955e9131482493541301419886a Mon Sep 17 00:00:00 2001 From: j-mendez Date: Fri, 13 Dec 2024 13:46:25 -0500 Subject: [PATCH] chore(chrome): add build generate blocking --- Cargo.lock | 1 + spider/src/features/chrome_common.rs | 4 +- spider_chrome/Cargo.toml | 4 + spider_chrome/build.rs | 223 ++++++++++++++++++ spider_chrome/src/browser.rs | 2 +- .../src/handler/blockers/amazon_blockers.rs | 55 ----- .../src/handler/blockers/ebay_blockers.rs | 29 --- .../handler/blockers/glassdoor_blockers.rs | 54 ----- .../src/handler/blockers/intercept_manager.rs | 94 ++------ .../src/handler/blockers/linkedin_blockers.rs | 29 --- .../src/handler/blockers/medium_blockers.rs | 29 --- spider_chrome/src/handler/blockers/mod.rs | 41 +--- .../src/handler/blockers/netflix_blockers.rs | 24 -- .../src/handler/blockers/nytimes_blockers.rs | 50 ---- .../handler/blockers/tcgplayer_blockers.rs | 27 --- .../src/handler/blockers/tiktok_blockers.rs | 45 ---- .../src/handler/blockers/upwork_blockers.rs | 65 ----- .../handler/blockers/wikipedia_blockers.rs | 29 --- .../src/handler/blockers/x_blockers.rs | 28 --- spider_chrome/src/handler/mod.rs | 2 +- spider_chrome/src/handler/network.rs | 110 +-------- spider_chrome/src/handler/target.rs | 2 +- .../domains/amazon.com/scripts/pattern1.txt | 15 ++ .../domains/amazon.com/styles/pattern1.txt | 0 .../domains/amazon.com/xhr/pattern1.txt | 2 + .../domains/ebay.com/scripts/pattern1.txt | 8 + .../domains/ebay.com/styles/pattern1.txt | 0 .../domains/ebay.com/xhr/pattern1.txt | 0 .../domains/facebook.com/scripts/pattern1.txt | 0 .../domains/facebook.com/styles/pattern1.txt | 0 .../domains/facebook.com/xhr/pattern1.txt | 1 + .../glassdoor.com/scripts/pattern1.txt | 3 + .../domains/glassdoor.com/styles/pattern1.txt | 6 + .../domains/glassdoor.com/xhr/pattern1.txt | 0 .../domains/linkedin.com/scripts/pattern1.txt | 8 + .../domains/linkedin.com/styles/pattern1.txt | 0 .../domains/linkedin.com/xhr/pattern1.txt | 0 .../domains/medium.com/scripts/pattern1.txt | 8 + .../domains/medium.com/styles/pattern1.txt | 0 .../domains/medium.com/xhr/pattern1.txt | 0 .../domains/netflix.com/scripts/pattern1.txt | 3 + .../domains/netflix.com/styles/pattern1.txt | 0 .../domains/netflix.com/xhr/pattern1.txt | 0 .../domains/nytimes.com/scripts/pattern1.txt | 9 + .../domains/nytimes.com/styles/pattern1.txt | 3 + .../domains/nytimes.com/xhr/pattern1.txt | 0 .../tcgplayer.com/scripts/pattern1.txt | 6 + .../domains/tcgplayer.com/styles/pattern1.txt | 0 .../domains/tcgplayer.com/xhr/pattern1.txt | 0 .../domains/tiktok.com/scripts/pattern1.txt | 21 ++ .../domains/tiktok.com/styles/pattern1.txt | 0 .../domains/tiktok.com/xhr/pattern1.txt | 0 .../domains/upwork.com/scripts/pattern1.txt | 14 ++ .../domains/upwork.com/styles/pattern1.txt | 4 + .../domains/upwork.com/xhr/pattern1.txt | 0 .../wikipedia.org/scripts/pattern1.txt | 8 + .../domains/wikipedia.org/styles/pattern1.txt | 0 .../domains/wikipedia.org/xhr/pattern1.txt | 0 .../domains/x.com/scripts/pattern1.txt | 6 + .../domains/x.com/styles/pattern1.txt | 0 .../domains/x.com/xhr/pattern1.txt | 0 61 files changed, 399 insertions(+), 673 deletions(-) create mode 100644 spider_chrome/build.rs delete mode 100644 spider_chrome/src/handler/blockers/amazon_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/ebay_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/glassdoor_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/linkedin_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/medium_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/netflix_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/nytimes_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/tcgplayer_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/tiktok_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/upwork_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/wikipedia_blockers.rs delete mode 100644 spider_chrome/src/handler/blockers/x_blockers.rs create mode 100644 spider_chrome/url_patterns/domains/amazon.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/amazon.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/amazon.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/ebay.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/ebay.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/ebay.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/facebook.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/facebook.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/facebook.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/glassdoor.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/glassdoor.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/glassdoor.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/linkedin.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/linkedin.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/linkedin.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/medium.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/medium.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/medium.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/netflix.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/netflix.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/netflix.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/nytimes.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/nytimes.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/nytimes.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/tcgplayer.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/tcgplayer.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/tcgplayer.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/tiktok.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/tiktok.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/tiktok.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/upwork.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/upwork.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/upwork.com/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/wikipedia.org/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/wikipedia.org/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/wikipedia.org/xhr/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/x.com/scripts/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/x.com/styles/pattern1.txt create mode 100644 spider_chrome/url_patterns/domains/x.com/xhr/pattern1.txt diff --git a/Cargo.lock b/Cargo.lock index 45cb66c08..8f017fded 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4354,6 +4354,7 @@ dependencies = [ "hashbrown 0.15.2", "lazy_static", "phf 0.11.2", + "phf_codegen 0.11.2", "pin-project-lite", "proc-macro2", "quote", diff --git a/spider/src/features/chrome_common.rs b/spider/src/features/chrome_common.rs index ec8da2eef..a92b81c31 100644 --- a/spider/src/features/chrome_common.rs +++ b/spider/src/features/chrome_common.rs @@ -10,14 +10,14 @@ use chromiumoxide::handler::blockers::intercept_manager::NetworkInterceptManager pub enum NetworkInterceptManager { #[default] /// Unknown - Unknown, + UNKNOWN, } #[cfg(not(feature = "chrome"))] impl NetworkInterceptManager { /// a custom intercept handle. pub fn new(_url: &Option>) -> NetworkInterceptManager { - NetworkInterceptManager::Unknown + NetworkInterceptManager::UNKNOWN } } diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 1dd610542..afd2729b7 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -62,6 +62,10 @@ chrono = "0.4" tracing-subscriber = "0.3" tokio = { version = "1", features = ["rt-multi-thread", "time", "macros"] } +[build-dependencies] +phf = { version = "0.11", default-features = false } +phf_codegen = "0.11" + [features] default = ["bytes"] stream = ["tokio-tungstenite/stream"] diff --git a/spider_chrome/build.rs b/spider_chrome/build.rs new file mode 100644 index 000000000..3518b67bf --- /dev/null +++ b/spider_chrome/build.rs @@ -0,0 +1,223 @@ +extern crate phf_codegen; +use std::env; +use std::fs::{self, File}; +use std::io::{BufWriter, Write}; +use std::path::Path; + +fn main() { + let out_dir = env::var("OUT_DIR").unwrap(); + let domain_map_path = Path::new(&out_dir).join("domain_map.rs"); + let url_trie_path = Path::new(&out_dir).join("url_ignore_trie.rs"); + let blockers_dir = Path::new(&out_dir).join("blockers"); + fs::create_dir_all(&blockers_dir).unwrap(); + + let pattern_dir = "url_patterns/domains"; + + generate_domain_map(&domain_map_path, pattern_dir); + generate_url_ignore_tries(&url_trie_path, pattern_dir); + generate_blockers(&blockers_dir, pattern_dir); + generate_blockers_mod(&blockers_dir, pattern_dir); +} + +fn generate_domain_map(domain_map_path: &Path, pattern_dir: &str) { + let mut file = BufWriter::new(File::create(&domain_map_path).unwrap()); + let mut map = phf_codegen::Map::new(); + + writeln!(file, "mod blockers;\nmod url_ignore_trie;").unwrap(); + writeln!( + &mut file, + "#[derive(Default, Debug, Clone, Copy, PartialEq)]" + ) + .unwrap(); + writeln!( + &mut file, + r#"#[derive(serde::Serialize, serde::Deserialize)]"# + ) + .unwrap(); + writeln!(&mut file, "pub enum NetworkInterceptManager {{").unwrap(); + + let mut domain_variants = vec![]; + + for entry in fs::read_dir(pattern_dir).unwrap() { + let entry = entry.unwrap(); + let path = entry.path(); + + if let Some(domain_name) = path.file_stem().unwrap().to_str() { + let enum_name = format_ident(domain_name); + writeln!(&mut file, " {},", enum_name).unwrap(); + domain_variants.push((domain_name.to_string(), enum_name.clone())); + map.entry( + format!("{}", domain_name), + &format!("NetworkInterceptManager::{}", enum_name), + ); + } + } + + writeln!(&mut file, " #[default]\n UNKNOWN,").unwrap(); // Default case + writeln!(&mut file, "}}\n").unwrap(); + + write!( + file, + "static DOMAIN_MAP: phf::Map<&'static str, NetworkInterceptManager> = {};\n", + map.build() + ) + .unwrap(); + + writeln!(file, "impl NetworkInterceptManager {{").unwrap(); + writeln!(file, " pub fn intercept_detection(&self, url: &str, ignore_visuals: bool, is_xhr: bool) -> bool {{").unwrap(); + writeln!(file, " let mut should_block = false;").unwrap(); + writeln!(file, " match self {{").unwrap(); + + for (domain_name, enum_name) in domain_variants { + let clean_name = domain_name.split('.').next().unwrap().to_lowercase(); + writeln!( + file, + " NetworkInterceptManager::{} => {{", + enum_name + ) + .unwrap(); + writeln!(file, " if is_xhr {{").unwrap(); + writeln!( + file, + " should_block = blockers::{}_blockers::block_xhr(url);", + clean_name + ) + .unwrap(); + writeln!(file, " }} else {{").unwrap(); + writeln!( + file, + " should_block = blockers::{}_blockers::block_scripts(url);", + clean_name + ) + .unwrap(); + writeln!( + file, + " if !should_block && ignore_visuals {{" + ) + .unwrap(); + writeln!( + file, + " should_block = blockers::{}_blockers::block_styles(url);", + clean_name + ) + .unwrap(); + writeln!(file, " }}").unwrap(); + writeln!(file, " }}").unwrap(); + writeln!(file, " }},").unwrap(); + } + + writeln!(file, " NetworkInterceptManager::UNKNOWN => (),").unwrap(); + + writeln!(file, " }}").unwrap(); + writeln!(file, " should_block").unwrap(); + writeln!(file, " }}").unwrap(); + writeln!(file, "}}").unwrap(); +} + +fn generate_url_ignore_tries(url_trie_path: &Path, pattern_dir: &str) { + let mut file = BufWriter::new(File::create(url_trie_path).unwrap()); + + writeln!(file, "use crate::handler::blockers::Trie;").unwrap(); + writeln!(file, "lazy_static::lazy_static! {{").unwrap(); + + for category in &["scripts", "xhr", "styles"] { + if let Ok(domain_entries) = fs::read_dir(pattern_dir) { + for domain_entry in domain_entries { + let domain_entry = domain_entry.unwrap(); + let domain_path = domain_entry.path(); + + if domain_path.is_dir() { + let domain_name = domain_path.file_name().unwrap().to_str().unwrap(); + let category_domain_path = domain_path.join(category); + + if let Ok(category_entries) = fs::read_dir(&category_domain_path) { + let trie_name = format_ident(&format!("{}_{}", domain_name, category)); + writeln!( + file, + "pub static ref {}_TRIE: Trie = {{", + trie_name.to_uppercase() + ) + .unwrap(); + writeln!(file, "let mut trie = Trie::new();").unwrap(); + + for entry in category_entries { + let entry = entry.unwrap(); + let path = entry.path(); + + if path.is_file() { + let contents = fs::read_to_string(path).unwrap(); + for pattern in contents.lines() { + writeln!(file, "trie.insert({:?});", pattern.trim()).unwrap(); + } + } + } + + writeln!(file, "trie").unwrap(); + writeln!(file, "}};").unwrap(); + } + } + } + } + } + + writeln!(file, "}}").unwrap(); +} + +fn generate_blockers(blockers_dir: &Path, pattern_dir: &str) { + if let Ok(domain_entries) = fs::read_dir(pattern_dir) { + for domain_entry in domain_entries { + let domain_entry = domain_entry.unwrap(); + let domain_path = domain_entry.path(); + + if domain_path.is_dir() { + let domain_name = domain_path.file_name().unwrap().to_str().unwrap(); + let file_name = format!("{}_blockers.rs", domain_name.split('.').next().unwrap()); + let file_path = blockers_dir.join(file_name); + let mut file = BufWriter::new(File::create(file_path).unwrap()); + + // Generate block_scripts + let scripts_trie_name = format_ident(&format!("{}_scripts", domain_name)); + writeln!(file, "pub fn block_scripts(url: &str) -> bool {{").unwrap(); + writeln!(file, " crate::handler::blockers::intercept_manager::url_ignore_trie::{}_TRIE.contains_prefix(url)", scripts_trie_name.to_uppercase()).unwrap(); + writeln!(file, "}}\n").unwrap(); + + // Generate block_styles + let styles_trie_name = format_ident(&format!("{}_styles", domain_name)); + writeln!(file, "pub fn block_styles(url: &str) -> bool {{").unwrap(); + writeln!(file, " crate::handler::blockers::intercept_manager::url_ignore_trie::{}_TRIE.contains_prefix(url)", styles_trie_name.to_uppercase()).unwrap(); + writeln!(file, "}}\n").unwrap(); + + // Generate block_xhr + let xhr_trie_name = format_ident(&format!("{}_xhr", domain_name)); + writeln!(file, "pub fn block_xhr(url: &str) -> bool {{").unwrap(); + writeln!(file, " crate::handler::blockers::intercept_manager::url_ignore_trie::{}_TRIE.contains_prefix(url)", xhr_trie_name.to_uppercase()).unwrap(); + writeln!(file, "}}\n").unwrap(); + } + } + } +} + +fn generate_blockers_mod(blockers_dir: &Path, pattern_dir: &str) { + let mod_file_path = blockers_dir.join("mod.rs"); + let mut mod_file = BufWriter::new(File::create(mod_file_path).unwrap()); + + if let Ok(domain_entries) = fs::read_dir(pattern_dir) { + for domain_entry in domain_entries { + let domain_entry = domain_entry.unwrap(); + let clean_name = domain_entry + .file_name() + .to_str() + .unwrap_or_default() + .split('.') + .next() + .unwrap() + .to_lowercase(); + + writeln!(mod_file, "pub mod {}_blockers;", clean_name).unwrap(); + } + } +} + +fn format_ident(name: &str) -> String { + name.replace('.', "_").replace('-', "_").to_uppercase() +} diff --git a/spider_chrome/src/browser.rs b/spider_chrome/src/browser.rs index f1dc2fc8d..eafbafb88 100644 --- a/spider_chrome/src/browser.rs +++ b/spider_chrome/src/browser.rs @@ -770,7 +770,7 @@ impl Default for BrowserConfigBuilder { ignore_stylesheets: false, only_html: false, extra_headers: Default::default(), - intercept_manager: NetworkInterceptManager::Unknown, + intercept_manager: NetworkInterceptManager::UNKNOWN, } } } diff --git a/spider_chrome/src/handler/blockers/amazon_blockers.rs b/spider_chrome/src/handler/blockers/amazon_blockers.rs deleted file mode 100644 index 3b603c895..000000000 --- a/spider_chrome/src/handler/blockers/amazon_blockers.rs +++ /dev/null @@ -1,55 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - // images - // "https://m.media-amazon.com", - // "https://images-na.ssl-images-amazon.com/images/", - // analytics and ads - "https://cognito-identity.us-east-1.amazonaws.com", - "https://completion.amazon.com/api/2017/suggestions", - "https://sts.us-east-1.amazonaws.com/", - "https://www.amazon.com/cross_border_interstitial_sp/render", - "https://aax-us-east-retail-direct.amazon.com/e/xsp/getAd", - "https://fls-na.amazon.com/1/batch/1/OE/", - "https://unagi.amazon.com/1/events/", - "https://images-na.ssl-images-amazon.com/images/S/apesafeframe/ape/sf/desktop/", - // ads - "https://m.media-amazon.com/images/G/01/csm/showads", - // we can prob search for rum subs uptop instead. - "https://dataplane.rum", - "https://client.rum", - ".amazon-adsystem.com", - "SearchPartnerAssets", - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block amazon events that are not required -pub fn block_amazon( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - let mut block_request = URL_IGNORE_TRIE.contains_prefix(&event.request.url); - - if !block_request { - let u = &event.request.url; - - if u.ends_with("?pageViewLogging=1") - || u.starts_with("https://s.amazon-adsystem.com/") - || u.ends_with("inner-host.min.js") - || u.ends_with(".js?xcp") - || u.contains(".amazon-adsystem.com/") - { - block_request = true; - } - } - - block_request -} diff --git a/spider_chrome/src/handler/blockers/ebay_blockers.rs b/spider_chrome/src/handler/blockers/ebay_blockers.rs deleted file mode 100644 index 4c2a9c09f..000000000 --- a/spider_chrome/src/handler/blockers/ebay_blockers.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://www.ebay.com/sch/ajax/autocomplete", - "https://www.ebay.com/blueberry/v1/ads/identity/pixelUrls", - "https://svcs.ebay.com/ufeservice/v1/events", - "https://www.ebay.com/gh/useracquisition?", - "https://vi.vipr.ebaydesc.com/", - "https://srv.main.ebayrtm.com/", - "https://www.ebay.com/nap/napkinapi/", - "https://ir.ebaystatic.com/rs/c/scandal/ScandalJS-" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block ebay events that are not required -pub fn block_ebay( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/blockers/glassdoor_blockers.rs b/spider_chrome/src/handler/blockers/glassdoor_blockers.rs deleted file mode 100644 index eca04ee2a..000000000 --- a/spider_chrome/src/handler/blockers/glassdoor_blockers.rs +++ /dev/null @@ -1,54 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://www.glassdoor.com/garnish/static/js/gd-sw-register.", - "https://cdnjs.cloudflare.com/ajax/libs/prop-types/15.7.2/prop-types.min.js", - "https://www.glassdoor.com/autocomplete/location?", - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; - - /// Ignore list of urls styles. - static ref URL_IGNORE_TRIE_STYLES: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://www.glassdoor.com/sam-global-nav/static/", - "https://www.glassdoor.com/garnish/static/js/gd-", - "https://unpkg.com/@dotlottie/player-component@", - "https://www.glassdoor.com/job-search-next/assets/_next/static/", - "https://www.glassdoor.com/ei-overview-next/assets/_next/static/", - "https://www.glassdoor.com/occ-salaries-web/assets/_next/static/" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block glassdoor events that are not required -pub fn block_glassdoor_styles( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE_STYLES.contains_prefix(&event.request.url) -} - -// Block glassdoor events that are not required -pub fn block_glassdoor( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, - ignore_visuals: bool, -) -> bool { - let blocked = URL_IGNORE_TRIE.contains_prefix(&event.request.url); - if !blocked && ignore_visuals { - block_glassdoor_styles(event) - } else { - blocked - } -} diff --git a/spider_chrome/src/handler/blockers/intercept_manager.rs b/spider_chrome/src/handler/blockers/intercept_manager.rs index 341f84568..b99c79841 100644 --- a/spider_chrome/src/handler/blockers/intercept_manager.rs +++ b/spider_chrome/src/handler/blockers/intercept_manager.rs @@ -1,79 +1,23 @@ -use phf::phf_map; - -/// Custom network intercept types to expect on a domain -#[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)] -pub enum NetworkInterceptManager { - /// tiktok.com - TikTok, - /// facebook.com - Facebook, - /// amazon.com - Amazon, - /// x.com - X, - /// LinkedIn, - LinkedIn, - /// netflix.com - Netflix, - /// medium.com - Medium, - /// upwork.com, - Upwork, - /// glassdoor.com - Glassdoor, - /// ebay.com - Ebay, - /// nytimes.com - Nytimes, - /// wikipedia.com - Wikipedia, - /// tcgplayer.com - Tcgplayer, - #[default] - /// Unknown - Unknown, -} - -/// Top tier 100 domain list. -static DOMAIN_MAP: phf::Map<&'static str, NetworkInterceptManager> = phf_map! { - "tiktok.com" => NetworkInterceptManager::TikTok, - "facebook.com" => NetworkInterceptManager::Facebook, - "amazon.com" => NetworkInterceptManager::Amazon, - "x.com" => NetworkInterceptManager::X, - "linkedin.com" => NetworkInterceptManager::LinkedIn, - "netflix.com" => NetworkInterceptManager::Netflix, - "medium.com" => NetworkInterceptManager::Medium, - "upwork.com" => NetworkInterceptManager::Upwork, - "glassdoor.com" => NetworkInterceptManager::Glassdoor, - "ebay.com" => NetworkInterceptManager::Ebay, - "nytimes.com" => NetworkInterceptManager::Nytimes, - "wikipedia.org" => NetworkInterceptManager::Wikipedia, - "tcgplayer.com" => NetworkInterceptManager::Tcgplayer, -}; +include!(concat!(env!("OUT_DIR"), "/domain_map.rs")); impl NetworkInterceptManager { pub fn new(url: &Option>) -> NetworkInterceptManager { if let Some(parsed_url) = url { if let Some(domain) = parsed_url.domain() { - // list of top websites should at most two - can always do a second pass. let domain_parts: Vec<&str> = domain.split('.').collect(); let base_domain = if domain_parts.len() > 2 { - format!( - "{}.{}", - domain_parts[domain_parts.len() - 2], - domain_parts[domain_parts.len() - 1] - ) + domain_parts[domain_parts.len() - 2..].join(".") } else { domain.to_string() }; return *DOMAIN_MAP .get(&base_domain) - .unwrap_or(&NetworkInterceptManager::Unknown); + .unwrap_or(&NetworkInterceptManager::UNKNOWN); } } - NetworkInterceptManager::Unknown + NetworkInterceptManager::UNKNOWN } } @@ -82,7 +26,7 @@ mod tests { use super::*; use url::Url; - /// Helper function to create an Option> from a string + // Helper function to create an Option> from a string fn create_url(url: &str) -> Option> { Url::parse(url).ok().map(Box::new) } @@ -90,33 +34,33 @@ mod tests { #[test] fn test_known_domains() { let cases = vec![ - ("http://www.tiktok.com", NetworkInterceptManager::TikTok), - ("https://facebook.com", NetworkInterceptManager::Facebook), - ("https://www.amazon.com", NetworkInterceptManager::Amazon), + ("http://www.tiktok.com", NetworkInterceptManager::TIKTOK), + ("https://facebook.com", NetworkInterceptManager::FACEBOOK), + ("https://www.amazon.com", NetworkInterceptManager::AMAZON), ("https://subdomain.x.com", NetworkInterceptManager::X), ( "https://linkedin.com/in/someone", - NetworkInterceptManager::LinkedIn, + NetworkInterceptManager::LINKEDIN, ), ( "https://www.netflix.com/browse", - NetworkInterceptManager::Netflix, + NetworkInterceptManager::NETFLIX, ), - ("https://medium.com", NetworkInterceptManager::Medium), - ("https://sub.upwork.com", NetworkInterceptManager::Upwork), - ("https://glassdoor.com", NetworkInterceptManager::Glassdoor), - ("https://ebay.com", NetworkInterceptManager::Ebay), + ("https://medium.com", NetworkInterceptManager::MEDIUM), + ("https://sub.upwork.com", NetworkInterceptManager::UPWORK), + ("https://glassdoor.com", NetworkInterceptManager::GLASSDOOR), + ("https://ebay.com", NetworkInterceptManager::EBAY), ( "https://nytimes.com/section/world", - NetworkInterceptManager::Nytimes, + NetworkInterceptManager::NYTIMES, ), ( "https://en.wikipedia.org/wiki/Rust", - NetworkInterceptManager::Wikipedia, + NetworkInterceptManager::WIKIPEDIA, ), ( "https://market.tcgplayer.com", - NetworkInterceptManager::Tcgplayer, + NetworkInterceptManager::TCGPLAYER, ), ]; @@ -137,7 +81,7 @@ mod tests { for url in cases { assert_eq!( NetworkInterceptManager::new(&create_url(url)), - NetworkInterceptManager::Unknown + NetworkInterceptManager::UNKNOWN ); } } @@ -149,7 +93,7 @@ mod tests { for url in cases { assert_eq!( NetworkInterceptManager::new(&create_url(url)), - NetworkInterceptManager::Unknown + NetworkInterceptManager::UNKNOWN ); } } diff --git a/spider_chrome/src/handler/blockers/linkedin_blockers.rs b/spider_chrome/src/handler/blockers/linkedin_blockers.rs deleted file mode 100644 index 685d8f392..000000000 --- a/spider_chrome/src/handler/blockers/linkedin_blockers.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "/log", - "https://www.linkedin.com/li/track", - "https://li.protechts.net", - "https://www.linkedin.com/platform-telemetry/li", - "https://www.linkedin.com/organization-guest/api/feedUpdates/", - "https://www.linkedin.com/feedcontent-guest/api/ingraphs/gauge", - "https://www.linkedin.com/voyager/api/", - "https://platform.linkedin.com/litms/allowlist/voyager-web-global" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block linkedin events that are not required -pub fn block_linkedin( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/blockers/medium_blockers.rs b/spider_chrome/src/handler/blockers/medium_blockers.rs deleted file mode 100644 index 9c2ecf2e4..000000000 --- a/spider_chrome/src/handler/blockers/medium_blockers.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://cdn-client.medium.com/lite/static/js/instrumentation.", - "https://medium.com/_/clientele/reports/performance/", - "https://cdn-client.medium.com/lite/static/js/reporting.f", - "https://medium.com/_/clientele/reports/performance/", - "https://cdn-client.medium.com/lite/static/js/manifest.", - "clientele/reports/performance/", - "https://www.google.com/js/bg/", - "https://chitaranjanbiswal93.medium.com/_/clientele/reports/performance/" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block medium events that are not required -pub fn block_medium( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs index 694e65fc9..8972242b6 100644 --- a/spider_chrome/src/handler/blockers/mod.rs +++ b/spider_chrome/src/handler/blockers/mod.rs @@ -1,47 +1,26 @@ /// adblock patterns pub mod adblock_patterns; -/// amazon blockers -pub mod amazon_blockers; -/// ebay blockers -pub mod ebay_blockers; -/// glassdoor blockers -pub mod glassdoor_blockers; /// interception manager pub mod intercept_manager; -/// linkedin blockers -pub mod linkedin_blockers; -/// medium blockers -pub mod medium_blockers; -/// netflix blockers -pub mod netflix_blockers; -/// nytimes blockers -pub mod nytimes_blockers; /// script blockers pub mod scripts; -/// block tcgplayer.com -pub mod tcgplayer_blockers; -/// tiktok blockers -pub mod tiktok_blockers; -/// upwork blockers -pub mod upwork_blockers; -/// wikipedia blockers -pub mod wikipedia_blockers; -/// x blockers -pub mod x_blockers; - /// xhr blockers pub mod xhr; // Trie node for ignore. -#[derive(Default)] -pub(crate) struct TrieNode { - children: hashbrown::HashMap, - is_end_of_word: bool, +#[derive(Default, Debug)] +pub struct TrieNode { + /// Children for trie. + pub children: hashbrown::HashMap, + /// End of word match. + pub is_end_of_word: bool, } /// Basic Ignore trie. -pub(crate) struct Trie { - root: TrieNode, +#[derive(Debug)] +pub struct Trie { + /// The trie node. + pub root: TrieNode, } impl Trie { diff --git a/spider_chrome/src/handler/blockers/netflix_blockers.rs b/spider_chrome/src/handler/blockers/netflix_blockers.rs deleted file mode 100644 index abda5c0a4..000000000 --- a/spider_chrome/src/handler/blockers/netflix_blockers.rs +++ /dev/null @@ -1,24 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "/log", - "https://assets.nflxext.com/web/", - "https://ae.nflximg.net/monet/scripts/adtech_iframe", - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block netflix events that are not required -pub fn block_netflix( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/blockers/nytimes_blockers.rs b/spider_chrome/src/handler/blockers/nytimes_blockers.rs deleted file mode 100644 index f3a84774f..000000000 --- a/spider_chrome/src/handler/blockers/nytimes_blockers.rs +++ /dev/null @@ -1,50 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://purr.nytimes.com/v1/purr-cache", - "https://static01.nyt.com/ads/tpc-check.html", - "https://www.nytimes.com/vi-assets/static-assets/adslot", - "https://purr.nytimes.com/v2/tcf", - "https://a.et.nytimes.com//.status", - "https://www.nytimes.com/fides/api/v1/privacy-experience?", - "https://o82024.ingest.us.sentry.io/", - "https://a.nytimes.com/svc/nyt/data-layer?", - "https://www.nytimes.com/ads/prebid9.11.0.js" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; - /// Ignore list of urls. - static ref URL_IGNORE_TRIE_VISUALS: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://static01.nyt.com/video-static/vhs3/vhs.min.js", - "https://www.nytimes.com/vi-assets/static-assets/vendors~", - "https://als-svc.nytimes.com/als" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block nytimes events that are not required -pub fn block_nytimes( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, - ignore_visuals: bool, -) -> bool { - let mut allowed = URL_IGNORE_TRIE.contains_prefix(&event.request.url); - - if !allowed && ignore_visuals { - allowed = URL_IGNORE_TRIE_VISUALS.contains_prefix(&event.request.url) - } - - allowed -} diff --git a/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs b/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs deleted file mode 100644 index bdf79b05d..000000000 --- a/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs +++ /dev/null @@ -1,27 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://data.tcgplayer.com/suggestions/trending", - "https://mpapi.tcgplayer.com/v2/kickbacks?active=true", - "https://homepage.marketplace.tcgplayer.com/sitealert.json", - "https://infinite-api.tcgplayer.com/signup/?", - "https://features.tcgplayer.com/v1/optimizely/Variation/", - "https://mpapi.tcgplayer.com/v2/address/countryCodes?mpfev=3031" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block tcgplayer events that are not required -pub fn block_tcgplayer( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/blockers/tiktok_blockers.rs b/spider_chrome/src/handler/blockers/tiktok_blockers.rs deleted file mode 100644 index 69298fe70..000000000 --- a/spider_chrome/src/handler/blockers/tiktok_blockers.rs +++ /dev/null @@ -1,45 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://mcs.tiktokw.us/v1/list", - "https://www.tiktok.com/ttwid/check", - "https://www.tiktok.com/api/share/settings", - "https://webcast.us.tiktok.com/", - "https://www.tiktok.com/api/ba/business/suite/permission/list", - "https://www.tiktok.com/api/policy/notice/", - "https://www.tiktok.com/api/v1/web-cookie-privacy", - "https://www.tiktok.com/aweme/v1/report/inbox/notice", - "https://www.tiktok.com/api/inbox/notice_count/", - "https://mcs.tiktokv.us/v1/user/webid", - "https://mon16-normal-useast5.tiktokv.us/monitor_browser/collect/batch/?bid=tiktok_pns_web_runtime", - "https://webcast.tiktok.com/webcast/wallet_api/fs/diamond_buy", - "https://lf16-tiktok-web.tiktokcdn-us.com/obj/tiktok-web-tx/tiktok_privacy_protection_framework/loader/", - "https://lf16-tiktok-web.tiktokcdn-us.com/obj/tiktok-web-tx/tiktok/webapp/main/webapp-desktop/npm-async-bric_verify_sec_sdk_build_captcha", - "/tiktok_privacy_protection_framework/loader", - "/obj/tiktok-web-tx/tiktok_privacy_protection_framework/loader", - "/service/2/abtest_config/", - "collect/batch/?bid=tiktok_pns_web_runtime", - // "https://libraweb.tiktokw.us/service/2/abtest_config/", - // "https://lf16-cdn-tos.tiktokcdn-us.com/obj/static-tx/secsdk/secsdk-lastest.umd.js", - "monitor_browser/collect/batch/?bid=tiktok_pns_web_runtime", - "/tiktok-cookie-banner/", - // custom framework - "/tiktok/webapp/main/webapp-desktop-islands/npm-async-bric_verify_sec_sdk_build_captcha_", - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block tiktok events that are not required -pub fn block_tiktok( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/blockers/upwork_blockers.rs b/spider_chrome/src/handler/blockers/upwork_blockers.rs deleted file mode 100644 index dfef50ff7..000000000 --- a/spider_chrome/src/handler/blockers/upwork_blockers.rs +++ /dev/null @@ -1,65 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://www.upwork.com/shitake/suit", - "https://www.upwork.com/upi/jslogger", - "https://mpsnare.iesnare.com/5.8.1/logo.js", - "https://first.iovation.com/", - "https://zn0izjiulta2j2t4o-upwork.siteintercept.qualtrics.com/", - "https://cdn123.forter.com/", - "https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/js/manifest.", - "https://www.upwork.com/iojs/general5/static_wdp.js", - "https://www.upwork.com/static/suit2-tracker/", - "https://www.upwork.com/api/graphql/v1?alias=spellCheck", - "https://www.upwork.com/api/graphql/v1?alias=relatedSuggestions", - "https://www.upwork.com/api/graphql/v1?alias=autoSuggestions", - ".siteintercept.qualtrics.com/", - ".forter.com", - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; - - /// Ignore list of urls styles. - static ref URL_IGNORE_TRIE_STYLES: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/", - // 1 missing link needs further looking into for each of the styles - "https://www.upwork.com/static/assets/UniversalSearchNuxt/styles~", - "https://www.upwork.com/static/assets/Brontes/styles", - "https://www.upwork.com/static/assets/Brontes/google-one-tap.6226625d.js" - - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block upwork events that are not required -pub fn block_upwork_styles( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE_STYLES.contains_prefix(&event.request.url) -} - -// Block upwork events that are not required -pub fn block_upwork( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, - ignore_visuals: bool, -) -> bool { - let blocked = URL_IGNORE_TRIE.contains_prefix(&event.request.url); - if !blocked && ignore_visuals { - block_upwork_styles(event) - } else { - blocked - } -} diff --git a/spider_chrome/src/handler/blockers/wikipedia_blockers.rs b/spider_chrome/src/handler/blockers/wikipedia_blockers.rs deleted file mode 100644 index 0ec2886cc..000000000 --- a/spider_chrome/src/handler/blockers/wikipedia_blockers.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://meta.wikimedia.org/w/index.php?title=MediaWiki:Wikiminiatlas.js&action=raw&ctype=text/javascript", - "https://login.wikimedia.org/wiki/Special:CentralAutoLogin/checkLoggedIn?useformat=desktop&wikiid=ptwiki&type=script&wikiid=ptwiki&type=script", - ".wikipedia.org/w/load.php?lang=pt&modules=ext.centralNotice.choiceData%2CgeoIP%2CstartUp%7Cext.centralauth.ForeignApi%2Ccentralautologin%7Cext.checkUser.clientHints%7Cext.cite.ux-enhancements%7Cext.cx.eventlogging.campaigns", - ".wikipedia.org/w/load.php?lang=pt&modules=startup&only=scripts&raw=1&skin=vector-2022", - ".eventlogging.campaigns", - "%2CFeedbackHighlight%2", - ".quicksurveys.", - "Special:CentralAutoLogin/start?type=script", - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block wikipedia events that are not required -pub fn block_wikipedia( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/blockers/x_blockers.rs b/spider_chrome/src/handler/blockers/x_blockers.rs deleted file mode 100644 index bb98e55eb..000000000 --- a/spider_chrome/src/handler/blockers/x_blockers.rs +++ /dev/null @@ -1,28 +0,0 @@ -use crate::handler::blockers::Trie; - -lazy_static::lazy_static! { - /// Ignore list of urls. - static ref URL_IGNORE_TRIE: Trie = { - let mut trie = Trie::new(); - let patterns = [ - "https://accounts.google.com/gsi/", - "https://appleid.cdn-apple.com/appleauth/static/jsapi/appleid/1/en_US/appleid.auth.js", - "https://api.x.com/1.1/onboarding/sso_init.json", - "https://api.x.com/1.1/jot/client_event.json", - "https://api.x.com/1.1/jot/error_log.json", - "https://api.x.com/1.1/hashflags.json", - // "https://abs.twimg.com/responsive-web/client-web/" - ]; - for pattern in &patterns { - trie.insert(pattern); - } - trie - }; -} - -// Block x events that are not required -pub fn block_x( - event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, -) -> bool { - URL_IGNORE_TRIE.contains_prefix(&event.request.url) -} diff --git a/spider_chrome/src/handler/mod.rs b/spider_chrome/src/handler/mod.rs index 116efd56f..9b0534275 100644 --- a/spider_chrome/src/handler/mod.rs +++ b/spider_chrome/src/handler/mod.rs @@ -730,7 +730,7 @@ impl Default for HandlerConfig { only_html: false, extra_headers: Default::default(), created_first_target: false, - intercept_manager: NetworkInterceptManager::Unknown, + intercept_manager: NetworkInterceptManager::UNKNOWN, } } } diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 6d56f4d46..8f53b4470 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -155,7 +155,7 @@ impl NetworkManager { block_stylesheets: false, block_analytics: true, only_html: false, - intercept_manager: NetworkInterceptManager::Unknown, + intercept_manager: NetworkInterceptManager::UNKNOWN, } } @@ -273,7 +273,7 @@ impl NetworkManager { // ignore assets we do not need for frameworks if !ignore_script - && intercept_manager == NetworkInterceptManager::Unknown + && intercept_manager == NetworkInterceptManager::UNKNOWN { let hydration_file = JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p)); @@ -434,54 +434,11 @@ impl NetworkManager { || network_resource || event.resource_type == ResourceType::Document) { - match self.intercept_manager { - NetworkInterceptManager::TikTok => { - super::blockers::tiktok_blockers::block_tiktok(event) - } - NetworkInterceptManager::Amazon => { - super::blockers::amazon_blockers::block_amazon(event) - } - NetworkInterceptManager::X => { - super::blockers::x_blockers::block_x(event) - } - NetworkInterceptManager::Netflix => { - super::blockers::netflix_blockers::block_netflix(event) - } - NetworkInterceptManager::LinkedIn => { - super::blockers::linkedin_blockers::block_linkedin(event) - } - NetworkInterceptManager::Medium => { - super::blockers::medium_blockers::block_medium(event) - } - NetworkInterceptManager::Ebay => { - super::blockers::ebay_blockers::block_ebay(event) - } - NetworkInterceptManager::Wikipedia => { - super::blockers::wikipedia_blockers::block_wikipedia(event) - } - NetworkInterceptManager::Tcgplayer => { - super::blockers::tcgplayer_blockers::block_tcgplayer(event) - } - NetworkInterceptManager::Nytimes => { - super::blockers::nytimes_blockers::block_nytimes( - event, - self.ignore_visuals, - ) - } - NetworkInterceptManager::Glassdoor => { - super::blockers::glassdoor_blockers::block_glassdoor( - event, - self.ignore_visuals, - ) - } - NetworkInterceptManager::Upwork => { - super::blockers::upwork_blockers::block_upwork( - event, - self.ignore_visuals, - ) - } - _ => skip_networking, - } + self.intercept_manager.intercept_detection( + &event.request.url, + self.ignore_visuals, + network_resource, + ) } else { skip_networking }; @@ -585,54 +542,11 @@ impl NetworkManager { || network_resource || event.resource_type == ResourceType::Document) { - match self.intercept_manager { - NetworkInterceptManager::TikTok => { - super::blockers::tiktok_blockers::block_tiktok(event) - } - NetworkInterceptManager::Amazon => { - super::blockers::amazon_blockers::block_amazon(event) - } - NetworkInterceptManager::X => { - super::blockers::x_blockers::block_x(event) - } - NetworkInterceptManager::Netflix => { - super::blockers::netflix_blockers::block_netflix(event) - } - NetworkInterceptManager::LinkedIn => { - super::blockers::linkedin_blockers::block_linkedin(event) - } - NetworkInterceptManager::Tcgplayer => { - super::blockers::tcgplayer_blockers::block_tcgplayer(event) - } - NetworkInterceptManager::Medium => { - super::blockers::medium_blockers::block_medium(event) - } - NetworkInterceptManager::Ebay => { - super::blockers::ebay_blockers::block_ebay(event) - } - NetworkInterceptManager::Wikipedia => { - super::blockers::wikipedia_blockers::block_wikipedia(event) - } - NetworkInterceptManager::Nytimes => { - super::blockers::nytimes_blockers::block_nytimes( - event, - self.ignore_visuals, - ) - } - NetworkInterceptManager::Glassdoor => { - super::blockers::glassdoor_blockers::block_glassdoor( - event, - self.ignore_visuals, - ) - } - NetworkInterceptManager::Upwork => { - super::blockers::upwork_blockers::block_upwork( - event, - self.ignore_visuals, - ) - } - _ => skip_networking, - } + self.intercept_manager.intercept_detection( + &event.request.url, + self.ignore_visuals, + network_resource, + ) } else { skip_networking }; diff --git a/spider_chrome/src/handler/target.rs b/spider_chrome/src/handler/target.rs index d4e669cda..25b7c4e50 100644 --- a/spider_chrome/src/handler/target.rs +++ b/spider_chrome/src/handler/target.rs @@ -654,7 +654,7 @@ impl Default for TargetConfig { ignore_analytics: true, only_html: false, extra_headers: Default::default(), - intercept_manager: NetworkInterceptManager::Unknown, + intercept_manager: NetworkInterceptManager::UNKNOWN, } } } diff --git a/spider_chrome/url_patterns/domains/amazon.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/amazon.com/scripts/pattern1.txt new file mode 100644 index 000000000..99f8e5f08 --- /dev/null +++ b/spider_chrome/url_patterns/domains/amazon.com/scripts/pattern1.txt @@ -0,0 +1,15 @@ +https://cognito-identity.us-east-1.amazonaws.com +https://completion.amazon.com/api/2017/suggestions +https://sts.us-east-1.amazonaws.com/ +https://www.amazon.com/cross_border_interstitial_sp/render +https://aax-us-east-retail-direct.amazon.com/e/xsp/getAd +https://fls-na.amazon.com/1/batch/1/OE/ +https://unagi.amazon.com/1/events/ +https://images-na.ssl-images-amazon.com/images/S/apesafeframe/ape/sf/desktop/ +https://m.media-amazon.com/images/G/01/csm/showads +https://dataplane.rum +https://client.rum +.amazon-adsystem.com +SearchPartnerAssets +inner-host.min.js +https://s.amazon-adsystem.com/ \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/amazon.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/amazon.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/amazon.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/amazon.com/xhr/pattern1.txt new file mode 100644 index 000000000..5a1c80c21 --- /dev/null +++ b/spider_chrome/url_patterns/domains/amazon.com/xhr/pattern1.txt @@ -0,0 +1,2 @@ +?pageViewLogging=1 +.amazon-adsystem.com/ \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/ebay.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/ebay.com/scripts/pattern1.txt new file mode 100644 index 000000000..4a4bf3b8f --- /dev/null +++ b/spider_chrome/url_patterns/domains/ebay.com/scripts/pattern1.txt @@ -0,0 +1,8 @@ +https://www.ebay.com/sch/ajax/autocomplete +https://www.ebay.com/blueberry/v1/ads/identity/pixelUrls +https://svcs.ebay.com/ufeservice/v1/events +https://www.ebay.com/gh/useracquisition? +https://vi.vipr.ebaydesc.com/ +https://srv.main.ebayrtm.com/ +https://www.ebay.com/nap/napkinapi/ +https://ir.ebaystatic.com/rs/c/scandal/ScandalJS- \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/ebay.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/ebay.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/ebay.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/ebay.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/facebook.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/facebook.com/scripts/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/facebook.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/facebook.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/facebook.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/facebook.com/xhr/pattern1.txt new file mode 100644 index 000000000..db8d9c82a --- /dev/null +++ b/spider_chrome/url_patterns/domains/facebook.com/xhr/pattern1.txt @@ -0,0 +1 @@ +https://www.facebook.com/ajax/webstorage/process_keys/?state=1 diff --git a/spider_chrome/url_patterns/domains/glassdoor.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/glassdoor.com/scripts/pattern1.txt new file mode 100644 index 000000000..08ececdf1 --- /dev/null +++ b/spider_chrome/url_patterns/domains/glassdoor.com/scripts/pattern1.txt @@ -0,0 +1,3 @@ +https://www.glassdoor.com/garnish/static/js/gd-sw-register. +https://cdnjs.cloudflare.com/ajax/libs/prop-types/15.7.2/prop-types.min.js +https://www.glassdoor.com/autocomplete/location? \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/glassdoor.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/glassdoor.com/styles/pattern1.txt new file mode 100644 index 000000000..e39868ae1 --- /dev/null +++ b/spider_chrome/url_patterns/domains/glassdoor.com/styles/pattern1.txt @@ -0,0 +1,6 @@ +https://www.glassdoor.com/sam-global-nav/static/ +https://www.glassdoor.com/garnish/static/js/gd- +https://unpkg.com/@dotlottie/player-component@ +https://www.glassdoor.com/job-search-next/assets/_next/static/ +https://www.glassdoor.com/ei-overview-next/assets/_next/static/ +https://www.glassdoor.com/occ-salaries-web/assets/_next/static/ \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/glassdoor.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/glassdoor.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/linkedin.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/linkedin.com/scripts/pattern1.txt new file mode 100644 index 000000000..728d26a4c --- /dev/null +++ b/spider_chrome/url_patterns/domains/linkedin.com/scripts/pattern1.txt @@ -0,0 +1,8 @@ +/log +https://www.linkedin.com/li/track +https://li.protechts.net +https://www.linkedin.com/platform-telemetry/li +https://www.linkedin.com/organization-guest/api/feedUpdates/ +https://www.linkedin.com/feedcontent-guest/api/ingraphs/gauge +https://www.linkedin.com/voyager/api/ +https://platform.linkedin.com/litms/allowlist/voyager-web-global \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/linkedin.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/linkedin.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/linkedin.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/linkedin.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/medium.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/medium.com/scripts/pattern1.txt new file mode 100644 index 000000000..29c243b2c --- /dev/null +++ b/spider_chrome/url_patterns/domains/medium.com/scripts/pattern1.txt @@ -0,0 +1,8 @@ +https://cdn-client.medium.com/lite/static/js/instrumentation. +https://medium.com/_/clientele/reports/performance/ +https://cdn-client.medium.com/lite/static/js/reporting.f +https://medium.com/_/clientele/reports/performance/ +https://cdn-client.medium.com/lite/static/js/manifest. +clientele/reports/performance/ +https://www.google.com/js/bg/ +https://chitaranjanbiswal93.medium.com/_/clientele/reports/performance/ \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/medium.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/medium.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/medium.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/medium.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/netflix.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/netflix.com/scripts/pattern1.txt new file mode 100644 index 000000000..23f646c2f --- /dev/null +++ b/spider_chrome/url_patterns/domains/netflix.com/scripts/pattern1.txt @@ -0,0 +1,3 @@ +/log +https://assets.nflxext.com/web/ +https://ae.nflximg.net/monet/scripts/adtech_iframe \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/netflix.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/netflix.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/netflix.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/netflix.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/nytimes.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/nytimes.com/scripts/pattern1.txt new file mode 100644 index 000000000..1a2ee0b3e --- /dev/null +++ b/spider_chrome/url_patterns/domains/nytimes.com/scripts/pattern1.txt @@ -0,0 +1,9 @@ +https://purr.nytimes.com/v1/purr-cache +https://static01.nyt.com/ads/tpc-check.html +https://www.nytimes.com/vi-assets/static-assets/adslot +https://purr.nytimes.com/v2/tcf +https://a.et.nytimes.com//.status +https://www.nytimes.com/fides/api/v1/privacy-experience? +https://o82024.ingest.us.sentry.io/ +https://a.nytimes.com/svc/nyt/data-layer? +https://www.nytimes.com/ads/prebid9.11.0.js \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/nytimes.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/nytimes.com/styles/pattern1.txt new file mode 100644 index 000000000..f2f38f1b0 --- /dev/null +++ b/spider_chrome/url_patterns/domains/nytimes.com/styles/pattern1.txt @@ -0,0 +1,3 @@ +https://static01.nyt.com/video-static/vhs3/vhs.min.js +https://www.nytimes.com/vi-assets/static-assets/vendors~ +https://als-svc.nytimes.com/als \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/nytimes.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/nytimes.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/tcgplayer.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/tcgplayer.com/scripts/pattern1.txt new file mode 100644 index 000000000..8ed686297 --- /dev/null +++ b/spider_chrome/url_patterns/domains/tcgplayer.com/scripts/pattern1.txt @@ -0,0 +1,6 @@ +https://data.tcgplayer.com/suggestions/trending +https://mpapi.tcgplayer.com/v2/kickbacks?active=true +https://homepage.marketplace.tcgplayer.com/sitealert.json +https://infinite-api.tcgplayer.com/signup/? +https://features.tcgplayer.com/v1/optimizely/Variation/ +https://mpapi.tcgplayer.com/v2/address/countryCodes?mpfev=3031 \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/tcgplayer.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/tcgplayer.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/tcgplayer.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/tcgplayer.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/tiktok.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/tiktok.com/scripts/pattern1.txt new file mode 100644 index 000000000..e39e423f8 --- /dev/null +++ b/spider_chrome/url_patterns/domains/tiktok.com/scripts/pattern1.txt @@ -0,0 +1,21 @@ +https://mcs.tiktokw.us/v1/list +https://www.tiktok.com/ttwid/check +https://www.tiktok.com/api/share/settings +https://webcast.us.tiktok.com/ +https://www.tiktok.com/api/ba/business/suite/permission/list +https://www.tiktok.com/api/policy/notice/ +https://www.tiktok.com/api/v1/web-cookie-privacy +https://www.tiktok.com/aweme/v1/report/inbox/notice +https://www.tiktok.com/api/inbox/notice_count/ +https://mcs.tiktokv.us/v1/user/webid +https://mon16-normal-useast5.tiktokv.us/monitor_browser/collect/batch/?bid=tiktok_pns_web_runtime +https://webcast.tiktok.com/webcast/wallet_api/fs/diamond_buy +https://lf16-tiktok-web.tiktokcdn-us.com/obj/tiktok-web-tx/tiktok_privacy_protection_framework/loader/ +https://lf16-tiktok-web.tiktokcdn-us.com/obj/tiktok-web-tx/tiktok/webapp/main/webapp-desktop/npm-async-bric_verify_sec_sdk_build_captcha +/tiktok_privacy_protection_framework/loader +/obj/tiktok-web-tx/tiktok_privacy_protection_framework/loader +/service/2/abtest_config/ +collect/batch/?bid=tiktok_pns_web_runtime +monitor_browser/collect/batch/?bid=tiktok_pns_web_runtime +/tiktok-cookie-banner/ +/tiktok/webapp/main/webapp-desktop-islands/npm-async-bric_verify_sec_sdk_build_captcha_ \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/tiktok.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/tiktok.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/tiktok.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/tiktok.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/upwork.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/upwork.com/scripts/pattern1.txt new file mode 100644 index 000000000..03ed15ab0 --- /dev/null +++ b/spider_chrome/url_patterns/domains/upwork.com/scripts/pattern1.txt @@ -0,0 +1,14 @@ +https://www.upwork.com/shitake/suit +https://www.upwork.com/upi/jslogger +https://mpsnare.iesnare.com/5.8.1/logo.js +https://first.iovation.com/ +https://zn0izjiulta2j2t4o-upwork.siteintercept.qualtrics.com/ +https://cdn123.forter.com/ +https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/js/manifest. +https://www.upwork.com/iojs/general5/static_wdp.js +https://www.upwork.com/static/suit2-tracker/ +https://www.upwork.com/api/graphql/v1?alias=spellCheck +https://www.upwork.com/api/graphql/v1?alias=relatedSuggestions +https://www.upwork.com/api/graphql/v1?alias=autoSuggestions +.siteintercept.qualtrics.com/ +.forter.com \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/upwork.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/upwork.com/styles/pattern1.txt new file mode 100644 index 000000000..34487841e --- /dev/null +++ b/spider_chrome/url_patterns/domains/upwork.com/styles/pattern1.txt @@ -0,0 +1,4 @@ +https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/ +https://www.upwork.com/static/assets/UniversalSearchNuxt/styles~ +https://www.upwork.com/static/assets/Brontes/styles +https://www.upwork.com/static/assets/Brontes/google-one-tap.6226625d.js \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/upwork.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/upwork.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/wikipedia.org/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/wikipedia.org/scripts/pattern1.txt new file mode 100644 index 000000000..0706a79a2 --- /dev/null +++ b/spider_chrome/url_patterns/domains/wikipedia.org/scripts/pattern1.txt @@ -0,0 +1,8 @@ +https://meta.wikimedia.org/w/index.php?title=MediaWiki:Wikiminiatlas.js&action=raw&ctype=text/javascript +https://login.wikimedia.org/wiki/Special:CentralAutoLogin/checkLoggedIn?useformat=desktop&wikiid=ptwiki&type=script&wikiid=ptwiki&type=script +.wikipedia.org/w/load.php?lang=pt&modules=ext.centralNotice.choiceData%2CgeoIP%2CstartUp%7Cext.centralauth.ForeignApi%2Ccentralautologin%7Cext.checkUser.clientHints%7Cext.cite.ux-enhancements%7Cext.cx.eventlogging.campaigns +.wikipedia.org/w/load.php?lang=pt&modules=startup&only=scripts&raw=1&skin=vector-2022 +.eventlogging.campaigns +%2CFeedbackHighlight%2 +.quicksurveys. +Special:CentralAutoLogin/start?type=script \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/wikipedia.org/styles/pattern1.txt b/spider_chrome/url_patterns/domains/wikipedia.org/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/wikipedia.org/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/wikipedia.org/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/x.com/scripts/pattern1.txt b/spider_chrome/url_patterns/domains/x.com/scripts/pattern1.txt new file mode 100644 index 000000000..c605d5751 --- /dev/null +++ b/spider_chrome/url_patterns/domains/x.com/scripts/pattern1.txt @@ -0,0 +1,6 @@ +https://accounts.google.com/gsi/ +https://appleid.cdn-apple.com/appleauth/static/jsapi/appleid/1/en_US/appleid.auth.js +https://api.x.com/1.1/onboarding/sso_init.json +https://api.x.com/1.1/jot/client_event.json +https://api.x.com/1.1/jot/error_log.json +https://api.x.com/1.1/hashflags.json \ No newline at end of file diff --git a/spider_chrome/url_patterns/domains/x.com/styles/pattern1.txt b/spider_chrome/url_patterns/domains/x.com/styles/pattern1.txt new file mode 100644 index 000000000..e69de29bb diff --git a/spider_chrome/url_patterns/domains/x.com/xhr/pattern1.txt b/spider_chrome/url_patterns/domains/x.com/xhr/pattern1.txt new file mode 100644 index 000000000..e69de29bb