diff --git a/Cargo.lock b/Cargo.lock index 20524a770..4cfc702dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4273,7 +4273,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.21.12" +version = "2.21.13" dependencies = [ "ahash", "aho-corasick", @@ -4336,7 +4336,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.21.12" +version = "2.21.13" dependencies = [ "adblock", "base64 0.22.1", @@ -4373,7 +4373,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.21.12" +version = "2.21.13" dependencies = [ "clap", "env_logger", @@ -4398,7 +4398,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.21.12" +version = "2.21.13" dependencies = [ "aho-corasick", "fast_html2md", @@ -4420,7 +4420,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.21.12" +version = "2.21.13" dependencies = [ "indexmap 1.9.3", "serde", @@ -4432,7 +4432,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.21.12" +version = "2.21.13" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 69649665b..921306fac 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.21.12" +version = "2.21.13" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 6d39ad87e..6caf2c285 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.21.12" +version = "2.21.13" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs index ca0829cb6..44c8df052 100644 --- a/spider_chrome/src/handler/blockers/mod.rs +++ b/spider_chrome/src/handler/blockers/mod.rs @@ -20,6 +20,8 @@ pub mod scripts; pub mod tiktok_blockers; /// upwork blockers pub mod upwork_blockers; +/// wikipedia blockers +pub mod wikipedia_blockers; /// x blockers pub mod x_blockers; diff --git a/spider_chrome/src/handler/blockers/wikipedia_blockers.rs b/spider_chrome/src/handler/blockers/wikipedia_blockers.rs new file mode 100644 index 000000000..6ae471ea7 --- /dev/null +++ b/spider_chrome/src/handler/blockers/wikipedia_blockers.rs @@ -0,0 +1,30 @@ +use crate::handler::blockers::Trie; + +lazy_static::lazy_static! { + /// Ignore list of urls. + static ref URL_IGNORE_TRIE: Trie = { + let mut trie = Trie::new(); + let patterns = [ + "https://meta.wikimedia.org/w/index.php?title=MediaWiki:Wikiminiatlas.js&action=raw&ctype=text/javascript", + "https://meta.wikimedia.org/w/index.php?title=MediaWiki:Wikiminiatlas.js&action=raw&ctype=text/javascript", + "https://login.wikimedia.org/wiki/Special:CentralAutoLogin/checkLoggedIn?useformat=desktop&wikiid=ptwiki&type=script&wikiid=ptwiki&type=script", + ".wikipedia.org/w/load.php?lang=pt&modules=ext.centralNotice.choiceData%2CgeoIP%2CstartUp%7Cext.centralauth.ForeignApi%2Ccentralautologin%7Cext.checkUser.clientHints%7Cext.cite.ux-enhancements%7Cext.cx.eventlogging.campaigns", + ".wikipedia.org/w/load.php?lang=pt&modules=startup&only=scripts&raw=1&skin=vector-2022", + ".eventlogging.campaigns", + "%2CFeedbackHighlight%2", + ".quicksurveys.", + "Special:CentralAutoLogin/start?type=script", + ]; + for pattern in &patterns { + trie.insert(pattern); + } + trie + }; +} + +// Block wikipedia events that are not required +pub fn block_wikipedia( + event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, +) -> bool { + URL_IGNORE_TRIE.contains_prefix(&event.request.url) +} diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 074b6ddfb..b03434953 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -26,8 +26,8 @@ use chromiumoxide_cdp::cdp::browser_protocol::{ use chromiumoxide_types::{Command, Method, MethodId}; use hashbrown::{HashMap, HashSet}; use lazy_static::lazy_static; -use std::collections::VecDeque; use std::time::Duration; +use std::{collections::VecDeque, default}; lazy_static! { /// allowed js frameworks and libs excluding some and adding additional URLs @@ -162,6 +162,8 @@ pub enum NetworkInterceptManager { Ebay, /// nytimes.com Nytimes, + /// wikipedia.com + Wikipedia, #[default] /// Unknown Unknown, @@ -169,7 +171,7 @@ pub enum NetworkInterceptManager { lazy_static! { /// Top tier list of the most common websites visited. - pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 20] = [ + pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [ ("https://www.tiktok.com", NetworkInterceptManager::TikTok), ("https://tiktok.com", NetworkInterceptManager::TikTok), ("https://www.amazon.", NetworkInterceptManager::Amazon), @@ -193,15 +195,32 @@ lazy_static! { ("https://ebay.", NetworkInterceptManager::Ebay), ("https://www.nytimes.com", NetworkInterceptManager::Nytimes), ("https://nytimes.com", NetworkInterceptManager::Nytimes), + ("wikipedia.org", NetworkInterceptManager::Wikipedia), ]; } +/// The find type is own. +#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)] +enum FindType { + #[default] + /// Starts with. + StartsWith, + /// Contains. + Contains, +} + impl NetworkInterceptManager { /// a custom intercept handle. pub fn new(url: &str) -> NetworkInterceptManager { TOP_TIER_LIST .iter() - .find(|&(pattern, _)| url.starts_with(pattern)) + .find(|&(pattern, nm)| { + if nm.get_pattern() == FindType::StartsWith { + url.starts_with(pattern) + } else { + url.contains(pattern) + } + }) .map(|&(_, manager_type)| manager_type) .unwrap_or(NetworkInterceptManager::Unknown) } @@ -209,6 +228,14 @@ impl NetworkInterceptManager { pub fn setup(&mut self, url: &str) -> Self { NetworkInterceptManager::new(url) } + + /// determine the pattern to use. + fn get_pattern(&self) -> FindType { + match self { + NetworkInterceptManager::Wikipedia => FindType::Contains, + _ => FindType::StartsWith, + } + } } #[derive(Debug)] @@ -564,6 +591,9 @@ impl NetworkManager { NetworkInterceptManager::Ebay => { super::blockers::ebay_blockers::block_ebay(event) } + NetworkInterceptManager::Wikipedia => { + super::blockers::wikipedia_blockers::block_wikipedia(event) + } NetworkInterceptManager::Nytimes => { super::blockers::nytimes_blockers::block_nytimes( event, @@ -704,6 +734,9 @@ impl NetworkManager { NetworkInterceptManager::Ebay => { super::blockers::ebay_blockers::block_ebay(event) } + NetworkInterceptManager::Wikipedia => { + super::blockers::wikipedia_blockers::block_wikipedia(event) + } NetworkInterceptManager::Nytimes => { super::blockers::nytimes_blockers::block_nytimes( event, diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 7df85749c..1a77160dc 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.21.12" +version = "2.21.13" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index c76d9f4a3..8ecde0a32 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.21.12" +version = "2.21.13" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 220fb6f51..228a200ab 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.21.12" +version = "2.21.13" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 2a81a24ed..6a3315491 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.21.12" +version = "2.21.13" authors = [ "j-mendez " ]