Skip to content

Commit

Permalink
chore(chrome): add to block list
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 11, 2024
1 parent 5ac0a1f commit 75f3022
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 15 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.21.12"
version = "2.21.13"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.21.12"
version = "2.21.13"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 2 additions & 0 deletions spider_chrome/src/handler/blockers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ pub mod scripts;
pub mod tiktok_blockers;
/// upwork blockers
pub mod upwork_blockers;
/// wikipedia blockers
pub mod wikipedia_blockers;
/// x blockers
pub mod x_blockers;

Expand Down
30 changes: 30 additions & 0 deletions spider_chrome/src/handler/blockers/wikipedia_blockers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use crate::handler::blockers::Trie;

lazy_static::lazy_static! {
/// Ignore list of urls.
static ref URL_IGNORE_TRIE: Trie = {
let mut trie = Trie::new();
let patterns = [
"https://meta.wikimedia.org/w/index.php?title=MediaWiki:Wikiminiatlas.js&action=raw&ctype=text/javascript",
"https://meta.wikimedia.org/w/index.php?title=MediaWiki:Wikiminiatlas.js&action=raw&ctype=text/javascript",
"https://login.wikimedia.org/wiki/Special:CentralAutoLogin/checkLoggedIn?useformat=desktop&wikiid=ptwiki&type=script&wikiid=ptwiki&type=script",
".wikipedia.org/w/load.php?lang=pt&modules=ext.centralNotice.choiceData%2CgeoIP%2CstartUp%7Cext.centralauth.ForeignApi%2Ccentralautologin%7Cext.checkUser.clientHints%7Cext.cite.ux-enhancements%7Cext.cx.eventlogging.campaigns",
".wikipedia.org/w/load.php?lang=pt&modules=startup&only=scripts&raw=1&skin=vector-2022",
".eventlogging.campaigns",
"%2CFeedbackHighlight%2",
".quicksurveys.",
"Special:CentralAutoLogin/start?type=script",
];
for pattern in &patterns {
trie.insert(pattern);
}
trie
};
}

// Block wikipedia events that are not required
pub fn block_wikipedia(
event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused,
) -> bool {
URL_IGNORE_TRIE.contains_prefix(&event.request.url)
}
39 changes: 36 additions & 3 deletions spider_chrome/src/handler/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ use chromiumoxide_cdp::cdp::browser_protocol::{
use chromiumoxide_types::{Command, Method, MethodId};
use hashbrown::{HashMap, HashSet};
use lazy_static::lazy_static;
use std::collections::VecDeque;
use std::time::Duration;
use std::{collections::VecDeque, default};

lazy_static! {
/// allowed js frameworks and libs excluding some and adding additional URLs
Expand Down Expand Up @@ -162,14 +162,16 @@ pub enum NetworkInterceptManager {
Ebay,
/// nytimes.com
Nytimes,
/// wikipedia.com
Wikipedia,
#[default]
/// Unknown
Unknown,
}

lazy_static! {
/// Top tier list of the most common websites visited.
pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 20] = [
pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [
("https://www.tiktok.com", NetworkInterceptManager::TikTok),
("https://tiktok.com", NetworkInterceptManager::TikTok),
("https://www.amazon.", NetworkInterceptManager::Amazon),
Expand All @@ -193,22 +195,47 @@ lazy_static! {
("https://ebay.", NetworkInterceptManager::Ebay),
("https://www.nytimes.com", NetworkInterceptManager::Nytimes),
("https://nytimes.com", NetworkInterceptManager::Nytimes),
("wikipedia.org", NetworkInterceptManager::Wikipedia),
];
}

/// The find type is own.
#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)]
enum FindType {
#[default]
/// Starts with.
StartsWith,
/// Contains.
Contains,
}

impl NetworkInterceptManager {
/// a custom intercept handle.
pub fn new(url: &str) -> NetworkInterceptManager {
TOP_TIER_LIST
.iter()
.find(|&(pattern, _)| url.starts_with(pattern))
.find(|&(pattern, nm)| {
if nm.get_pattern() == FindType::StartsWith {
url.starts_with(pattern)
} else {
url.contains(pattern)
}
})
.map(|&(_, manager_type)| manager_type)
.unwrap_or(NetworkInterceptManager::Unknown)
}
/// Setup the intercept handle
pub fn setup(&mut self, url: &str) -> Self {
NetworkInterceptManager::new(url)
}

/// determine the pattern to use.
fn get_pattern(&self) -> FindType {
match self {
NetworkInterceptManager::Wikipedia => FindType::Contains,
_ => FindType::StartsWith,
}
}
}

#[derive(Debug)]
Expand Down Expand Up @@ -564,6 +591,9 @@ impl NetworkManager {
NetworkInterceptManager::Ebay => {
super::blockers::ebay_blockers::block_ebay(event)
}
NetworkInterceptManager::Wikipedia => {
super::blockers::wikipedia_blockers::block_wikipedia(event)
}
NetworkInterceptManager::Nytimes => {
super::blockers::nytimes_blockers::block_nytimes(
event,
Expand Down Expand Up @@ -704,6 +734,9 @@ impl NetworkManager {
NetworkInterceptManager::Ebay => {
super::blockers::ebay_blockers::block_ebay(event)
}
NetworkInterceptManager::Wikipedia => {
super::blockers::wikipedia_blockers::block_wikipedia(event)
}
NetworkInterceptManager::Nytimes => {
super::blockers::nytimes_blockers::block_nytimes(
event,
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.21.12"
version = "2.21.13"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.21.12"
version = "2.21.13"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.21.12"
version = "2.21.13"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.21.12"
version = "2.21.13"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 75f3022

Please sign in to comment.