Skip to content

Commit

Permalink
chore(chrome): add to block list
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 12, 2024
1 parent 4df8e5b commit c2dbd96
Show file tree
Hide file tree
Showing 18 changed files with 145 additions and 111 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 38 additions & 27 deletions examples/real_world.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
//! cargo run --example real_world --features="chrome chrome_intercept spider_utils/transformations"
//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations"
extern crate spider;
use crate::spider::tokio::io::AsyncWriteExt;
use spider::configuration::{WaitForDelay, WaitForSelector};
use spider::tokio;
use spider::website::Website;
use spider::{
Expand All @@ -14,12 +15,18 @@ async fn crawl_website(url: &str) -> Result<()> {
let mut stdout = tokio::io::stdout();

let mut website: Website = Website::new(url)
.with_limit(1)
.with_limit(5)
.with_chrome_intercept(RequestInterceptConfiguration::new(true))
.with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
200,
500,
)))))
.with_subdomains(true)
.with_wait_for_idle_dom(Some(WaitForSelector::new(
Some(Duration::from_millis(100)),
"body".into(),
)))
.with_block_assets(true)
// .with_wait_for_delay(Some(WaitForDelay::new(Some(Duration::from_millis(10000)))))
.with_stealth(true)
.with_return_page_links(true)
.with_fingerprint(true)
Expand All @@ -30,36 +37,40 @@ async fn crawl_website(url: &str) -> Result<()> {

let mut rx2 = website.subscribe(16).unwrap();

tokio::spawn(async move {
while let Ok(page) = rx2.recv().await {
let _ = stdout
.write_all(
format!(
"- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
page.get_url(),
page.bytes_transferred.unwrap_or_default(),
page.get_html_bytes_u8().len(),
match page.page_links {
Some(ref l) => l.len(),
_ => 0,
}
let start = crate::tokio::time::Instant::now();

let (links, _) = tokio::join!(
async move {
website.crawl().await;
website.unsubscribe();
website.get_links()
},
async move {
while let Ok(page) = rx2.recv().await {
let _ = stdout
.write_all(
format!(
"- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
page.get_url(),
page.bytes_transferred.unwrap_or_default(),
page.get_html_bytes_u8().len(),
match page.page_links {
Some(ref l) => l.len(),
_ => 0,
}
)
.as_bytes(),
)
.as_bytes(),
)
.await;
.await;
}
}
});

let start = crate::tokio::time::Instant::now();
website.crawl().await;
);

let duration = start.elapsed();

let links = website.get_links();

println!(
"Time elapsed in website.crawl({}) is: {:?} for total pages: {:?}",
website.get_url(),
url,
duration,
links.len()
);
Expand All @@ -71,7 +82,7 @@ async fn crawl_website(url: &str) -> Result<()> {
async fn main() -> Result<()> {
env_logger::init();
let _ = tokio::join!(
crawl_website("https://choosealicense.com"),
crawl_website("https://www.choosealicense.com"),
crawl_website("https://jeffmendez.com"),
crawl_website("https://example.com"),
);
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.21.16"
version = "2.21.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
4 changes: 2 additions & 2 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ impl Configuration {
pub fn with_chrome_intercept(
&mut self,
chrome_intercept: RequestInterceptConfiguration,
url: &str,
url: &Option<Box<url::Url>>,
) -> &mut Self {
self.chrome_intercept = chrome_intercept;
self.chrome_intercept.setup_intercept_manager(url);
Expand All @@ -865,7 +865,7 @@ impl Configuration {
pub fn with_chrome_intercept(
&mut self,
_chrome_intercept: RequestInterceptConfiguration,
_url: &str,
_url: &Option<Box<url::Url>>,
) -> &mut Self {
self
}
Expand Down
13 changes: 6 additions & 7 deletions spider/src/features/chrome_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,9 @@ pub enum NetworkInterceptManager {
#[cfg(not(feature = "chrome"))]
impl NetworkInterceptManager {
/// a custom intercept handle.
pub fn new(_url: &str) -> NetworkInterceptManager {
pub fn new(_url: &Option<Box<url::Url>>) -> NetworkInterceptManager {
NetworkInterceptManager::Unknown
}
/// Setup the intercept handle
pub fn setup(&mut self, url: &str) -> Self {
NetworkInterceptManager::new(url)
}
}

#[derive(Debug, Default, Clone, PartialEq)]
Expand Down Expand Up @@ -682,7 +678,10 @@ impl RequestInterceptConfiguration {
}
}
/// Setup a new intercept config with a custom intercept manager.
pub fn new_manager(enabled: bool, url: &str) -> RequestInterceptConfiguration {
pub fn new_manager(
enabled: bool,
url: &Option<Box<url::Url>>,
) -> RequestInterceptConfiguration {
RequestInterceptConfiguration {
enabled,
block_javascript: false,
Expand All @@ -695,7 +694,7 @@ impl RequestInterceptConfiguration {
}

/// Setup the network request manager type.
pub fn setup_intercept_manager(&mut self, url: &str) {
pub fn setup_intercept_manager(&mut self, url: &Option<Box<url::Url>>) {
self.intercept_manager = NetworkInterceptManager::new(url);
}

Expand Down
1 change: 1 addition & 0 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
) {
if let Some(b) = base {
let mut abs = convert_abs_path(b, href);

let new_page = abs != **b;

if let Some(link_map) = links_pages {
Expand Down
2 changes: 1 addition & 1 deletion spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4046,7 +4046,7 @@ impl Website {
chrome_intercept: RequestInterceptConfiguration,
) -> &mut Self {
self.configuration
.with_chrome_intercept(chrome_intercept, &self.url);
.with_chrome_intercept(chrome_intercept, &self.domain_parsed);
self
}

Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.21.16"
version = "2.21.17"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
100 changes: 39 additions & 61 deletions spider_chrome/src/handler/blockers/intercept_manager.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use phf::phf_map;

/// Custom network intercept types to expect on a domain
#[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)]
pub enum NetworkInterceptManager {
Expand Down Expand Up @@ -25,76 +27,52 @@ pub enum NetworkInterceptManager {
Nytimes,
/// wikipedia.com
Wikipedia,
/// tcgplayer.com
Tcgplayer,
#[default]
/// Unknown
Unknown,
}

lazy_static::lazy_static! {
/// Top tier list of the most common websites visited.
pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [
("https://www.tiktok.com", NetworkInterceptManager::TikTok),
("https://tiktok.com", NetworkInterceptManager::TikTok),
("https://www.amazon.", NetworkInterceptManager::Amazon),
("https://amazon.", NetworkInterceptManager::Amazon),
("https://www.x.com", NetworkInterceptManager::X),
("https://x.com", NetworkInterceptManager::X),
("https://www.netflix.com", NetworkInterceptManager::Netflix),
("https://netflix.com", NetworkInterceptManager::Netflix),
(
"https://www.linkedin.com",
NetworkInterceptManager::LinkedIn
),
("https://linkedin.com", NetworkInterceptManager::LinkedIn),
("https://www.upwork.com", NetworkInterceptManager::Upwork),
("https://upwork.com", NetworkInterceptManager::Upwork),
("https://www.glassdoor.", NetworkInterceptManager::Glassdoor),
("https://glassdoor.", NetworkInterceptManager::Glassdoor),
("https://www.medium.com", NetworkInterceptManager::Medium),
("https://medium.com", NetworkInterceptManager::Medium),
("https://www.ebay.", NetworkInterceptManager::Ebay),
("https://ebay.", NetworkInterceptManager::Ebay),
("https://www.nytimes.com", NetworkInterceptManager::Nytimes),
("https://nytimes.com", NetworkInterceptManager::Nytimes),
("wikipedia.org", NetworkInterceptManager::Wikipedia),
];
}

/// The find type is own.
#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)]
enum FindType {
#[default]
/// Starts with.
StartsWith,
/// Contains.
Contains,
}
// A constant map using `phf` that maps domains to their respective intercept manager
static DOMAIN_MAP: phf::Map<&'static str, NetworkInterceptManager> = phf_map! {
"tiktok.com" => NetworkInterceptManager::TikTok,
"facebook.com" => NetworkInterceptManager::Facebook,
"amazon.com" => NetworkInterceptManager::Amazon,
"x.com" => NetworkInterceptManager::X,
"linkedin.com" => NetworkInterceptManager::LinkedIn,
"netflix.com" => NetworkInterceptManager::Netflix,
"medium.com" => NetworkInterceptManager::Medium,
"upwork.com" => NetworkInterceptManager::Upwork,
"glassdoor.com" => NetworkInterceptManager::Glassdoor,
"ebay.com" => NetworkInterceptManager::Ebay,
"nytimes.com" => NetworkInterceptManager::Nytimes,
"wikipedia.org" => NetworkInterceptManager::Wikipedia,
"tcgplayer.com" => NetworkInterceptManager::Tcgplayer,
};

impl NetworkInterceptManager {
/// a custom intercept handle.
pub fn new(url: &str) -> NetworkInterceptManager {
TOP_TIER_LIST
.iter()
.find(|&(pattern, nm)| {
if nm.get_pattern() == FindType::StartsWith {
url.starts_with(pattern)
pub fn new(url: &Option<Box<url::Url>>) -> NetworkInterceptManager {
if let Some(parsed_url) = url {
if let Some(domain) = parsed_url.domain() {
// list of top websites should at most two - can always do a second pass.
let domain_parts: Vec<&str> = domain.split('.').collect();

let base_domain = if domain_parts.len() > 2 {
format!(
"{}.{}",
domain_parts[domain_parts.len() - 2],
domain_parts[domain_parts.len() - 1]
)
} else {
url.contains(pattern)
}
})
.map(|&(_, manager_type)| manager_type)
.unwrap_or(NetworkInterceptManager::Unknown)
}
/// Setup the intercept handle
pub fn setup(&mut self, url: &str) -> Self {
NetworkInterceptManager::new(url)
}
domain.to_string()
};

/// determine the pattern to use.
fn get_pattern(&self) -> FindType {
match self {
NetworkInterceptManager::Wikipedia => FindType::Contains,
_ => FindType::StartsWith,
return *DOMAIN_MAP
.get(&base_domain)
.unwrap_or(&NetworkInterceptManager::Unknown);
}
}
NetworkInterceptManager::Unknown
}
}
3 changes: 3 additions & 0 deletions spider_chrome/src/handler/blockers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ pub mod netflix_blockers;
pub mod nytimes_blockers;
/// script blockers
pub mod scripts;
/// block tcgplayer.com
pub mod tcgplayer_blockers;
/// tiktok blockers
pub mod tiktok_blockers;
/// upwork blockers
Expand All @@ -26,6 +28,7 @@ pub mod upwork_blockers;
pub mod wikipedia_blockers;
/// x blockers
pub mod x_blockers;

/// xhr blockers
pub mod xhr;

Expand Down
Loading

0 comments on commit c2dbd96

Please sign in to comment.