chore(chrome): add to block list

spider-rs · Dec 12, 2024 · c2dbd96 · c2dbd96
1 parent 4df8e5b
commit c2dbd96
Show file tree

Hide file tree

Showing 18 changed files with 145 additions and 111 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/real_world.rs b/examples/real_world.rs
@@ -1,7 +1,8 @@
-//! cargo run --example real_world --features="chrome chrome_intercept spider_utils/transformations"
+//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations"
 
 extern crate spider;
 use crate::spider::tokio::io::AsyncWriteExt;
+use spider::configuration::{WaitForDelay, WaitForSelector};
 use spider::tokio;
 use spider::website::Website;
 use spider::{
@@ -14,12 +15,18 @@ async fn crawl_website(url: &str) -> Result<()> {
     let mut stdout = tokio::io::stdout();
 
     let mut website: Website = Website::new(url)
-        .with_limit(1)
+        .with_limit(5)
         .with_chrome_intercept(RequestInterceptConfiguration::new(true))
         .with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
-            200,
+            500,
         )))))
+        .with_subdomains(true)
+        .with_wait_for_idle_dom(Some(WaitForSelector::new(
+            Some(Duration::from_millis(100)),
+            "body".into(),
+        )))
         .with_block_assets(true)
+        // .with_wait_for_delay(Some(WaitForDelay::new(Some(Duration::from_millis(10000)))))
         .with_stealth(true)
         .with_return_page_links(true)
         .with_fingerprint(true)
@@ -30,36 +37,40 @@ async fn crawl_website(url: &str) -> Result<()> {
 
     let mut rx2 = website.subscribe(16).unwrap();
 
-    tokio::spawn(async move {
-        while let Ok(page) = rx2.recv().await {
-            let _ = stdout
-                .write_all(
-                    format!(
-                        "- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
-                        page.get_url(),
-                        page.bytes_transferred.unwrap_or_default(),
-                        page.get_html_bytes_u8().len(),
-                        match page.page_links {
-                            Some(ref l) => l.len(),
-                            _ => 0,
-                        }
+    let start = crate::tokio::time::Instant::now();
+
+    let (links, _) = tokio::join!(
+        async move {
+            website.crawl().await;
+            website.unsubscribe();
+            website.get_links()
+        },
+        async move {
+            while let Ok(page) = rx2.recv().await {
+                let _ = stdout
+                    .write_all(
+                        format!(
+                            "- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
+                            page.get_url(),
+                            page.bytes_transferred.unwrap_or_default(),
+                            page.get_html_bytes_u8().len(),
+                            match page.page_links {
+                                Some(ref l) => l.len(),
+                                _ => 0,
+                            }
+                        )
+                        .as_bytes(),
                     )
-                    .as_bytes(),
-                )
-                .await;
+                    .await;
+            }
         }
-    });
-
-    let start = crate::tokio::time::Instant::now();
-    website.crawl().await;
+    );
 
     let duration = start.elapsed();
 
-    let links = website.get_links();
-
     println!(
         "Time elapsed in website.crawl({}) is: {:?} for total pages: {:?}",
-        website.get_url(),
+        url,
         duration,
         links.len()
     );
@@ -71,7 +82,7 @@ async fn crawl_website(url: &str) -> Result<()> {
 async fn main() -> Result<()> {
     env_logger::init();
     let _ = tokio::join!(
-        crawl_website("https://choosealicense.com"),
+        crawl_website("https://www.choosealicense.com"),
         crawl_website("https://jeffmendez.com"),
         crawl_website("https://example.com"),
     );

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "2.21.16"
+version = "2.21.17"
 authors = [
     "j-mendez <[email protected]>"
 ]

diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs
@@ -853,7 +853,7 @@ impl Configuration {
     pub fn with_chrome_intercept(
         &mut self,
         chrome_intercept: RequestInterceptConfiguration,
-        url: &str,
+        url: &Option<Box<url::Url>>,
     ) -> &mut Self {
         self.chrome_intercept = chrome_intercept;
         self.chrome_intercept.setup_intercept_manager(url);
@@ -865,7 +865,7 @@ impl Configuration {
     pub fn with_chrome_intercept(
         &mut self,
         _chrome_intercept: RequestInterceptConfiguration,
-        _url: &str,
+        _url: &Option<Box<url::Url>>,
     ) -> &mut Self {
         self
     }

diff --git a/spider/src/features/chrome_common.rs b/spider/src/features/chrome_common.rs
@@ -16,13 +16,9 @@ pub enum NetworkInterceptManager {
 #[cfg(not(feature = "chrome"))]
 impl NetworkInterceptManager {
     /// a custom intercept handle.
-    pub fn new(_url: &str) -> NetworkInterceptManager {
+    pub fn new(_url: &Option<Box<url::Url>>) -> NetworkInterceptManager {
         NetworkInterceptManager::Unknown
     }
-    /// Setup the intercept handle
-    pub fn setup(&mut self, url: &str) -> Self {
-        NetworkInterceptManager::new(url)
-    }
 }
 
 #[derive(Debug, Default, Clone, PartialEq)]
@@ -682,7 +678,10 @@ impl RequestInterceptConfiguration {
         }
     }
     /// Setup a new intercept config with a custom intercept manager.
-    pub fn new_manager(enabled: bool, url: &str) -> RequestInterceptConfiguration {
+    pub fn new_manager(
+        enabled: bool,
+        url: &Option<Box<url::Url>>,
+    ) -> RequestInterceptConfiguration {
         RequestInterceptConfiguration {
             enabled,
             block_javascript: false,
@@ -695,7 +694,7 @@ impl RequestInterceptConfiguration {
     }
 
     /// Setup the network request manager type.
-    pub fn setup_intercept_manager(&mut self, url: &str) {
+    pub fn setup_intercept_manager(&mut self, url: &Option<Box<url::Url>>) {
         self.intercept_manager = NetworkInterceptManager::new(url);
     }
 

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -229,6 +229,7 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
 ) {
     if let Some(b) = base {
         let mut abs = convert_abs_path(b, href);
+
         let new_page = abs != **b;
 
         if let Some(link_map) = links_pages {

diff --git a/spider/src/website.rs b/spider/src/website.rs
@@ -4046,7 +4046,7 @@ impl Website {
         chrome_intercept: RequestInterceptConfiguration,
     ) -> &mut Self {
         self.configuration
-            .with_chrome_intercept(chrome_intercept, &self.url);
+            .with_chrome_intercept(chrome_intercept, &self.domain_parsed);
         self
     }
 

diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_chrome"
-version = "2.21.16"
+version = "2.21.17"
 rust-version = "1.70"
 authors = [
     "j-mendez <[email protected]>"

diff --git a/spider_chrome/src/handler/blockers/intercept_manager.rs b/spider_chrome/src/handler/blockers/intercept_manager.rs
@@ -1,3 +1,5 @@
+use phf::phf_map;
+
 /// Custom network intercept types to expect on a domain
 #[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)]
 pub enum NetworkInterceptManager {
@@ -25,76 +27,52 @@ pub enum NetworkInterceptManager {
     Nytimes,
     /// wikipedia.com
     Wikipedia,
+    /// tcgplayer.com
+    Tcgplayer,
     #[default]
     /// Unknown
     Unknown,
 }
 
-lazy_static::lazy_static! {
-    /// Top tier list of the most common websites visited.
-    pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [
-        ("https://www.tiktok.com", NetworkInterceptManager::TikTok),
-        ("https://tiktok.com", NetworkInterceptManager::TikTok),
-        ("https://www.amazon.", NetworkInterceptManager::Amazon),
-        ("https://amazon.", NetworkInterceptManager::Amazon),
-        ("https://www.x.com", NetworkInterceptManager::X),
-        ("https://x.com", NetworkInterceptManager::X),
-        ("https://www.netflix.com", NetworkInterceptManager::Netflix),
-        ("https://netflix.com", NetworkInterceptManager::Netflix),
-        (
-            "https://www.linkedin.com",
-            NetworkInterceptManager::LinkedIn
-        ),
-        ("https://linkedin.com", NetworkInterceptManager::LinkedIn),
-        ("https://www.upwork.com", NetworkInterceptManager::Upwork),
-        ("https://upwork.com", NetworkInterceptManager::Upwork),
-        ("https://www.glassdoor.", NetworkInterceptManager::Glassdoor),
-        ("https://glassdoor.", NetworkInterceptManager::Glassdoor),
-        ("https://www.medium.com", NetworkInterceptManager::Medium),
-        ("https://medium.com", NetworkInterceptManager::Medium),
-        ("https://www.ebay.", NetworkInterceptManager::Ebay),
-        ("https://ebay.", NetworkInterceptManager::Ebay),
-        ("https://www.nytimes.com", NetworkInterceptManager::Nytimes),
-        ("https://nytimes.com", NetworkInterceptManager::Nytimes),
-        ("wikipedia.org", NetworkInterceptManager::Wikipedia),
-    ];
-}
-
-/// The find type is own.
-#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)]
-enum FindType {
-    #[default]
-    /// Starts with.
-    StartsWith,
-    /// Contains.
-    Contains,
-}
+// A constant map using `phf` that maps domains to their respective intercept manager
+static DOMAIN_MAP: phf::Map<&'static str, NetworkInterceptManager> = phf_map! {
+    "tiktok.com" => NetworkInterceptManager::TikTok,
+    "facebook.com" => NetworkInterceptManager::Facebook,
+    "amazon.com" => NetworkInterceptManager::Amazon,
+    "x.com" => NetworkInterceptManager::X,
+    "linkedin.com" => NetworkInterceptManager::LinkedIn,
+    "netflix.com" => NetworkInterceptManager::Netflix,
+    "medium.com" => NetworkInterceptManager::Medium,
+    "upwork.com" => NetworkInterceptManager::Upwork,
+    "glassdoor.com" => NetworkInterceptManager::Glassdoor,
+    "ebay.com" => NetworkInterceptManager::Ebay,
+    "nytimes.com" => NetworkInterceptManager::Nytimes,
+    "wikipedia.org" => NetworkInterceptManager::Wikipedia,
+    "tcgplayer.com" => NetworkInterceptManager::Tcgplayer,
+};
 
 impl NetworkInterceptManager {
-    /// a custom intercept handle.
-    pub fn new(url: &str) -> NetworkInterceptManager {
-        TOP_TIER_LIST
-            .iter()
-            .find(|&(pattern, nm)| {
-                if nm.get_pattern() == FindType::StartsWith {
-                    url.starts_with(pattern)
+    pub fn new(url: &Option<Box<url::Url>>) -> NetworkInterceptManager {
+        if let Some(parsed_url) = url {
+            if let Some(domain) = parsed_url.domain() {
+                // list of top websites should at most two - can always do a second pass.
+                let domain_parts: Vec<&str> = domain.split('.').collect();
+
+                let base_domain = if domain_parts.len() > 2 {
+                    format!(
+                        "{}.{}",
+                        domain_parts[domain_parts.len() - 2],
+                        domain_parts[domain_parts.len() - 1]
+                    )
                 } else {
-                    url.contains(pattern)
-                }
-            })
-            .map(|&(_, manager_type)| manager_type)
-            .unwrap_or(NetworkInterceptManager::Unknown)
-    }
-    /// Setup the intercept handle
-    pub fn setup(&mut self, url: &str) -> Self {
-        NetworkInterceptManager::new(url)
-    }
+                    domain.to_string()
+                };
 
-    /// determine the pattern to use.
-    fn get_pattern(&self) -> FindType {
-        match self {
-            NetworkInterceptManager::Wikipedia => FindType::Contains,
-            _ => FindType::StartsWith,
+                return *DOMAIN_MAP
+                    .get(&base_domain)
+                    .unwrap_or(&NetworkInterceptManager::Unknown);
+            }
         }
+        NetworkInterceptManager::Unknown
     }
 }
diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs
@@ -18,6 +18,8 @@ pub mod netflix_blockers;
 pub mod nytimes_blockers;
 /// script blockers
 pub mod scripts;
+/// block tcgplayer.com
+pub mod tcgplayer_blockers;
 /// tiktok blockers
 pub mod tiktok_blockers;
 /// upwork blockers
@@ -26,6 +28,7 @@ pub mod upwork_blockers;
 pub mod wikipedia_blockers;
 /// x blockers
 pub mod x_blockers;
+
 /// xhr blockers
 pub mod xhr;