From c2dbd969bf615e7ef83dd4bbb052eef48c20ef11 Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Thu, 12 Dec 2024 13:35:30 -0500
Subject: [PATCH] chore(chrome): add to block list

---
 Cargo.lock                                    |  12 +--
 examples/real_world.rs                        |  65 +++++++-----
 spider/Cargo.toml                             |   2 +-
 spider/src/configuration.rs                   |   4 +-
 spider/src/features/chrome_common.rs          |  13 ++-
 spider/src/page.rs                            |   1 +
 spider/src/website.rs                         |   2 +-
 spider_chrome/Cargo.toml                      |   2 +-
 .../src/handler/blockers/intercept_manager.rs | 100 +++++++-----------
 spider_chrome/src/handler/blockers/mod.rs     |   3 +
 spider_chrome/src/handler/blockers/scripts.rs |   3 +
 .../handler/blockers/tcgplayer_blockers.rs    |  27 +++++
 spider_chrome/src/handler/blockers/xhr.rs     |   1 +
 spider_chrome/src/handler/network.rs          |  13 ++-
 spider_cli/Cargo.toml                         |   2 +-
 spider_transformations/Cargo.toml             |   2 +-
 spider_utils/Cargo.toml                       |   2 +-
 spider_worker/Cargo.toml                      |   2 +-
 18 files changed, 145 insertions(+), 111 deletions(-)
 create mode 100644 spider_chrome/src/handler/blockers/tcgplayer_blockers.rs

diff --git a/Cargo.lock b/Cargo.lock
index d8345eb93..0c310f86e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4273,7 +4273,7 @@ dependencies = [
 
 [[package]]
 name = "spider"
-version = "2.21.16"
+version = "2.21.17"
 dependencies = [
  "ahash",
  "aho-corasick",
@@ -4336,7 +4336,7 @@ dependencies = [
 
 [[package]]
 name = "spider_chrome"
-version = "2.21.16"
+version = "2.21.17"
 dependencies = [
  "adblock",
  "base64 0.22.1",
@@ -4373,7 +4373,7 @@ dependencies = [
 
 [[package]]
 name = "spider_cli"
-version = "2.21.16"
+version = "2.21.17"
 dependencies = [
  "clap",
  "env_logger",
@@ -4398,7 +4398,7 @@ dependencies = [
 
 [[package]]
 name = "spider_transformations"
-version = "2.21.16"
+version = "2.21.17"
 dependencies = [
  "aho-corasick",
  "fast_html2md",
@@ -4420,7 +4420,7 @@ dependencies = [
 
 [[package]]
 name = "spider_utils"
-version = "2.21.16"
+version = "2.21.17"
 dependencies = [
  "indexmap 1.9.3",
  "serde",
@@ -4432,7 +4432,7 @@ dependencies = [
 
 [[package]]
 name = "spider_worker"
-version = "2.21.16"
+version = "2.21.17"
 dependencies = [
  "env_logger",
  "lazy_static",
diff --git a/examples/real_world.rs b/examples/real_world.rs
index faff6bacc..488875f32 100644
--- a/examples/real_world.rs
+++ b/examples/real_world.rs
@@ -1,7 +1,8 @@
-//! cargo run --example real_world --features="chrome chrome_intercept spider_utils/transformations"
+//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations"
 
 extern crate spider;
 use crate::spider::tokio::io::AsyncWriteExt;
+use spider::configuration::{WaitForDelay, WaitForSelector};
 use spider::tokio;
 use spider::website::Website;
 use spider::{
@@ -14,12 +15,18 @@ async fn crawl_website(url: &str) -> Result<()> {
     let mut stdout = tokio::io::stdout();
 
     let mut website: Website = Website::new(url)
-        .with_limit(1)
+        .with_limit(5)
         .with_chrome_intercept(RequestInterceptConfiguration::new(true))
         .with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
-            200,
+            500,
         )))))
+        .with_subdomains(true)
+        .with_wait_for_idle_dom(Some(WaitForSelector::new(
+            Some(Duration::from_millis(100)),
+            "body".into(),
+        )))
         .with_block_assets(true)
+        // .with_wait_for_delay(Some(WaitForDelay::new(Some(Duration::from_millis(10000)))))
         .with_stealth(true)
         .with_return_page_links(true)
         .with_fingerprint(true)
@@ -30,36 +37,40 @@ async fn crawl_website(url: &str) -> Result<()> {
 
     let mut rx2 = website.subscribe(16).unwrap();
 
-    tokio::spawn(async move {
-        while let Ok(page) = rx2.recv().await {
-            let _ = stdout
-                .write_all(
-                    format!(
-                        "- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
-                        page.get_url(),
-                        page.bytes_transferred.unwrap_or_default(),
-                        page.get_html_bytes_u8().len(),
-                        match page.page_links {
-                            Some(ref l) => l.len(),
-                            _ => 0,
-                        }
+    let start = crate::tokio::time::Instant::now();
+
+    let (links, _) = tokio::join!(
+        async move {
+            website.crawl().await;
+            website.unsubscribe();
+            website.get_links()
+        },
+        async move {
+            while let Ok(page) = rx2.recv().await {
+                let _ = stdout
+                    .write_all(
+                        format!(
+                            "- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
+                            page.get_url(),
+                            page.bytes_transferred.unwrap_or_default(),
+                            page.get_html_bytes_u8().len(),
+                            match page.page_links {
+                                Some(ref l) => l.len(),
+                                _ => 0,
+                            }
+                        )
+                        .as_bytes(),
                     )
-                    .as_bytes(),
-                )
-                .await;
+                    .await;
+            }
         }
-    });
-
-    let start = crate::tokio::time::Instant::now();
-    website.crawl().await;
+    );
 
     let duration = start.elapsed();
 
-    let links = website.get_links();
-
     println!(
         "Time elapsed in website.crawl({}) is: {:?} for total pages: {:?}",
-        website.get_url(),
+        url,
         duration,
         links.len()
     );
@@ -71,7 +82,7 @@ async fn crawl_website(url: &str) -> Result<()> {
 async fn main() -> Result<()> {
     env_logger::init();
     let _ = tokio::join!(
-        crawl_website("https://choosealicense.com"),
+        crawl_website("https://www.choosealicense.com"),
         crawl_website("https://jeffmendez.com"),
         crawl_website("https://example.com"),
     );
diff --git a/spider/Cargo.toml b/spider/Cargo.toml
index 992df9431..087e50fa2 100644
--- a/spider/Cargo.toml
+++ b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "2.21.16"
+version = "2.21.17"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]
diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs
index 126b03ade..ff90236e3 100644
--- a/spider/src/configuration.rs
+++ b/spider/src/configuration.rs
@@ -853,7 +853,7 @@ impl Configuration {
     pub fn with_chrome_intercept(
         &mut self,
         chrome_intercept: RequestInterceptConfiguration,
-        url: &str,
+        url: &Option<Box<url::Url>>,
     ) -> &mut Self {
         self.chrome_intercept = chrome_intercept;
         self.chrome_intercept.setup_intercept_manager(url);
@@ -865,7 +865,7 @@ impl Configuration {
     pub fn with_chrome_intercept(
         &mut self,
         _chrome_intercept: RequestInterceptConfiguration,
-        _url: &str,
+        _url: &Option<Box<url::Url>>,
     ) -> &mut Self {
         self
     }
diff --git a/spider/src/features/chrome_common.rs b/spider/src/features/chrome_common.rs
index cdd28bbb7..ec8da2eef 100644
--- a/spider/src/features/chrome_common.rs
+++ b/spider/src/features/chrome_common.rs
@@ -16,13 +16,9 @@ pub enum NetworkInterceptManager {
 #[cfg(not(feature = "chrome"))]
 impl NetworkInterceptManager {
     /// a custom intercept handle.
-    pub fn new(_url: &str) -> NetworkInterceptManager {
+    pub fn new(_url: &Option<Box<url::Url>>) -> NetworkInterceptManager {
         NetworkInterceptManager::Unknown
     }
-    /// Setup the intercept handle
-    pub fn setup(&mut self, url: &str) -> Self {
-        NetworkInterceptManager::new(url)
-    }
 }
 
 #[derive(Debug, Default, Clone, PartialEq)]
@@ -682,7 +678,10 @@ impl RequestInterceptConfiguration {
         }
     }
     /// Setup a new intercept config with a custom intercept manager.
-    pub fn new_manager(enabled: bool, url: &str) -> RequestInterceptConfiguration {
+    pub fn new_manager(
+        enabled: bool,
+        url: &Option<Box<url::Url>>,
+    ) -> RequestInterceptConfiguration {
         RequestInterceptConfiguration {
             enabled,
             block_javascript: false,
@@ -695,7 +694,7 @@ impl RequestInterceptConfiguration {
     }
 
     /// Setup the network request manager type.
-    pub fn setup_intercept_manager(&mut self, url: &str) {
+    pub fn setup_intercept_manager(&mut self, url: &Option<Box<url::Url>>) {
         self.intercept_manager = NetworkInterceptManager::new(url);
     }
 
diff --git a/spider/src/page.rs b/spider/src/page.rs
index 978ab62ca..bfde640ad 100644
--- a/spider/src/page.rs
+++ b/spider/src/page.rs
@@ -229,6 +229,7 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
 ) {
     if let Some(b) = base {
         let mut abs = convert_abs_path(b, href);
+
         let new_page = abs != **b;
 
         if let Some(link_map) = links_pages {
diff --git a/spider/src/website.rs b/spider/src/website.rs
index f168d16eb..7a19898e5 100644
--- a/spider/src/website.rs
+++ b/spider/src/website.rs
@@ -4046,7 +4046,7 @@ impl Website {
         chrome_intercept: RequestInterceptConfiguration,
     ) -> &mut Self {
         self.configuration
-            .with_chrome_intercept(chrome_intercept, &self.url);
+            .with_chrome_intercept(chrome_intercept, &self.domain_parsed);
         self
     }
 
diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml
index 53287fdf1..ead6166eb 100644
--- a/spider_chrome/Cargo.toml
+++ b/spider_chrome/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_chrome"
-version = "2.21.16"
+version = "2.21.17"
 rust-version = "1.70"
 authors = [
     "j-mendez <jeff@spider.cloud>"
diff --git a/spider_chrome/src/handler/blockers/intercept_manager.rs b/spider_chrome/src/handler/blockers/intercept_manager.rs
index fdc8f239e..963e9260c 100644
--- a/spider_chrome/src/handler/blockers/intercept_manager.rs
+++ b/spider_chrome/src/handler/blockers/intercept_manager.rs
@@ -1,3 +1,5 @@
+use phf::phf_map;
+
 /// Custom network intercept types to expect on a domain
 #[derive(Debug, Default, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq)]
 pub enum NetworkInterceptManager {
@@ -25,76 +27,52 @@ pub enum NetworkInterceptManager {
     Nytimes,
     /// wikipedia.com
     Wikipedia,
+    /// tcgplayer.com
+    Tcgplayer,
     #[default]
     /// Unknown
     Unknown,
 }
 
-lazy_static::lazy_static! {
-    /// Top tier list of the most common websites visited.
-    pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 21] = [
-        ("https://www.tiktok.com", NetworkInterceptManager::TikTok),
-        ("https://tiktok.com", NetworkInterceptManager::TikTok),
-        ("https://www.amazon.", NetworkInterceptManager::Amazon),
-        ("https://amazon.", NetworkInterceptManager::Amazon),
-        ("https://www.x.com", NetworkInterceptManager::X),
-        ("https://x.com", NetworkInterceptManager::X),
-        ("https://www.netflix.com", NetworkInterceptManager::Netflix),
-        ("https://netflix.com", NetworkInterceptManager::Netflix),
-        (
-            "https://www.linkedin.com",
-            NetworkInterceptManager::LinkedIn
-        ),
-        ("https://linkedin.com", NetworkInterceptManager::LinkedIn),
-        ("https://www.upwork.com", NetworkInterceptManager::Upwork),
-        ("https://upwork.com", NetworkInterceptManager::Upwork),
-        ("https://www.glassdoor.", NetworkInterceptManager::Glassdoor),
-        ("https://glassdoor.", NetworkInterceptManager::Glassdoor),
-        ("https://www.medium.com", NetworkInterceptManager::Medium),
-        ("https://medium.com", NetworkInterceptManager::Medium),
-        ("https://www.ebay.", NetworkInterceptManager::Ebay),
-        ("https://ebay.", NetworkInterceptManager::Ebay),
-        ("https://www.nytimes.com", NetworkInterceptManager::Nytimes),
-        ("https://nytimes.com", NetworkInterceptManager::Nytimes),
-        ("wikipedia.org", NetworkInterceptManager::Wikipedia),
-    ];
-}
-
-/// The find type is own.
-#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)]
-enum FindType {
-    #[default]
-    /// Starts with.
-    StartsWith,
-    /// Contains.
-    Contains,
-}
+// A constant map using `phf` that maps domains to their respective intercept manager
+static DOMAIN_MAP: phf::Map<&'static str, NetworkInterceptManager> = phf_map! {
+    "tiktok.com" => NetworkInterceptManager::TikTok,
+    "facebook.com" => NetworkInterceptManager::Facebook,
+    "amazon.com" => NetworkInterceptManager::Amazon,
+    "x.com" => NetworkInterceptManager::X,
+    "linkedin.com" => NetworkInterceptManager::LinkedIn,
+    "netflix.com" => NetworkInterceptManager::Netflix,
+    "medium.com" => NetworkInterceptManager::Medium,
+    "upwork.com" => NetworkInterceptManager::Upwork,
+    "glassdoor.com" => NetworkInterceptManager::Glassdoor,
+    "ebay.com" => NetworkInterceptManager::Ebay,
+    "nytimes.com" => NetworkInterceptManager::Nytimes,
+    "wikipedia.org" => NetworkInterceptManager::Wikipedia,
+    "tcgplayer.com" => NetworkInterceptManager::Tcgplayer,
+};
 
 impl NetworkInterceptManager {
-    /// a custom intercept handle.
-    pub fn new(url: &str) -> NetworkInterceptManager {
-        TOP_TIER_LIST
-            .iter()
-            .find(|&(pattern, nm)| {
-                if nm.get_pattern() == FindType::StartsWith {
-                    url.starts_with(pattern)
+    pub fn new(url: &Option<Box<url::Url>>) -> NetworkInterceptManager {
+        if let Some(parsed_url) = url {
+            if let Some(domain) = parsed_url.domain() {
+                // list of top websites should at most two - can always do a second pass.
+                let domain_parts: Vec<&str> = domain.split('.').collect();
+
+                let base_domain = if domain_parts.len() > 2 {
+                    format!(
+                        "{}.{}",
+                        domain_parts[domain_parts.len() - 2],
+                        domain_parts[domain_parts.len() - 1]
+                    )
                 } else {
-                    url.contains(pattern)
-                }
-            })
-            .map(|&(_, manager_type)| manager_type)
-            .unwrap_or(NetworkInterceptManager::Unknown)
-    }
-    /// Setup the intercept handle
-    pub fn setup(&mut self, url: &str) -> Self {
-        NetworkInterceptManager::new(url)
-    }
+                    domain.to_string()
+                };
 
-    /// determine the pattern to use.
-    fn get_pattern(&self) -> FindType {
-        match self {
-            NetworkInterceptManager::Wikipedia => FindType::Contains,
-            _ => FindType::StartsWith,
+                return *DOMAIN_MAP
+                    .get(&base_domain)
+                    .unwrap_or(&NetworkInterceptManager::Unknown);
+            }
         }
+        NetworkInterceptManager::Unknown
     }
 }
diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs
index c923c7d8f..694e65fc9 100644
--- a/spider_chrome/src/handler/blockers/mod.rs
+++ b/spider_chrome/src/handler/blockers/mod.rs
@@ -18,6 +18,8 @@ pub mod netflix_blockers;
 pub mod nytimes_blockers;
 /// script blockers
 pub mod scripts;
+/// block tcgplayer.com
+pub mod tcgplayer_blockers;
 /// tiktok blockers
 pub mod tiktok_blockers;
 /// upwork blockers
@@ -26,6 +28,7 @@ pub mod upwork_blockers;
 pub mod wikipedia_blockers;
 /// x blockers
 pub mod x_blockers;
+
 /// xhr blockers
 pub mod xhr;
 
diff --git a/spider_chrome/src/handler/blockers/scripts.rs b/spider_chrome/src/handler/blockers/scripts.rs
index 080d4bff7..67bbe337d 100644
--- a/spider_chrome/src/handler/blockers/scripts.rs
+++ b/spider_chrome/src/handler/blockers/scripts.rs
@@ -92,6 +92,7 @@ lazy_static::lazy_static! {
             "https://assets.adobedtm.com/extensions/",
             "https://macro.adnami.io/macro/spec/adsm.macro.",
             "https://log.medietall.no/analytics.js",
+            "https://cdn.siftscience.com/s.js",
             "https://lwadm.com/lw/pbjs?",
             "https://cl.k5a.io/",
             "https://cdn-cookieyes.com/",
@@ -139,6 +140,7 @@ lazy_static::lazy_static! {
         let patterns = [
             "https://www.youtube.com/embed/",      // YouTube video embeds
             "https://www.google.com/maps/embed?",  // Google Maps embeds
+            "https://maps.google.com", // Google maps iframe.
             "https://player.vimeo.com/video/",     // Vimeo video embeds
             "https://player.vimeo.com/api/player.js", // Vimeo video embeds
             "https://open.spotify.com/embed/",     // Spotify music embeds
@@ -193,6 +195,7 @@ lazy_static::lazy_static! {
             ".googlesyndication.com/safeframe/",
             // repeat consent js
             "/ccpa/user-consent.min.js",
+            "consent-manager",
             "/cookiebanner/js/",
             "cookielaw.org",
             // privacy
diff --git a/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs b/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs
new file mode 100644
index 000000000..bdf79b05d
--- /dev/null
+++ b/spider_chrome/src/handler/blockers/tcgplayer_blockers.rs
@@ -0,0 +1,27 @@
+use crate::handler::blockers::Trie;
+
+lazy_static::lazy_static! {
+        /// Ignore list of urls.
+        static ref URL_IGNORE_TRIE: Trie = {
+            let mut trie = Trie::new();
+            let patterns = [
+                "https://data.tcgplayer.com/suggestions/trending",
+                "https://mpapi.tcgplayer.com/v2/kickbacks?active=true",
+                "https://homepage.marketplace.tcgplayer.com/sitealert.json",
+                "https://infinite-api.tcgplayer.com/signup/?",
+                "https://features.tcgplayer.com/v1/optimizely/Variation/",
+                "https://mpapi.tcgplayer.com/v2/address/countryCodes?mpfev=3031"
+            ];
+            for pattern in &patterns {
+                trie.insert(pattern);
+            }
+            trie
+        };
+}
+
+// Block tcgplayer events that are not required
+pub fn block_tcgplayer(
+    event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused,
+) -> bool {
+    URL_IGNORE_TRIE.contains_prefix(&event.request.url)
+}
diff --git a/spider_chrome/src/handler/blockers/xhr.rs b/spider_chrome/src/handler/blockers/xhr.rs
index 28e15e351..90acf9f49 100644
--- a/spider_chrome/src/handler/blockers/xhr.rs
+++ b/spider_chrome/src/handler/blockers/xhr.rs
@@ -87,6 +87,7 @@ lazy_static::lazy_static! {
             "https://events.api.secureserver.net/",
             "https://csp.secureserver.net/eventbus",
             "https://doh.cq0.co/resolve",
+            "https://cdn.segment.",
             ".wixapps.net/api/v1/bulklog",
             "https://error-analytics-sessions-production.shopifysvc.com/",
             "https://static-forms.",
diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs
index a278ba009..6d56f4d46 100644
--- a/spider_chrome/src/handler/network.rs
+++ b/spider_chrome/src/handler/network.rs
@@ -459,6 +459,9 @@ impl NetworkManager {
                             NetworkInterceptManager::Wikipedia => {
                                 super::blockers::wikipedia_blockers::block_wikipedia(event)
                             }
+                            NetworkInterceptManager::Tcgplayer => {
+                                super::blockers::tcgplayer_blockers::block_tcgplayer(event)
+                            }
                             NetworkInterceptManager::Nytimes => {
                                 super::blockers::nytimes_blockers::block_nytimes(
                                     event,
@@ -484,6 +487,11 @@ impl NetworkManager {
                     };
 
                     if skip_networking {
+                        tracing::debug!(
+                            "Blocked: {:?} - {}",
+                            event.resource_type,
+                            event.request.url
+                        );
                         let fullfill_params =
                             crate::handler::network::fetch::FulfillRequestParams::new(
                                 event.request_id.clone(),
@@ -492,7 +500,7 @@ impl NetworkManager {
                         self.push_cdp_request(fullfill_params);
                     } else {
                         tracing::debug!(
-                            "Network Allowed: {:?} - {}",
+                            "Allowed: {:?} - {}",
                             event.resource_type,
                             event.request.url
                         );
@@ -593,6 +601,9 @@ impl NetworkManager {
                             NetworkInterceptManager::LinkedIn => {
                                 super::blockers::linkedin_blockers::block_linkedin(event)
                             }
+                            NetworkInterceptManager::Tcgplayer => {
+                                super::blockers::tcgplayer_blockers::block_tcgplayer(event)
+                            }
                             NetworkInterceptManager::Medium => {
                                 super::blockers::medium_blockers::block_medium(event)
                             }
diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
index e2f22f734..f08e892e6 100644
--- a/spider_cli/Cargo.toml
+++ b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "2.21.16"
+version = "2.21.17"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]
diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml
index d0e8b1055..86c0e5442 100644
--- a/spider_transformations/Cargo.toml
+++ b/spider_transformations/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_transformations"
-version = "2.21.16"
+version = "2.21.17"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]
diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml
index 172fce90e..e4a51132c 100644
--- a/spider_utils/Cargo.toml
+++ b/spider_utils/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_utils"
-version = "2.21.16"
+version = "2.21.17"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]
diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml
index 97f14725a..4d9e993d2 100644
--- a/spider_worker/Cargo.toml
+++ b/spider_worker/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_worker"
-version = "2.21.16"
+version = "2.21.17"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]