Skip to content

Commit

Permalink
chore(chrome): add to block list
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 12, 2024
1 parent 4df8e5b commit ca712b5
Show file tree
Hide file tree
Showing 18 changed files with 208 additions and 110 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 38 additions & 27 deletions examples/real_world.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
//! cargo run --example real_world --features="chrome chrome_intercept spider_utils/transformations"
//! cargo run --example real_world --features="chrome chrome_intercept real_browser spider_utils/transformations"
extern crate spider;
use crate::spider::tokio::io::AsyncWriteExt;
use spider::configuration::{WaitForDelay, WaitForSelector};
use spider::tokio;
use spider::website::Website;
use spider::{
Expand All @@ -14,12 +15,18 @@ async fn crawl_website(url: &str) -> Result<()> {
let mut stdout = tokio::io::stdout();

let mut website: Website = Website::new(url)
.with_limit(1)
.with_limit(5)
.with_chrome_intercept(RequestInterceptConfiguration::new(true))
.with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
200,
500,
)))))
.with_subdomains(true)
.with_wait_for_idle_dom(Some(WaitForSelector::new(
Some(Duration::from_millis(100)),
"body".into(),
)))
.with_block_assets(true)
// .with_wait_for_delay(Some(WaitForDelay::new(Some(Duration::from_millis(10000)))))
.with_stealth(true)
.with_return_page_links(true)
.with_fingerprint(true)
Expand All @@ -30,36 +37,40 @@ async fn crawl_website(url: &str) -> Result<()> {

let mut rx2 = website.subscribe(16).unwrap();

tokio::spawn(async move {
while let Ok(page) = rx2.recv().await {
let _ = stdout
.write_all(
format!(
"- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
page.get_url(),
page.bytes_transferred.unwrap_or_default(),
page.get_html_bytes_u8().len(),
match page.page_links {
Some(ref l) => l.len(),
_ => 0,
}
let start = crate::tokio::time::Instant::now();

let (links, _) = tokio::join!(
async move {
website.crawl().await;
website.unsubscribe();
website.get_links()
},
async move {
while let Ok(page) = rx2.recv().await {
let _ = stdout
.write_all(
format!(
"- {} -- Bytes transferred {:?} -- HTML Size {:?} -- Links: {:?}\n",
page.get_url(),
page.bytes_transferred.unwrap_or_default(),
page.get_html_bytes_u8().len(),
match page.page_links {
Some(ref l) => l.len(),
_ => 0,
}
)
.as_bytes(),
)
.as_bytes(),
)
.await;
.await;
}
}
});

let start = crate::tokio::time::Instant::now();
website.crawl().await;
);

let duration = start.elapsed();

let links = website.get_links();

println!(
"Time elapsed in website.crawl({}) is: {:?} for total pages: {:?}",
website.get_url(),
url,
duration,
links.len()
);
Expand All @@ -71,7 +82,7 @@ async fn crawl_website(url: &str) -> Result<()> {
async fn main() -> Result<()> {
env_logger::init();
let _ = tokio::join!(
crawl_website("https://choosealicense.com"),
crawl_website("https://www.choosealicense.com"),
crawl_website("https://jeffmendez.com"),
crawl_website("https://example.com"),
);
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.21.16"
version = "2.21.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
4 changes: 2 additions & 2 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ impl Configuration {
pub fn with_chrome_intercept(
&mut self,
chrome_intercept: RequestInterceptConfiguration,
url: &str,
url: &Option<Box<url::Url>>,
) -> &mut Self {
self.chrome_intercept = chrome_intercept;
self.chrome_intercept.setup_intercept_manager(url);
Expand All @@ -865,7 +865,7 @@ impl Configuration {
pub fn with_chrome_intercept(
&mut self,
_chrome_intercept: RequestInterceptConfiguration,
_url: &str,
_url: &Option<Box<url::Url>>,
) -> &mut Self {
self
}
Expand Down
13 changes: 6 additions & 7 deletions spider/src/features/chrome_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,9 @@ pub enum NetworkInterceptManager {
#[cfg(not(feature = "chrome"))]
impl NetworkInterceptManager {
/// a custom intercept handle.
pub fn new(_url: &str) -> NetworkInterceptManager {
pub fn new(_url: &Option<Box<url::Url>>) -> NetworkInterceptManager {
NetworkInterceptManager::Unknown
}
/// Setup the intercept handle
pub fn setup(&mut self, url: &str) -> Self {
NetworkInterceptManager::new(url)
}
}

#[derive(Debug, Default, Clone, PartialEq)]
Expand Down Expand Up @@ -682,7 +678,10 @@ impl RequestInterceptConfiguration {
}
}
/// Setup a new intercept config with a custom intercept manager.
pub fn new_manager(enabled: bool, url: &str) -> RequestInterceptConfiguration {
pub fn new_manager(
enabled: bool,
url: &Option<Box<url::Url>>,
) -> RequestInterceptConfiguration {
RequestInterceptConfiguration {
enabled,
block_javascript: false,
Expand All @@ -695,7 +694,7 @@ impl RequestInterceptConfiguration {
}

/// Setup the network request manager type.
pub fn setup_intercept_manager(&mut self, url: &str) {
pub fn setup_intercept_manager(&mut self, url: &Option<Box<url::Url>>) {
self.intercept_manager = NetworkInterceptManager::new(url);
}

Expand Down
1 change: 1 addition & 0 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
) {
if let Some(b) = base {
let mut abs = convert_abs_path(b, href);

let new_page = abs != **b;

if let Some(link_map) = links_pages {
Expand Down
2 changes: 1 addition & 1 deletion spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4046,7 +4046,7 @@ impl Website {
chrome_intercept: RequestInterceptConfiguration,
) -> &mut Self {
self.configuration
.with_chrome_intercept(chrome_intercept, &self.url);
.with_chrome_intercept(chrome_intercept, &self.domain_parsed);
self
}

Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.21.16"
version = "2.21.17"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
Loading

0 comments on commit ca712b5

Please sign in to comment.