diff --git a/Cargo.lock b/Cargo.lock index aa2d1c0bb..89680a897 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3550,7 +3550,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.41.0" +version = "1.41.1" dependencies = [ "ahash", "bytes", @@ -3586,7 +3586,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.41.0" +version = "1.41.1" dependencies = [ "clap 3.2.25", "env_logger 0.9.3", @@ -3598,7 +3598,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.41.0" +version = "1.41.1" dependencies = [ "convert_case", "env_logger 0.9.3", @@ -3619,7 +3619,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.41.0" +version = "1.41.1" dependencies = [ "env_logger 0.10.0", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index df4e4992f..d9b2dd29e 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.41.0" +version = "1.41.1" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ htr = "0.5.27" flexbuffers = "2.0.0" [dependencies.spider] -version = "1.41.0" +version = "1.41.1" path = "../spider" features = ["serde"] @@ -57,3 +57,8 @@ path = "subscribe.rs" [[example]] name = "callback" path = "callback.rs" + +[[example]] +name = "sitemap" +path = "sitemap.rs" +features = ["sitemap"] diff --git a/examples/sitemap.rs b/examples/sitemap.rs new file mode 100644 index 000000000..a7117aec3 --- /dev/null +++ b/examples/sitemap.rs @@ -0,0 +1,36 @@ +//! `cargo run --example sitemap` +extern crate spider; + +use spider::tokio; +use spider::website::Website; +use std::time::Instant; + +#[tokio::main] +async fn main() { + let mut website: Website = Website::new("https://rsseau.fr"); + website + .configuration + .blacklist_url + .insert(Default::default()) + .push("https://rsseau.fr/resume".into()); + website.configuration.respect_robots_txt = true; + website.configuration.subdomains = false; + website.configuration.delay = 0; // Defaults to 0 ms + website.configuration.user_agent = Some(Box::new("SpiderBot".into())); // Defaults to spider/x.y.z, where x.y.z is the library version + + let start = Instant::now(); + website.crawl().await; + let duration = start.elapsed(); + + let links = website.get_links(); + + for link in links { + println!("- {:?}", link.as_ref()); + } + + println!( + "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", + duration, + links.len() + ) +} diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 7379a5747..86bd9e679 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.41.0" +version = "1.41.1" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler written in Rust." repository = "https://github.com/spider-rs/spider" diff --git a/spider/README.md b/spider/README.md index b62a86734..36c18f99a 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.41.0" +spider = "1.41.1" ``` And then the code: @@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl ```toml [dependencies] -spider = { version = "1.41.0", features = ["regex", "ua_generator"] } +spider = { version = "1.41.1", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -115,7 +115,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.41.0", features = ["decentralized"] } +spider = { version = "1.41.1", features = ["decentralized"] } ``` ```sh @@ -136,7 +136,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.41.0", features = ["sync"] } +spider = { version = "1.41.1", features = ["sync"] } ``` ```rust,no_run @@ -166,7 +166,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.41.0", features = ["regex"] } +spider = { version = "1.41.1", features = ["regex"] } ``` ```rust,no_run @@ -193,7 +193,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.41.0", features = ["control"] } +spider = { version = "1.41.1", features = ["control"] } ``` ```rust diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index 6065edcae..ba8b7d441 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -131,10 +131,20 @@ impl Configuration { self } + #[cfg(feature = "sitemap")] + /// Set the sitemap url. + pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self { + match sitemap_url { + Some(sitemap_url) => self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into()), + _ => self.sitemap_url = None, + }; + self + } + /// Add user agent to request. - pub fn with_user_agent(&mut self, user_agent: Option) -> &mut Self { + pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self { match user_agent { - Some(agent) => self.user_agent = Some(agent.into()), + Some(agent) => self.user_agent = Some(CompactString::new(agent.to_string()).into()), _ => self.user_agent = None, }; self diff --git a/spider/src/website.rs b/spider/src/website.rs index a4e191dd4..13152101d 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -1575,6 +1575,7 @@ impl Website { _ => (), } }); + } Location::None | Location::ParseErr(_) => (), }, @@ -1601,6 +1602,8 @@ impl Website { Err(err) => log("http network error: ", err.to_string()), }; + drop(tx); + if let Ok(handle) = handles.await { match self.pages.as_mut() { Some(p) => p.extend(handle), @@ -1653,11 +1656,18 @@ impl Website { } /// Add user agent to request. - pub fn with_user_agent(&mut self, user_agent: Option) -> &mut Self { + pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self { self.configuration.with_user_agent(user_agent); self } + #[cfg(feature = "sitemap")] + /// Add user agent to request. + pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self { + self.configuration.with_sitemap(sitemap_url); + self + } + /// Use proxies for request. pub fn with_proxies(&mut self, proxies: Option>) -> &mut Self { self.configuration.with_proxies(proxies); diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index b5c6c0a58..a8eaefa22 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.41.0" +version = "1.41.1" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -26,7 +26,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.41.0" +version = "1.41.1" path = "../spider" [[bin]] diff --git a/spider_cli/README.md b/spider_cli/README.md index 00fdf1f68..fb6faf5f1 100644 --- a/spider_cli/README.md +++ b/spider_cli/README.md @@ -40,7 +40,7 @@ spider --domain http://localhost:3000 download -t _temp_spider_downloads ``` ```sh -spider_cli 1.41.0 +spider_cli 1.41.1 madeindjs , j-mendez The fastest web crawler CLI written in Rust. diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 7f4182cb4..1daf88412 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.41.0" +version = "1.41.1" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ lazy_static = "1.4.0" env_logger = "0.10.0" [dependencies.spider] -version = "1.41.0" +version = "1.41.1" path = "../spider" features = ["serde", "flexbuffers"]