Skip to content

Commit

Permalink
chore(sitemap): fix sender drop
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Sep 14, 2023
1 parent 3680887 commit d752534
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 21 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.41.0"
version = "1.41.1"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.41.0"
version = "1.41.1"
path = "../spider"
features = ["serde"]

Expand Down Expand Up @@ -57,3 +57,8 @@ path = "subscribe.rs"
[[example]]
name = "callback"
path = "callback.rs"

[[example]]
name = "sitemap"
path = "sitemap.rs"
features = ["sitemap"]
36 changes: 36 additions & 0 deletions examples/sitemap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//! `cargo run --example sitemap`
extern crate spider;

use spider::tokio;
use spider::website::Website;
use std::time::Instant;

#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://rsseau.fr");
website
.configuration
.blacklist_url
.insert(Default::default())
.push("https://rsseau.fr/resume".into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = false;
website.configuration.delay = 0; // Defaults to 0 ms
website.configuration.user_agent = Some(Box::new("SpiderBot".into())); // Defaults to spider/x.y.z, where x.y.z is the library version

let start = Instant::now();
website.crawl().await;
let duration = start.elapsed();

let links = website.get_links();

for link in links {
println!("- {:?}", link.as_ref());
}

println!(
"Time elapsed in website.crawl() is: {:?} for total pages: {:?}",
duration,
links.len()
)
}
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.41.0"
version = "1.41.1"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down
12 changes: 6 additions & 6 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.41.0"
spider = "1.41.1"
```

And then the code:
Expand Down Expand Up @@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.41.0", features = ["regex", "ua_generator"] }
spider = { version = "1.41.1", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -115,7 +115,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.41.0", features = ["decentralized"] }
spider = { version = "1.41.1", features = ["decentralized"] }
```

```sh
Expand All @@ -136,7 +136,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.41.0", features = ["sync"] }
spider = { version = "1.41.1", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -166,7 +166,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.41.0", features = ["regex"] }
spider = { version = "1.41.1", features = ["regex"] }
```

```rust,no_run
Expand All @@ -193,7 +193,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.41.0", features = ["control"] }
spider = { version = "1.41.1", features = ["control"] }
```

```rust
Expand Down
14 changes: 12 additions & 2 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,20 @@ impl Configuration {
self
}

#[cfg(feature = "sitemap")]
/// Set the sitemap url.
pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
match sitemap_url {
Some(sitemap_url) => self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into()),
_ => self.sitemap_url = None,
};
self
}

/// Add user agent to request.
pub fn with_user_agent(&mut self, user_agent: Option<CompactString>) -> &mut Self {
pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
match user_agent {
Some(agent) => self.user_agent = Some(agent.into()),
Some(agent) => self.user_agent = Some(CompactString::new(agent.to_string()).into()),
_ => self.user_agent = None,
};
self
Expand Down
12 changes: 11 additions & 1 deletion spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1575,6 +1575,7 @@ impl Website {
_ => (),
}
});

}
Location::None | Location::ParseErr(_) => (),
},
Expand All @@ -1601,6 +1602,8 @@ impl Website {
Err(err) => log("http network error: ", err.to_string()),
};

drop(tx);

if let Ok(handle) = handles.await {
match self.pages.as_mut() {
Some(p) => p.extend(handle),
Expand Down Expand Up @@ -1653,11 +1656,18 @@ impl Website {
}

/// Add user agent to request.
pub fn with_user_agent(&mut self, user_agent: Option<CompactString>) -> &mut Self {
pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
self.configuration.with_user_agent(user_agent);
self
}

#[cfg(feature = "sitemap")]
/// Add user agent to request.
pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
self.configuration.with_sitemap(sitemap_url);
self
}

/// Use proxies for request.
pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
self.configuration.with_proxies(proxies);
Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.41.0"
version = "1.41.1"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "The fastest web crawler CLI written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -26,7 +26,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.41.0"
version = "1.41.1"
path = "../spider"

[[bin]]
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ spider --domain http://localhost:3000 download -t _temp_spider_downloads
```

```sh
spider_cli 1.41.0
spider_cli 1.41.1
madeindjs <[email protected]>, j-mendez <[email protected]>
The fastest web crawler CLI written in Rust.

Expand Down
4 changes: 2 additions & 2 deletions spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "1.41.0"
version = "1.41.1"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "The fastest web crawler CLI written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ lazy_static = "1.4.0"
env_logger = "0.10.0"

[dependencies.spider]
version = "1.41.0"
version = "1.41.1"
path = "../spider"
features = ["serde", "flexbuffers"]

Expand Down

0 comments on commit d752534

Please sign in to comment.