Skip to content

Commit

Permalink
perf(links): filter dup links after async batch
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Apr 22, 2022
1 parent 1e72ec9 commit 053eea4
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 9 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## Unreleased

## v1.6.1

- perf(links): filter dup links after async batch
- chore(delay): fix crawl delay thread groups
- perf(page): slim channel page sending required props

## v1.5.3

- feat(regex): add optional regex black listing
Expand Down
2 changes: 1 addition & 1 deletion benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ publish = false
edition = "2021"

[dependencies]
spider = { version = "1.6.0", path = "../spider" }
spider = { version = "1.6.1", path = "../spider" }
criterion = "0.3"

[[bench]]
Expand Down
2 changes: 1 addition & 1 deletion benches/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ We have comparisons set against 3 different languages and libs that can be used

How fast can we crawl all pages on a medium sized website. Tests are ordered between the largest to smallest runtimes needed. All examples use the same html selector to gather the pages for a website.

### v1.6.0
### v1.6.1

Case: `https://rsseau.fr`

Expand Down
4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.6.0"
version = "1.6.1"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -15,7 +15,7 @@ publish = false
maintenance = { status = "as-is" }

[dependencies.spider]
version = "1.6.0"
version = "1.6.1"
path = "../spider"
default-features = false

Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.6.0"
version = "1.6.1"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand Down
7 changes: 5 additions & 2 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use std::collections::HashSet;
use std::{sync, thread, time::Duration};
use reqwest::header::CONNECTION;
use reqwest::header;
use sync::mpsc::{channel, Sender, Receiver};

/// Represent a website to scrawl. To start crawling, instanciate a new `struct` using
/// <pre>
Expand Down Expand Up @@ -43,6 +44,8 @@ pub struct Website<'a> {
pub page_store_ignore: bool,
}

type Message = (Page, Vec<String>);

impl<'a> Website<'a> {
/// Initialize Website object with a start link to scrawl.
pub fn new(domain: &str) -> Self {
Expand Down Expand Up @@ -110,7 +113,7 @@ impl<'a> Website<'a> {

// crawl while links exists
while !self.links.is_empty() {
let (tx, rx) = sync::mpsc::channel();
let (tx, rx): (Sender<Message>, Receiver<Message>) = channel();

for link in self.links.iter() {
if !self.is_allowed(link) {
Expand Down Expand Up @@ -152,7 +155,7 @@ impl<'a> Website<'a> {

});

self.links = new_links;
self.links = new_links.difference(&self.links_visited).cloned().collect();
}
}

Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.6.0"
version = "1.6.1"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -23,7 +23,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.6.0"
version = "1.6.1"
path = "../spider"
default-features = false

Expand Down

0 comments on commit 053eea4

Please sign in to comment.