Skip to content

Commit

Permalink
feat(scrape): add html scrape ability (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez authored May 16, 2022
1 parent 17bafa7 commit 675c040
Show file tree
Hide file tree
Showing 10 changed files with 142 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

![crate version](https://img.shields.io/crates/v/spider.svg)

The fastest web indexer. (SpiderBot)
The fastest web crawler and indexer.

## Getting Started

Expand Down
2 changes: 1 addition & 1 deletion benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ publish = false
edition = "2021"

[dependencies]
spider = { version = "1.7.23", path = "../spider" }
spider = { version = "1.8.0", path = "../spider" }
criterion = "0.3"

[[bench]]
Expand Down
4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.7.23"
version = "1.8.0"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -15,7 +15,7 @@ publish = false
maintenance = { status = "as-is" }

[dependencies.spider]
version = "1.7.23"
version = "1.8.0"
path = "../spider"
default-features = false

Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.7.23"
version = "1.8.0"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand Down
4 changes: 2 additions & 2 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic blocking example crawling a web page, add spider to your `Cargo.

```toml
[dependencies]
spider = "1.7.23"
spider = "1.8.0"
```

And then the code:
Expand Down Expand Up @@ -57,7 +57,7 @@ There is an optional "regex" crate that can be enabled:

```toml
[dependencies]
spider = { version = "1.7.23", features = ["regex"] }
spider = { version = "1.8.0", features = ["regex"] }
```

```rust,no_run
Expand Down
5 changes: 5 additions & 0 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ impl Page {
&self.url
}

/// Html getter for page.
pub fn get_html(&self) -> &String {
&self.html
}

/// HTML returned from Scraper.
fn parse_html(&self) -> Html {
Html::parse_document(&self.html)
Expand Down
84 changes: 82 additions & 2 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ pub struct Website<'a> {
links: HashSet<String>,
/// contains all visited URL.
links_visited: HashSet<String>,
/// contains page visited
pages: Vec<Page>,
/// callback when a link is found.
pub on_link_find_callback: fn(String) -> String,
/// Robot.txt parser holder.
Expand All @@ -47,6 +49,7 @@ impl<'a> Website<'a> {
Self {
configuration: Configuration::new(),
links_visited: HashSet::new(),
pages: Vec::new(),
robot_file_parser: RobotFileParser::new(&format!("{}/robots.txt", domain)), // TODO: lazy establish
links: HashSet::from([format!("{}/", domain)]),
on_link_find_callback: |s| s,
Expand All @@ -56,7 +59,11 @@ impl<'a> Website<'a> {

/// page getter
pub fn get_pages(&self) -> Vec<Page> {
self.links_visited.iter().map(|l| Page::build(l, "")).collect()
if !self.pages.is_empty(){
self.pages.clone()
} else {
self.links_visited.iter().map(|l| Page::build(l, "")).collect()
}
}

/// links visited getter
Expand Down Expand Up @@ -110,13 +117,20 @@ impl<'a> Website<'a> {
client
}

/// Start to crawl website blocking with async parallelization
/// Start to crawl website with async parallelization
pub fn crawl(&mut self) {
let client = self.setup();

self.crawl_concurrent(&client);
}

/// Start to scrape website with async parallelization
pub fn scrape(&mut self) {
let client = self.setup();

self.scrape_concurrent(&client);
}

/// Start to crawl website in sync
pub fn crawl_sync(&mut self) {
let client = self.setup();
Expand Down Expand Up @@ -203,6 +217,54 @@ impl<'a> Website<'a> {
self.links = &new_links - &self.links_visited;
}
}

/// Start to scape website concurrently and store html
fn scrape_concurrent(&mut self, client: &Client) {
let pool = self.create_thread_pool();
let delay = self.configuration.delay;
let delay_enabled = delay > 0;
let on_link_find_callback = self.on_link_find_callback;

// crawl while links exists
while !self.links.is_empty() {
let (tx, rx): (Sender<Page>, Receiver<Page>) = channel();

for link in self.links.iter() {
if !self.is_allowed(link) {
continue;
}
log("fetch", link);

self.links_visited.insert(link.into());

let link = link.clone();
let tx = tx.clone();
let cx = client.clone();

pool.spawn(move || {
if delay_enabled {
tokio_sleep(&Duration::from_millis(delay));
}
let link_result = on_link_find_callback(link);
let page = Page::new(&link_result, &cx);

tx.send(page).unwrap();
});
}

drop(tx);

let mut new_links: HashSet<String> = HashSet::new();

rx.into_iter().for_each(|page| {
let links = page.links();
new_links.extend(links);
self.pages.push(page);
});

self.links = &new_links - &self.links_visited;
}
}

/// return `true` if URL:
///
Expand Down Expand Up @@ -255,6 +317,24 @@ fn crawl() {
);
}

#[test]
fn scrape() {
let mut website: Website = Website::new("https://choosealicense.com");
website.scrape();
assert!(
website
.links_visited
.contains(&"https://choosealicense.com/licenses/".to_string()),
"{:?}",
website.links_visited
);

assert_eq!(
website.get_pages()[0].get_html().is_empty(),
false
);
}

#[test]
fn crawl_subsequential() {
let mut website: Website = Website::new("https://choosealicense.com");
Expand Down
5 changes: 3 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.7.23"
version = "1.8.0"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -18,13 +18,14 @@ maintenance = { status = "as-is" }
[dependencies]
clap = { version = "3.1.9", features = ["derive"] }
env_logger = "0.9.0"
serde_json = "1.0.81"

[build-dependencies]
quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.7.23"
version = "1.8.0"
path = "../spider"
default-features = false

Expand Down
35 changes: 35 additions & 0 deletions spider_cli/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
extern crate spider;
extern crate env_logger;
extern crate serde_json;

pub mod options;

Expand Down Expand Up @@ -53,6 +54,40 @@ fn main() {
io::stdout().write_all(format!("{:?}", links).as_bytes()).unwrap();
}

}
Some(Commands::SCRAPE { output_html, output_links }) => {
use serde_json::{json};

website.scrape();

let mut page_objects: Vec<_> = vec![];

for page in website.get_pages() {
let mut links: Vec<String> = vec![];
let mut html: &String = &String::new();

if *output_links {
let page_links = page.links();
links.extend(page_links);
}

if *output_html {
html = page.get_html();
}

let page_json = json!({
"url": page.get_url(),
"links": links,
"html": html,
});

page_objects.push(page_json);
}

let j = serde_json::to_string_pretty(&page_objects).unwrap();

io::stdout().write_all(j.as_bytes()).unwrap();

}
None => {}
}
Expand Down
11 changes: 10 additions & 1 deletion spider_cli/src/options/sub_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use clap::Subcommand;

#[derive(Subcommand)]
pub enum Commands {
/// crawl the website.
/// crawl the website extracting links.
CRAWL {
/// sequentially one by one crawl pages
#[clap(short, long)]
Expand All @@ -11,4 +11,13 @@ pub enum Commands {
#[clap(short, long)]
output_links: bool,
},
/// scrape the website extracting html and links.
SCRAPE {
/// stdout all pages links crawled
#[clap(short, long)]
output_links: bool,
/// stdout all pages html crawled
#[clap(long)]
output_html: bool,
},
}

0 comments on commit 675c040

Please sign in to comment.