feat(scrape): add html scrape ability (#50)

spider-rs · May 16, 2022 · 675c040 · 675c040
1 parent 17bafa7
commit 675c040
Show file tree

Hide file tree

Showing 10 changed files with 142 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 ![crate version](https://img.shields.io/crates/v/spider.svg)
 
-The fastest web indexer. (SpiderBot)
+The fastest web crawler and indexer.
 
 ## Getting Started
 

diff --git a/benches/Cargo.toml b/benches/Cargo.toml
@@ -5,7 +5,7 @@ publish = false
 edition = "2021"
 
 [dependencies]
-spider = { version = "1.7.23", path = "../spider" }
+spider = { version = "1.8.0", path = "../spider" }
 criterion = "0.3"
 
 [[bench]]

diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.7.23"
+version = "1.8.0"
 authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/madeindjs/spider"
@@ -15,7 +15,7 @@ publish = false
 maintenance = { status = "as-is" }
 
 [dependencies.spider]
-version = "1.7.23"
+version = "1.8.0"
 path = "../spider"
 default-features = false
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.7.23"
+version = "1.8.0"
 authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/madeindjs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic blocking example crawling a web page, add spider to your `Cargo.
 
 ```toml
 [dependencies]
-spider = "1.7.23"
+spider = "1.8.0"
 ```
 
 And then the code:
@@ -57,7 +57,7 @@ There is an optional "regex" crate that can be enabled:
 
 ```toml
 [dependencies]
-spider = { version = "1.7.23", features = ["regex"] }
+spider = { version = "1.8.0", features = ["regex"] }
 ```
 
 ```rust,no_run

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -56,6 +56,11 @@ impl Page {
         &self.url
     }
 
+    /// Html getter for page.
+    pub fn get_html(&self) -> &String {
+        &self.html
+    }
+
     /// HTML returned from Scraper.
     fn parse_html(&self) -> Html {
         Html::parse_document(&self.html)

diff --git a/spider/src/website.rs b/spider/src/website.rs
@@ -33,6 +33,8 @@ pub struct Website<'a> {
     links: HashSet<String>,
     /// contains all visited URL.
     links_visited: HashSet<String>,
+    /// contains page visited
+    pages: Vec<Page>,
     /// callback when a link is found.
     pub on_link_find_callback: fn(String) -> String,
     /// Robot.txt parser holder.
@@ -47,6 +49,7 @@ impl<'a> Website<'a> {
         Self {
             configuration: Configuration::new(),
             links_visited: HashSet::new(),
+            pages: Vec::new(),
             robot_file_parser: RobotFileParser::new(&format!("{}/robots.txt", domain)), // TODO: lazy establish
             links: HashSet::from([format!("{}/", domain)]),
             on_link_find_callback: |s| s,
@@ -56,7 +59,11 @@ impl<'a> Website<'a> {
 
     /// page getter
     pub fn get_pages(&self) -> Vec<Page> {
-        self.links_visited.iter().map(|l| Page::build(l, "")).collect()
+        if !self.pages.is_empty(){
+            self.pages.clone()
+        } else {
+            self.links_visited.iter().map(|l| Page::build(l, "")).collect()
+        }
     }
 
     /// links visited getter
@@ -110,13 +117,20 @@ impl<'a> Website<'a> {
         client
     }
 
-    /// Start to crawl website blocking with async parallelization
+    /// Start to crawl website with async parallelization
     pub fn crawl(&mut self) {
         let client = self.setup();
 
         self.crawl_concurrent(&client);
     }
 
+    /// Start to scrape website with async parallelization
+    pub fn scrape(&mut self) {
+        let client = self.setup();
+
+        self.scrape_concurrent(&client);
+    }
+
     /// Start to crawl website in sync
     pub fn crawl_sync(&mut self) {
         let client = self.setup();
@@ -203,6 +217,54 @@ impl<'a> Website<'a> {
             self.links = &new_links - &self.links_visited;
         }
     }
+
+    /// Start to scape website concurrently and store html
+    fn scrape_concurrent(&mut self, client: &Client) {
+        let pool = self.create_thread_pool();
+        let delay = self.configuration.delay;
+        let delay_enabled = delay > 0;
+        let on_link_find_callback = self.on_link_find_callback;
+
+        // crawl while links exists
+        while !self.links.is_empty() {
+            let (tx, rx): (Sender<Page>, Receiver<Page>) = channel();
+
+            for link in self.links.iter() {
+                if !self.is_allowed(link) {
+                    continue;
+                }
+                log("fetch", link);
+
+                self.links_visited.insert(link.into());
+
+                let link = link.clone();
+                let tx = tx.clone();
+                let cx = client.clone();
+
+                pool.spawn(move || {
+                    if delay_enabled {
+                        tokio_sleep(&Duration::from_millis(delay));
+                    }
+                    let link_result = on_link_find_callback(link);
+                    let page = Page::new(&link_result, &cx);
+
+                    tx.send(page).unwrap();
+                });
+            }
+
+            drop(tx);
+
+            let mut new_links: HashSet<String> = HashSet::new();
+
+            rx.into_iter().for_each(|page| {
+                let links = page.links();
+                new_links.extend(links);
+                self.pages.push(page);
+            });
+
+            self.links = &new_links - &self.links_visited;
+        }
+    }
 
     /// return `true` if URL:
     ///
@@ -255,6 +317,24 @@ fn crawl() {
     );
 }
 
+#[test]
+fn scrape() {
+    let mut website: Website = Website::new("https://choosealicense.com");
+    website.scrape();
+    assert!(
+        website
+            .links_visited
+            .contains(&"https://choosealicense.com/licenses/".to_string()),
+        "{:?}",
+        website.links_visited
+    );
+
+    assert_eq!(
+        website.get_pages()[0].get_html().is_empty(),
+        false
+    );  
+}
+
 #[test]
 fn crawl_subsequential() {
     let mut website: Website = Website::new("https://choosealicense.com");

diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "1.7.23"
+version = "1.8.0"
 authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/madeindjs/spider"
@@ -18,13 +18,14 @@ maintenance = { status = "as-is" }
 [dependencies]
 clap = { version = "3.1.9", features = ["derive"] }
 env_logger = "0.9.0"
+serde_json = "1.0.81"
 
 [build-dependencies]
 quote = "1.0.18"
 failure_derive = "0.1.8"
 
 [dependencies.spider]
-version = "1.7.23"
+version = "1.8.0"
 path = "../spider"
 default-features = false
 

diff --git a/spider_cli/src/main.rs b/spider_cli/src/main.rs
@@ -1,5 +1,6 @@
 extern crate spider;
 extern crate env_logger;
+extern crate serde_json;
 
 pub mod options;
 
@@ -53,6 +54,40 @@ fn main() {
                 io::stdout().write_all(format!("{:?}", links).as_bytes()).unwrap();
             }
 
+        }
+        Some(Commands::SCRAPE { output_html, output_links }) => {
+            use serde_json::{json};
+
+            website.scrape();
+
+            let mut page_objects: Vec<_> = vec![];
+
+            for page in website.get_pages() {
+                let mut links: Vec<String> = vec![];
+                let mut html: &String = &String::new();
+
+                if *output_links {
+                    let page_links = page.links();
+                    links.extend(page_links);
+                }
+
+                if *output_html {
+                    html = page.get_html();
+                }
+
+                let page_json = json!({
+                    "url": page.get_url(),
+                    "links": links,
+                    "html": html,
+                });
+
+                page_objects.push(page_json);
+            }
+
+            let j = serde_json::to_string_pretty(&page_objects).unwrap();
+
+            io::stdout().write_all(j.as_bytes()).unwrap();
+
         }
         None => {}
     }

diff --git a/spider_cli/src/options/sub_command.rs b/spider_cli/src/options/sub_command.rs
@@ -2,7 +2,7 @@ use clap::Subcommand;
 
 #[derive(Subcommand)]
 pub enum Commands {
-    /// crawl the website.
+    /// crawl the website extracting links.
     CRAWL {
         /// sequentially one by one crawl pages
         #[clap(short, long)]
@@ -11,4 +11,13 @@ pub enum Commands {
         #[clap(short, long)]
         output_links: bool,
     },
+    /// scrape the website extracting html and links.
+    SCRAPE {
+        /// stdout all pages links crawled
+        #[clap(short, long)]
+        output_links: bool,
+        /// stdout all pages html crawled
+        #[clap(long)]
+        output_html: bool,
+    },
 }