feat(chrome): add headless chrome

spider-rs · Nov 29, 2023 · 97188f8 · 97188f8
1 parent 5517f15
commit 97188f8
Show file tree

Hide file tree

Showing 10 changed files with 196 additions and 49 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ indexmap = "2.1.0"
 napi = { version = "2.14.1", default-features = false, features = ["napi4", "async", "tokio_rt"] }
 napi-derive = "2.14.2"
 num_cpus = "1.16.0"
-spider = { version = "1.50.14", features = ["napi", "budget", "cron", "regex", "cookies", "socks"] }
+spider = { version = "1.50.14", features = ["napi", "budget", "cron", "regex", "cookies", "socks", "chrome"] }
 
 [target.x86_64-unknown-linux-gnu.dependencies]
 openssl-sys = { version = "0.9.96", features = ["vendored"] }

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # spider-rs
 
-The [spider](https://github.com/spider-rs/spider) project ported to nodejs via napi.
+The [spider](https://github.com/spider-rs/spider) project ported to nodejs.
 
 ## Getting Started
 
@@ -19,7 +19,7 @@ await website.crawl(onPageEvent);
 console.log(website.getLinks());
 ```
 
-Collect the resources for a website. View [config](https://docs.rs/spider/latest/spider/website/struct.Website.html) for options, when using convert the method to camelCase.
+Collect the resources for a website.
 
 ```ts
 import { Website } from "@spider-rs/spider-rs";
@@ -39,6 +39,36 @@ await website.scrape();
 console.log(website.getPages());
 ```
 
+Run the crawls in the background on another thread.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (_err, value) => {
+  console.log(value);
+};
+
+await website.crawl(onPageEvent, true);
+// runs immediately
+```
+
+Use headless Chrome rendering for crawls.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (_err, value) => {
+  console.log(value);
+};
+
+await website.crawl(onPageEvent, false, true);
+console.log(website.getLinks());
+```
+
 Cron jobs can be done with the following.
 
 ```ts

diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts
@@ -3,7 +3,7 @@ import { crawl, Website, Page, type NPage, Cron } from "../index.js";
 
 const TEST_URL = "https://choosealicense.com";
 
-test("crawl native", async (t) => {
+test("crawl shortcut native", async (t) => {
   const { links, pages } = await crawl(TEST_URL);
 
   t.assert(links.length > 1, "should be more than one link");
@@ -138,3 +138,10 @@ test("new single page", async (t) => {
   t.assert(page.getHtml().length >= 100, "should be valid html");
   t.assert(page.getBytes().length >= 100, "should be valid bytes");
 });
+
+test("new website native headless", async (t) => {
+  const website = new Website(TEST_URL);
+  await website.crawl(undefined, false, true);
+
+  t.assert(website.getLinks().length > 1, "should be more than one link");
+});
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
@@ -11,6 +11,7 @@
 
 - [Website](./website.md)
 - [Page](./page.md)
+- [Environment](./env.md)
 
 # Usage
 

diff --git a/book/src/crawl.md b/book/src/crawl.md
@@ -47,7 +47,6 @@ await website.crawl(onPageEvent, true);
 // this will run instantly as the crawl is in the background
 ```
 
-
 ## Subscriptions
 
 You can setup many subscriptions to run events when a crawl happens.
@@ -68,3 +67,22 @@ await website.crawl(onPageEvent);
 website.unsubscribe(subscriptionID);
 // this will run instantly as the crawl is in the background
 ```
+
+## Headless Chrome
+
+Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. 
+It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will 
+drastically speed up runs. 
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (err, value) => {
+  console.log(value);
+};
+
+// all params are optional. The third param determines headless rendering.
+await website.crawl(onPageEvent, false, true);
+```
diff --git a/book/src/env.md b/book/src/env.md
@@ -0,0 +1,11 @@
+# Environment
+
+Env variables to adjust the project.
+
+## CHROME_URL
+
+You can set the chrome URL to connect remotely.
+
+```sh
+CHROME_URL=http://localhost:9222
+```
diff --git a/book/src/scrape.md b/book/src/scrape.md
@@ -13,3 +13,22 @@ await website.scrape();
 // [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]
 console.log(website.getPages());
 ```
+
+## Headless Chrome
+
+Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. 
+It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will 
+drastically speed up runs.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (err, value) => {
+  console.log(value);
+};
+
+// all params are optional. The third param determines headless rendering.
+await website.scrape(onPageEvent, false, true);
+```
diff --git a/index.d.ts b/index.d.ts
@@ -10,7 +10,7 @@ export interface NPage {
   /** the content of the page found */
   content: string
 }
-/** crawl a website gathering all links to array */
+/** crawl a website using HTTP gathering all links and html. */
 export function crawl(url: string): Promise<NWebsite>
 /** a simple page object */
 export class Page {
@@ -43,11 +43,11 @@ export class Website {
   /** remove a subscription listener */
   unsubscribe(id?: number | undefined | null): boolean
   /** crawl a website */
-  crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null): Promise<void>
+  crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
+  /** scrape a website */
+  scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
   /** run the cron */
   runCron(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise<Cron>
-  /** scrape a website */
-  scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise<void>
   /** get all the links of a website */
   getLinks(): Array<string>
   /** get all the pages of a website - requires calling website.scrape */

diff --git a/src/page.rs b/src/page.rs
@@ -24,17 +24,17 @@ impl Page {
   /// a new page
   pub fn new(url: String, subdomains: Option<bool>, tld: Option<bool>) -> Self {
     Page {
-        url,
-        subdomains,
-        tld,
+      url,
+      subdomains,
+      tld,
       ..Default::default()
     }
   }
 
   #[napi]
   /// get the page content
   pub async unsafe fn fetch(&mut self) -> &Self {
-    self.inner = Some(spider::page::Page::new(&self.url, &Default::default()).await);
+    self.inner = Some(spider::page::Page::new_page(&self.url, &Default::default()).await);
     self.selectors = spider::page::get_page_selectors(
       &self.url,
       self.subdomains.unwrap_or_default(),

diff --git a/src/website.rs b/src/website.rs
@@ -28,7 +28,7 @@ pub struct NWebsite {
 }
 
 #[napi]
-/// crawl a website gathering all links to array
+/// crawl a website using HTTP gathering all links and html.
 pub async fn crawl(url: String) -> NWebsite {
   let mut website = spider::website::Website::new(&url);
   let mut rx2 = website
@@ -54,7 +54,7 @@ pub async fn crawl(url: String) -> NWebsite {
   });
 
   spider::tokio::spawn(async move {
-    website.crawl().await;
+    website.crawl_raw().await;
   });
 
   let mut pages = Vec::new();
@@ -161,10 +161,14 @@ impl Website {
   pub async unsafe fn crawl(
     &mut self,
     on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
+    // run the page in the background
     background: Option<bool>,
+    // headless chrome rendering
+    headless: Option<bool>,
   ) {
     // only run in background if on_page_event is handled for streaming.
-    let background = background.is_some() && background.unwrap_or_default() == true;
+    let background = background.is_some() && background.unwrap_or_default();
+    let headless = headless.is_some() && headless.unwrap_or_default();
 
     match on_page_event {
       Some(callback) => {
@@ -188,7 +192,11 @@ impl Website {
           });
 
           spider::tokio::spawn(async move {
-            website.crawl().await;
+            if headless {
+              website.crawl().await;
+            } else {
+              website.crawl_raw().await;
+            }
           });
         } else {
           let mut rx2 = self
@@ -207,10 +215,94 @@ impl Website {
               );
             }
           });
+
+          if headless {
+            self.inner.crawl().await;
+          } else {
+            self.inner.crawl_raw().await;
+          }
+        }
+      }
+      _ => {
+        if headless {
           self.inner.crawl().await;
+        } else {
+          self.inner.crawl_raw().await;
+        }
+      }
+    }
+  }
+
+  #[napi]
+  /// scrape a website
+  pub async unsafe fn scrape(
+    &mut self,
+    on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
+    background: Option<bool>,
+    headless: Option<bool>,
+  ) {
+    let headless = headless.is_some() && headless.unwrap_or_default();
+
+    match on_page_event {
+      Some(callback) => {
+        if background.unwrap_or_default() {
+          let mut website = self.inner.clone();
+
+          let mut rx2 = website
+            .subscribe(*BUFFER / 2)
+            .expect("sync feature should be enabled");
+
+          spider::tokio::spawn(async move {
+            while let Ok(res) = rx2.recv().await {
+              callback.call(
+                Ok(NPage {
+                  url: res.get_url().into(),
+                  content: res.get_html().into(),
+                }),
+                napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
+              );
+            }
+          });
+
+          spider::tokio::spawn(async move {
+            if headless {
+              website.scrape().await;
+            } else {
+              website.scrape_raw().await;
+            }
+          });
+        } else {
+          let mut rx2 = self
+            .inner
+            .subscribe(*BUFFER / 2)
+            .expect("sync feature should be enabled");
+
+          spider::tokio::spawn(async move {
+            while let Ok(res) = rx2.recv().await {
+              callback.call(
+                Ok(NPage {
+                  url: res.get_url().into(),
+                  content: res.get_html().into(),
+                }),
+                napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
+              );
+            }
+          });
+
+          if headless {
+            self.inner.scrape().await;
+          } else {
+            self.inner.scrape_raw().await;
+          }
+        }
+      }
+      _ => {
+        if headless {
+          self.inner.scrape().await;
+        } else {
+          self.inner.scrape_raw().await;
         }
       }
-      _ => self.inner.crawl().await,
     }
   }
 
@@ -249,37 +341,6 @@ impl Website {
     Cron { inner, cron_handle }
   }
 
-  #[napi]
-  /// scrape a website
-  pub async unsafe fn scrape(
-    &mut self,
-    on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
-  ) {
-    match on_page_event {
-      Some(callback) => {
-        let mut rx2 = self
-          .inner
-          .subscribe(*BUFFER / 2)
-          .expect("sync feature should be enabled");
-
-        spider::tokio::spawn(async move {
-          while let Ok(res) = rx2.recv().await {
-            callback.call(
-              Ok(NPage {
-                url: res.get_url().into(),
-                content: res.get_html().into(),
-              }),
-              napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
-            );
-          }
-        });
-
-        self.inner.scrape().await;
-      }
-      _ => self.inner.scrape().await,
-    }
-  }
-
   #[napi]
   /// get all the links of a website
   pub fn get_links(&self) -> Vec<String> {
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ @@
     - [Website](./website.md)
     - [Page](./page.md)
+    - [Environment](./env.md)
     # Usage
@@ Expand Down @@