From 97188f84fe1f37d604f653634c6b23ec63079f8a Mon Sep 17 00:00:00 2001 From: j-mendez Date: Wed, 29 Nov 2023 12:20:55 -0500 Subject: [PATCH] feat(chrome): add headless chrome --- Cargo.toml | 2 +- README.md | 34 ++++++++++- __test__/index.spec.ts | 9 ++- book/src/SUMMARY.md | 1 + book/src/crawl.md | 20 ++++++- book/src/env.md | 11 ++++ book/src/scrape.md | 19 ++++++ index.d.ts | 8 +-- src/page.rs | 8 +-- src/website.rs | 133 ++++++++++++++++++++++++++++++----------- 10 files changed, 196 insertions(+), 49 deletions(-) create mode 100644 book/src/env.md diff --git a/Cargo.toml b/Cargo.toml index d1deb33..1a0fa65 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ indexmap = "2.1.0" napi = { version = "2.14.1", default-features = false, features = ["napi4", "async", "tokio_rt"] } napi-derive = "2.14.2" num_cpus = "1.16.0" -spider = { version = "1.50.14", features = ["napi", "budget", "cron", "regex", "cookies", "socks"] } +spider = { version = "1.50.14", features = ["napi", "budget", "cron", "regex", "cookies", "socks", "chrome"] } [target.x86_64-unknown-linux-gnu.dependencies] openssl-sys = { version = "0.9.96", features = ["vendored"] } diff --git a/README.md b/README.md index de42d56..00f322a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # spider-rs -The [spider](https://github.com/spider-rs/spider) project ported to nodejs via napi. +The [spider](https://github.com/spider-rs/spider) project ported to nodejs. ## Getting Started @@ -19,7 +19,7 @@ await website.crawl(onPageEvent); console.log(website.getLinks()); ``` -Collect the resources for a website. View [config](https://docs.rs/spider/latest/spider/website/struct.Website.html) for options, when using convert the method to camelCase. +Collect the resources for a website. ```ts import { Website } from "@spider-rs/spider-rs"; @@ -39,6 +39,36 @@ await website.scrape(); console.log(website.getPages()); ``` +Run the crawls in the background on another thread. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +const website = new Website("https://rsseau.fr"); + +const onPageEvent = (_err, value) => { + console.log(value); +}; + +await website.crawl(onPageEvent, true); +// runs immediately +``` + +Use headless Chrome rendering for crawls. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +const website = new Website("https://rsseau.fr"); + +const onPageEvent = (_err, value) => { + console.log(value); +}; + +await website.crawl(onPageEvent, false, true); +console.log(website.getLinks()); +``` + Cron jobs can be done with the following. ```ts diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts index 2c5c530..6b877ae 100644 --- a/__test__/index.spec.ts +++ b/__test__/index.spec.ts @@ -3,7 +3,7 @@ import { crawl, Website, Page, type NPage, Cron } from "../index.js"; const TEST_URL = "https://choosealicense.com"; -test("crawl native", async (t) => { +test("crawl shortcut native", async (t) => { const { links, pages } = await crawl(TEST_URL); t.assert(links.length > 1, "should be more than one link"); @@ -138,3 +138,10 @@ test("new single page", async (t) => { t.assert(page.getHtml().length >= 100, "should be valid html"); t.assert(page.getBytes().length >= 100, "should be valid bytes"); }); + +test("new website native headless", async (t) => { + const website = new Website(TEST_URL); + await website.crawl(undefined, false, true); + + t.assert(website.getLinks().length > 1, "should be more than one link"); +}); \ No newline at end of file diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 5a5f37e..ceb1649 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -11,6 +11,7 @@ - [Website](./website.md) - [Page](./page.md) +- [Environment](./env.md) # Usage diff --git a/book/src/crawl.md b/book/src/crawl.md index 88a6d93..847561d 100644 --- a/book/src/crawl.md +++ b/book/src/crawl.md @@ -47,7 +47,6 @@ await website.crawl(onPageEvent, true); // this will run instantly as the crawl is in the background ``` - ## Subscriptions You can setup many subscriptions to run events when a crawl happens. @@ -68,3 +67,22 @@ await website.crawl(onPageEvent); website.unsubscribe(subscriptionID); // this will run instantly as the crawl is in the background ``` + +## Headless Chrome + +Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. +It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will +drastically speed up runs. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +const website = new Website("https://rsseau.fr"); + +const onPageEvent = (err, value) => { + console.log(value); +}; + +// all params are optional. The third param determines headless rendering. +await website.crawl(onPageEvent, false, true); +``` diff --git a/book/src/env.md b/book/src/env.md new file mode 100644 index 0000000..3d80cbb --- /dev/null +++ b/book/src/env.md @@ -0,0 +1,11 @@ +# Environment + +Env variables to adjust the project. + +## CHROME_URL + +You can set the chrome URL to connect remotely. + +```sh +CHROME_URL=http://localhost:9222 +``` \ No newline at end of file diff --git a/book/src/scrape.md b/book/src/scrape.md index 705945b..86f9c66 100644 --- a/book/src/scrape.md +++ b/book/src/scrape.md @@ -13,3 +13,22 @@ await website.scrape(); // [ { url: "https://rsseau.fr/blog", html: "..."}, ...] console.log(website.getPages()); ``` + +## Headless Chrome + +Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. +It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will +drastically speed up runs. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +const website = new Website("https://rsseau.fr"); + +const onPageEvent = (err, value) => { + console.log(value); +}; + +// all params are optional. The third param determines headless rendering. +await website.scrape(onPageEvent, false, true); +``` \ No newline at end of file diff --git a/index.d.ts b/index.d.ts index 8d8f27d..e2c24a0 100644 --- a/index.d.ts +++ b/index.d.ts @@ -10,7 +10,7 @@ export interface NPage { /** the content of the page found */ content: string } -/** crawl a website gathering all links to array */ +/** crawl a website using HTTP gathering all links and html. */ export function crawl(url: string): Promise /** a simple page object */ export class Page { @@ -43,11 +43,11 @@ export class Website { /** remove a subscription listener */ unsubscribe(id?: number | undefined | null): boolean /** crawl a website */ - crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null): Promise + crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise + /** scrape a website */ + scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise /** run the cron */ runCron(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise - /** scrape a website */ - scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise /** get all the links of a website */ getLinks(): Array /** get all the pages of a website - requires calling website.scrape */ diff --git a/src/page.rs b/src/page.rs index e18c815..609074c 100644 --- a/src/page.rs +++ b/src/page.rs @@ -24,9 +24,9 @@ impl Page { /// a new page pub fn new(url: String, subdomains: Option, tld: Option) -> Self { Page { - url, - subdomains, - tld, + url, + subdomains, + tld, ..Default::default() } } @@ -34,7 +34,7 @@ impl Page { #[napi] /// get the page content pub async unsafe fn fetch(&mut self) -> &Self { - self.inner = Some(spider::page::Page::new(&self.url, &Default::default()).await); + self.inner = Some(spider::page::Page::new_page(&self.url, &Default::default()).await); self.selectors = spider::page::get_page_selectors( &self.url, self.subdomains.unwrap_or_default(), diff --git a/src/website.rs b/src/website.rs index 1f6dab1..1308645 100644 --- a/src/website.rs +++ b/src/website.rs @@ -28,7 +28,7 @@ pub struct NWebsite { } #[napi] -/// crawl a website gathering all links to array +/// crawl a website using HTTP gathering all links and html. pub async fn crawl(url: String) -> NWebsite { let mut website = spider::website::Website::new(&url); let mut rx2 = website @@ -54,7 +54,7 @@ pub async fn crawl(url: String) -> NWebsite { }); spider::tokio::spawn(async move { - website.crawl().await; + website.crawl_raw().await; }); let mut pages = Vec::new(); @@ -161,10 +161,14 @@ impl Website { pub async unsafe fn crawl( &mut self, on_page_event: Option>, + // run the page in the background background: Option, + // headless chrome rendering + headless: Option, ) { // only run in background if on_page_event is handled for streaming. - let background = background.is_some() && background.unwrap_or_default() == true; + let background = background.is_some() && background.unwrap_or_default(); + let headless = headless.is_some() && headless.unwrap_or_default(); match on_page_event { Some(callback) => { @@ -188,7 +192,11 @@ impl Website { }); spider::tokio::spawn(async move { - website.crawl().await; + if headless { + website.crawl().await; + } else { + website.crawl_raw().await; + } }); } else { let mut rx2 = self @@ -207,10 +215,94 @@ impl Website { ); } }); + + if headless { + self.inner.crawl().await; + } else { + self.inner.crawl_raw().await; + } + } + } + _ => { + if headless { self.inner.crawl().await; + } else { + self.inner.crawl_raw().await; + } + } + } + } + + #[napi] + /// scrape a website + pub async unsafe fn scrape( + &mut self, + on_page_event: Option>, + background: Option, + headless: Option, + ) { + let headless = headless.is_some() && headless.unwrap_or_default(); + + match on_page_event { + Some(callback) => { + if background.unwrap_or_default() { + let mut website = self.inner.clone(); + + let mut rx2 = website + .subscribe(*BUFFER / 2) + .expect("sync feature should be enabled"); + + spider::tokio::spawn(async move { + while let Ok(res) = rx2.recv().await { + callback.call( + Ok(NPage { + url: res.get_url().into(), + content: res.get_html().into(), + }), + napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, + ); + } + }); + + spider::tokio::spawn(async move { + if headless { + website.scrape().await; + } else { + website.scrape_raw().await; + } + }); + } else { + let mut rx2 = self + .inner + .subscribe(*BUFFER / 2) + .expect("sync feature should be enabled"); + + spider::tokio::spawn(async move { + while let Ok(res) = rx2.recv().await { + callback.call( + Ok(NPage { + url: res.get_url().into(), + content: res.get_html().into(), + }), + napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, + ); + } + }); + + if headless { + self.inner.scrape().await; + } else { + self.inner.scrape_raw().await; + } + } + } + _ => { + if headless { + self.inner.scrape().await; + } else { + self.inner.scrape_raw().await; } } - _ => self.inner.crawl().await, } } @@ -249,37 +341,6 @@ impl Website { Cron { inner, cron_handle } } - #[napi] - /// scrape a website - pub async unsafe fn scrape( - &mut self, - on_page_event: Option>, - ) { - match on_page_event { - Some(callback) => { - let mut rx2 = self - .inner - .subscribe(*BUFFER / 2) - .expect("sync feature should be enabled"); - - spider::tokio::spawn(async move { - while let Ok(res) = rx2.recv().await { - callback.call( - Ok(NPage { - url: res.get_url().into(), - content: res.get_html().into(), - }), - napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, - ); - } - }); - - self.inner.scrape().await; - } - _ => self.inner.scrape().await, - } - } - #[napi] /// get all the links of a website pub fn get_links(&self) -> Vec {