Skip to content

Commit

Permalink
feat(chrome): add headless chrome
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 29, 2023
1 parent 5517f15 commit 97188f8
Show file tree
Hide file tree
Showing 10 changed files with 196 additions and 49 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ indexmap = "2.1.0"
napi = { version = "2.14.1", default-features = false, features = ["napi4", "async", "tokio_rt"] }
napi-derive = "2.14.2"
num_cpus = "1.16.0"
spider = { version = "1.50.14", features = ["napi", "budget", "cron", "regex", "cookies", "socks"] }
spider = { version = "1.50.14", features = ["napi", "budget", "cron", "regex", "cookies", "socks", "chrome"] }

[target.x86_64-unknown-linux-gnu.dependencies]
openssl-sys = { version = "0.9.96", features = ["vendored"] }
Expand Down
34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# spider-rs

The [spider](https://github.com/spider-rs/spider) project ported to nodejs via napi.
The [spider](https://github.com/spider-rs/spider) project ported to nodejs.

## Getting Started

Expand All @@ -19,7 +19,7 @@ await website.crawl(onPageEvent);
console.log(website.getLinks());
```

Collect the resources for a website. View [config](https://docs.rs/spider/latest/spider/website/struct.Website.html) for options, when using convert the method to camelCase.
Collect the resources for a website.

```ts
import { Website } from "@spider-rs/spider-rs";
Expand All @@ -39,6 +39,36 @@ await website.scrape();
console.log(website.getPages());
```

Run the crawls in the background on another thread.

```ts
import { Website } from "@spider-rs/spider-rs";

const website = new Website("https://rsseau.fr");

const onPageEvent = (_err, value) => {
console.log(value);
};

await website.crawl(onPageEvent, true);
// runs immediately
```

Use headless Chrome rendering for crawls.

```ts
import { Website } from "@spider-rs/spider-rs";

const website = new Website("https://rsseau.fr");

const onPageEvent = (_err, value) => {
console.log(value);
};

await website.crawl(onPageEvent, false, true);
console.log(website.getLinks());
```

Cron jobs can be done with the following.

```ts
Expand Down
9 changes: 8 additions & 1 deletion __test__/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { crawl, Website, Page, type NPage, Cron } from "../index.js";

const TEST_URL = "https://choosealicense.com";

test("crawl native", async (t) => {
test("crawl shortcut native", async (t) => {
const { links, pages } = await crawl(TEST_URL);

t.assert(links.length > 1, "should be more than one link");
Expand Down Expand Up @@ -138,3 +138,10 @@ test("new single page", async (t) => {
t.assert(page.getHtml().length >= 100, "should be valid html");
t.assert(page.getBytes().length >= 100, "should be valid bytes");
});

test("new website native headless", async (t) => {
const website = new Website(TEST_URL);
await website.crawl(undefined, false, true);

t.assert(website.getLinks().length > 1, "should be more than one link");
});
1 change: 1 addition & 0 deletions book/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

- [Website](./website.md)
- [Page](./page.md)
- [Environment](./env.md)

# Usage

Expand Down
20 changes: 19 additions & 1 deletion book/src/crawl.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ await website.crawl(onPageEvent, true);
// this will run instantly as the crawl is in the background
```


## Subscriptions

You can setup many subscriptions to run events when a crawl happens.
Expand All @@ -68,3 +67,22 @@ await website.crawl(onPageEvent);
website.unsubscribe(subscriptionID);
// this will run instantly as the crawl is in the background
```

## Headless Chrome

Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
drastically speed up runs.

```ts
import { Website } from "@spider-rs/spider-rs";

const website = new Website("https://rsseau.fr");

const onPageEvent = (err, value) => {
console.log(value);
};

// all params are optional. The third param determines headless rendering.
await website.crawl(onPageEvent, false, true);
```
11 changes: 11 additions & 0 deletions book/src/env.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Environment

Env variables to adjust the project.

## CHROME_URL

You can set the chrome URL to connect remotely.

```sh
CHROME_URL=http://localhost:9222
```
19 changes: 19 additions & 0 deletions book/src/scrape.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,22 @@ await website.scrape();
// [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]
console.log(website.getPages());
```

## Headless Chrome

Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
drastically speed up runs.

```ts
import { Website } from "@spider-rs/spider-rs";

const website = new Website("https://rsseau.fr");

const onPageEvent = (err, value) => {
console.log(value);
};

// all params are optional. The third param determines headless rendering.
await website.scrape(onPageEvent, false, true);
```
8 changes: 4 additions & 4 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export interface NPage {
/** the content of the page found */
content: string
}
/** crawl a website gathering all links to array */
/** crawl a website using HTTP gathering all links and html. */
export function crawl(url: string): Promise<NWebsite>
/** a simple page object */
export class Page {
Expand Down Expand Up @@ -43,11 +43,11 @@ export class Website {
/** remove a subscription listener */
unsubscribe(id?: number | undefined | null): boolean
/** crawl a website */
crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null): Promise<void>
crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
/** scrape a website */
scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
/** run the cron */
runCron(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise<Cron>
/** scrape a website */
scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise<void>
/** get all the links of a website */
getLinks(): Array<string>
/** get all the pages of a website - requires calling website.scrape */
Expand Down
8 changes: 4 additions & 4 deletions src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@ impl Page {
/// a new page
pub fn new(url: String, subdomains: Option<bool>, tld: Option<bool>) -> Self {
Page {
url,
subdomains,
tld,
url,
subdomains,
tld,
..Default::default()
}
}

#[napi]
/// get the page content
pub async unsafe fn fetch(&mut self) -> &Self {
self.inner = Some(spider::page::Page::new(&self.url, &Default::default()).await);
self.inner = Some(spider::page::Page::new_page(&self.url, &Default::default()).await);
self.selectors = spider::page::get_page_selectors(
&self.url,
self.subdomains.unwrap_or_default(),
Expand Down
133 changes: 97 additions & 36 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pub struct NWebsite {
}

#[napi]
/// crawl a website gathering all links to array
/// crawl a website using HTTP gathering all links and html.
pub async fn crawl(url: String) -> NWebsite {
let mut website = spider::website::Website::new(&url);
let mut rx2 = website
Expand All @@ -54,7 +54,7 @@ pub async fn crawl(url: String) -> NWebsite {
});

spider::tokio::spawn(async move {
website.crawl().await;
website.crawl_raw().await;
});

let mut pages = Vec::new();
Expand Down Expand Up @@ -161,10 +161,14 @@ impl Website {
pub async unsafe fn crawl(
&mut self,
on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
// run the page in the background
background: Option<bool>,
// headless chrome rendering
headless: Option<bool>,
) {
// only run in background if on_page_event is handled for streaming.
let background = background.is_some() && background.unwrap_or_default() == true;
let background = background.is_some() && background.unwrap_or_default();
let headless = headless.is_some() && headless.unwrap_or_default();

match on_page_event {
Some(callback) => {
Expand All @@ -188,7 +192,11 @@ impl Website {
});

spider::tokio::spawn(async move {
website.crawl().await;
if headless {
website.crawl().await;
} else {
website.crawl_raw().await;
}
});
} else {
let mut rx2 = self
Expand All @@ -207,10 +215,94 @@ impl Website {
);
}
});

if headless {
self.inner.crawl().await;
} else {
self.inner.crawl_raw().await;
}
}
}
_ => {
if headless {
self.inner.crawl().await;
} else {
self.inner.crawl_raw().await;
}
}
}
}

#[napi]
/// scrape a website
pub async unsafe fn scrape(
&mut self,
on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
background: Option<bool>,
headless: Option<bool>,
) {
let headless = headless.is_some() && headless.unwrap_or_default();

match on_page_event {
Some(callback) => {
if background.unwrap_or_default() {
let mut website = self.inner.clone();

let mut rx2 = website
.subscribe(*BUFFER / 2)
.expect("sync feature should be enabled");

spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
callback.call(
Ok(NPage {
url: res.get_url().into(),
content: res.get_html().into(),
}),
napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
);
}
});

spider::tokio::spawn(async move {
if headless {
website.scrape().await;
} else {
website.scrape_raw().await;
}
});
} else {
let mut rx2 = self
.inner
.subscribe(*BUFFER / 2)
.expect("sync feature should be enabled");

spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
callback.call(
Ok(NPage {
url: res.get_url().into(),
content: res.get_html().into(),
}),
napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
);
}
});

if headless {
self.inner.scrape().await;
} else {
self.inner.scrape_raw().await;
}
}
}
_ => {
if headless {
self.inner.scrape().await;
} else {
self.inner.scrape_raw().await;
}
}
_ => self.inner.crawl().await,
}
}

Expand Down Expand Up @@ -249,37 +341,6 @@ impl Website {
Cron { inner, cron_handle }
}

#[napi]
/// scrape a website
pub async unsafe fn scrape(
&mut self,
on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
) {
match on_page_event {
Some(callback) => {
let mut rx2 = self
.inner
.subscribe(*BUFFER / 2)
.expect("sync feature should be enabled");

spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
callback.call(
Ok(NPage {
url: res.get_url().into(),
content: res.get_html().into(),
}),
napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
);
}
});

self.inner.scrape().await;
}
_ => self.inner.scrape().await,
}
}

#[napi]
/// get all the links of a website
pub fn get_links(&self) -> Vec<String> {
Expand Down

0 comments on commit 97188f8

Please sign in to comment.