diff --git a/README.md b/README.md index 00f322a..d66ce05 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ const website = new Website("https://rsseau.fr") .withBudget({ // limit up to 200 pages crawled for the entire website "*": 200, + // limit only 10 pages on the docs page + "/docs": 10 }) .withBlacklistUrl([new RegExp("/books").source, "/resume"]) .build(); @@ -78,7 +80,7 @@ const website = new Website("https://choosealicense.com").withCron( "1/5 * * * * *", ); // sleep function to test cron -const stopCron = (time: number, handle: Cron) => { +const stopCron = (time: number, handle) => { return new Promise((resolve) => { setTimeout(() => { resolve(handle.stop()); @@ -86,9 +88,9 @@ const stopCron = (time: number, handle: Cron) => { }); }; -const links: NPage[] = []; +const links = []; -const onPageEvent = (err: Error | null, value: NPage) => { +const onPageEvent = (err, value) => { links.push(value); }; @@ -103,12 +105,14 @@ Use the crawl shortcut to get the page content and url. ```ts import { crawl } from "@spider-rs/spider-rs"; -const { links, pages } = new crawl("https://rsseau.fr"); +const { links, pages } = await crawl("https://rsseau.fr"); console.log(pages); ``` ## Benchmarks +Spider is about 1,000x (small websites) 10,000x (medium websites), and 100,000x (production grade websites) times faster than the popular crawlee library even with the node port performance hits. + ```sh ---------------------- mac Apple M1 Max @@ -119,19 +123,17 @@ mac Apple M1 Max ``` Test url: `https://choosealicense.com` (small) - 32 pages -| | `libraries` | +| `libraries` | `speed` | | :-------------------------------- | :-------------------- | | **`spider-rs: crawl 10 samples`** | `286ms`(✅ **1.00x**) | -| **`crawlee: crawl 10 samples`** | `1s` (✅ **1.00x**) | +| **`crawlee: crawl 10 samples`** | `1.7s` (✅ **1.00x**) | Test url: `https://rsseau.fr` (medium) - 211 pages -| | `libraries` | +| `libraries` | `speed` | | :-------------------------------- | :-------------------- | | **`spider-rs: crawl 10 samples`** | `2.5s` (✅ **1.00x**) | | **`crawlee: crawl 10 samples`** | `75s` (✅ **1.00x**) | diff --git a/book/src/README.md b/book/src/README.md index f3192f4..a7d85a3 100644 --- a/book/src/README.md +++ b/book/src/README.md @@ -7,7 +7,7 @@ Spider powers some big tools and helps bring the crawling aspect to almost no do ```ts import { Website } from "@spider-rs/spider-rs"; -const website = new Website("https://rsseau.fr"); +const website = new Website("https://choosealicense.com"); await website.crawl(); diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index ceb1649..f87525d 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -18,3 +18,7 @@ - [Crawl](./crawl.md) - [Scrape](./scrape.md) - [Cron Job](./cron-job.md) + +# Benchmarks + +- [Compare](./benchmarks.md) diff --git a/book/src/benchmarks.md b/book/src/benchmarks.md new file mode 100644 index 0000000..00d1112 --- /dev/null +++ b/book/src/benchmarks.md @@ -0,0 +1,34 @@ +# Benchmarks + +The speed of Spider-RS ported compared to other tools. + +Spider is about 1,000x (small websites) 10,000x (medium websites), and 100,000x (production grade websites) times faster than the popular crawlee library even with the node port performance hits. + +```sh +---------------------- +mac Apple M1 Max +10-core CPU +64 GB of RAM memory +1 TB of SSD disk space +----------------------- +``` + +Test url: `https://choosealicense.com` (small) +32 pages + +| `libraries` | `speed` | +| :-------------------------------- | :-------------------- | +| **`spider-rs: crawl 10 samples`** | `286ms`(✅ **1.00x**) | +| **`crawlee: crawl 10 samples`** | `1.7s` (✅ **1.00x**) | + +Test url: `https://rsseau.fr` (medium) +211 pages + +| `libraries` | `speed` | +| :-------------------------------- | :-------------------- | +| **`spider-rs: crawl 10 samples`** | `2.5s` (✅ **1.00x**) | +| **`crawlee: crawl 10 samples`** | `75s` (✅ **1.00x**) | + +The performance scales the larger the website and if throttling is needed. + +Linux benchmarks are about 10x faster than macOS for spider-rs. diff --git a/book/src/crawl.md b/book/src/crawl.md index 847561d..abb53af 100644 --- a/book/src/crawl.md +++ b/book/src/crawl.md @@ -70,9 +70,9 @@ website.unsubscribe(subscriptionID); ## Headless Chrome -Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. -It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will -drastically speed up runs. +Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. +It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will +drastically speed up runs. ```ts import { Website } from "@spider-rs/spider-rs"; diff --git a/book/src/env.md b/book/src/env.md index 3d80cbb..c99807b 100644 --- a/book/src/env.md +++ b/book/src/env.md @@ -8,4 +8,4 @@ You can set the chrome URL to connect remotely. ```sh CHROME_URL=http://localhost:9222 -``` \ No newline at end of file +``` diff --git a/book/src/page.md b/book/src/page.md index c436899..7375fa5 100644 --- a/book/src/page.md +++ b/book/src/page.md @@ -22,7 +22,6 @@ await page.fetch(); get all the links related to a page. ```ts - const page = new Page("https://choosealicense.com", false, false); await page.fetch(); const links = await page.getLinks(); @@ -34,7 +33,6 @@ console.log(links); Get the markup for the page or HTML. ```ts - const page = new Page("https://choosealicense.com", false, false); await page.fetch(); const html = page.getHtml(); @@ -46,9 +44,8 @@ console.log(html); Get the raw bytes of a page to store the files in a database. ```ts - const page = new Page("https://choosealicense.com", false, false); await page.fetch(); const bytes = page.getBytes(); console.log(bytes); -``` \ No newline at end of file +``` diff --git a/book/src/scrape.md b/book/src/scrape.md index 86f9c66..5b7618b 100644 --- a/book/src/scrape.md +++ b/book/src/scrape.md @@ -16,8 +16,8 @@ console.log(website.getPages()); ## Headless Chrome -Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. -It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will +Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. +It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will drastically speed up runs. ```ts @@ -31,4 +31,4 @@ const onPageEvent = (err, value) => { // all params are optional. The third param determines headless rendering. await website.scrape(onPageEvent, false, true); -``` \ No newline at end of file +``` diff --git a/book/src/simple.md b/book/src/simple.md index 4732501..e31adc4 100644 --- a/book/src/simple.md +++ b/book/src/simple.md @@ -15,7 +15,7 @@ A basic example. ```ts import { Website } from "@spider-rs/spider-rs"; -const website = new Website("https://rsseau.fr"); +const website = new Website("https://choosealicense.com"); await website.crawl(); console.log(website.getLinks()); @@ -28,7 +28,7 @@ You can pass a function that could be async as param to `crawl` and `scrape`. ```ts import { Website, type NPage } from "@spider-rs/spider-rs"; -const website = new Website("https://rsseau.fr"); +const website = new Website("https://choosealicense.com"); const links: NPage[] = []; @@ -36,6 +36,7 @@ const onPageEvent = (err: Error | null, value: NPage) => { links.push(value); }; +// params in order event, background, and headless chrome await website.crawl(onPageEvent); console.log(website.getLinks()); ``` diff --git a/npm/android-arm-eabi/package.json b/npm/android-arm-eabi/package.json index 08d3bd6..99ccb19 100644 --- a/npm/android-arm-eabi/package.json +++ b/npm/android-arm-eabi/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-android-arm-eabi", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "android" diff --git a/npm/android-arm64/package.json b/npm/android-arm64/package.json index 21bc1a1..8dad63a 100644 --- a/npm/android-arm64/package.json +++ b/npm/android-arm64/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-android-arm64", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "android" diff --git a/npm/darwin-arm64/package.json b/npm/darwin-arm64/package.json index 70c8d2d..62254a7 100644 --- a/npm/darwin-arm64/package.json +++ b/npm/darwin-arm64/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-darwin-arm64", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "darwin" diff --git a/npm/darwin-universal/package.json b/npm/darwin-universal/package.json index fd0617e..0d037f1 100644 --- a/npm/darwin-universal/package.json +++ b/npm/darwin-universal/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-darwin-universal", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "darwin" diff --git a/npm/darwin-x64/package.json b/npm/darwin-x64/package.json index 8f75e95..9ff77c9 100644 --- a/npm/darwin-x64/package.json +++ b/npm/darwin-x64/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-darwin-x64", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "darwin" diff --git a/npm/freebsd-x64/package.json b/npm/freebsd-x64/package.json index 1f714af..ceaf32f 100644 --- a/npm/freebsd-x64/package.json +++ b/npm/freebsd-x64/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-freebsd-x64", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "freebsd" diff --git a/npm/linux-arm-gnueabihf/package.json b/npm/linux-arm-gnueabihf/package.json index 80661cc..ade68e1 100644 --- a/npm/linux-arm-gnueabihf/package.json +++ b/npm/linux-arm-gnueabihf/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-linux-arm-gnueabihf", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "linux" diff --git a/npm/linux-arm64-gnu/package.json b/npm/linux-arm64-gnu/package.json index 83e05ba..b81740a 100644 --- a/npm/linux-arm64-gnu/package.json +++ b/npm/linux-arm64-gnu/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-linux-arm64-gnu", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "linux" diff --git a/npm/linux-arm64-musl/package.json b/npm/linux-arm64-musl/package.json index 137e40d..b59ab6e 100644 --- a/npm/linux-arm64-musl/package.json +++ b/npm/linux-arm64-musl/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-linux-arm64-musl", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "linux" diff --git a/npm/linux-x64-gnu/package.json b/npm/linux-x64-gnu/package.json index 3135e59..e7105af 100644 --- a/npm/linux-x64-gnu/package.json +++ b/npm/linux-x64-gnu/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-linux-x64-gnu", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "linux" diff --git a/npm/linux-x64-musl/package.json b/npm/linux-x64-musl/package.json index a4fba8e..a4f1fbe 100644 --- a/npm/linux-x64-musl/package.json +++ b/npm/linux-x64-musl/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-linux-x64-musl", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "linux" diff --git a/npm/win32-arm64-msvc/package.json b/npm/win32-arm64-msvc/package.json index cad1ff2..417425f 100644 --- a/npm/win32-arm64-msvc/package.json +++ b/npm/win32-arm64-msvc/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-win32-arm64-msvc", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "win32" diff --git a/npm/win32-ia32-msvc/package.json b/npm/win32-ia32-msvc/package.json index e6d0eab..42f1f71 100644 --- a/npm/win32-ia32-msvc/package.json +++ b/npm/win32-ia32-msvc/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-win32-ia32-msvc", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "win32" diff --git a/npm/win32-x64-msvc/package.json b/npm/win32-x64-msvc/package.json index 2e0d0b9..9887c91 100644 --- a/npm/win32-x64-msvc/package.json +++ b/npm/win32-x64-msvc/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs-win32-x64-msvc", - "version": "0.0.20", + "version": "0.0.21", "repository": "https://github.com/spider-rs/spider-nodejs", "os": [ "win32" diff --git a/package.json b/package.json index faaf157..4924d7c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@spider-rs/spider-rs", - "version": "0.0.20", + "version": "0.0.21", "main": "index.js", "types": "index.d.ts", "napi": {