From d1665257817617bf811be55e76e194e49ea42ad5 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 28 Nov 2023 12:34:08 -0500 Subject: [PATCH] chore(docs): add full website config examples --- Cargo.toml | 2 +- book/src/SUMMARY.md | 6 ++- book/src/crawl.md | 48 ++++++++++++++++++++ book/src/cron-job.md | 7 +-- book/src/scrape.md | 15 +++++++ book/src/website.md | 102 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 174 insertions(+), 6 deletions(-) create mode 100644 book/src/crawl.md create mode 100644 book/src/scrape.md diff --git a/Cargo.toml b/Cargo.toml index 16b03e5..1b43ab0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ compact_str = "0.7.1" napi = { version = "2.14.1", default-features = false, features = ["napi4", "async", "tokio_rt"] } napi-derive = "2.14.2" num_cpus = "1.16.0" -spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies"] } +spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies", "socks"] } [target.x86_64-unknown-linux-gnu.dependencies] openssl-sys = { version = "0.9.96", features = ["vendored"] } diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 90e6514..d72d213 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -4,13 +4,15 @@ # User Guide -- [Getting Started](./getting-started.md) +- [Getting started](./getting-started.md) - [A simple example](./simple.md) # Config - [Website](./website.md) -# Features +# Usage +- [Crawl](./crawl.md) +- [Scrape](./scrape.md) - [Cron Job](./cron-job.md) diff --git a/book/src/crawl.md b/book/src/crawl.md new file mode 100644 index 0000000..7ef5dc9 --- /dev/null +++ b/book/src/crawl.md @@ -0,0 +1,48 @@ +# Crawl + +Crawl a website concurrently. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +// pass in the website url +const website = new Website("https://rsseau.fr"); + +await website.crawl(); + +// [ "https://rsseau.fr/blog", ...] +console.log(website.getLinks()); +``` + +## Async Event + +You can pass in a async function as the first param to the crawl function for realtime updates streamed. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +const website = new Website("https://rsseau.fr"); + +const onPageEvent = (err, value) => { + console.log(value); +}; + +await website.crawl(onPageEvent); +``` + +## Background + +You can run the request in the background and receive events with the second param set to `true`. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +const website = new Website("https://rsseau.fr"); + +const onPageEvent = (err, value) => { + console.log(value); +}; + +await website.crawl(onPageEvent, true); +// this will run instantly as the crawl is in the background +``` diff --git a/book/src/cron-job.md b/book/src/cron-job.md index 746485a..68f172d 100644 --- a/book/src/cron-job.md +++ b/book/src/cron-job.md @@ -3,14 +3,15 @@ Use a cron job that can run any time of day to gather website data. ```ts -import { Website, type NPage } from "@spider-rs/spider-rs"; +import { Website } from "@spider-rs/spider-rs"; const website = new Website("https://choosealicense.com") .withCron("1/5 * * * * *") .build(); -const onPageEvent = (err: Error | null, value: NPage) => { - links.push(value); +// get the pages of the website when the cron runs streamed. +const onPageEvent = (err, value) => { + console.log(value); }; const handle = await website.runCron(onPageEvent); diff --git a/book/src/scrape.md b/book/src/scrape.md new file mode 100644 index 0000000..705945b --- /dev/null +++ b/book/src/scrape.md @@ -0,0 +1,15 @@ +# Scrape + +Scape a website and collect the resource data. + +```ts +import { Website } from "@spider-rs/spider-rs"; + +// pass in the website url +const website = new Website("https://rsseau.fr"); + +await website.scrape(); + +// [ { url: "https://rsseau.fr/blog", html: "..."}, ...] +console.log(website.getPages()); +``` diff --git a/book/src/website.md b/book/src/website.md index 8f700e2..59cc98e 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -49,3 +49,105 @@ const website = new Website("https://choosealicense.com") ``` View the [cron](./cron-job.md) section for details how to use the cron. + +### Budget + +Add a crawl budget that prevents crawling `x` amount of pages. + +```ts +const website = new Website("https://choosealicense.com") + .withBudget({ + "*": 1, + }) + .build(); +``` + +### Subdomains + +Include subdomains in request. + +```ts +const website = new Website("https://choosealicense.com") + .withSubdomains(true) + .build(); +``` + +### TLD + +Include TLDs in request. + +```ts +const website = new Website("https://choosealicense.com") + .withTlds(true) + .build(); +``` + +### External Domains + +Add external domains to include with the website. + +```ts +const website = new Website("https://choosealicense.com") + .withExternalDomains(["https://www.myotherdomain.com"]) + .build(); +``` + +### Proxy + +Use a proxy to crawl a website. + +```ts +const website = new Website("https://choosealicense.com") + .withProxies(["https://www.myproxy.com"]) + .build(); +``` + +### Delays + +Add delays between pages. + +```ts +const website = new Website("https://choosealicense.com") + .withDelays(200) + .build(); +``` + +### User-Agent + +Use a custom User-Agent. + +```ts +const website = new Website("https://choosealicense.com") + .withUserAgent("mybot/v1") + .build(); +``` + +### Request Timeout + +Add a request timeout per page in miliseconds. Example shows 30 seconds. + +```ts +const website = new Website("https://choosealicense.com") + .withRequestTimeout(30000) + .build(); +``` + +### Respect Robots + +Respect the robots.txt file. + +```ts +const website = new Website("https://choosealicense.com") + .withRespectRobotsTxt(true) + .build(); +``` + +### Http2 Prior Knowledge + +Use http2 to connect if you know the website servers supports this. + +```ts +const website = new Website("https://choosealicense.com") + .withHttp2PriorKnowledge(true) + .build(); +```