From 23f97e0faedce2cb3e78bd1036dd6474d1af5633 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 28 Nov 2023 12:34:08 -0500 Subject: [PATCH] chore(docs): add full website config examples --- Cargo.toml | 2 +- book/src/cron-job.md | 6 +-- book/src/website.md | 102 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 16b03e5..1b43ab0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ compact_str = "0.7.1" napi = { version = "2.14.1", default-features = false, features = ["napi4", "async", "tokio_rt"] } napi-derive = "2.14.2" num_cpus = "1.16.0" -spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies"] } +spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies", "socks"] } [target.x86_64-unknown-linux-gnu.dependencies] openssl-sys = { version = "0.9.96", features = ["vendored"] } diff --git a/book/src/cron-job.md b/book/src/cron-job.md index 746485a..7a84a87 100644 --- a/book/src/cron-job.md +++ b/book/src/cron-job.md @@ -3,14 +3,14 @@ Use a cron job that can run any time of day to gather website data. ```ts -import { Website, type NPage } from "@spider-rs/spider-rs"; +import { Website } from "@spider-rs/spider-rs"; const website = new Website("https://choosealicense.com") .withCron("1/5 * * * * *") .build(); -const onPageEvent = (err: Error | null, value: NPage) => { - links.push(value); +const onPageEvent = (err, value) => { + console.log(value); }; const handle = await website.runCron(onPageEvent); diff --git a/book/src/website.md b/book/src/website.md index 8f700e2..59cc98e 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -49,3 +49,105 @@ const website = new Website("https://choosealicense.com") ``` View the [cron](./cron-job.md) section for details how to use the cron. + +### Budget + +Add a crawl budget that prevents crawling `x` amount of pages. + +```ts +const website = new Website("https://choosealicense.com") + .withBudget({ + "*": 1, + }) + .build(); +``` + +### Subdomains + +Include subdomains in request. + +```ts +const website = new Website("https://choosealicense.com") + .withSubdomains(true) + .build(); +``` + +### TLD + +Include TLDs in request. + +```ts +const website = new Website("https://choosealicense.com") + .withTlds(true) + .build(); +``` + +### External Domains + +Add external domains to include with the website. + +```ts +const website = new Website("https://choosealicense.com") + .withExternalDomains(["https://www.myotherdomain.com"]) + .build(); +``` + +### Proxy + +Use a proxy to crawl a website. + +```ts +const website = new Website("https://choosealicense.com") + .withProxies(["https://www.myproxy.com"]) + .build(); +``` + +### Delays + +Add delays between pages. + +```ts +const website = new Website("https://choosealicense.com") + .withDelays(200) + .build(); +``` + +### User-Agent + +Use a custom User-Agent. + +```ts +const website = new Website("https://choosealicense.com") + .withUserAgent("mybot/v1") + .build(); +``` + +### Request Timeout + +Add a request timeout per page in miliseconds. Example shows 30 seconds. + +```ts +const website = new Website("https://choosealicense.com") + .withRequestTimeout(30000) + .build(); +``` + +### Respect Robots + +Respect the robots.txt file. + +```ts +const website = new Website("https://choosealicense.com") + .withRespectRobotsTxt(true) + .build(); +``` + +### Http2 Prior Knowledge + +Use http2 to connect if you know the website servers supports this. + +```ts +const website = new Website("https://choosealicense.com") + .withHttp2PriorKnowledge(true) + .build(); +```