chore(docs): add full website config examples

spider-rs · Nov 28, 2023 · 0179995 · 0179995
1 parent 7b5800c
commit 0179995
Show file tree

Hide file tree

Showing 5 changed files with 156 additions and 6 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,7 +13,7 @@ compact_str = "0.7.1"
 napi = { version = "2.14.1", default-features = false, features = ["napi4", "async", "tokio_rt"] }
 napi-derive = "2.14.2"
 num_cpus = "1.16.0"
-spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies"] }
+spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies", "socks"] }
 
 [target.x86_64-unknown-linux-gnu.dependencies]
 openssl-sys = { version = "0.9.96", features = ["vendored"] }

diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
@@ -4,13 +4,14 @@
 
 # User Guide
 
-- [Getting Started](./getting-started.md)
+- [Getting started](./getting-started.md)
 - [A simple example](./simple.md)
 
 # Config
 
 - [Website](./website.md)
 
-# Features
+# Usage
 
+- [Crawl](./crawl.md)
 - [Cron Job](./cron-job.md)
diff --git a/book/src/crawl.md b/book/src/crawl.md
@@ -0,0 +1,46 @@
+# Crawl
+
+Crawl a website concurrently. 
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+// pass in the website url
+const website = new Website("https://rsseau.fr");
+
+await website.crawl();
+console.log(website.getLinks());
+```
+
+## Async Event
+
+You can pass in a async function as the first param to the crawl function for realtime updates streamed.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (err, value) => {
+  console.log(value);
+};
+
+await website.crawl(onPageEvent);
+```
+
+## Background
+
+You can run the request in the background and receive events with the second param set to `true`.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (err, value) => {
+  console.log(value);
+};
+
+await website.crawl(onPageEvent, true);
+// this will run instantly as the crawl is in the background
+```
diff --git a/book/src/cron-job.md b/book/src/cron-job.md
@@ -3,14 +3,15 @@
 Use a cron job that can run any time of day to gather website data.
 
 ```ts
-import { Website, type NPage } from "@spider-rs/spider-rs";
+import { Website } from "@spider-rs/spider-rs";
 
 const website = new Website("https://choosealicense.com")
   .withCron("1/5 * * * * *")
   .build();
 
-const onPageEvent = (err: Error | null, value: NPage) => {
-  links.push(value);
+// get the pages of the website when the cron runs streamed.
+const onPageEvent = (err, value) => {
+  console.log(value);
 };
 
 const handle = await website.runCron(onPageEvent);

diff --git a/book/src/website.md b/book/src/website.md
@@ -49,3 +49,105 @@ const website = new Website("https://choosealicense.com")
 ```
 
 View the [cron](./cron-job.md) section for details how to use the cron.
+
+### Budget
+
+Add a crawl budget that prevents crawling `x` amount of pages.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withBudget({
+    "*": 1,
+  })
+  .build();
+```
+
+### Subdomains
+
+Include subdomains in request.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withSubdomains(true)
+  .build();
+```
+
+### TLD
+
+Include TLDs in request.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withTlds(true)
+  .build();
+```
+
+### External Domains
+
+Add external domains to include with the website.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withExternalDomains(["https://www.myotherdomain.com"])
+  .build();
+```
+
+### Proxy
+
+Use a proxy to crawl a website.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withProxies(["https://www.myproxy.com"])
+  .build();
+```
+
+### Delays
+
+Add delays between pages.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withDelays(200)
+  .build();
+```
+
+### User-Agent
+
+Use a custom User-Agent.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withUserAgent("mybot/v1")
+  .build();
+```
+
+### Request Timeout
+
+Add a request timeout per page in miliseconds. Example shows 30 seconds.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withRequestTimeout(30000)
+  .build();
+```
+
+### Respect Robots
+
+Respect the robots.txt file.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withRespectRobotsTxt(true)
+  .build();
+```
+
+### Http2 Prior Knowledge
+
+Use http2 to connect if you know the website servers supports this.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withHttp2PriorKnowledge(true)
+  .build();
+```