chore(docs): add full website config examples

spider-rs · Nov 29, 2023 · 302a328 · 302a328
1 parent 7b5800c
commit 302a328
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 9 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,7 +13,7 @@ compact_str = "0.7.1"
 napi = { version = "2.14.1", default-features = false, features = ["napi4", "async", "tokio_rt"] }
 napi-derive = "2.14.2"
 num_cpus = "1.16.0"
-spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies"] }
+spider = { version = "1.50.8", features = ["napi", "budget", "cron", "regex", "cookies", "socks"] }
 
 [target.x86_64-unknown-linux-gnu.dependencies]
 openssl-sys = { version = "0.9.96", features = ["vendored"] }

diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
@@ -4,13 +4,15 @@
 
 # User Guide
 
-- [Getting Started](./getting-started.md)
+- [Getting started](./getting-started.md)
 - [A simple example](./simple.md)
 
 # Config
 
 - [Website](./website.md)
 
-# Features
+# Usage
 
+- [Crawl](./crawl.md)
+- [Scrape](./scrape.md)
 - [Cron Job](./cron-job.md)
diff --git a/book/src/crawl.md b/book/src/crawl.md
@@ -0,0 +1,48 @@
+# Crawl
+
+Crawl a website concurrently.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+// pass in the website url
+const website = new Website("https://rsseau.fr");
+
+await website.crawl();
+
+// [ "https://rsseau.fr/blog", ...]
+console.log(website.getLinks());
+```
+
+## Async Event
+
+You can pass in a async function as the first param to the crawl function for realtime updates streamed.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (err, value) => {
+  console.log(value);
+};
+
+await website.crawl(onPageEvent);
+```
+
+## Background
+
+You can run the request in the background and receive events with the second param set to `true`.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+const website = new Website("https://rsseau.fr");
+
+const onPageEvent = (err, value) => {
+  console.log(value);
+};
+
+await website.crawl(onPageEvent, true);
+// this will run instantly as the crawl is in the background
+```
diff --git a/book/src/cron-job.md b/book/src/cron-job.md
@@ -3,14 +3,15 @@
 Use a cron job that can run any time of day to gather website data.
 
 ```ts
-import { Website, type NPage } from "@spider-rs/spider-rs";
+import { Website } from "@spider-rs/spider-rs";
 
 const website = new Website("https://choosealicense.com")
   .withCron("1/5 * * * * *")
   .build();
 
-const onPageEvent = (err: Error | null, value: NPage) => {
-  links.push(value);
+// get the pages of the website when the cron runs streamed.
+const onPageEvent = (err, value) => {
+  console.log(value);
 };
 
 const handle = await website.runCron(onPageEvent);

diff --git a/book/src/getting-started.md b/book/src/getting-started.md
@@ -1,6 +1,8 @@
 # Getting Started
 
-Install the package.
+Make sure to have [node](https://nodejs.org/en/download) installed v10 and higher.
+
+Install the package with your favorite package manager.
 
 ```sh
 yarn add @spider-rs/spider-rs

diff --git a/book/src/scrape.md b/book/src/scrape.md
@@ -0,0 +1,15 @@
+# Scrape
+
+Scape a website and collect the resource data.
+
+```ts
+import { Website } from "@spider-rs/spider-rs";
+
+// pass in the website url
+const website = new Website("https://rsseau.fr");
+
+await website.scrape();
+
+// [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]
+console.log(website.getPages());
+```
diff --git a/book/src/website.md b/book/src/website.md
@@ -8,8 +8,6 @@ We use the builder pattern to configure the website for crawling.
 
 \*note: Replace `https://choosealicense.com` from the examples below with your website target URL.
 
-All of the examples use typescript by default.
-
 ```ts
 import { Website } from "@spider-rs/spider-rs";
 
@@ -49,3 +47,118 @@ const website = new Website("https://choosealicense.com")
 ```
 
 View the [cron](./cron-job.md) section for details how to use the cron.
+
+### Budget
+
+Add a crawl budget that prevents crawling `x` amount of pages.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withBudget({
+    "*": 1,
+  })
+  .build();
+```
+
+### Subdomains
+
+Include subdomains in request.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withSubdomains(true)
+  .build();
+```
+
+### TLD
+
+Include TLDs in request.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withTlds(true)
+  .build();
+```
+
+### External Domains
+
+Add external domains to include with the website.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withExternalDomains(["https://www.myotherdomain.com"])
+  .build();
+```
+
+### Proxy
+
+Use a proxy to crawl a website.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withProxies(["https://www.myproxy.com"])
+  .build();
+```
+
+### Delays
+
+Add delays between pages. Defaults to none.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withDelays(200)
+  .build();
+```
+
+### User-Agent
+
+Use a custom User-Agent.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withUserAgent("mybot/v1")
+  .build();
+```
+
+### Request Timeout
+
+Add a request timeout per page in miliseconds. Example shows 30 seconds.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withRequestTimeout(30000)
+  .build();
+```
+
+### Respect Robots
+
+Respect the robots.txt file.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withRespectRobotsTxt(true)
+  .build();
+```
+
+### Http2 Prior Knowledge
+
+Use http2 to connect if you know the website servers supports this.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withHttp2PriorKnowledge(true)
+  .build();
+```
+
+## Chaining
+
+You can chain all of the configs together for simple configuration.
+
+```ts
+const website = new Website("https://choosealicense.com")
+  .withSubdomains(true)
+  .withTlds(true)
+  .withUserAgent("mybot/v1")
+  .withRespectRobotsTxt(true)
+  .build();
+```