chore(builder): add depth and caching

spider-rs · Dec 27, 2023 · 04c0335 · 04c0335
1 parent ea796bb
commit 04c0335
Show file tree

Hide file tree

Showing 11 changed files with 234 additions and 19 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 edition = "2021"
 name = "spider_rs"
-version = "0.0.11"
+version = "0.0.12"
 description = "The fastest web crawler written in Rust ported to nodejs."
 repository = "https://github.com/spider-rs/spider-nodejs"
 
@@ -11,7 +11,7 @@ crate-type = ["cdylib"]
 [dependencies]
 indexmap = "2.1.0"
 num_cpus = "1.16.0"
-spider = { version = "1.80.24", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept" ] }
+spider = { version = "1.80.26", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] }
 pyo3 = { version = "0.20.0", features = ["extension-module"] }
 pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
 

diff --git a/README.md b/README.md
@@ -36,16 +36,15 @@ asyncio.run(main())
 
 Setting up real time subscriptions can be done too.
 
-
 ```python
 import asyncio
 
 from spider_rs import Website
 
 class Subscription:
-    def __init__(self): 
-        print("Subscription Created...") 
-    def __call__(self, page): 
+    def __init__(self):
+        print("Subscription Created...")
+    def __call__(self, page):
         print(page.url + " - status: " + str(page.status_code))
 
 async def main():
@@ -69,4 +68,4 @@ View [bench](./bench/) to see the results.
 
 ## Issues
 
-Please submit a Github issue for any issues found.
+Please submit a Github issue for any issues found.
diff --git a/book/src/README.md b/book/src/README.md
@@ -12,4 +12,4 @@
 - Blacklisting and Budgeting Depth
 - Written in [Rust](https://www.rust-lang.org/) for speed, safety, and simplicity
 
-Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more.
+Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more.
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
@@ -15,4 +15,6 @@
 
 # Usage
 
+- [Crawl](./crawl.md)
+- [Scrape](./scrape.md)
 - [Cron Job](./cron-job.md)
diff --git a/book/src/crawl.md b/book/src/crawl.md
@@ -0,0 +1,105 @@
+# Crawl
+
+Crawl a website concurrently.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://rsseau.fr")
+    website.crawl()
+    print(website.get_links())
+
+asyncio.run(main())
+```
+
+## Async Event
+
+You can pass in a async function as the first param to the crawl function for realtime updates streamed.
+
+```py
+import asyncio
+from spider_rs import Website
+
+class Subscription:
+    def __init__(self):
+        print("Subscription Created...")
+    def __call__(self, page):
+        print(page.url + " - status: " + str(page.status_code))
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.crawl(Subscription())
+
+asyncio.run(main())
+```
+
+## Background
+
+You can run the request in the background and receive events with the second param set to `true`.
+
+```py
+import asyncio
+from spider_rs import Website
+
+class Subscription:
+    def __init__(self):
+        print("Subscription Created...")
+    def __call__(self, page):
+        print(page.url + " - status: " + str(page.status_code))
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.crawl(Subscription(), True)
+    # this will run instantly as the crawl is in the background
+
+asyncio.run(main())
+```
+
+## Subscriptions
+
+You can setup many subscriptions to run events when a crawl happens.
+
+```py
+import asyncio
+from spider_rs import Website
+
+class Subscription:
+    def __init__(self):
+        print("Subscription Created...")
+    def __call__(self, page):
+        print(page.url + " - status: " + str(page.status_code))
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.crawl()
+    subscription_id = website.subscribe(Subscription());
+    website.crawl()
+    website.unsubscribe(subscription_id);
+
+asyncio.run(main())
+```
+
+## Headless Chrome
+
+Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
+It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
+drastically speed up runs.
+
+```py
+import asyncio
+from spider_rs import Website
+
+class Subscription:
+    def __init__(self):
+        print("Subscription Created...")
+    def __call__(self, page):
+        print(page.url + " - status: " + str(page.status_code))
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.crawl(Subscription(), false, True)
+
+asyncio.run(main())
+```
diff --git a/book/src/cron-job.md b/book/src/cron-job.md
@@ -7,9 +7,9 @@ import asyncio
 from spider_rs import Website
 
 class Subscription:
-    def __init__(self): 
-        print("Cron Created...") 
-    def __call__(self, page): 
+    def __init__(self):
+        print("Cron Created...")
+    def __call__(self, page):
         print(page.url + " - status: " + str(page.status_code))
 
 async def main():

diff --git a/book/src/scrape.md b/book/src/scrape.md
@@ -0,0 +1,35 @@
+# Scrape
+
+Scape a website and collect the resource data.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.scrape()
+    print(website.get_pages())
+    # [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]
+
+asyncio.run(main())
+```
+
+## Headless Chrome
+
+Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
+It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
+drastically speed up runs.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.scrape(NULL, NULL, True)
+    print(website.get_pages())
+    # [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]
+
+asyncio.run(main())
+```
diff --git a/book/src/simple.md b/book/src/simple.md
@@ -1,14 +1,42 @@
 # Simple Example
 
+We use the [pyo3](https://pyo3.rs/v0.20.0/) to port the Rust project to target Python.
+
+There are some performance drawbacks from the addon, even still the crawls are lightning fast and efficient.
+
+## Usage
+
+The examples below can help get started with spider.
+
+### Basic
+
 ```python
 import asyncio
 
+from spider_rs import Website
+
+async def main():
+    website = Website("https://jeffmendez.com")
+    website.crawl()
+    print(website.links)
+    # print(website.pages)
+
+asyncio.run(main())
+```
+
+## Shortcut
+
+You can use the `crawl` shortcut method to collect contents quickly without configuration.
+
+```ts
+import asyncio
+
 from spider_rs import crawl
 
 async def main():
-    website = await crawl("https://jeffmendez.com")
+    website = crawl("https://jeffmendez.com")
     print(website.links)
     # print(website.pages)
 
 asyncio.run(main())
-```
+```
diff --git a/book/src/website.md b/book/src/website.md
@@ -136,6 +136,34 @@ async def main():
 asyncio.run(main())
 ```
 
+### Depth Limit
+
+Set the depth limit for the amount of forward pages.
+
+```ts
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_depth(3)
+
+asyncio.run(main())
+```
+
+### Cache
+
+Enable HTTP caching, this useful when using the spider on a server.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_caching(True)
+
+asyncio.run(main())
+```
+
 ### Delays
 
 Add delays between pages. Defaults to none.
@@ -258,15 +286,14 @@ asyncio.run(main())
 
 To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop.
 
-
 ```py
 import asyncio
 from spider_rs import Website
 
 class Subscription:
-    def __init__(self): 
-        print("Subscription Created...") 
-    def __call__(self, page): 
+    def __init__(self):
+        print("Subscription Created...")
+    def __call__(self, page):
         print(page.url + " - status: " + str(page.status_code))
 
 async def main():

diff --git a/src/page.rs b/src/page.rs
@@ -34,9 +34,22 @@ impl Page {
 
   /// get the page content
   pub fn fetch(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> {
+    use spider::{
+      lazy_static::lazy_static, reqwest::Client, reqwest_middleware::ClientWithMiddleware,
+      ClientBuilder,
+    };
+    lazy_static! {
+      /// top level single page client to re-use.
+      pub static ref PAGE_CLIENT: ClientWithMiddleware = {
+        let reqwest_client = Client::builder().build().unwrap_or_default();
+        let client = ClientBuilder::new(reqwest_client).build();
+
+        client
+      };
+    }
     let s = pyo3_asyncio::tokio::get_runtime()
       .block_on(async move {
-        let page = spider::page::Page::new_page(&slf.url, &Default::default()).await;
+        let page = spider::page::Page::new_page(&slf.url, &PAGE_CLIENT).await;
         slf.status_code = page.status_code.into();
         slf.inner = Some(page);
         slf.selectors = spider::page::get_page_selectors(

diff --git a/src/website.rs b/src/website.rs
@@ -746,12 +746,18 @@ impl Website {
     slf
   }
 
-  /// Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag [budget] enabled.
+  /// Set a crawl depth limit. If the value is 0 there is no limit.
   pub fn with_depth(mut slf: PyRefMut<'_, Self>, depth: usize) -> PyRefMut<'_, Self> {
     slf.inner.with_depth(depth);
     slf
   }
 
+  /// Cache the page following HTTP rules.
+  pub fn with_caching(mut slf: PyRefMut<'_, Self>, cache: bool) -> PyRefMut<'_, Self> {
+    slf.inner.with_caching(cache);
+    slf
+  }
+
   /// add external domains
   pub fn with_external_domains(
     mut slf: PyRefMut<'_, Self>,