From e2af729061f4065ef78e3202dbaf3293a432aedb Mon Sep 17 00:00:00 2001 From: j-mendez Date: Wed, 27 Dec 2023 07:16:20 -0500 Subject: [PATCH] chore(builder): add depth and caching --- Cargo.toml | 4 ++-- book/src/website.md | 28 ++++++++++++++++++++++++++++ src/page.rs | 15 ++++++++++++++- src/website.rs | 8 +++++++- 4 files changed, 51 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dd9dd2b..e1d23a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.11" +version = "0.0.12" description = "The fastest web crawler written in Rust ported to nodejs." repository = "https://github.com/spider-rs/spider-nodejs" @@ -11,7 +11,7 @@ crate-type = ["cdylib"] [dependencies] indexmap = "2.1.0" num_cpus = "1.16.0" -spider = { version = "1.80.24", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept" ] } +spider = { version = "1.80.26", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] } pyo3 = { version = "0.20.0", features = ["extension-module"] } pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] } diff --git a/book/src/website.md b/book/src/website.md index 6b12a28..0147899 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -136,6 +136,34 @@ async def main(): asyncio.run(main()) ``` +### Depth Limit + +Set the depth limit for the amount of forward pages. + +```ts +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_depth(3) + +asyncio.run(main()) +``` + +### Cache + +Enable HTTP caching, this useful when using the spider on a server. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_caching(True) + +asyncio.run(main()) +``` + ### Delays Add delays between pages. Defaults to none. diff --git a/src/page.rs b/src/page.rs index 3f3b4cb..9202ba4 100644 --- a/src/page.rs +++ b/src/page.rs @@ -34,9 +34,22 @@ impl Page { /// get the page content pub fn fetch(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> { + use spider::{ + lazy_static::lazy_static, reqwest::Client, reqwest_middleware::ClientWithMiddleware, + ClientBuilder, + }; + lazy_static! { + /// top level single page client to re-use. + pub static ref PAGE_CLIENT: ClientWithMiddleware = { + let reqwest_client = Client::builder().build().unwrap_or_default(); + let client = ClientBuilder::new(reqwest_client).build(); + + client + }; + } let s = pyo3_asyncio::tokio::get_runtime() .block_on(async move { - let page = spider::page::Page::new_page(&slf.url, &Default::default()).await; + let page = spider::page::Page::new_page(&slf.url, &PAGE_CLIENT).await; slf.status_code = page.status_code.into(); slf.inner = Some(page); slf.selectors = spider::page::get_page_selectors( diff --git a/src/website.rs b/src/website.rs index 94827bf..d3f563d 100644 --- a/src/website.rs +++ b/src/website.rs @@ -746,12 +746,18 @@ impl Website { slf } - /// Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag [budget] enabled. + /// Set a crawl depth limit. If the value is 0 there is no limit. pub fn with_depth(mut slf: PyRefMut<'_, Self>, depth: usize) -> PyRefMut<'_, Self> { slf.inner.with_depth(depth); slf } + /// Cache the page following HTTP rules. + pub fn with_caching(mut slf: PyRefMut<'_, Self>, cache: bool) -> PyRefMut<'_, Self> { + slf.inner.with_caching(cache); + slf + } + /// add external domains pub fn with_external_domains( mut slf: PyRefMut<'_, Self>,