Skip to content

Commit

Permalink
chore(builder): add depth and caching
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 27, 2023
1 parent ea796bb commit b97b190
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 5 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.11"
version = "0.0.12"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"

Expand All @@ -11,7 +11,7 @@ crate-type = ["cdylib"]
[dependencies]
indexmap = "2.1.0"
num_cpus = "1.16.0"
spider = { version = "1.80.24", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept" ] }
spider = { version = "1.80.26", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] }
pyo3 = { version = "0.20.0", features = ["extension-module"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }

Expand Down
30 changes: 29 additions & 1 deletion book/src/simple.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,40 @@
# Simple Example

We use the [pyo3](https://pyo3.rs/v0.20.0/) to port the Rust project to target Python.

There are some performance drawbacks from the addon, even still the crawls are lightning fast and efficient.

## Usage

The examples below can help get started with spider.

### Basic

```python
import asyncio

from spider_rs import Website

async def main():
website = Website("https://jeffmendez.com")
website.crawl()
print(website.links)
# print(website.pages)

asyncio.run(main())
```

## Shortcut

You can use the `crawl` shortcut method to collect contents quickly without configuration.

```ts
import asyncio

from spider_rs import crawl

async def main():
website = await crawl("https://jeffmendez.com")
website = crawl("https://jeffmendez.com")
print(website.links)
# print(website.pages)

Expand Down
28 changes: 28 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,34 @@ async def main():
asyncio.run(main())
```

### Depth Limit

Set the depth limit for the amount of forward pages.

```ts
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_depth(3)

asyncio.run(main())
```
### Cache
Enable HTTP caching, this useful when using the spider on a server.
```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_caching(True)

asyncio.run(main())
```
### Delays
Add delays between pages. Defaults to none.
Expand Down
15 changes: 14 additions & 1 deletion src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,22 @@ impl Page {

/// get the page content
pub fn fetch(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> {
use spider::{
lazy_static::lazy_static, reqwest::Client, reqwest_middleware::ClientWithMiddleware,
ClientBuilder,
};
lazy_static! {
/// top level single page client to re-use.
pub static ref PAGE_CLIENT: ClientWithMiddleware = {
let reqwest_client = Client::builder().build().unwrap_or_default();
let client = ClientBuilder::new(reqwest_client).build();

client
};
}
let s = pyo3_asyncio::tokio::get_runtime()
.block_on(async move {
let page = spider::page::Page::new_page(&slf.url, &Default::default()).await;
let page = spider::page::Page::new_page(&slf.url, &PAGE_CLIENT).await;
slf.status_code = page.status_code.into();
slf.inner = Some(page);
slf.selectors = spider::page::get_page_selectors(
Expand Down
8 changes: 7 additions & 1 deletion src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -746,12 +746,18 @@ impl Website {
slf
}

/// Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag [budget] enabled.
/// Set a crawl depth limit. If the value is 0 there is no limit.
pub fn with_depth(mut slf: PyRefMut<'_, Self>, depth: usize) -> PyRefMut<'_, Self> {
slf.inner.with_depth(depth);
slf
}

/// Cache the page following HTTP rules.
pub fn with_caching(mut slf: PyRefMut<'_, Self>, cache: bool) -> PyRefMut<'_, Self> {
slf.inner.with_caching(cache);
slf
}

/// add external domains
pub fn with_external_domains(
mut slf: PyRefMut<'_, Self>,
Expand Down

0 comments on commit b97b190

Please sign in to comment.