Skip to content

Commit

Permalink
chore(builder): add depth and caching
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 27, 2023
1 parent ea796bb commit 04c0335
Show file tree
Hide file tree
Showing 11 changed files with 234 additions and 19 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.11"
version = "0.0.12"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"

Expand All @@ -11,7 +11,7 @@ crate-type = ["cdylib"]
[dependencies]
indexmap = "2.1.0"
num_cpus = "1.16.0"
spider = { version = "1.80.24", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept" ] }
spider = { version = "1.80.26", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] }
pyo3 = { version = "0.20.0", features = ["extension-module"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }

Expand Down
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,15 @@ asyncio.run(main())

Setting up real time subscriptions can be done too.


```python
import asyncio

from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
Expand All @@ -69,4 +68,4 @@ View [bench](./bench/) to see the results.

## Issues

Please submit a Github issue for any issues found.
Please submit a Github issue for any issues found.
2 changes: 1 addition & 1 deletion book/src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
- Blacklisting and Budgeting Depth
- Written in [Rust](https://www.rust-lang.org/) for speed, safety, and simplicity

Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more.
Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more.
2 changes: 2 additions & 0 deletions book/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@

# Usage

- [Crawl](./crawl.md)
- [Scrape](./scrape.md)
- [Cron Job](./cron-job.md)
105 changes: 105 additions & 0 deletions book/src/crawl.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Crawl

Crawl a website concurrently.

```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://rsseau.fr")
website.crawl()
print(website.get_links())

asyncio.run(main())
```

## Async Event

You can pass in a async function as the first param to the crawl function for realtime updates streamed.

```py
import asyncio
from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
website = Website("https://choosealicense.com")
website.crawl(Subscription())

asyncio.run(main())
```

## Background

You can run the request in the background and receive events with the second param set to `true`.

```py
import asyncio
from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
website = Website("https://choosealicense.com")
website.crawl(Subscription(), True)
# this will run instantly as the crawl is in the background

asyncio.run(main())
```

## Subscriptions

You can setup many subscriptions to run events when a crawl happens.

```py
import asyncio
from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
website = Website("https://choosealicense.com")
website.crawl()
subscription_id = website.subscribe(Subscription());
website.crawl()
website.unsubscribe(subscription_id);

asyncio.run(main())
```

## Headless Chrome

Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
drastically speed up runs.

```py
import asyncio
from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
website = Website("https://choosealicense.com")
website.crawl(Subscription(), false, True)

asyncio.run(main())
```
6 changes: 3 additions & 3 deletions book/src/cron-job.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ import asyncio
from spider_rs import Website

class Subscription:
def __init__(self):
print("Cron Created...")
def __call__(self, page):
def __init__(self):
print("Cron Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
Expand Down
35 changes: 35 additions & 0 deletions book/src/scrape.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Scrape

Scape a website and collect the resource data.

```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com")
website.scrape()
print(website.get_pages())
# [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]

asyncio.run(main())
```

## Headless Chrome

Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
drastically speed up runs.

```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com")
website.scrape(NULL, NULL, True)
print(website.get_pages())
# [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]

asyncio.run(main())
```
32 changes: 30 additions & 2 deletions book/src/simple.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,42 @@
# Simple Example

We use the [pyo3](https://pyo3.rs/v0.20.0/) to port the Rust project to target Python.

There are some performance drawbacks from the addon, even still the crawls are lightning fast and efficient.

## Usage

The examples below can help get started with spider.

### Basic

```python
import asyncio

from spider_rs import Website

async def main():
website = Website("https://jeffmendez.com")
website.crawl()
print(website.links)
# print(website.pages)

asyncio.run(main())
```

## Shortcut

You can use the `crawl` shortcut method to collect contents quickly without configuration.

```ts
import asyncio

from spider_rs import crawl

async def main():
website = await crawl("https://jeffmendez.com")
website = crawl("https://jeffmendez.com")
print(website.links)
# print(website.pages)

asyncio.run(main())
```
```
35 changes: 31 additions & 4 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,34 @@ async def main():
asyncio.run(main())
```

### Depth Limit

Set the depth limit for the amount of forward pages.

```ts
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_depth(3)

asyncio.run(main())
```
### Cache
Enable HTTP caching, this useful when using the spider on a server.
```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_caching(True)

asyncio.run(main())
```
### Delays
Add delays between pages. Defaults to none.
Expand Down Expand Up @@ -258,15 +286,14 @@ asyncio.run(main())
To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop.

```py
import asyncio
from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
Expand Down
15 changes: 14 additions & 1 deletion src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,22 @@ impl Page {

/// get the page content
pub fn fetch(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> {
use spider::{
lazy_static::lazy_static, reqwest::Client, reqwest_middleware::ClientWithMiddleware,
ClientBuilder,
};
lazy_static! {
/// top level single page client to re-use.
pub static ref PAGE_CLIENT: ClientWithMiddleware = {
let reqwest_client = Client::builder().build().unwrap_or_default();
let client = ClientBuilder::new(reqwest_client).build();

client
};
}
let s = pyo3_asyncio::tokio::get_runtime()
.block_on(async move {
let page = spider::page::Page::new_page(&slf.url, &Default::default()).await;
let page = spider::page::Page::new_page(&slf.url, &PAGE_CLIENT).await;
slf.status_code = page.status_code.into();
slf.inner = Some(page);
slf.selectors = spider::page::get_page_selectors(
Expand Down
8 changes: 7 additions & 1 deletion src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -746,12 +746,18 @@ impl Website {
slf
}

/// Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag [budget] enabled.
/// Set a crawl depth limit. If the value is 0 there is no limit.
pub fn with_depth(mut slf: PyRefMut<'_, Self>, depth: usize) -> PyRefMut<'_, Self> {
slf.inner.with_depth(depth);
slf
}

/// Cache the page following HTTP rules.
pub fn with_caching(mut slf: PyRefMut<'_, Self>, cache: bool) -> PyRefMut<'_, Self> {
slf.inner.with_caching(cache);
slf
}

/// add external domains
pub fn with_external_domains(
mut slf: PyRefMut<'_, Self>,
Expand Down

0 comments on commit 04c0335

Please sign in to comment.