From 04c0335bd64df368723d8da1501f03974aac79e4 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Wed, 27 Dec 2023 07:16:20 -0500 Subject: [PATCH] chore(builder): add depth and caching --- Cargo.toml | 4 +- README.md | 9 ++-- book/src/README.md | 2 +- book/src/SUMMARY.md | 2 + book/src/crawl.md | 105 +++++++++++++++++++++++++++++++++++++++++++ book/src/cron-job.md | 6 +-- book/src/scrape.md | 35 +++++++++++++++ book/src/simple.md | 32 ++++++++++++- book/src/website.md | 35 +++++++++++++-- src/page.rs | 15 ++++++- src/website.rs | 8 +++- 11 files changed, 234 insertions(+), 19 deletions(-) create mode 100644 book/src/crawl.md create mode 100644 book/src/scrape.md diff --git a/Cargo.toml b/Cargo.toml index dd9dd2b..e1d23a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.11" +version = "0.0.12" description = "The fastest web crawler written in Rust ported to nodejs." repository = "https://github.com/spider-rs/spider-nodejs" @@ -11,7 +11,7 @@ crate-type = ["cdylib"] [dependencies] indexmap = "2.1.0" num_cpus = "1.16.0" -spider = { version = "1.80.24", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept" ] } +spider = { version = "1.80.26", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] } pyo3 = { version = "0.20.0", features = ["extension-module"] } pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] } diff --git a/README.md b/README.md index 697fe6c..0864885 100644 --- a/README.md +++ b/README.md @@ -36,16 +36,15 @@ asyncio.run(main()) Setting up real time subscriptions can be done too. - ```python import asyncio from spider_rs import Website class Subscription: - def __init__(self): - print("Subscription Created...") - def __call__(self, page): + def __init__(self): + print("Subscription Created...") + def __call__(self, page): print(page.url + " - status: " + str(page.status_code)) async def main(): @@ -69,4 +68,4 @@ View [bench](./bench/) to see the results. ## Issues -Please submit a Github issue for any issues found. \ No newline at end of file +Please submit a Github issue for any issues found. diff --git a/book/src/README.md b/book/src/README.md index 518604d..f7a15e0 100644 --- a/book/src/README.md +++ b/book/src/README.md @@ -12,4 +12,4 @@ - Blacklisting and Budgeting Depth - Written in [Rust](https://www.rust-lang.org/) for speed, safety, and simplicity -Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more. \ No newline at end of file +Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more. diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index abfd68a..ceb1649 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -15,4 +15,6 @@ # Usage +- [Crawl](./crawl.md) +- [Scrape](./scrape.md) - [Cron Job](./cron-job.md) diff --git a/book/src/crawl.md b/book/src/crawl.md new file mode 100644 index 0000000..68599d8 --- /dev/null +++ b/book/src/crawl.md @@ -0,0 +1,105 @@ +# Crawl + +Crawl a website concurrently. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://rsseau.fr") + website.crawl() + print(website.get_links()) + +asyncio.run(main()) +``` + +## Async Event + +You can pass in a async function as the first param to the crawl function for realtime updates streamed. + +```py +import asyncio +from spider_rs import Website + +class Subscription: + def __init__(self): + print("Subscription Created...") + def __call__(self, page): + print(page.url + " - status: " + str(page.status_code)) + +async def main(): + website = Website("https://choosealicense.com") + website.crawl(Subscription()) + +asyncio.run(main()) +``` + +## Background + +You can run the request in the background and receive events with the second param set to `true`. + +```py +import asyncio +from spider_rs import Website + +class Subscription: + def __init__(self): + print("Subscription Created...") + def __call__(self, page): + print(page.url + " - status: " + str(page.status_code)) + +async def main(): + website = Website("https://choosealicense.com") + website.crawl(Subscription(), True) + # this will run instantly as the crawl is in the background + +asyncio.run(main()) +``` + +## Subscriptions + +You can setup many subscriptions to run events when a crawl happens. + +```py +import asyncio +from spider_rs import Website + +class Subscription: + def __init__(self): + print("Subscription Created...") + def __call__(self, page): + print(page.url + " - status: " + str(page.status_code)) + +async def main(): + website = Website("https://choosealicense.com") + website.crawl() + subscription_id = website.subscribe(Subscription()); + website.crawl() + website.unsubscribe(subscription_id); + +asyncio.run(main()) +``` + +## Headless Chrome + +Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. +It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will +drastically speed up runs. + +```py +import asyncio +from spider_rs import Website + +class Subscription: + def __init__(self): + print("Subscription Created...") + def __call__(self, page): + print(page.url + " - status: " + str(page.status_code)) + +async def main(): + website = Website("https://choosealicense.com") + website.crawl(Subscription(), false, True) + +asyncio.run(main()) +``` diff --git a/book/src/cron-job.md b/book/src/cron-job.md index c06fb73..3e8d925 100644 --- a/book/src/cron-job.md +++ b/book/src/cron-job.md @@ -7,9 +7,9 @@ import asyncio from spider_rs import Website class Subscription: - def __init__(self): - print("Cron Created...") - def __call__(self, page): + def __init__(self): + print("Cron Created...") + def __call__(self, page): print(page.url + " - status: " + str(page.status_code)) async def main(): diff --git a/book/src/scrape.md b/book/src/scrape.md new file mode 100644 index 0000000..02dfbe1 --- /dev/null +++ b/book/src/scrape.md @@ -0,0 +1,35 @@ +# Scrape + +Scape a website and collect the resource data. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com") + website.scrape() + print(website.get_pages()) + # [ { url: "https://rsseau.fr/blog", html: "..."}, ...] + +asyncio.run(main()) +``` + +## Headless Chrome + +Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. +It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will +drastically speed up runs. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com") + website.scrape(NULL, NULL, True) + print(website.get_pages()) + # [ { url: "https://rsseau.fr/blog", html: "..."}, ...] + +asyncio.run(main()) +``` diff --git a/book/src/simple.md b/book/src/simple.md index ee880a0..09596dc 100644 --- a/book/src/simple.md +++ b/book/src/simple.md @@ -1,14 +1,42 @@ # Simple Example +We use the [pyo3](https://pyo3.rs/v0.20.0/) to port the Rust project to target Python. + +There are some performance drawbacks from the addon, even still the crawls are lightning fast and efficient. + +## Usage + +The examples below can help get started with spider. + +### Basic + ```python import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://jeffmendez.com") + website.crawl() + print(website.links) + # print(website.pages) + +asyncio.run(main()) +``` + +## Shortcut + +You can use the `crawl` shortcut method to collect contents quickly without configuration. + +```ts +import asyncio + from spider_rs import crawl async def main(): - website = await crawl("https://jeffmendez.com") + website = crawl("https://jeffmendez.com") print(website.links) # print(website.pages) asyncio.run(main()) -``` \ No newline at end of file +``` diff --git a/book/src/website.md b/book/src/website.md index 6b12a28..b0eba87 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -136,6 +136,34 @@ async def main(): asyncio.run(main()) ``` +### Depth Limit + +Set the depth limit for the amount of forward pages. + +```ts +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_depth(3) + +asyncio.run(main()) +``` + +### Cache + +Enable HTTP caching, this useful when using the spider on a server. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_caching(True) + +asyncio.run(main()) +``` + ### Delays Add delays between pages. Defaults to none. @@ -258,15 +286,14 @@ asyncio.run(main()) To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop. - ```py import asyncio from spider_rs import Website class Subscription: - def __init__(self): - print("Subscription Created...") - def __call__(self, page): + def __init__(self): + print("Subscription Created...") + def __call__(self, page): print(page.url + " - status: " + str(page.status_code)) async def main(): diff --git a/src/page.rs b/src/page.rs index 3f3b4cb..9202ba4 100644 --- a/src/page.rs +++ b/src/page.rs @@ -34,9 +34,22 @@ impl Page { /// get the page content pub fn fetch(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> { + use spider::{ + lazy_static::lazy_static, reqwest::Client, reqwest_middleware::ClientWithMiddleware, + ClientBuilder, + }; + lazy_static! { + /// top level single page client to re-use. + pub static ref PAGE_CLIENT: ClientWithMiddleware = { + let reqwest_client = Client::builder().build().unwrap_or_default(); + let client = ClientBuilder::new(reqwest_client).build(); + + client + }; + } let s = pyo3_asyncio::tokio::get_runtime() .block_on(async move { - let page = spider::page::Page::new_page(&slf.url, &Default::default()).await; + let page = spider::page::Page::new_page(&slf.url, &PAGE_CLIENT).await; slf.status_code = page.status_code.into(); slf.inner = Some(page); slf.selectors = spider::page::get_page_selectors( diff --git a/src/website.rs b/src/website.rs index 94827bf..d3f563d 100644 --- a/src/website.rs +++ b/src/website.rs @@ -746,12 +746,18 @@ impl Website { slf } - /// Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag [budget] enabled. + /// Set a crawl depth limit. If the value is 0 there is no limit. pub fn with_depth(mut slf: PyRefMut<'_, Self>, depth: usize) -> PyRefMut<'_, Self> { slf.inner.with_depth(depth); slf } + /// Cache the page following HTTP rules. + pub fn with_caching(mut slf: PyRefMut<'_, Self>, cache: bool) -> PyRefMut<'_, Self> { + slf.inner.with_caching(cache); + slf + } + /// add external domains pub fn with_external_domains( mut slf: PyRefMut<'_, Self>,