From 22b39f5cfd48bfc726a0b7259ee92a0afe86760c Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 5 Sep 2024 09:19:41 -0400 Subject: [PATCH] chore(book): add rust and cli getting started entry --- book/src/SUMMARY.md | 4 + book/src/cli/getting-started.md | 122 +++++++++++++ book/src/rust/getting-started.md | 288 +++++++++++++++++++++++++++++++ 3 files changed, 414 insertions(+) create mode 100644 book/src/cli/getting-started.md create mode 100644 book/src/rust/getting-started.md diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index dd9db9c..b72a9d6 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -21,4 +21,8 @@ # Rust +- [Getting Started](./rust/getting-started.md) + # CLI + +- [Getting Started](./cli/getting-started.md) diff --git a/book/src/cli/getting-started.md b/book/src/cli/getting-started.md new file mode 100644 index 0000000..f66f501 --- /dev/null +++ b/book/src/cli/getting-started.md @@ -0,0 +1,122 @@ +# Getting Started + +Spider Cloud CLI is a command-line interface to interact with the [Spider Cloud](https://spider.cloud) web crawler. It allows you to scrape, crawl, search, and perform various other web-related tasks through simple commands. + +## Installation + +Install the CLI using [`homebrew`](https://brew.sh/) or [`cargo`](https://doc.rust-lang.org/cargo/) from [crates.io](https://crates.io): + +### Homebrew + +```sh +brew tap spider-rs/spider-cloud-cli +brew install spider-cloud-cli +``` + +### Cargo + +```sh +cargo install spider-cloud-cli +``` + +## Usage + +After installing, you can use the CLI by typing `spider-cloud-cli` followed by a command and its respective arguments. + +### Authentication + +Before using most of the commands, you need to authenticate by providing an API key: + +```sh +spider-cloud-cli auth --api_key YOUR_API_KEY +``` + +### Commands + +#### Scrape + +Scrape data from a specified URL. + +```sh +spider-cloud-cli scrape --url http://example.com +``` + +#### Crawl + +Crawl a specified URL with an optional limit on the number of pages. + +```sh +spider-cloud-cli crawl --url http://example.com --limit 10 +``` + +#### Links + +Fetch links from a specified URL. + +```sh +spider-cloud-cli links --url http://example.com +``` + +#### Screenshot + +Take a screenshot of a specified URL. + +```sh +spider-cloud-cli screenshot --url http://example.com +``` + +#### Search + +Search for a query. + +```sh +spider-cloud-cli search --query "example query" +``` + +#### Transform + +Transform specified data. + +```sh +spider-cloud-cli transform --data "sample data" +``` + +#### Extract Contacts + +Extract contact information from a specified URL. + +```sh +spider-cloud-cli extract_contacts --url http://example.com +``` + +#### Label + +Label data from a specified URL. + +```sh +spider-cloud-cli label --url http://example.com +``` + +#### Get Crawl State + +Get the crawl state of a specified URL. + +```sh +spider-cloud-cli get_crawl_state --url http://example.com +``` + +#### Query + +Query records of a specified domain. + +```sh +spider-cloud-cli query --domain example.com +``` + +#### Get Credits + +Fetch the account credits left. + +```sh +spider-cloud-cli get_credits +``` \ No newline at end of file diff --git a/book/src/rust/getting-started.md b/book/src/rust/getting-started.md new file mode 100644 index 0000000..00e9ab5 --- /dev/null +++ b/book/src/rust/getting-started.md @@ -0,0 +1,288 @@ +# Getting Started + +The Spider Cloud Rust SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API. + +## Installation + +To use the Spider Cloud Rust SDK, include the following in your `Cargo.toml`: + +```toml +[dependencies] +spider-client = "0.1" +``` + +## Usage + +1. Get an API key from [spider.cloud](https://spider.cloud) +2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as an argument when creating an instance of the `Spider` struct. + +Here's an example of how to use the SDK: + +```rust +use serde_json::json; +use std::env; + +#[tokio::main] +async fn main() { + // Set the API key as an environment variable + env::set_var("SPIDER_API_KEY", "your_api_key"); + + // Initialize the Spider with your API key + let spider = Spider::new(None).expect("API key must be provided"); + + let url = "https://spider.cloud"; + + // Scrape a single URL + let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); + + println!("Scraped Data: {:?}", scraped_data); + + // Crawl a website + let crawler_params = RequestParams { + limit: Some(1), + proxy_enabled: Some(true), + store_data: Some(false), + metadata: Some(false), + request: Some(RequestType::Http), + ..Default::default() + }; + + let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); + + println!("Crawl Result: {:?}", crawl_result); +} +``` + +### Scraping a URL + +To scrape data from a single URL: + +```rust +let url = "https://example.com"; +let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); +``` + +### Crawling a Website + +To automate crawling a website: + +```rust +let url = "https://example.com"; +let crawl_params = RequestParams { + limit: Some(200), + request: Some(RequestType::Smart), + ..Default::default() +}; +let crawl_result = spider.crawl_url(url, Some(crawl_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); +``` + +#### Crawl Streaming + +Stream crawl the website in chunks to scale with a callback: + +```rust +fn handle_json(json_obj: serde_json::Value) { + println!("Received chunk: {:?}", json_obj); +} + +let url = "https://example.com"; +let crawl_params = RequestParams { + limit: Some(200), + store_data: Some(false), + ..Default::default() +}; + +spider.crawl_url( + url, + Some(crawl_params), + true, + "application/json", + Some(handle_json) +).await.expect("Failed to crawl the URL"); +``` + +### Search + +Perform a search for websites to crawl or gather search results: + +```rust +let query = "a sports website"; +let crawl_params = RequestParams { + request: Some(RequestType::Smart), + search_limit: Some(5), + limit: Some(5), + fetch_page_content: Some(true), + ..Default::default() +}; +let crawl_result = spider.search(query, Some(crawl_params), false, "application/json").await.expect("Failed to perform search"); +``` + +### Retrieving Links from a URL(s) + +Extract all links from a specified URL: + +```rust +let url = "https://example.com"; +let links = spider.links(url, None, false, "application/json").await.expect("Failed to retrieve links from URL"); +``` + +### Transform + +Transform HTML to markdown or text lightning fast: + +```rust +let data = vec![json!({"html": "

Hello world

"})]; +let params = RequestParams { + readability: Some(false), + return_format: Some(ReturnFormat::Markdown), + ..Default::default() +}; +let result = spider.transform(data, Some(params), false, "application/json").await.expect("Failed to transform HTML to markdown"); +println!("Transformed Data: {:?}", result); +``` + +### Taking Screenshots of a URL(s) + +Capture a screenshot of a given URL: + +```rust +let url = "https://example.com"; +let screenshot = spider.screenshot(url, None, false, "application/json").await.expect("Failed to take screenshot of URL"); +``` + +### Extracting Contact Information + +Extract contact details from a specified URL: + +```rust +let url = "https://example.com"; +let contacts = spider.extract_contacts(url, None, false, "application/json").await.expect("Failed to extract contacts from URL"); +println!("Extracted Contacts: {:?}", contacts); +``` + +### Labeling Data from a URL(s) + +Label the data extracted from a particular URL: + +```rust +let url = "https://example.com"; +let labeled_data = spider.label(url, None, false, "application/json").await.expect("Failed to label data from URL"); +println!("Labeled Data: {:?}", labeled_data); +``` + +### Checking Crawl State + +You can check the crawl state of a specific URL: + +```rust +let url = "https://example.com"; +let state = spider.get_crawl_state(url, None, false, "application/json").await.expect("Failed to get crawl state for URL"); +println!("Crawl State: {:?}", state); +``` + +### Downloading Files + +You can download the results of the website: + +```rust +let url = "https://example.com"; +let options = hashmap!{ + "page" => 0, + "limit" => 100, + "expiresIn" => 3600 // Optional, add if needed +}; +let response = spider.create_signed_url(Some(url), Some(options)).await.expect("Failed to create signed URL"); +println!("Download URL: {:?}", response); +``` + +### Checking Available Credits + +You can check the remaining credits on your account: + +```rust +let credits = spider.get_credits().await.expect("Failed to get credits"); +println!("Remaining Credits: {:?}", credits); +``` + +### Data Operations + +The Spider client can now interact with specific data tables to create, retrieve, and delete data. + +#### Retrieve Data from a Table + +To fetch data from a specified table by applying query parameters: + +```rust +let table_name = "pages"; +let query_params = RequestParams { + limit: Some(20), + ..Default::default() +}; +let response = spider.data_get(table_name, Some(query_params)).await.expect("Failed to retrieve data from table"); +println!("Data from table: {:?}", response); +``` + +#### Delete Data from a Table + +To delete data from a specified table based on certain conditions: + +```rust +let table_name = "websites"; +let delete_params = RequestParams { + domain: Some("www.example.com".to_string()), + ..Default::default() +}; +let response = spider.data_delete(table_name, Some(delete_params)).await.expect("Failed to delete data from table"); +println!("Delete Response: {:?}", response); +``` + +## Streaming + +If you need to use streaming, set the `stream` parameter to `true` and provide a callback function: + +```rust +fn handle_json(json_obj: serde_json::Value) { + println!("Received chunk: {:?}", json_obj); +} + +let url = "https://example.com"; +let crawler_params = RequestParams { + limit: Some(1), + proxy_enabled: Some(true), + store_data: Some(false), + metadata: Some(false), + request: Some(RequestType::Http), + ..Default::default() +}; + +spider.links(url, Some(crawler_params), true, "application/json").await.expect("Failed to retrieve links from URL"); +``` + +## Content-Type + +The following Content-type headers are supported using the `content_type` parameter: + +- `application/json` +- `text/csv` +- `application/xml` +- `application/jsonl` + +```rust +let url = "https://example.com"; + +let crawler_params = RequestParams { + limit: Some(1), + proxy_enabled: Some(true), + store_data: Some(false), + metadata: Some(false), + request: Some(RequestType::Http), + ..Default::default() +}; + +// Stream JSON lines back to the client +spider.crawl_url(url, Some(crawler_params), true, "application/jsonl", None::).await.expect("Failed to crawl the URL"); +``` + +## Error Handling + +The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, it will be propagated to the caller with a descriptive error message. \ No newline at end of file