Skip to content

Commit

Permalink
feat(openai): add openai flag dynamic js snippets
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Mar 20, 2024
1 parent 473ed26 commit 367d585
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 18 deletions.
10 changes: 6 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.27"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"
version = "0.0.28"
description = "The fastest web crawler written in Rust ported to Python."
repository = "https://github.com/spider-rs/spider-py"
license = "MIT"

[lib]
crate-type = ["cdylib"]

[dependencies]
indexmap = "2.1.0"
num_cpus = "1.16.0"
spider = { version = "1.85.4", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] }
spider = { version = "1.86.8", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache", "serde", "openai" ] }
pyo3 = { version = "0.20.3", features = ["extension-module"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
serde_json = "1.0.114"

[target.x86_64-unknown-linux-gnu.dependencies]
openssl-sys = { version = "0.9.96", features = ["vendored"] }
Expand Down
41 changes: 41 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,47 @@ async def main():
asyncio.run(main())
```
### OpenAI
Use OpenAI to generate dynamic scripts to use with headless. Make sure to set the `OPENAI_API_KEY` env variable.
```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_openai({ model: "gpt-3.5-turbo", prompt: "Search for movies", maxTokens: 300 })

asyncio.run(main())
```
### Screenshots
Take a screenshot of the pages on crawl when using headless chrome.
```py
import asyncio
from spider_rs import Website

async def main():
website = (
Website("https://choosealicense.com", False)
.with_screenshot({
"params": {
"cdp_params": None,
"full_page": True,
"omit_background": False
},
"bytes": False,
"save": True,
"output_dir": None
})
)

asyncio.run(main())
```
### Http2 Prior Knowledge
Use http2 to connect if you know the website servers supports this.
Expand Down
9 changes: 7 additions & 2 deletions examples/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com", False).with_agent("BotBot").with_headers({ "authorization": "Something "})
website = (
Website("https://choosealicense.com", False)
.with_user_agent("BotBot")
.with_headers({"authorization": "Something "})
)
website.crawl()
print(website.get_links())

asyncio.run(main())

asyncio.run(main())
25 changes: 25 additions & 0 deletions examples/screenshot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import asyncio

from spider_rs import Website

async def main():
website = (
Website("https://choosealicense.com", False)
.with_user_agent("BotBot")
.with_headers({"authorization": "Something "})
.with_screenshot({
"params": {
"cdp_params": None,
"full_page": True,
"omit_background": False
},
"bytes": False,
"save": True,
"output_dir": None
})
)
website.crawl()
print(website.get_links())


asyncio.run(main())
63 changes: 51 additions & 12 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -741,24 +741,63 @@ impl Website {
slf
}

/// Take a screenshot of the page when using chrome.
pub fn with_screenshot<'a>(
mut slf: PyRefMut<'a, Self>,
screenshot_configs: Option<&'a PyAny>,
) -> PyRefMut<'a, Self> {
if let Some(py_obj) = screenshot_configs {
let config_json: String = py_obj.extract().unwrap();

match serde_json::from_str::<spider::configuration::ScreenShotConfig>(&config_json) {
Ok(configs) => {
slf.inner.with_screenshot(Some(configs));
}
Err(e) => {
spider::utils::log("", e.to_string());
}
}
}

slf
}

/// Use OpenAI to generate dynamic javascript snippets. Make sure to set the `OPENAI_API_KEY` env variable.
pub fn with_openai<'a>(
mut slf: PyRefMut<'a, Self>,
openai_configs: Option<&'a PyAny>,
) -> PyRefMut<'a, Self> {
if let Some(py_obj) = openai_configs {
let config_json: String = py_obj.extract().unwrap();

match serde_json::from_str::<spider::configuration::GPTConfigs>(&config_json) {
Ok(configs) => {
slf.inner.with_openai(Some(configs));
}
Err(e) => {
spider::utils::log("", e.to_string());
}
}
}

slf
}

/// Regex black list urls from the crawl
pub fn with_blacklist_url(
mut slf: PyRefMut<'_, Self>,
blacklist_url: Option<Vec<String>>,
) -> PyRefMut<'_, Self> {
slf
.inner
.configuration
.with_blacklist_url(match blacklist_url {
Some(v) => {
let mut blacklist: Vec<CompactString> = Vec::new();
for item in v {
blacklist.push(CompactString::new(item));
}
Some(blacklist)
slf.inner.with_blacklist_url(match blacklist_url {
Some(v) => {
let mut blacklist: Vec<CompactString> = Vec::new();
for item in v {
blacklist.push(CompactString::new(item));
}
_ => None,
});
Some(blacklist)
}
_ => None,
});

slf
}
Expand Down

0 comments on commit 367d585

Please sign in to comment.