Skip to content

Commit

Permalink
feat(whitelist): add whitelist only paths
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jun 19, 2024
1 parent b509bbb commit 5d16991
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- host: macos-latest
target: x86_64-apple-darwin
build: |
yarn build --target x86_64-apple-darwin
yarn build
strip -x *.node
- host: windows-latest
build: yarn build
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ napi-derive = "2"
num_cpus = "1.16.0"
serde = "1"
serde_json = "1"
spider = { version = "1.97.13", features = ["napi", "cron", "regex", "cookies", "socks", "chrome", "control", "chrome_intercept", "cache", "openai", "serde", "real_browser" ] }
spider = { version = "1.98.0", features = ["napi", "cron", "regex", "cookies", "socks", "chrome", "control", "chrome_intercept", "cache", "openai", "serde", "real_browser" ] }

[target.x86_64-unknown-linux-gnu.dependencies]
openssl-sys = { version = "0.9", features = ["vendored"] }
Expand Down
10 changes: 10 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ const website = new Website("https://choosealicense.com")
.build();
```

### Whitelist

Only crawl set paths, url, or pattern with Regex.

```ts
const website = new Website("https://choosealicense.com")
.withWhitelistUrl(["/blog", new RegExp("/books").source, "/resume"])
.build();
```

### Crons

Setup a cron job that can run at any time in the background using cron-syntax.
Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
"repository": "https://github.com/spider-rs/spider-nodejs",
"devDependencies": {
"@napi-rs/cli": "^2.18.3",
"@types/node": "^20.10.0",
"ava": "^5.1.1",
"typescript": "^5.3.2"
"@types/node": "^20.14.5",
"ava": "^6.1.3",
"typescript": "^5.4.5"
},
"ava": {
"timeout": "3m"
Expand Down
28 changes: 22 additions & 6 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -790,7 +790,7 @@ impl Website {
}

#[napi]
/// Regex black list urls from the crawl
/// Regex blacklist urls from the crawl
pub fn with_blacklist_url(&mut self, blacklist_url: Option<Vec<String>>) -> &Self {
self
.inner
Expand All @@ -809,6 +809,26 @@ impl Website {
self
}

#[napi]
/// Regex whitelist urls from the crawl
pub fn with_whitelist_url(&mut self, whitelist_url: Option<Vec<String>>) -> &Self {
self
.inner
.configuration
.with_whitelist_url(match whitelist_url {
Some(v) => {
let mut whitelist: Vec<CompactString> = Vec::new();
for item in v {
whitelist.push(CompactString::new(item));
}
Some(whitelist)
}
_ => None,
});

self
}

/// Setup cron jobs to run
#[napi]
pub fn with_cron(&mut self, cron_str: String, cron_type: Option<String>) -> &Self {
Expand Down Expand Up @@ -850,11 +870,7 @@ impl Website {

/// Take screenshots of web pages using chrome.
#[napi]
pub fn with_screenshot(
&mut self,
env: Env,
screenshot_configs: Option<napi::JsObject>,
) -> &Self {
pub fn with_screenshot(&mut self, env: Env, screenshot_configs: Option<napi::JsObject>) -> &Self {
use serde_json::Value;
use spider::configuration::ScreenShotConfig;
let screenshot_configs: Option<Value> = match screenshot_configs {
Expand Down

0 comments on commit 5d16991

Please sign in to comment.