diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 2ece755..2818450 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,7 +31,7 @@ jobs: - host: macos-latest target: x86_64-apple-darwin build: | - yarn build --target x86_64-apple-darwin + yarn build strip -x *.node - host: windows-latest build: yarn build diff --git a/Cargo.toml b/Cargo.toml index 98cad71..07bdec4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ napi-derive = "2" num_cpus = "1.16.0" serde = "1" serde_json = "1" -spider = { version = "1.97.13", features = ["napi", "cron", "regex", "cookies", "socks", "chrome", "control", "chrome_intercept", "cache", "openai", "serde", "real_browser" ] } +spider = { version = "1.98.0", features = ["napi", "cron", "regex", "cookies", "socks", "chrome", "control", "chrome_intercept", "cache", "openai", "serde", "real_browser" ] } [target.x86_64-unknown-linux-gnu.dependencies] openssl-sys = { version = "0.9", features = ["vendored"] } diff --git a/book/src/website.md b/book/src/website.md index 04b5f91..707a393 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -36,6 +36,16 @@ const website = new Website("https://choosealicense.com") .build(); ``` +### Whitelist + +Only crawl set paths, url, or pattern with Regex. + +```ts +const website = new Website("https://choosealicense.com") + .withWhitelistUrl(["/blog", new RegExp("/books").source, "/resume"]) + .build(); +``` + ### Crons Setup a cron job that can run at any time in the background using cron-syntax. diff --git a/package.json b/package.json index 95b5214..0b28f36 100644 --- a/package.json +++ b/package.json @@ -29,9 +29,9 @@ "repository": "https://github.com/spider-rs/spider-nodejs", "devDependencies": { "@napi-rs/cli": "^2.18.3", - "@types/node": "^20.10.0", - "ava": "^5.1.1", - "typescript": "^5.3.2" + "@types/node": "^20.14.5", + "ava": "^6.1.3", + "typescript": "^5.4.5" }, "ava": { "timeout": "3m" diff --git a/src/website.rs b/src/website.rs index 5fc534e..d6526ea 100644 --- a/src/website.rs +++ b/src/website.rs @@ -790,7 +790,7 @@ impl Website { } #[napi] - /// Regex black list urls from the crawl + /// Regex blacklist urls from the crawl pub fn with_blacklist_url(&mut self, blacklist_url: Option>) -> &Self { self .inner @@ -809,6 +809,26 @@ impl Website { self } + #[napi] + /// Regex whitelist urls from the crawl + pub fn with_whitelist_url(&mut self, whitelist_url: Option>) -> &Self { + self + .inner + .configuration + .with_whitelist_url(match whitelist_url { + Some(v) => { + let mut whitelist: Vec = Vec::new(); + for item in v { + whitelist.push(CompactString::new(item)); + } + Some(whitelist) + } + _ => None, + }); + + self + } + /// Setup cron jobs to run #[napi] pub fn with_cron(&mut self, cron_str: String, cron_type: Option) -> &Self { @@ -850,11 +870,7 @@ impl Website { /// Take screenshots of web pages using chrome. #[napi] - pub fn with_screenshot( - &mut self, - env: Env, - screenshot_configs: Option, - ) -> &Self { + pub fn with_screenshot(&mut self, env: Env, screenshot_configs: Option) -> &Self { use serde_json::Value; use spider::configuration::ScreenShotConfig; let screenshot_configs: Option = match screenshot_configs {