diff --git a/Cargo.toml b/Cargo.toml index a491a78..0429de1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ napi-derive = "2.14.2" num_cpus = "1.16.0" serde = "1.0.193" serde_json = "1.0.108" -spider = { version = "1.50.17", features = ["napi", "budget", "cron", "regex", "cookies", "socks", "chrome"] } +spider = { version = "1.50.18", features = ["napi", "budget", "cron", "regex", "cookies", "socks", "chrome", "control" ] } [target.x86_64-unknown-linux-gnu.dependencies] openssl-sys = { version = "0.9.96", features = ["vendored"] } diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts index cead78c..05013af 100644 --- a/__test__/index.spec.ts +++ b/__test__/index.spec.ts @@ -3,7 +3,7 @@ import { crawl, Website, Page, type NPage, Cron, pageTitle } from "../index.js"; const TEST_URL = "https://choosealicense.com"; -test("crawl shortcut native", async (t) => { +test("shortcut crawl native", async (t) => { const { links, pages } = await crawl(TEST_URL); t.assert(links.length > 1, "should be more than one link"); @@ -194,3 +194,17 @@ test("new website data store and export", async (t) => { t.assert(!!data, "should contain valid json file"); }); +test("new website stop background", async (t) => { + const website = new Website(TEST_URL); + + const onPageEvent = (_err: Error | null, page: NPage) => { + if (website.size >= 8) { + website.stop(); + } + }; + + await website.crawl(onPageEvent); + + t.assert(website.size < 15, "should only have crawled a couple pages concurrently"); +}); + diff --git a/book/src/website.md b/book/src/website.md index 0af733e..474620a 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -207,3 +207,21 @@ await website.crawl(onPageEvent); // we only have one export method atm. Optional file path. All data by default goes to storage await website.exportJsonlData("./storage/test.jsonl"); ``` + +## Stop crawl + +To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop. + +```ts +const website = new Website("https://choosealicense.com"); + +const onPageEvent = (_err, page) => { + console.log(page) + // stop the concurrent crawl when 8 pages are found. + if (website.size >= 8) { + website.stop(); + } +}; + +await website.crawl(onPageEvent); +``` diff --git a/index.d.ts b/index.d.ts index aa5dc4c..bf85c6c 100644 --- a/index.d.ts +++ b/index.d.ts @@ -21,6 +21,13 @@ export function crawl(url: string, rawContent?: boolean | undefined | null): Pro export interface PageEvent { page: NPage } +/** website main data from rust to node. */ +export class NWebsite { + /** all of the website links. */ + links: Array + /** the pages found. */ + pages: Array +} /** a simple page object */ export class Page { /** the url for the page */ @@ -55,14 +62,18 @@ export class Website { subscribe(onPageEvent: (err: Error | null, value: NPage) => any): number /** remove a subscription listener. */ unsubscribe(id?: number | undefined | null): boolean + /** stop a crawl */ + stop(id?: number | undefined | null): Promise /** crawl a website */ crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise /** scrape a website */ scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise - /** run the cron */ + /** run a cron job */ runCron(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise /** get all the links of a website */ getLinks(): Array + /** get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. */ + get size(): number /** get all the pages of a website - requires calling website.scrape */ getPages(): Array /** drain all links from storing */ @@ -103,10 +114,3 @@ export class Cron { /** stop the cron instance */ stop(): Promise } -/** website main data from rust to node. */ -export class NWebsite { - /** all of the website links. */ - links: Array - /** the pages found. */ - pages: Array -} diff --git a/index.js b/index.js index d338702..09dcf5a 100644 --- a/index.js +++ b/index.js @@ -252,11 +252,11 @@ if (!nativeBinding) { throw new Error(`Failed to load native binding`) } -const { pageTitle, Page, crawl, Website, Cron, NWebsite } = nativeBinding +const { pageTitle, NWebsite, Page, crawl, Website, Cron } = nativeBinding module.exports.pageTitle = pageTitle +module.exports.NWebsite = NWebsite module.exports.Page = Page module.exports.crawl = crawl module.exports.Website = Website module.exports.Cron = Cron -module.exports.NWebsite = NWebsite diff --git a/src/website.rs b/src/website.rs index cc26346..6c89e6d 100644 --- a/src/website.rs +++ b/src/website.rs @@ -76,8 +76,10 @@ fn object_to_u8( pub struct Website { /// the website from spider. inner: spider::website::Website, - /// spawn subscription handles. + /// spawned subscription handles. subscription_handles: IndexMap>, + /// spawned crawl handles. + crawl_handles: IndexMap>, /// do not convert content to UT8. raw_content: bool, /// the dataset collected @@ -98,6 +100,7 @@ impl Website { Website { inner: spider::website::Website::new(&url), subscription_handles: IndexMap::new(), + crawl_handles: IndexMap::new(), raw_content: raw_content.unwrap_or_default(), collected_data: Box::new(Vec::new()), // file_handle: None, @@ -232,6 +235,34 @@ impl Website { } } + #[napi] + /// stop a crawl + pub async unsafe fn stop(&mut self, id: Option) -> bool { + self.inner.stop(); + + match id { + Some(id) => { + let handle = self.crawl_handles.get(&id); + + match handle { + Some(h) => { + h.abort(); + self.crawl_handles.remove_entry(&id); + true + } + _ => false, + } + } + _ => { + let keys = self.crawl_handles.len(); + for k in self.crawl_handles.drain(..) { + k.1.abort(); + } + keys > 0 + } + } + } + #[napi] /// crawl a website pub async unsafe fn crawl( @@ -264,7 +295,12 @@ impl Website { } }); - spider::tokio::spawn(async move { + let crawl_id = match self.crawl_handles.last() { + Some(handle) => handle.0 + 1, + _ => 0, + }; + + let crawl_handle = spider::tokio::spawn(async move { if headless { website.crawl().await; } else { @@ -277,6 +313,7 @@ impl Website { _ => 0, }; + self.crawl_handles.insert(crawl_id, crawl_handle); self.subscription_handles.insert(id, handle); } else { let mut rx2 = self @@ -345,7 +382,12 @@ impl Website { } }); - spider::tokio::spawn(async move { + let crawl_id = match self.crawl_handles.last() { + Some(handle) => handle.0 + 1, + _ => 0, + }; + + let crawl_handle = spider::tokio::spawn(async move { if headless { website.scrape().await; } else { @@ -358,6 +400,7 @@ impl Website { _ => 0, }; + self.crawl_handles.insert(crawl_id, crawl_handle); self.subscription_handles.insert(id, handle); } else { let mut rx2 = self @@ -398,7 +441,7 @@ impl Website { } } - /// run the cron + /// run a cron job #[napi] pub async unsafe fn run_cron( &mut self, @@ -443,6 +486,12 @@ impl Website { links } + #[napi(getter)] + /// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. + pub fn size(&mut self) -> u32 { + self.inner.size() as u32 + } + /// get all the pages of a website - requires calling website.scrape #[napi] pub fn get_pages(&self) -> Vec {