Skip to content

Commit

Permalink
feat(website): add crawl stop
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 4, 2023
1 parent d890b40 commit 9b32f01
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ napi-derive = "2.14.2"
num_cpus = "1.16.0"
serde = "1.0.193"
serde_json = "1.0.108"
spider = { version = "1.50.17", features = ["napi", "budget", "cron", "regex", "cookies", "socks", "chrome"] }
spider = { version = "1.50.18", features = ["napi", "budget", "cron", "regex", "cookies", "socks", "chrome", "control" ] }

[target.x86_64-unknown-linux-gnu.dependencies]
openssl-sys = { version = "0.9.96", features = ["vendored"] }
Expand Down
16 changes: 15 additions & 1 deletion __test__/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { crawl, Website, Page, type NPage, Cron, pageTitle } from "../index.js";

const TEST_URL = "https://choosealicense.com";

test("crawl shortcut native", async (t) => {
test("shortcut crawl native", async (t) => {
const { links, pages } = await crawl(TEST_URL);

t.assert(links.length > 1, "should be more than one link");
Expand Down Expand Up @@ -194,3 +194,17 @@ test("new website data store and export", async (t) => {
t.assert(!!data, "should contain valid json file");
});

test("new website stop background", async (t) => {
const website = new Website(TEST_URL);

const onPageEvent = (_err: Error | null, page: NPage) => {
if (website.size >= 8) {
website.stop();
}
};

await website.crawl(onPageEvent);

t.assert(website.size < 15, "should only have crawled a couple pages concurrently");
});

18 changes: 18 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,21 @@ await website.crawl(onPageEvent);
// we only have one export method atm. Optional file path. All data by default goes to storage
await website.exportJsonlData("./storage/test.jsonl");
```

## Stop crawl

To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop.

```ts
const website = new Website("https://choosealicense.com");

const onPageEvent = (_err, page) => {
console.log(page)
// stop the concurrent crawl when 8 pages are found.
if (website.size >= 8) {
website.stop();
}
};

await website.crawl(onPageEvent);
```
20 changes: 12 additions & 8 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ export function crawl(url: string, rawContent?: boolean | undefined | null): Pro
export interface PageEvent {
page: NPage
}
/** website main data from rust to node. */
export class NWebsite {
/** all of the website links. */
links: Array<string>
/** the pages found. */
pages: Array<NPage>
}
/** a simple page object */
export class Page {
/** the url for the page */
Expand Down Expand Up @@ -55,14 +62,18 @@ export class Website {
subscribe(onPageEvent: (err: Error | null, value: NPage) => any): number
/** remove a subscription listener. */
unsubscribe(id?: number | undefined | null): boolean
/** stop a crawl */
stop(id?: number | undefined | null): Promise<boolean>
/** crawl a website */
crawl(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
/** scrape a website */
scrape(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
/** run the cron */
/** run a cron job */
runCron(onPageEvent?: (err: Error | null, value: NPage) => any | undefined | null): Promise<Cron>
/** get all the links of a website */
getLinks(): Array<string>
/** get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. */
get size(): number
/** get all the pages of a website - requires calling website.scrape */
getPages(): Array<NPage>
/** drain all links from storing */
Expand Down Expand Up @@ -103,10 +114,3 @@ export class Cron {
/** stop the cron instance */
stop(): Promise<void>
}
/** website main data from rust to node. */
export class NWebsite {
/** all of the website links. */
links: Array<string>
/** the pages found. */
pages: Array<NPage>
}
4 changes: 2 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,11 @@ if (!nativeBinding) {
throw new Error(`Failed to load native binding`)
}

const { pageTitle, Page, crawl, Website, Cron, NWebsite } = nativeBinding
const { pageTitle, NWebsite, Page, crawl, Website, Cron } = nativeBinding

module.exports.pageTitle = pageTitle
module.exports.NWebsite = NWebsite
module.exports.Page = Page
module.exports.crawl = crawl
module.exports.Website = Website
module.exports.Cron = Cron
module.exports.NWebsite = NWebsite
57 changes: 53 additions & 4 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,10 @@ fn object_to_u8(
pub struct Website {
/// the website from spider.
inner: spider::website::Website,
/// spawn subscription handles.
/// spawned subscription handles.
subscription_handles: IndexMap<u32, JoinHandle<()>>,
/// spawned crawl handles.
crawl_handles: IndexMap<u32, JoinHandle<()>>,
/// do not convert content to UT8.
raw_content: bool,
/// the dataset collected
Expand All @@ -98,6 +100,7 @@ impl Website {
Website {
inner: spider::website::Website::new(&url),
subscription_handles: IndexMap::new(),
crawl_handles: IndexMap::new(),
raw_content: raw_content.unwrap_or_default(),
collected_data: Box::new(Vec::new()),
// file_handle: None,
Expand Down Expand Up @@ -232,6 +235,34 @@ impl Website {
}
}

#[napi]
/// stop a crawl
pub async unsafe fn stop(&mut self, id: Option<u32>) -> bool {
self.inner.stop();

match id {
Some(id) => {
let handle = self.crawl_handles.get(&id);

match handle {
Some(h) => {
h.abort();
self.crawl_handles.remove_entry(&id);
true
}
_ => false,
}
}
_ => {
let keys = self.crawl_handles.len();
for k in self.crawl_handles.drain(..) {
k.1.abort();
}
keys > 0
}
}
}

#[napi]
/// crawl a website
pub async unsafe fn crawl(
Expand Down Expand Up @@ -264,7 +295,12 @@ impl Website {
}
});

spider::tokio::spawn(async move {
let crawl_id = match self.crawl_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

let crawl_handle = spider::tokio::spawn(async move {
if headless {
website.crawl().await;
} else {
Expand All @@ -277,6 +313,7 @@ impl Website {
_ => 0,
};

self.crawl_handles.insert(crawl_id, crawl_handle);
self.subscription_handles.insert(id, handle);
} else {
let mut rx2 = self
Expand Down Expand Up @@ -345,7 +382,12 @@ impl Website {
}
});

spider::tokio::spawn(async move {
let crawl_id = match self.crawl_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

let crawl_handle = spider::tokio::spawn(async move {
if headless {
website.scrape().await;
} else {
Expand All @@ -358,6 +400,7 @@ impl Website {
_ => 0,
};

self.crawl_handles.insert(crawl_id, crawl_handle);
self.subscription_handles.insert(id, handle);
} else {
let mut rx2 = self
Expand Down Expand Up @@ -398,7 +441,7 @@ impl Website {
}
}

/// run the cron
/// run a cron job
#[napi]
pub async unsafe fn run_cron(
&mut self,
Expand Down Expand Up @@ -443,6 +486,12 @@ impl Website {
links
}

#[napi(getter)]
/// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update.
pub fn size(&mut self) -> u32 {
self.inner.size() as u32
}

/// get all the pages of a website - requires calling website.scrape
#[napi]
pub fn get_pages(&self) -> Vec<NPage> {
Expand Down

0 comments on commit 9b32f01

Please sign in to comment.