Skip to content

Commit

Permalink
chore(website): add background stop support
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 4, 2023
1 parent 368bbe3 commit 079edbc
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 16 deletions.
33 changes: 30 additions & 3 deletions __test__/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,12 @@ test("new website data store and export", async (t) => {
t.assert(!!data, "should contain valid json file");
});

test("new website stop background", async (t) => {
test("new website stop", async (t) => {
const website = new Website(TEST_URL);

const onPageEvent = (_err: Error | null, page: NPage) => {
const onPageEvent = async (_err: Error | null, page: NPage) => {
if (website.size >= 8) {
website.stop();
await website.stop();
}
};

Expand All @@ -208,3 +208,30 @@ test("new website stop background", async (t) => {
t.assert(website.size < 15, "should only have crawled a couple pages concurrently");
});

test("new website stop background", async (t) => {
const sleep = (time: number) => {
return new Promise((resolve) => {
setTimeout(() => {
resolve(true);
}, time);
});
};

const website = new Website(TEST_URL);
let count = 0;

const onPageEvent = async (_err: Error | null, page: NPage) => {
if (count) {
await website.stop();
}
count++;
};

// lets wait for all other test since background shutsdown all crawls matching the url
await sleep(2000);
await website.crawl(onPageEvent, true);
await sleep(2000);

t.assert(count < 15, "should only have crawled a couple pages concurrently in the background");
});

83 changes: 70 additions & 13 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use napi::{
bindgen_prelude::{Buffer, Object},
tokio::task::JoinHandle,
};
use spider::utils::shutdown;
use std::time::Duration;

/// build an object to jsonl - can be switched between json with changes
Expand Down Expand Up @@ -82,8 +83,11 @@ pub struct Website {
crawl_handles: IndexMap<u32, JoinHandle<()>>,
/// do not convert content to UT8.
raw_content: bool,
/// the dataset collected
collected_data: Box<Vec<u8>>, // /// the file handle for storing data
/// the data collected.
collected_data: Box<Vec<u8>>,
/// is the crawl running in the background.
running_in_background: bool
// /// the file handle for storing data
// file_handle: Option<spider::tokio::fs::File>,
}

Expand All @@ -103,6 +107,7 @@ impl Website {
crawl_handles: IndexMap::new(),
raw_content: raw_content.unwrap_or_default(),
collected_data: Box::new(Vec::new()),
running_in_background: false
// file_handle: None,
}
}
Expand Down Expand Up @@ -239,7 +244,14 @@ impl Website {
/// stop a crawl
pub async unsafe fn stop(&mut self, id: Option<u32>) -> bool {
self.inner.stop();


// prevent the last background run
if self.running_in_background {
// we may want ID's to be used as an option along with urls for complete shutdowns.
shutdown(self.inner.get_domain().inner()).await;
self.running_in_background = false;
}

match id {
Some(id) => {
let handle = self.crawl_handles.get(&id);
Expand All @@ -262,21 +274,23 @@ impl Website {
}
}
}

#[napi]
/// crawl a website
pub async unsafe fn crawl(
&mut self,
on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
// run the page in the background
background: Option<bool>,
// headless chrome rendering
headless: Option<bool>,
) {
// only run in background if on_page_event is handled for streaming.
let background = background.is_some() && background.unwrap_or_default();
let headless = headless.is_some() && headless.unwrap_or_default();
let raw_content = self.raw_content;

if background {
self.running_in_background = background;
}

match on_page_event {
Some(callback) => {
Expand Down Expand Up @@ -345,10 +359,29 @@ impl Website {
}
}
_ => {
if headless {
self.inner.crawl().await;
if background {
let mut website = self.inner.clone();

let crawl_id = match self.crawl_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

let crawl_handle = spider::tokio::spawn(async move {
if headless {
website.crawl().await;
} else {
website.crawl_raw().await;
}
});

self.crawl_handles.insert(crawl_id, crawl_handle);
} else {
self.inner.crawl_raw().await;
if headless {
self.inner.crawl().await;
} else {
self.inner.crawl_raw().await;
}
}
}
}
Expand All @@ -364,10 +397,15 @@ impl Website {
) {
let headless = headless.is_some() && headless.unwrap_or_default();
let raw_content = self.raw_content;
let background = background.is_some() && background.unwrap_or_default();

if background {
self.running_in_background = background;
}

match on_page_event {
Some(callback) => {
if background.unwrap_or_default() {
if background {
let mut website = self.inner.clone();
let mut rx2 = website
.subscribe(*BUFFER / 2)
Expand Down Expand Up @@ -432,10 +470,29 @@ impl Website {
}
}
_ => {
if headless {
self.inner.scrape().await;
if background {
let mut website = self.inner.clone();

let crawl_id = match self.crawl_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

let crawl_handle = spider::tokio::spawn(async move {
if headless {
website.scrape().await;
} else {
website.scrape_raw().await;
}
});

self.crawl_handles.insert(crawl_id, crawl_handle);
} else {
self.inner.scrape_raw().await;
if headless {
self.inner.scrape().await;
} else {
self.inner.scrape_raw().await;
}
}
}
}
Expand Down

0 comments on commit 079edbc

Please sign in to comment.