Skip to content

Commit

Permalink
chore(website): add crawl method class
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 8, 2023
1 parent 139758e commit 1e43a5a
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 134 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.0"
version = "0.0.1"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"

Expand Down
246 changes: 113 additions & 133 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,141 +199,121 @@ impl Website {
}
}

// /// crawl a website
// pub async unsafe fn crawl(
// &mut self,
// // on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
// background: Option<bool>,
// headless: Option<bool>,
// ) {
// // only run in background if on_page_event is handled for streaming.
// let background = background.is_some() && background.unwrap_or_default();
// let headless = headless.is_some() && headless.unwrap_or_default();
// // let raw_content = self.raw_content;

// if background {
// self.running_in_background = background;
// }

// if background {
// let mut website = self.inner.clone();

// let crawl_id = match self.crawl_handles.last() {
// Some(handle) => handle.0 + 1,
// _ => 0,
// };

// let crawl_handle = spider::tokio::spawn(async move {
// if headless {
// website.crawl().await;
// } else {
// website.crawl_raw().await;
// }
// });

// self.crawl_handles.insert(crawl_id, crawl_handle);
// } else {
// if headless {
// self.inner.crawl().await;
// } else {
// self.inner.crawl_raw().await;
// }
// }

// // match on_page_event {
// // Some(callback) => {
// // if background {
// // let mut website = self.inner.clone();
// // let mut rx2 = website
// // .subscribe(*BUFFER / 2)
// // .expect("sync feature should be enabled");

// // let handle = spider::tokio::spawn(async move {
// // while let Ok(res) = rx2.recv().await {
// // callback.call(
// // Ok(NPage::new(&res, raw_content)),
// // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
// // );
// // }
// // });

// // let crawl_id = match self.crawl_handles.last() {
// // Some(handle) => handle.0 + 1,
// // _ => 0,
// // };

// // let crawl_handle = spider::tokio::spawn(async move {
// // if headless {
// // website.crawl().await;
// // } else {
// // website.crawl_raw().await;
// // }
// // });

// // let id = match self.subscription_handles.last() {
// // Some(handle) => handle.0 + 1,
// // _ => 0,
// // };

// // self.crawl_handles.insert(crawl_id, crawl_handle);
// // self.subscription_handles.insert(id, handle);
// // } else {
// // let mut rx2 = self
// // .inner
// // .subscribe(*BUFFER / 2)
// // .expect("sync feature should be enabled");

// // let handle = spider::tokio::spawn(async move {
// // while let Ok(res) = rx2.recv().await {
// // callback.call(
// // Ok(NPage::new(&res, raw_content)),
// // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
// // );
// // }
// // });

// // if headless {
// // self.inner.crawl().await;
// // } else {
// // self.inner.crawl_raw().await;
// // }

// // let id = match self.subscription_handles.last() {
// // Some(handle) => handle.0 + 1,
// // _ => 0,
// // };

// // self.subscription_handles.insert(id, handle);
// // }
// // }
// // _ => {
// // if background {
// // let mut website = self.inner.clone();

// // let crawl_id = match self.crawl_handles.last() {
// // Some(handle) => handle.0 + 1,
// // _ => 0,
// // };
/// crawl a website
pub fn crawl(
mut slf: PyRefMut<'_, Self>,
on_page_event: Option<PyObject>,
background: Option<bool>,
headless: Option<bool>,
) {
// only run in background if on_page_event is handled for streaming.
let background = background.is_some() && background.unwrap_or_default();
let headless = headless.is_some() && headless.unwrap_or_default();
let raw_content = slf.raw_content;

// // let crawl_handle = spider::tokio::spawn(async move {
// // if headless {
// // website.crawl().await;
// // } else {
// // website.crawl_raw().await;
// // }
// // });
if background {
slf.running_in_background = background;
}

// // self.crawl_handles.insert(crawl_id, crawl_handle);
// // } else {
// // if headless {
// // self.inner.crawl().await;
// // } else {
// // self.inner.crawl_raw().await;
// // }
// // }
// // }
// // }
// }
match on_page_event {
Some(callback) => {
if background {
let mut website = slf.inner.clone();
let mut rx2 = website
.subscribe(*BUFFER / 2)
.expect("sync feature should be enabled");

let handle = spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
let page = NPage::new(&res, raw_content);
Python::with_gil(|py| {
let _ = callback.call(py, (page, 0), None);
});
}
});

let crawl_id = match slf.crawl_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

let crawl_handle = spider::tokio::spawn(async move {
if headless {
website.crawl().await;
} else {
website.crawl_raw().await;
}
});

let id = match slf.subscription_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

slf.crawl_handles.insert(crawl_id, crawl_handle);
slf.subscription_handles.insert(id, handle);
} else {
let mut rx2 = slf
.inner
.subscribe(*BUFFER / 2)
.expect("sync feature should be enabled");

let handle = pyo3_asyncio::tokio::get_runtime().spawn(async move {
while let Ok(res) = rx2.recv().await {
Python::with_gil(|py| {
let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None);
});
}
});

let id = match slf.subscription_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

slf.subscription_handles.insert(id, handle);

let _ = pyo3_asyncio::tokio::get_runtime().block_on(async move {
if headless {
slf.inner.crawl().await;
} else {
slf.inner.crawl_raw().await;
}
Ok::<(), ()>(())
});
}
}
_ => {
if background {
let mut website = slf.inner.clone();

let crawl_id = match slf.crawl_handles.last() {
Some(handle) => handle.0 + 1,
_ => 0,
};

let crawl_handle = spider::tokio::spawn(async move {
if headless {
website.crawl().await;
} else {
website.crawl_raw().await;
}
});

slf.crawl_handles.insert(crawl_id, crawl_handle);
} else {
let _ = pyo3_asyncio::tokio::get_runtime().block_on(async move {
if headless {
slf.inner.crawl().await;
} else {
slf.inner.crawl_raw().await;
}
Ok::<(), ()>(())
});
}
}
}
}

// /// scrape a website
// pub async unsafe fn scrape(
Expand Down

0 comments on commit 1e43a5a

Please sign in to comment.