From f94d5d446948dce5426202b085d82aea7953e9b8 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Fri, 8 Dec 2023 10:50:00 -0500 Subject: [PATCH] chore(website): add builder method start --- README.md | 1 + book/src/getting-started.md | 4 +- book/src/simple.md | 1 + src/website.rs | 1058 +++++++++++++++++------------------ 4 files changed, 533 insertions(+), 531 deletions(-) diff --git a/README.md b/README.md index 1cc69b7..7e2ee7c 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ from spider_rs import crawl async def main(): website = await crawl("https://choosealicense.com") print(website.links) + # print(website.pages) asyncio.run(main()) ``` diff --git a/book/src/getting-started.md b/book/src/getting-started.md index c7a3d9a..aba989f 100644 --- a/book/src/getting-started.md +++ b/book/src/getting-started.md @@ -1,7 +1,7 @@ # Getting Started -Make sure to have python installed v10 and higher. +Make sure to have python installed. ```sh -pip install spider-py +pip install spider_rs ``` diff --git a/book/src/simple.md b/book/src/simple.md index 8e25ab0..ee880a0 100644 --- a/book/src/simple.md +++ b/book/src/simple.md @@ -8,6 +8,7 @@ from spider_rs import crawl async def main(): website = await crawl("https://jeffmendez.com") print(website.links) + # print(website.pages) asyncio.run(main()) ``` \ No newline at end of file diff --git a/src/website.rs b/src/website.rs index 3725beb..fa40d98 100644 --- a/src/website.rs +++ b/src/website.rs @@ -21,7 +21,8 @@ fn spider_rs(_py: Python<'_>, m: &PyModule) -> PyResult<()> { Ok(()) } -/// a website holding the inner spider::website::Website from Rust fit for nodejs. +/// a website holding the inner spider::website::Website from Rust fit for python. +#[pyclass] pub struct Website { /// the website from spider. inner: spider::website::Website, @@ -43,9 +44,10 @@ struct PageEvent { pub page: NPage, } +#[pymethods] impl Website { /// a new website. - pub fn new(url: String, raw_content: Option) -> Self { + pub fn new(&self, url: String, raw_content: Option) -> Self { Website { inner: spider::website::Website::new(&url), subscription_handles: IndexMap::new(), @@ -62,48 +64,48 @@ impl Website { self.inner.get_status().to_string() } - /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage. - pub async fn export_jsonl_data(&self, export_path: Option) -> std::io::Result<()> { - use spider::tokio::io::AsyncWriteExt; - let file = match export_path { - Some(p) => { - let base_dir = p - .split("/") - .into_iter() - .map(|f| { - if f.contains(".") { - "".to_string() - } else { - f.to_string() - } - }) - .collect::(); - - spider::tokio::fs::create_dir_all(&base_dir).await?; - - if !p.contains(".") { - p + ".jsonl" - } else { - p - } - } - _ => { - spider::tokio::fs::create_dir_all("./storage").await?; - "./storage/".to_owned() - + &self - .inner - .get_domain() - .inner() - .replace("http://", "") - .replace("https://", "") - + "jsonl" - } - }; - let mut file = spider::tokio::fs::File::create(file).await?; - // transform data step needed to auto convert type .. - file.write_all(&self.collected_data).await?; - Ok(()) - } + // /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage. + // pub async fn export_jsonl_data(&self, export_path: Option) -> std::io::Result<()> { + // use spider::tokio::io::AsyncWriteExt; + // let file = match export_path { + // Some(p) => { + // let base_dir = p + // .split("/") + // .into_iter() + // .map(|f| { + // if f.contains(".") { + // "".to_string() + // } else { + // f.to_string() + // } + // }) + // .collect::(); + + // spider::tokio::fs::create_dir_all(&base_dir).await?; + + // if !p.contains(".") { + // p + ".jsonl" + // } else { + // p + // } + // } + // _ => { + // spider::tokio::fs::create_dir_all("./storage").await?; + // "./storage/".to_owned() + // + &self + // .inner + // .get_domain() + // .inner() + // .replace("http://", "") + // .replace("https://", "") + // + "jsonl" + // } + // }; + // let mut file = spider::tokio::fs::File::create(file).await?; + // // transform data step needed to auto convert type .. + // file.write_all(&self.collected_data).await?; + // Ok(()) + // } // /// subscribe and add an event listener. @@ -138,342 +140,341 @@ impl Website { // } - /// remove a subscription listener. - pub fn unsubscribe(&mut self, id: Option) -> bool { - match id { - Some(id) => { - let handle = self.subscription_handles.get(&id); - - match handle { - Some(h) => { - h.abort(); - self.subscription_handles.remove_entry(&id); - true - } - _ => false, - } - } - // we may want to get all subs and remove them - _ => { - let keys = self.subscription_handles.len(); - for k in self.subscription_handles.drain(..) { - k.1.abort(); - } - keys > 0 - } - } - } + // /// remove a subscription listener. + // pub fn unsubscribe(&mut self, id: Option) -> bool { + // match id { + // Some(id) => { + // let handle = self.subscription_handles.get(&id); + + // match handle { + // Some(h) => { + // h.abort(); + // self.subscription_handles.remove_entry(&id); + // true + // } + // _ => false, + // } + // } + // // we may want to get all subs and remove them + // _ => { + // let keys = self.subscription_handles.len(); + // for k in self.subscription_handles.drain(..) { + // k.1.abort(); + // } + // keys > 0 + // } + // } + // } - /// stop a crawl - pub async unsafe fn stop(&mut self, id: Option) -> bool { - self.inner.stop(); - - // prevent the last background run - if self.running_in_background { - // we may want ID's to be used as an option along with urls for complete shutdowns. - shutdown(self.inner.get_domain().inner()).await; - self.running_in_background = false; - } - - match id { - Some(id) => { - let handle = self.crawl_handles.get(&id); - - match handle { - Some(h) => { - h.abort(); - self.crawl_handles.remove_entry(&id); - true - } - _ => false, - } - } - _ => { - let keys = self.crawl_handles.len(); - for k in self.crawl_handles.drain(..) { - k.1.abort(); - } - keys > 0 - } - } - } + // /// stop a crawl + // pub async unsafe fn stop(&mut self, id: Option) -> bool { + // self.inner.stop(); + + // // prevent the last background run + // if self.running_in_background { + // // we may want ID's to be used as an option along with urls for complete shutdowns. + // shutdown(self.inner.get_domain().inner()).await; + // self.running_in_background = false; + // } + + // match id { + // Some(id) => { + // let handle = self.crawl_handles.get(&id); + + // match handle { + // Some(h) => { + // h.abort(); + // self.crawl_handles.remove_entry(&id); + // true + // } + // _ => false, + // } + // } + // _ => { + // let keys = self.crawl_handles.len(); + // for k in self.crawl_handles.drain(..) { + // k.1.abort(); + // } + // keys > 0 + // } + // } + // } - /// crawl a website - pub async unsafe fn crawl( - &mut self, - // on_page_event: Option>, - background: Option, - headless: Option, - ) { - // only run in background if on_page_event is handled for streaming. - let background = background.is_some() && background.unwrap_or_default(); - let headless = headless.is_some() && headless.unwrap_or_default(); - // let raw_content = self.raw_content; + // /// crawl a website + // pub async unsafe fn crawl( + // &mut self, + // // on_page_event: Option>, + // background: Option, + // headless: Option, + // ) { + // // only run in background if on_page_event is handled for streaming. + // let background = background.is_some() && background.unwrap_or_default(); + // let headless = headless.is_some() && headless.unwrap_or_default(); + // // let raw_content = self.raw_content; - if background { - self.running_in_background = background; - } + // if background { + // self.running_in_background = background; + // } + + // if background { + // let mut website = self.inner.clone(); + + // let crawl_id = match self.crawl_handles.last() { + // Some(handle) => handle.0 + 1, + // _ => 0, + // }; + + // let crawl_handle = spider::tokio::spawn(async move { + // if headless { + // website.crawl().await; + // } else { + // website.crawl_raw().await; + // } + // }); + + // self.crawl_handles.insert(crawl_id, crawl_handle); + // } else { + // if headless { + // self.inner.crawl().await; + // } else { + // self.inner.crawl_raw().await; + // } + // } + + // // match on_page_event { + // // Some(callback) => { + // // if background { + // // let mut website = self.inner.clone(); + // // let mut rx2 = website + // // .subscribe(*BUFFER / 2) + // // .expect("sync feature should be enabled"); + + // // let handle = spider::tokio::spawn(async move { + // // while let Ok(res) = rx2.recv().await { + // // callback.call( + // // Ok(NPage::new(&res, raw_content)), + // // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, + // // ); + // // } + // // }); + + // // let crawl_id = match self.crawl_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // let crawl_handle = spider::tokio::spawn(async move { + // // if headless { + // // website.crawl().await; + // // } else { + // // website.crawl_raw().await; + // // } + // // }); + + // // let id = match self.subscription_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // self.crawl_handles.insert(crawl_id, crawl_handle); + // // self.subscription_handles.insert(id, handle); + // // } else { + // // let mut rx2 = self + // // .inner + // // .subscribe(*BUFFER / 2) + // // .expect("sync feature should be enabled"); + + // // let handle = spider::tokio::spawn(async move { + // // while let Ok(res) = rx2.recv().await { + // // callback.call( + // // Ok(NPage::new(&res, raw_content)), + // // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, + // // ); + // // } + // // }); + + // // if headless { + // // self.inner.crawl().await; + // // } else { + // // self.inner.crawl_raw().await; + // // } + + // // let id = match self.subscription_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // self.subscription_handles.insert(id, handle); + // // } + // // } + // // _ => { + // // if background { + // // let mut website = self.inner.clone(); + + // // let crawl_id = match self.crawl_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // let crawl_handle = spider::tokio::spawn(async move { + // // if headless { + // // website.crawl().await; + // // } else { + // // website.crawl_raw().await; + // // } + // // }); + + // // self.crawl_handles.insert(crawl_id, crawl_handle); + // // } else { + // // if headless { + // // self.inner.crawl().await; + // // } else { + // // self.inner.crawl_raw().await; + // // } + // // } + // // } + // // } + // } - if background { - let mut website = self.inner.clone(); - - let crawl_id = match self.crawl_handles.last() { - Some(handle) => handle.0 + 1, - _ => 0, - }; - - let crawl_handle = spider::tokio::spawn(async move { - if headless { - website.crawl().await; - } else { - website.crawl_raw().await; - } - }); - - self.crawl_handles.insert(crawl_id, crawl_handle); - } else { - if headless { - self.inner.crawl().await; - } else { - self.inner.crawl_raw().await; - } - } + + // /// scrape a website + // pub async unsafe fn scrape( + // &mut self, + // // on_page_event: Option>, + // background: Option, + // headless: Option, + // ) { + // let headless = headless.is_some() && headless.unwrap_or_default(); + // let raw_content = self.raw_content; + // let background = background.is_some() && background.unwrap_or_default(); - // match on_page_event { - // Some(callback) => { - // if background { - // let mut website = self.inner.clone(); - // let mut rx2 = website - // .subscribe(*BUFFER / 2) - // .expect("sync feature should be enabled"); - - // let handle = spider::tokio::spawn(async move { - // while let Ok(res) = rx2.recv().await { - // callback.call( - // Ok(NPage::new(&res, raw_content)), - // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, - // ); - // } - // }); - - // let crawl_id = match self.crawl_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // let crawl_handle = spider::tokio::spawn(async move { - // if headless { - // website.crawl().await; - // } else { - // website.crawl_raw().await; - // } - // }); - - // let id = match self.subscription_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // self.crawl_handles.insert(crawl_id, crawl_handle); - // self.subscription_handles.insert(id, handle); - // } else { - // let mut rx2 = self - // .inner - // .subscribe(*BUFFER / 2) - // .expect("sync feature should be enabled"); - - // let handle = spider::tokio::spawn(async move { - // while let Ok(res) = rx2.recv().await { - // callback.call( - // Ok(NPage::new(&res, raw_content)), - // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, - // ); - // } - // }); - - // if headless { - // self.inner.crawl().await; - // } else { - // self.inner.crawl_raw().await; - // } - - // let id = match self.subscription_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // self.subscription_handles.insert(id, handle); - // } - // } - // _ => { - // if background { - // let mut website = self.inner.clone(); - - // let crawl_id = match self.crawl_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // let crawl_handle = spider::tokio::spawn(async move { - // if headless { - // website.crawl().await; - // } else { - // website.crawl_raw().await; - // } - // }); - - // self.crawl_handles.insert(crawl_id, crawl_handle); - // } else { - // if headless { - // self.inner.crawl().await; - // } else { - // self.inner.crawl_raw().await; - // } - // } - // } - // } - } + // if background { + // self.running_in_background = background; + // } - - /// scrape a website - pub async unsafe fn scrape( - &mut self, - // on_page_event: Option>, - background: Option, - headless: Option, - ) { - let headless = headless.is_some() && headless.unwrap_or_default(); - let raw_content = self.raw_content; - let background = background.is_some() && background.unwrap_or_default(); - - if background { - self.running_in_background = background; - } + // if background { + // let mut website = self.inner.clone(); - if background { - let mut website = self.inner.clone(); - - let crawl_id = match self.crawl_handles.last() { - Some(handle) => handle.0 + 1, - _ => 0, - }; - - let crawl_handle = spider::tokio::spawn(async move { - if headless { - website.scrape().await; - } else { - website.scrape_raw().await; - } - }); - - self.crawl_handles.insert(crawl_id, crawl_handle); - } else { - if headless { - self.inner.scrape().await; - } else { - self.inner.scrape_raw().await; - } - } + // let crawl_id = match self.crawl_handles.last() { + // Some(handle) => handle.0 + 1, + // _ => 0, + // }; - // match on_page_event { - // Some(callback) => { - // if background { - // let mut website = self.inner.clone(); - // let mut rx2 = website - // .subscribe(*BUFFER / 2) - // .expect("sync feature should be enabled"); - - // let handle = spider::tokio::spawn(async move { - // while let Ok(res) = rx2.recv().await { - // callback.call( - // Ok(NPage::new(&res, raw_content)), - // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, - // ); - // } - // }); - - // let crawl_id = match self.crawl_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // let crawl_handle = spider::tokio::spawn(async move { - // if headless { - // website.scrape().await; - // } else { - // website.scrape_raw().await; - // } - // }); - - // let id = match self.subscription_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // self.crawl_handles.insert(crawl_id, crawl_handle); - // self.subscription_handles.insert(id, handle); - // } else { - // let mut rx2 = self - // .inner - // .subscribe(*BUFFER / 2) - // .expect("sync feature should be enabled"); - - // let handle = spider::tokio::spawn(async move { - // while let Ok(res) = rx2.recv().await { - // callback.call( - // Ok(NPage::new(&res, raw_content)), - // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, - // ); - // } - // }); - - // if headless { - // self.inner.scrape().await; - // } else { - // self.inner.scrape_raw().await; - // } - - // let id = match self.subscription_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // self.subscription_handles.insert(id, handle); - // } - // } - // _ => { - // if background { - // let mut website = self.inner.clone(); - - // let crawl_id = match self.crawl_handles.last() { - // Some(handle) => handle.0 + 1, - // _ => 0, - // }; - - // let crawl_handle = spider::tokio::spawn(async move { - // if headless { - // website.scrape().await; - // } else { - // website.scrape_raw().await; - // } - // }); - - // self.crawl_handles.insert(crawl_id, crawl_handle); - // } else { - // if headless { - // self.inner.scrape().await; - // } else { - // self.inner.scrape_raw().await; - // } - // } - // } - // } - } + // let crawl_handle = spider::tokio::spawn(async move { + // if headless { + // website.scrape().await; + // } else { + // website.scrape_raw().await; + // } + // }); + + // self.crawl_handles.insert(crawl_id, crawl_handle); + // } else { + // if headless { + // self.inner.scrape().await; + // } else { + // self.inner.scrape_raw().await; + // } + // } + + // // match on_page_event { + // // Some(callback) => { + // // if background { + // // let mut website = self.inner.clone(); + // // let mut rx2 = website + // // .subscribe(*BUFFER / 2) + // // .expect("sync feature should be enabled"); + + // // let handle = spider::tokio::spawn(async move { + // // while let Ok(res) = rx2.recv().await { + // // callback.call( + // // Ok(NPage::new(&res, raw_content)), + // // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, + // // ); + // // } + // // }); + + // // let crawl_id = match self.crawl_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // let crawl_handle = spider::tokio::spawn(async move { + // // if headless { + // // website.scrape().await; + // // } else { + // // website.scrape_raw().await; + // // } + // // }); + + // // let id = match self.subscription_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // self.crawl_handles.insert(crawl_id, crawl_handle); + // // self.subscription_handles.insert(id, handle); + // // } else { + // // let mut rx2 = self + // // .inner + // // .subscribe(*BUFFER / 2) + // // .expect("sync feature should be enabled"); + + // // let handle = spider::tokio::spawn(async move { + // // while let Ok(res) = rx2.recv().await { + // // callback.call( + // // Ok(NPage::new(&res, raw_content)), + // // napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, + // // ); + // // } + // // }); + + // // if headless { + // // self.inner.scrape().await; + // // } else { + // // self.inner.scrape_raw().await; + // // } + + // // let id = match self.subscription_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // self.subscription_handles.insert(id, handle); + // // } + // // } + // // _ => { + // // if background { + // // let mut website = self.inner.clone(); + + // // let crawl_id = match self.crawl_handles.last() { + // // Some(handle) => handle.0 + 1, + // // _ => 0, + // // }; + + // // let crawl_handle = spider::tokio::spawn(async move { + // // if headless { + // // website.scrape().await; + // // } else { + // // website.scrape_raw().await; + // // } + // // }); + + // // self.crawl_handles.insert(crawl_id, crawl_handle); + // // } else { + // // if headless { + // // self.inner.scrape().await; + // // } else { + // // self.inner.scrape_raw().await; + // // } + // // } + // // } + // // } + // } - /// run a cron job - + // /// run a cron job // pub async unsafe fn run_cron( // &mut self, // on_page_event: Option>, @@ -506,57 +507,57 @@ impl Website { // } - /// get all the links of a website - pub fn get_links(&self) -> Vec { - let links = self - .inner - .get_links() - .iter() - .map(|x| x.as_ref().to_string()) - .collect::>(); - links - } + // /// get all the links of a website + // pub fn get_links(&self) -> Vec { + // let links = self + // .inner + // .get_links() + // .iter() + // .map(|x| x.as_ref().to_string()) + // .collect::>(); + // links + // } - /// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. - pub fn size(&mut self) -> u32 { - self.inner.size() as u32 - } + // /// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. + // pub fn size(&mut self) -> u32 { + // self.inner.size() as u32 + // } - /// get all the pages of a website - requires calling website.scrape + // /// get all the pages of a website - requires calling website.scrape - pub fn get_pages(&self) -> Vec { - let mut pages: Vec = Vec::new(); - let raw_content = self.raw_content; - - match self.inner.get_pages() { - Some(p) => { - for page in p.iter() { - pages.push(NPage::new(page, raw_content)); - } - } - _ => (), - } + // pub fn get_pages(&self) -> Vec { + // let mut pages: Vec = Vec::new(); + // let raw_content = self.raw_content; - pages - } + // match self.inner.get_pages() { + // Some(p) => { + // for page in p.iter() { + // pages.push(NPage::new(page, raw_content)); + // } + // } + // _ => (), + // } + + // pages + // } - /// drain all links from storing - pub fn drain_links(&mut self) -> Vec { - let links = self - .inner - .drain_links() - .map(|x| x.as_ref().to_string()) - .collect::>(); - - links - } + // /// drain all links from storing + // pub fn drain_links(&mut self) -> Vec { + // let links = self + // .inner + // .drain_links() + // .map(|x| x.as_ref().to_string()) + // .collect::>(); + + // links + // } - /// clear all links and page data - pub fn clear(&mut self) { - self.inner.clear(); - } + // /// clear all links and page data + // pub fn clear(&mut self) { + // self.inner.clear(); + // } // /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). @@ -598,153 +599,152 @@ impl Website { // self // } - /// Add user agent to request. + // /// Add user agent to request. - pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &Self { - self.inner.configuration.with_user_agent(user_agent); - self - } + // pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &Self { + // self.inner.configuration.with_user_agent(user_agent); + // self + // } - /// Respect robots.txt file. + // /// Respect robots.txt file. - pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &Self { - self - .inner - .configuration - .with_respect_robots_txt(respect_robots_txt); - self - } + // pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &Self { + // self + // .inner + // .configuration + // .with_respect_robots_txt(respect_robots_txt); + // self + // } - /// Include subdomains detection. + // /// Include subdomains detection. - pub fn with_subdomains(&mut self, subdomains: bool) -> &Self { - self.inner.configuration.with_subdomains(subdomains); - self - } + // pub fn with_subdomains(&mut self, subdomains: bool) -> &Self { + // self.inner.configuration.with_subdomains(subdomains); + // self + // } - /// Include tld detection. + // /// Include tld detection. - pub fn with_tld(&mut self, tld: bool) -> &Self { - self.inner.configuration.with_tld(tld); - self - } + // pub fn with_tld(&mut self, tld: bool) -> &Self { + // self.inner.configuration.with_tld(tld); + // self + // } - /// Only use HTTP/2. + // /// Only use HTTP/2. - pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &Self { - self - .inner - .configuration - .with_http2_prior_knowledge(http2_prior_knowledge); - self - } + // pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &Self { + // self + // .inner + // .configuration + // .with_http2_prior_knowledge(http2_prior_knowledge); + // self + // } - /// Max time to wait for request duration to milliseconds. + // /// Max time to wait for request duration to milliseconds. - pub fn with_request_timeout(&mut self, request_timeout: Option) -> &Self { - self - .inner - .configuration - .with_request_timeout(match request_timeout { - Some(d) => Some(Duration::from_millis(d.into())), - _ => None, - }); - self - } + // pub fn with_request_timeout(&mut self, request_timeout: Option) -> &Self { + // self + // .inner + // .configuration + // .with_request_timeout(match request_timeout { + // Some(d) => Some(Duration::from_millis(d.into())), + // _ => None, + // }); + // self + // } /// add external domains - - pub fn with_external_domains(&mut self, external_domains: Option>) -> &Self { - self.inner.with_external_domains(match external_domains { + pub fn with_external_domains(mut slf: PyRefMut<'_, Self>, external_domains: Option>) -> PyRefMut<'_, Self> { + slf.inner.with_external_domains(match external_domains { Some(ext) => Some(ext.into_iter()), _ => None, }); - self + slf } - /// Set the crawling budget - pub fn with_budget(&mut self, budget: Option>) -> &Self { - use spider::hashbrown::hash_map::HashMap; - - match budget { - Some(d) => { - let v = d - .iter() - .map(|e| e.0.to_owned() + "," + &e.1.to_string()) - .collect::(); - let v = v - .split(",") - .collect::>() - .chunks(2) - .map(|x| (x[0], x[1].parse::().unwrap_or_default())) - .collect::>(); - - self.inner.with_budget(Some(v)); - } - _ => (), - } + // /// Set the crawling budget + // pub fn with_budget(&mut self, budget: Option>) -> &Self { + // use spider::hashbrown::hash_map::HashMap; + + // match budget { + // Some(d) => { + // let v = d + // .iter() + // .map(|e| e.0.to_owned() + "," + &e.1.to_string()) + // .collect::(); + // let v = v + // .split(",") + // .collect::>() + // .chunks(2) + // .map(|x| (x[0], x[1].parse::().unwrap_or_default())) + // .collect::>(); + + // self.inner.with_budget(Some(v)); + // } + // _ => (), + // } - self - } + // self + // } - /// Regex black list urls from the crawl - pub fn with_blacklist_url(&mut self, blacklist_url: Option>) -> &Self { - self - .inner - .configuration - .with_blacklist_url(match blacklist_url { - Some(v) => { - let mut blacklist: Vec = Vec::new(); - for item in v { - blacklist.push(CompactString::new(item)); - } - Some(blacklist) - } - _ => None, - }); - - self - } + // /// Regex black list urls from the crawl + // pub fn with_blacklist_url(&mut self, blacklist_url: Option>) -> &Self { + // self + // .inner + // .configuration + // .with_blacklist_url(match blacklist_url { + // Some(v) => { + // let mut blacklist: Vec = Vec::new(); + // for item in v { + // blacklist.push(CompactString::new(item)); + // } + // Some(blacklist) + // } + // _ => None, + // }); + + // self + // } - /// Setup cron jobs to run + // /// Setup cron jobs to run - pub fn with_cron(&mut self, cron_str: String, cron_type: Option) -> &Self { - self.inner.with_cron( - cron_str.as_str(), - if cron_type.unwrap_or_default() == "scrape" { - spider::website::CronType::Scrape - } else { - spider::website::CronType::Crawl - }, - ); - self - } + // pub fn with_cron(&mut self, cron_str: String, cron_type: Option) -> &Self { + // self.inner.with_cron( + // cron_str.as_str(), + // if cron_type.unwrap_or_default() == "scrape" { + // spider::website::CronType::Scrape + // } else { + // spider::website::CronType::Crawl + // }, + // ); + // self + // } - /// Delay between request as ms. + // /// Delay between request as ms. - pub fn with_delay(&mut self, delay: u32) -> &Self { - self.inner.configuration.with_delay(delay.into()); - self - } + // pub fn with_delay(&mut self, delay: u32) -> &Self { + // self.inner.configuration.with_delay(delay.into()); + // self + // } - /// Use proxies for request. + // /// Use proxies for request. - pub fn with_proxies(&mut self, proxies: Option>) -> &Self { - self.inner.configuration.with_proxies(proxies); - self - } + // pub fn with_proxies(&mut self, proxies: Option>) -> &Self { + // self.inner.configuration.with_proxies(proxies); + // self + // } - /// build the inner website - not required for all builder_steps - pub fn build(&mut self) -> &Self { - match self.inner.build() { - Ok(w) => self.inner = w, - _ => (), - } - self - } + // /// build the inner website - not required for all builder_steps + // pub fn build(&mut self) -> &Self { + // match self.inner.build() { + // Ok(w) => self.inner = w, + // _ => (), + // } + // self + // } } /// a runner for handling crons