From e993e2112554702ef2a7b7eb7aef980298b4aae7 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Fri, 8 Dec 2023 22:36:54 -0500 Subject: [PATCH] chore(website): add http header stub --- Cargo.toml | 2 +- README.md | 2 +- src/lib.rs | 20 +++++++- src/page.rs | 44 +++++++++++------ src/website.rs | 129 ++++++++++++++++++++++++++++--------------------- 5 files changed, 126 insertions(+), 71 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 40c68d3..6d386bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.4" +version = "0.0.5" description = "The fastest web crawler written in Rust ported to nodejs." repository = "https://github.com/spider-rs/spider-nodejs" diff --git a/README.md b/README.md index 3672aaa..6fec20e 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Install maturin `pipx install maturin` and python. ## Todo -1. Port Page function. +1. Fix http headers custom assign. 1. Add better docs. Once these items are done the base of the module should be complete. Most of the code comes from the initial port to Node.js that was done. diff --git a/src/lib.rs b/src/lib.rs index aa9f09d..362949e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ #![deny(clippy::all)] +use pyo3::prelude::*; use spider::lazy_static::lazy_static; lazy_static! { @@ -15,5 +16,22 @@ pub mod website; pub use npage::{page_title, NPage}; pub use nwebsite::NWebsite; pub use page::Page; -pub use shortcut::crawl; pub use website::Website; + +#[pyfunction] +fn crawl(py: Python, url: String, raw_content: Option) -> PyResult<&PyAny> { + pyo3_asyncio::tokio::future_into_py(py, async move { + let w = shortcut::crawl(url, raw_content).await; + + Ok(w) + }) +} + +#[pymodule] +fn spider_rs(_py: Python<'_>, m: &PyModule) -> PyResult<()> { + m.add_function(wrap_pyfunction!(crawl, m)?)?; + m.add_class::()?; + m.add_class::()?; + + Ok(()) +} diff --git a/src/page.rs b/src/page.rs index 8b46930..813bff1 100644 --- a/src/page.rs +++ b/src/page.rs @@ -1,7 +1,9 @@ use compact_str::CompactString; +use pyo3::{pyclass, pymethods, PyRef, PyRefMut}; /// a simple page object #[derive(Default)] +#[pyclass] pub struct Page { /// the page object from spider inner: Option, @@ -17,8 +19,10 @@ pub struct Page { pub status_code: u16, } +#[pymethods] impl Page { /// a new page + #[new] pub fn new(url: String, subdomains: Option, tld: Option) -> Self { Page { url, @@ -29,24 +33,36 @@ impl Page { } /// get the page content - pub async unsafe fn fetch(&mut self) -> &Self { - let page = spider::page::Page::new_page(&self.url, &Default::default()).await; - self.status_code = page.status_code.into(); - self.inner = Some(page); - self.selectors = spider::page::get_page_selectors( - &self.url, - self.subdomains.unwrap_or_default(), - self.tld.unwrap_or_default(), - ); - self + pub fn fetch(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> { + let s = pyo3_asyncio::tokio::get_runtime() + .block_on(async move { + let page = spider::page::Page::new_page(&slf.url, &Default::default()).await; + slf.status_code = page.status_code.into(); + slf.inner = Some(page); + slf.selectors = spider::page::get_page_selectors( + &slf.url, + slf.subdomains.unwrap_or_default(), + slf.tld.unwrap_or_default(), + ); + Ok::, ()>(slf) + }) + .unwrap(); + + s } /// all links on the page - pub async fn get_links(&self) -> Vec { - match &self.selectors { - Some(selectors) => match &self.inner { + pub fn get_links(slf: PyRef<'_, Self>) -> Vec { + match &slf.selectors { + Some(selectors) => match &slf.inner { Some(inner) => { - let links = inner.links(&selectors).await; + let links = pyo3_asyncio::tokio::get_runtime() + .block_on(async move { + let links = inner.links(&selectors).await; + Ok::, ()>(links) + }) + .unwrap_or_default(); + links .into_iter() .map(|i| i.as_ref().to_string()) diff --git a/src/website.rs b/src/website.rs index 644a7cf..0b0d7a0 100644 --- a/src/website.rs +++ b/src/website.rs @@ -7,23 +7,6 @@ use spider::tokio::task::JoinHandle; use spider::utils::shutdown; use std::time::Duration; -#[pyfunction] -fn crawl(py: Python, url: String, raw_content: Option) -> PyResult<&PyAny> { - pyo3_asyncio::tokio::future_into_py(py, async move { - let w = crate::crawl(url, raw_content).await; - - Ok(w) - }) -} - -#[pymodule] -fn spider_rs(_py: Python<'_>, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(crawl, m)?)?; - m.add_class::()?; - - Ok(()) -} - /// a website holding the inner spider::website::Website from Rust fit for python. #[pyclass] pub struct Website { @@ -481,6 +464,31 @@ impl Website { self.inner.size() as u32 } + /// get the configuration custom HTTP headers + pub fn get_configuration_headers(&self) -> Vec<(String, String)> { + let mut map = Vec::new(); + + match self.inner.configuration.headers.as_ref() { + Some(h) => { + for v in h.iter() { + let mut value = String::new(); + + match v.1.to_str() { + Ok(vv) => { + value.push_str(vv); + } + _ => (), + }; + + map.push((v.0.to_string(), value)) + } + } + _ => (), + } + + map + } + /// get all the pages of a website - requires calling website.scrape pub fn get_pages(&self) -> Vec { let mut pages: Vec = Vec::new(); @@ -514,44 +522,57 @@ impl Website { self.inner.clear(); } - // /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). - // pub fn with_headers(mut slf: PyRefMut<'_, Self>, headers: Option) -> PyRefMut<'_, Self> { - // use std::str::FromStr; - - // match headers { - // Some(obj) => { - // let mut h = spider::reqwest::header::HeaderMap::new(); - // let keys = Object::keys(&obj).unwrap_or_default(); - - // for key in keys.into_iter() { - // let header_key = spider::reqwest::header::HeaderName::from_str(&key); - - // match header_key { - // Ok(hn) => { - // let header_value = obj - // .get::(key) - // .unwrap_or_default() - // .unwrap_or_default(); - - // match spider::reqwest::header::HeaderValue::from_str(&header_value) { - // Ok(hk) => { - // h.append(hn, hk); - // } - // _ => (), - // } - // } - // _ => (), - // } - // } - // slf.inner.with_headers(Some(h)); - // } - // _ => { - // slf.inner.with_headers(None); - // } - // }; + /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). + pub fn with_headers( + mut slf: PyRefMut<'_, Self>, + headers: Option, + ) -> PyRefMut<'_, Self> { + use std::str::FromStr; + + match headers { + Some(obj) => { + let mut h = spider::reqwest::header::HeaderMap::new(); + + match obj.as_ref(slf.py()).iter() { + Ok(keys) => { + for key in keys.into_iter() { + match key { + Ok(k) => { + let key_name = k.to_string(); + let header_key = spider::reqwest::header::HeaderName::from_str(&key_name); + + match header_key { + Ok(hn) => match k.get_item(key_name) { + Ok(he) => { + let header_value = he.to_string(); + + match spider::reqwest::header::HeaderValue::from_str(&header_value) { + Ok(hk) => { + h.append(hn, hk); + } + _ => (), + } + } + _ => (), + }, + _ => (), + } + } + _ => (), + } + } + slf.inner.with_headers(Some(h)); + } + _ => (), + } + } + _ => { + slf.inner.with_headers(None); + } + }; - // slf - // } + slf + } /// Add user agent to request. pub fn with_user_agent(