From 8bc3ffe3471865d7ef90eb521a2b88b16f858b57 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sat, 30 Nov 2024 16:52:49 -0500 Subject: [PATCH] chore(clippy): fix lint --- Cargo.lock | 20 +- spider/src/lib.rs | 2 +- spider/src/packages/robotparser/parser.rs | 16 +- spider/src/page.rs | 170 +++++++--------- spider/src/utils/header_utils.rs | 15 +- spider/src/utils/interner.rs | 5 + spider/src/utils/mod.rs | 20 +- spider/src/utils/trie.rs | 12 ++ spider/src/website.rs | 192 ++++++++++-------- spider_chrome/src/handler/network.rs | 16 +- spider_chrome/src/page.rs | 2 +- spider_cli/src/main.rs | 67 ++---- spider_transformations/src/html2text/mod.rs | 39 ++-- .../src/html2text/render/text_renderer.rs | 85 ++++---- .../src/transformation/content.rs | 7 +- spider_utils/src/lib.rs | 17 +- spider_worker/src/main.rs | 7 +- 17 files changed, 325 insertions(+), 367 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 044e6e8b4..7655bddd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -307,9 +307,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "auto_encoder" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7541d8c3aaed36cd8840c9349d3d6d77be4f057a922e50612d8bbe1346b722bd" +checksum = "0f9fbdc2af8df1e9f42b362850d4d16ebef2b80430ba360a2c9f262ec161f002" dependencies = [ "chardetng", "encoding_rs", @@ -1339,9 +1339,9 @@ dependencies = [ [[package]] name = "event-listener-strategy" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1" +checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2" dependencies = [ "event-listener", "pin-project-lite", @@ -3020,9 +3020,9 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "os_info" -version = "3.8.2" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae99c7fa6dd38c7cafe1ec085e804f8f555a2f8659b0dbe03f1f9963a9b51092" +checksum = "e5ca711d8b83edbb00b44d504503cd247c9c0bd8b0fa2694f2a1a3d8165379ce" dependencies = [ "log", "windows-sys 0.52.0", @@ -3459,7 +3459,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "rustls 0.23.19", "socket2", "thiserror 2.0.3", @@ -3477,7 +3477,7 @@ dependencies = [ "getrandom 0.2.15", "rand 0.8.5", "ring", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "rustls 0.23.19", "rustls-pki-types", "slab", @@ -3860,9 +3860,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "rustc_version" diff --git a/spider/src/lib.rs b/spider/src/lib.rs index 1a4fdacd7..bd3fe21e8 100644 --- a/spider/src/lib.rs +++ b/spider/src/lib.rs @@ -191,7 +191,7 @@ pub mod black_list { pub mod black_list { use crate::compact_str::CompactString; /// check if link exist in blacklists. - pub fn contains(blacklist_url: &Vec, link: &CompactString) -> bool { + pub fn contains(blacklist_url: &[CompactString], link: &CompactString) -> bool { blacklist_url.contains(link) } } diff --git a/spider/src/packages/robotparser/parser.rs b/spider/src/packages/robotparser/parser.rs index a2608c3fc..a735f6ebd 100644 --- a/spider/src/packages/robotparser/parser.rs +++ b/spider/src/packages/robotparser/parser.rs @@ -135,11 +135,10 @@ impl RuleLine { #[cfg(not(feature = "regex"))] fn applies_to(&self, pathname: &str) -> bool { - if self.path == "*" { - true - } else if self.path == "/" && pathname == "/" { - true - } else if self.path.ends_with("/") && pathname.starts_with(&self.path) { + if self.path == "*" + || self.path == "/" && pathname == "/" + || self.path.ends_with("/") && pathname.starts_with(&self.path) + { true } else { self.path @@ -317,11 +316,8 @@ impl RobotFileParser { /// Sets the time the robots.txt file was last fetched to the /// current time. pub fn modified(&mut self) { - match SystemTime::now().duration_since(UNIX_EPOCH) { - Ok(time) => { - self.last_checked = time.as_secs() as i64; - } - _ => (), + if let Ok(time) = SystemTime::now().duration_since(UNIX_EPOCH) { + self.last_checked = time.as_secs() as i64; } } diff --git a/spider/src/page.rs b/spider/src/page.rs index d8aebd9e3..1f934c0ce 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -217,12 +217,17 @@ pub fn push_link>( sub_matcher: &CompactString, external_domains_caseless: &Box>, full_resources: bool, + links_pages: &mut Option>, ) { if let Some(b) = base { - let mut abs = convert_abs_path(&b, href); + let mut abs = convert_abs_path(b, href); let scheme = abs.scheme(); if scheme == "https" || scheme == "http" { + if let Some(link_map) = links_pages { + link_map.insert(A::from(abs.as_str().to_string())); + } + let host_name = abs.host_str(); let mut can_process = parent_host_match( host_name, @@ -272,10 +277,7 @@ pub fn push_link>( /// get the clean domain name pub fn domain_name(domain: &Url) -> &str { - match domain.host_str() { - Some(host) => host, - _ => "", - } + domain.host_str().unwrap_or_default() } /// extract the valid domains from a url. @@ -290,7 +292,7 @@ fn extract_root_domain(domain: &str) -> &str { domain } } else if parts.len() == 2 { - &parts[0] + parts[0] } else { domain } @@ -316,7 +318,7 @@ pub fn parent_host_match( if base_domain.is_empty() { exact_match } else { - exact_match || is_subdomain(host, &parent_host) || is_subdomain(host, &sub_matcher) + exact_match || is_subdomain(host, parent_host) || is_subdomain(host, sub_matcher) } } _ => false, @@ -325,7 +327,7 @@ pub fn parent_host_match( /// html selector for valid web pages for domain. pub fn get_page_selectors_base(u: &Url, subdomains: bool, tld: bool) -> Option { - let u = convert_abs_url_base(&u); + let u = convert_abs_url_base(u); let b = match u.host_str() { Some(host) => host.to_ascii_lowercase(), @@ -371,13 +373,9 @@ pub fn get_page_selectors(url: &str, subdomains: bool, tld: bool) -> Option>, is_success: bool) -> bool { match content { Some(ref content) => { - if content.is_empty() || content.starts_with(b"") || is_success && - content.starts_with(b"\r\n\r\n\r\n\r\n\r\n\r\n") { - false - } else { - true - } + !(content.is_empty() || content.starts_with(b"") || is_success && + content.starts_with(b"\r\n\r\n\r\n\r\n\r\n\r\n")) } _ => false, } @@ -419,11 +417,7 @@ pub fn build(url: &str, res: PageResponse) -> Page { Ok(_) => None, Err(er) => { if er.is_status() || er.is_connect() || er.is_timeout() { - if er.to_string().contains("ENOTFOUND") { - should_retry = false; - } else { - should_retry = true; - } + should_retry = !er.to_string().contains("ENOTFOUND"); } Some(er.to_string()) } @@ -518,8 +512,8 @@ pub(crate) fn get_charset_from_content_type( let parts: Vec<&str> = content_type_str.split(';').collect(); for part in parts { let part = part.trim().to_lowercase(); - if part.starts_with("charset=") { - if let Some(encoding) = encoding_rs::Encoding::for_label(part[8..].as_bytes()) { + if let Some(stripped) = part.strip_prefix("charset=") { + if let Some(encoding) = encoding_rs::Encoding::for_label(stripped.as_bytes()) { if let Some(ascii_encoding) = AsciiCompatibleEncoding::new(encoding) { return Some(ascii_encoding); } @@ -549,13 +543,14 @@ impl Page { url: &str, client: &Client, only_html: bool, - mut selectors: &mut RelativeSelectors, + selectors: &mut RelativeSelectors, external_domains_caseless: &Box>, r_settings: &PageLinkBuildSettings, - mut map: &mut hashbrown::HashSet, + map: &mut hashbrown::HashSet, ssg_map: Option<&mut hashbrown::HashSet>, prior_domain: &Option>, - mut domain_parsed: &mut Option>, + domain_parsed: &mut Option>, + links_pages: &mut Option>, ) -> Self { use crate::utils::{ handle_response_bytes_writer, modify_selectors, setup_default_response, @@ -566,7 +561,7 @@ impl Page { let cell = tokio::sync::OnceCell::new(); let (encoding, adjust_charset_on_meta_tag) = - match get_charset_from_content_type(&res.headers()) { + match get_charset_from_content_type(res.headers()) { Some(h) => (h, false), _ => (AsciiCompatibleEncoding::utf_8(), true), }; @@ -580,11 +575,11 @@ impl Page { let mut url = Box::new(CaseInsensitiveString::new(&url)); modify_selectors( - &prior_domain, + prior_domain, domain, - &mut domain_parsed, + domain_parsed, &mut url, - &mut selectors, + selectors, AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld), ); }; @@ -601,16 +596,15 @@ impl Page { let base_links_settings = if r_settings.full_resources { lol_html::element!("a[href],script[src],link[href]", |el| { - let attribute = if el.tag_name() == "script" { - "src" - } else { - "href" - }; + let tag_name = el.tag_name(); + + let attribute = if tag_name == "script" { "src" } else { "href" }; + if let Some(href) = el.get_attribute(attribute) { push_link( &base, &href, - &mut map, + map, &selectors.0, parent_host, parent_host_scheme, @@ -618,6 +612,7 @@ impl Page { sub_matcher, &external_domains_caseless, r_settings.full_resources, + links_pages, ); } Ok(()) @@ -628,7 +623,7 @@ impl Page { push_link( &base, &href, - &mut map, + map, &selectors.0, parent_host, parent_host_scheme, @@ -636,6 +631,7 @@ impl Page { sub_matcher, &external_domains_caseless, r_settings.full_resources, + links_pages, ); } Ok(()) @@ -664,8 +660,7 @@ impl Page { ..lol_html::send::Settings::new_for_handler_types() }; - let mut rewriter = - lol_html::send::HtmlRewriter::new(settings.into(), |_c: &[u8]| {}); + let mut rewriter = lol_html::send::HtmlRewriter::new(settings, |_c: &[u8]| {}); let mut response = handle_response_bytes_writer( res, @@ -685,20 +680,20 @@ impl Page { response .0 .content - .replace(Box::new(collected_bytes.freeze().into())); + .replace(Box::new(collected_bytes.freeze())); if r_settings.ssg_build { - if let Some(mut ssg_map) = ssg_map { + if let Some(ssg_map) = ssg_map { if let Some(source) = cell.get() { - if let Some(ref url_base) = base { - let build_ssg_path = convert_abs_path(&url_base, &source); + if let Some(url_base) = base { + let build_ssg_path = convert_abs_path(url_base, source); let build_page = - Page::new_page(build_ssg_path.as_str(), &client).await; + Page::new_page(build_ssg_path.as_str(), client).await; for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) { if let Some(matched) = cap.get(1) { - let href = auto_encode_bytes(&matched.as_bytes()) + let href = auto_encode_bytes(matched.as_bytes()) .replace(r#"\u002F"#, "/"); let last_segment = crate::utils::get_last_segment(&href); @@ -710,7 +705,7 @@ impl Page { push_link( &base, &href, - &mut ssg_map, + ssg_map, &selectors.0, parent_host, parent_host_scheme, @@ -718,6 +713,7 @@ impl Page { sub_matcher, &external_domains_caseless, r_settings.full_resources, + &mut None, ); } } @@ -1071,10 +1067,7 @@ impl Page { /// Set the html directly of the page pub fn set_html_bytes(&mut self, html: Option) { - self.html = match html { - Some(html) => Some(Box::new(html)), - _ => None, - }; + self.html = html.map(Box::new); } /// Set the url directly of the page. Useful for transforming the content and rewriting the url. @@ -1121,24 +1114,15 @@ impl Page { /// Html getter for bytes on the page. pub fn get_bytes(&self) -> Option<&Bytes> { - match self.html.as_ref() { - Some(html) => Some(html), - _ => None, - } + self.html.as_deref() } /// Html getter for bytes on the page as string. pub fn get_html(&self) -> String { - match self.html.as_ref() { - Some(html) => { - if html.is_empty() { - Default::default() - } else { - auto_encoder::auto_encode_bytes(html) - } - } - _ => Default::default(), - } + self.html + .as_ref() + .map(|v| auto_encoder::auto_encode_bytes(v)) + .unwrap_or_default() } /// Html getter for page to u8. @@ -1206,22 +1190,20 @@ impl Page { } Event::Text(e) => { if is_link_tag { - match e.unescape() { - Ok(v) => { - push_link( - &self.base.as_ref(), - &v, - map, - &selectors.0, - parent_host, - parent_host_scheme, - base_input_domain, - sub_matcher, - &self.external_domains_caseless, - false, - ); - } - _ => (), + if let Ok(v) = e.unescape() { + push_link( + &self.base.as_ref(), + &v, + map, + &selectors.0, + parent_host, + parent_host_scheme, + base_input_domain, + sub_matcher, + &self.external_domains_caseless, + false, + &mut None, + ); } } } @@ -1280,6 +1262,7 @@ impl Page { sub_matcher, &self.external_domains_caseless, false, + &mut None, ); } Ok(()) @@ -1291,7 +1274,7 @@ impl Page { let mut wrote_error = false; let mut rewriter = - lol_html::send::HtmlRewriter::new(rewriter_settings.into(), |_c: &[u8]| {}); + lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {}); let html_bytes = html.as_bytes(); let chunk_size = 8192; @@ -1301,7 +1284,7 @@ impl Page { while let Some(chunk) = stream.next().await { if let Ok(chunk) = chunk { - if let Err(_) = rewriter.write(chunk) { + if rewriter.write(chunk).is_err() { wrote_error = true; break; } @@ -1363,6 +1346,7 @@ impl Page { sub_matcher, &self.external_domains_caseless, false, + &mut None, ); } Ok(()) @@ -1385,7 +1369,7 @@ impl Page { }; let mut rewriter = - lol_html::send::HtmlRewriter::new(rewriter_settings.into(), |_c: &[u8]| {}); + lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {}); let html_bytes = html.as_bytes(); let chunk_size = 8192; @@ -1396,7 +1380,7 @@ impl Page { while let Some(chunk) = stream.next().await { if let Ok(chunk) = chunk { - if let Err(_) = rewriter.write(chunk) { + if rewriter.write(chunk).is_err() { wrote_error = true; break; } @@ -1409,12 +1393,12 @@ impl Page { if let Some(build_ssg_path) = cell.get() { if !build_ssg_path.is_empty() { - let build_page = Page::new_page(&build_ssg_path, &client).await; + let build_page = Page::new_page(build_ssg_path, client).await; for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) { if let Some(matched) = cap.get(1) { - let href = auto_encode_bytes(&matched.as_bytes()) - .replace(r#"\u002F"#, "/"); + let href = + auto_encode_bytes(matched.as_bytes()).replace(r#"\u002F"#, "/"); let last_segment = crate::utils::get_last_segment(&href); @@ -1431,6 +1415,7 @@ impl Page { sub_matcher, &self.external_domains_caseless, false, + &mut None, ); } } @@ -1836,6 +1821,7 @@ impl Page { sub_matcher, &external_domains_caseless, true, + &mut None, ); } Ok(()) @@ -1849,8 +1835,7 @@ impl Page { ..lol_html::send::Settings::new_for_handler_types() }; - let mut rewriter = - lol_html::send::HtmlRewriter::new(settings.into(), |_c: &[u8]| {}); + let mut rewriter = lol_html::send::HtmlRewriter::new(settings, |_c: &[u8]| {}); let html_bytes = html.as_bytes(); let chunk_size = 8192; @@ -1861,7 +1846,7 @@ impl Page { while let Some(chunk) = stream.next().await { if let Ok(chunk) = chunk { - if let Err(_) = rewriter.write(chunk) { + if rewriter.write(chunk).is_err() { wrote_error = true; break; } @@ -1927,7 +1912,7 @@ impl Page { if auto_encoder::is_binary_file(self.get_html_bytes_u8()) { return Default::default(); } - self.links_stream_full_resource::(&selectors) + self.links_stream_full_resource::(selectors) .await } } @@ -1971,10 +1956,7 @@ impl Page { #[inline] #[cfg(not(feature = "decentralized"))] fn abs_path(&self, href: &str) -> Option { - match &self.base { - Some(b) => Some(convert_abs_path(&b, href)), - _ => None, - } + self.base.as_ref().map(|b| convert_abs_path(b, href)) } /// Convert a URL to its absolute path without any fragments or params. [unused in the worker atm by default all is returned] diff --git a/spider/src/utils/header_utils.rs b/spider/src/utils/header_utils.rs index 840b1dfc5..56ce9da5d 100644 --- a/spider/src/utils/header_utils.rs +++ b/spider/src/utils/header_utils.rs @@ -19,7 +19,7 @@ pub fn setup_default_headers( if !headers.contains_key(REFERER) { if let Ok(ref_value) = - HeaderValue::from_str(&crate::features::spoof_referrer::spoof_referrer()) + HeaderValue::from_str(crate::features::spoof_referrer::spoof_referrer()) { if !ref_value.is_empty() { headers.insert(REFERER, ref_value); @@ -28,17 +28,14 @@ pub fn setup_default_headers( } if !headers.contains_key(HOST) && configuration.preserve_host_header { - match url { - Some(u) => { - if let Some(host) = u.host_str() { - if let Ok(ref_value) = HeaderValue::from_str(&host) { - if !ref_value.is_empty() { - headers.insert(HOST, ref_value); - } + if let Some(u) = url { + if let Some(host) = u.host_str() { + if let Ok(ref_value) = HeaderValue::from_str(host) { + if !ref_value.is_empty() { + headers.insert(HOST, ref_value); } } } - _ => (), } } diff --git a/spider/src/utils/interner.rs b/spider/src/utils/interner.rs index 7e92b6f66..b5c795756 100644 --- a/spider/src/utils/interner.rs +++ b/spider/src/utils/interner.rs @@ -98,6 +98,11 @@ where self.links_visited.len() } + /// The bucket is empty. + pub fn is_empty(&self) -> bool { + self.links_visited.is_empty() + } + /// Drain the bucket. pub fn drain(&mut self) -> hashbrown::hash_set::Drain<'_, SymbolUsize> { self.links_visited.drain() diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index a5ae1dfcf..130ecc3c7 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -832,20 +832,14 @@ pub fn convert_headers( let mut header_map = reqwest::header::HeaderMap::new(); for (index, items) in headers.iter().enumerate() { - match reqwest::header::HeaderValue::from_str(&items.1) { - Ok(head) => { - use std::str::FromStr; - match reqwest::header::HeaderName::from_str(&items.0) { - Ok(key) => { - header_map.insert(key, head); - } - _ => (), - } + if let Ok(head) = reqwest::header::HeaderValue::from_str(items.1) { + use std::str::FromStr; + if let Ok(key) = reqwest::header::HeaderName::from_str(items.0) { + header_map.insert(key, head); } - _ => (), } // mal headers - if index > 2000 { + if index > 1000 { break; } } @@ -1624,7 +1618,7 @@ pub(crate) fn setup_default_response(target_url: &str, res: &Response) -> PageRe #[cfg(feature = "headers")] headers: Some(res.headers().clone()), #[cfg(feature = "cookies")] - cookies: get_cookies(&res), + cookies: get_cookies(res), status_code: res.status(), final_url: rd, ..Default::default() @@ -2854,7 +2848,7 @@ pub(crate) fn setup_website_selectors( let tld = allowed.tld; match domain_parsed { - Some(u) => get_page_selectors_base(&u, subdomains, tld), + Some(u) => get_page_selectors_base(u, subdomains, tld), _ => get_page_selectors(url, subdomains, tld), } } diff --git a/spider/src/utils/trie.rs b/spider/src/utils/trie.rs index 7d922d044..bc4ccf27a 100644 --- a/spider/src/utils/trie.rs +++ b/spider/src/utils/trie.rs @@ -21,6 +21,12 @@ impl TrieNode { } } +impl Default for TrieNode { + fn default() -> Self { + Self::new() + } +} + #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] /// Trie value. @@ -31,6 +37,12 @@ pub struct Trie { pub match_all: bool, } +impl Default for Trie { + fn default() -> Self { + Self::new() + } +} + impl Trie { /// A new trie node. pub fn new() -> Self { diff --git a/spider/src/website.rs b/spider/src/website.rs index 5281ba74c..9c027de82 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -423,8 +423,8 @@ impl Website { let whitelist = self.configuration.get_whitelist_compiled(); let blacklist = self.configuration.get_blacklist_compiled(); - let blocked_whitelist = !whitelist.is_empty() && !contains(&whitelist, link); - let blocked_blacklist = !blacklist.is_empty() && contains(&blacklist, link); + let blocked_whitelist = !whitelist.is_empty() && !contains(whitelist, link); + let blocked_blacklist = !blacklist.is_empty() && contains(blacklist, link); if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(link) { ProcessLinkStatus::Blocked @@ -482,18 +482,15 @@ impl Website { match self.configuration.inner_budget.as_mut() { Some(budget) => { let exceeded_wild_budget = if self.configuration.wild_card_budgeting { - match budget.get_mut(&*WILD_CARD_PATH) { - Some(budget) => { - if budget.abs_diff(0) == 1 { - true - } else if budget == &0 { - true - } else { - *budget -= 1; - false - } + if let Some(budget) = budget.get_mut(&*WILD_CARD_PATH) { + if budget.abs_diff(0) == 1 { + true + } else { + *budget -= 1; + false } - _ => false, + } else { + false } } else { false @@ -525,18 +522,15 @@ impl Website { joint_segment.push_str(seg); if budget.contains_key(&joint_segment) { - match budget.get_mut(&joint_segment) { - Some(budget) => { - if budget.abs_diff(0) == 0 || *budget == 0 { - over = true; - break; - } else { - *budget -= 1; - continue; - } + if let Some(budget) = budget.get_mut(&joint_segment) { + if budget.abs_diff(0) == 0 || *budget == 0 { + over = true; + break; + } else { + *budget -= 1; + continue; } - _ => (), - }; + } } } @@ -560,9 +554,9 @@ impl Website { if self.configuration.inner_budget.is_some() || has_depth_control { if self.configuration.inner_budget.is_none() && has_depth_control { - self.is_over_inner_depth_budget(&link) + self.is_over_inner_depth_budget(link) } else { - self.is_over_inner_budget(&link) + self.is_over_inner_budget(link) } } else { false @@ -661,15 +655,13 @@ impl Website { } _ => None, } - } else { - if let Some(mut d) = self.domain_parsed.as_deref().cloned() { - if let Ok(mut path) = d.path_segments_mut() { - path.clear(); - } - Some(d) - } else { - None + } else if let Some(mut d) = self.domain_parsed.as_deref().cloned() { + if let Ok(mut path) = d.path_segments_mut() { + path.clear(); } + Some(d) + } else { + None } } @@ -807,7 +799,7 @@ impl Website { let user_agent = match &self.configuration.user_agent { Some(ua) => ua.as_str(), - _ => &get_ua(self.only_chrome_agent()), + _ => get_ua(self.only_chrome_agent()), }; if cfg!(feature = "real_browser") { @@ -928,7 +920,7 @@ impl Website { client: reqwest::ClientBuilder, ) -> reqwest::ClientBuilder { let client = client.cookie_store(true); - let client = if !self.configuration.cookie_str.is_empty() && self.domain_parsed.is_some() { + if !self.configuration.cookie_str.is_empty() && self.domain_parsed.is_some() { match self.domain_parsed.clone() { Some(p) => { let cookie_store = reqwest::cookie::Jar::default(); @@ -939,8 +931,7 @@ impl Website { } } else { client - }; - client + } } /// Build the client with cookie configurations. This does nothing with [cookies] flag enabled. @@ -1186,7 +1177,7 @@ impl Website { /// Setup selectors for handling link targets. fn setup_selectors(&self) -> Option { setup_website_selectors( - &self.get_url_parsed(), + self.get_url_parsed(), self.get_url().inner(), AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld), ) @@ -1275,7 +1266,11 @@ impl Website { let mut links: HashSet = HashSet::new(); let mut links_ssg = links.clone(); - + let mut links_pages = if self.configuration.return_page_links { + Some(links.clone()) + } else { + None + }; let mut page_links_settings = PageLinkBuildSettings::new(true, self.configuration.full_resources); @@ -1295,6 +1290,7 @@ impl Website { Some(&mut links_ssg), &mut domain_parsed, &mut self.domain_parsed, + &mut links_pages, ) .await; @@ -1317,12 +1313,13 @@ impl Website { client, false, base, - &domains_caseless, + domains_caseless, &page_links_settings, &mut links, Some(&mut links_ssg), &mut domain_parsed, &mut domain_parsed_clone, + &mut links_pages, ) .await, ); @@ -1346,13 +1343,14 @@ impl Website { Some(&mut links_ssg), &mut domain_parsed, &mut self.domain_parsed, + &mut links_pages, ) .await, ); } } - emit_log(&url); + emit_log(url); self.links_visited.insert(match self.on_link_find_callback { Some(cb) => { @@ -1366,24 +1364,24 @@ impl Website { self.status = CrawlStatus::Empty; } + if self.configuration.return_page_links { + page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new); + if let Some(page_links) = page.page_links.as_mut() { + page_links.extend(links_ssg.clone()); + } + } + links.extend(links_ssg); self.initial_status_code = page.status_code; - if page.status_code == reqwest::StatusCode::FORBIDDEN && links.len() == 0 { + if page.status_code == reqwest::StatusCode::FORBIDDEN && links.is_empty() { self.status = CrawlStatus::Blocked; } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS { self.status = CrawlStatus::RateLimited; } else if page.status_code.is_server_error() { self.status = CrawlStatus::ServerError; } - if self.configuration.return_page_links { - page.page_links = if links.is_empty() { - None - } else { - Some(Box::new(links.clone())) - }; - } channel_send_page(&self.channel, page, &self.channel_guard); @@ -2176,10 +2174,7 @@ impl Website { self.configuration.configure_allowlist(); - let mut q = match &self.channel_queue { - Some(q) => Some(q.0.subscribe()), - _ => None, - }; + let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe()); let semaphore = self.setup_semaphore(); @@ -2222,7 +2217,7 @@ impl Website { biased; Some(link) = stream.next(), if semaphore.available_permits() > 0 => { if !self.handle_process(handle, &mut interval, async { - emit_log_shutdown(&link.inner()); + emit_log_shutdown(link.inner()); let permits = set.len(); set.shutdown().await; semaphore.add_permits(permits); @@ -2240,7 +2235,7 @@ impl Website { continue; } - emit_log(&link.inner()); + emit_log(link.inner()); self.links_visited.insert(link.clone()); @@ -2254,6 +2249,11 @@ impl Website { }; let mut links: HashSet = HashSet::new(); + let mut links_pages = if return_page_links { + Some(links.clone()) + } else { + None + }; let mut relative_selectors = shared.1.clone(); let mut r_settings = shared.7; r_settings.ssg_build = true; @@ -2263,7 +2263,17 @@ impl Website { let mut domain_parsed = None; - let mut page = Page::new_page_streaming(target_url, client, only_html, &mut relative_selectors, external_domains_caseless, &r_settings, &mut links, None, &None, &mut domain_parsed).await; + let mut page = Page::new_page_streaming( + target_url, + client, only_html, + &mut relative_selectors, + external_domains_caseless, + &r_settings, + &mut links, + None, + &None, + &mut domain_parsed, + &mut links_pages).await; let mut retry_count = shared.5; @@ -2277,7 +2287,17 @@ impl Website { if page.status_code == StatusCode::GATEWAY_TIMEOUT { if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async { let mut domain_parsed = None; - let next_page = Page::new_page_streaming(target_url, client, only_html, &mut relative_selectors.clone(), &external_domains_caseless, &r_settings, &mut links, None, &None, &mut domain_parsed).await; + let next_page = Page::new_page_streaming( + target_url, + client, only_html, + &mut relative_selectors.clone(), + external_domains_caseless, + &r_settings, + &mut links, + None, + &None, + &mut domain_parsed, + &mut links_pages).await; page.clone_from(&next_page); @@ -2287,16 +2307,23 @@ impl Website { } } else { - page.clone_from(&Page::new_page_streaming(target_url, &client, only_html, &mut relative_selectors.clone(), external_domains_caseless, &r_settings, &mut links, None, &None, &mut domain_parsed).await); + page.clone_from(&Page::new_page_streaming( + target_url, + client, + only_html, + &mut relative_selectors.clone(), + external_domains_caseless, + &r_settings, + &mut links, + None, + &None, + &mut domain_parsed, + &mut links_pages).await); } } if return_page_links { - page.page_links = if links.is_empty() { - None - } else { - Some(Box::new(links.clone())) - }; + page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new); } channel_send_page(&shared.2, page, &shared.4); @@ -3315,12 +3342,17 @@ impl Website { } } SiteMapEntity::Err(err) => { - log::info!("incorrect sitemap error: {:?}", err.msg()) + log::info!( + "incorrect sitemap error: {:?}", + err.msg() + ) } }; } } - Err(err) => log::info!("http parse error: {:?}", err.to_string()), + Err(err) => { + log::info!("http parse error: {:?}", err.to_string()) + } }; } Err(err) => log::info!("http network error: {}", err.to_string()), @@ -4152,21 +4184,17 @@ impl Website { self.configuration.wild_card_budgeting = wild_card_budget; } if self.configuration.depth > 0 && self.domain_parsed.is_some() { - match &self.domain_parsed { - Some(domain) => match domain.path_segments() { - Some(segments) => { - let segments_cnt = segments.count(); - - if segments_cnt > self.configuration.depth { - self.configuration.depth_distance = self.configuration.depth - + self.configuration.depth.abs_diff(segments_cnt); - } else { - self.configuration.depth_distance = self.configuration.depth; - } + if let Some(ref domain) = self.domain_parsed { + if let Some(segments) = domain.path_segments() { + let segments_cnt = segments.count(); + + if segments_cnt > self.configuration.depth { + self.configuration.depth_distance = self.configuration.depth + + self.configuration.depth.abs_diff(segments_cnt); + } else { + self.configuration.depth_distance = self.configuration.depth; } - _ => (), - }, - _ => (), + } } } } @@ -4237,7 +4265,7 @@ impl Website { let channel = self.channel.get_or_insert_with(|| { let (tx, rx) = broadcast::channel( (if capacity == 0 { - DEFAULT_PERMITS.clone() + *DEFAULT_PERMITS } else { capacity }) @@ -4401,7 +4429,7 @@ fn channel_send_page( channel_guard: &Option, ) { if let Some(c) = channel { - if let Ok(_) = c.0.send(page) { + if c.0.send(page).is_ok() { if let Some(guard) = channel_guard { ChannelGuard::inc_guard(&guard.0 .1) } diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 73ce69523..eb8cfc86f 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -230,24 +230,14 @@ impl NetworkManager { self.on_request(&request_will_be_sent, Some(event.request_id.clone().into())); } else { let skip_networking = IGNORE_NETWORKING_RESOURCE_MAP - .contains(&event.resource_type.as_ref()) + .contains(event.resource_type.as_ref()) || self.ignore_visuals - && (IGNORE_VISUAL_RESOURCE_MAP.contains(&event.resource_type.as_ref())) + && (IGNORE_VISUAL_RESOURCE_MAP.contains(event.resource_type.as_ref())) || self.block_stylesheets && ResourceType::Stylesheet == event.resource_type || self.block_javascript && ResourceType::Script == event.resource_type - && !JS_FRAMEWORK_ALLOW.contains(&event.request.url.as_str()) - || (!self.block_javascript - && event - .request - .url - .starts_with("https://www.google-analytics.com") - || event - .request - .url - .starts_with("https://www.googletagmanager.com") - || event.request.url.starts_with("https://px.ads.linkedin.com")); + && !JS_FRAMEWORK_ALLOW.contains(event.request.url.as_str()); if skip_networking { let fullfill_params = diff --git a/spider_chrome/src/page.rs b/spider_chrome/src/page.rs index f84f2aa46..4bcb01852 100644 --- a/spider_chrome/src/page.rs +++ b/spider_chrome/src/page.rs @@ -123,7 +123,7 @@ fn get_plugin_filenames() -> Vec { .collect(); for _ in 0..2 { - plugins.push(generate_random_plugin_filename().into()); + plugins.push(generate_random_plugin_filename()); } plugins.shuffle(&mut thread_rng()); diff --git a/spider_cli/src/main.rs b/spider_cli/src/main.rs index 679b26b32..f84e7f5e2 100644 --- a/spider_cli/src/main.rs +++ b/spider_cli/src/main.rs @@ -58,35 +58,20 @@ async fn main() { .collect::>() })); - match cli.agent { - Some(agent) => { - website.with_user_agent(Some(&agent)); - } - _ => (), + if let Some(ref agent) = cli.agent { + website.with_user_agent(Some(agent)); } - match cli.delay { - Some(delay) => { - website.with_delay(delay); - } - _ => (), + if let Some(delay) = cli.delay { + website.with_delay(delay); } - match cli.limit { - Some(limit) => { - website.with_limit(limit); - } - _ => (), + if let Some(limit) = cli.limit { + website.with_limit(limit); } - match cli.depth { - Some(depth) => { - website.with_depth(depth); - } - _ => (), + if let Some(depth) = cli.depth { + website.with_depth(depth); } - match cli.external_domains { - Some(domains) => { - website.with_external_domains(Some(domains.into_iter())); - } - _ => (), + if let Some(domains) = cli.external_domains { + website.with_external_domains(Some(domains.into_iter())); } match website @@ -113,9 +98,7 @@ async fn main() { if output_links { while let Ok(res) = rx2.recv().await { - match stdout.write_all(string_concat!(res.get_url(), "\n").as_bytes()).await { - _ => () - } + let _ = stdout.write_all(string_concat!(res.get_url(), "\n").as_bytes()).await; } } } @@ -127,9 +110,7 @@ async fn main() { let tmp_path = Path::new(&tmp_dir); if !Path::new(&tmp_path).exists() { - match tokio::fs::create_dir_all(tmp_path).await { - _ => (), - }; + let _ = tokio::fs::create_dir_all(tmp_path).await; } let download_path = PathBuf::from(tmp_path); @@ -139,10 +120,8 @@ async fn main() { }); while let Ok(res) = rx2.recv().await { - match res.get_url_parsed() { - Some(parsed_url) => { - log("Storing", parsed_url); - + if let Some(parsed_url) = res.get_url_parsed() { + log("Storing", parsed_url); let url_path = parsed_url.path(); let split_paths: Vec<&str> = url_path.split('/').collect(); @@ -155,9 +134,7 @@ async fn main() { download_path.push(p); if !Path::new(&download_path).exists() { - match tokio::fs::create_dir_all(&download_path).await { - _ => (), - }; + let _ = tokio::fs::create_dir_all(&download_path).await; } } else { match tokio::fs::OpenOptions::new() @@ -173,13 +150,8 @@ async fn main() { ) })).await { Ok(mut file) => { - match res.get_bytes() { - Some(b) => { - match file.write_all(b).await { - _ => () - } - } - _ => (), + if let Some(b) = res.get_bytes() { + let _ = file.write_all(b).await; } } _ => { @@ -188,10 +160,7 @@ async fn main() { } } } - } - _ => () } - } } Some(Commands::SCRAPE { @@ -219,7 +188,7 @@ async fn main() { Default::default() }, "links": match selectors { - Some(ref s) => res.links(&s).await.iter().map(|i| i.inner().to_string()).collect::(), + Some(ref s) => res.links(s).await.iter().map(|i| i.inner().to_string()).collect::(), _ => Default::default() } }); diff --git a/spider_transformations/src/html2text/mod.rs b/spider_transformations/src/html2text/mod.rs index 50ce32dc4..4d64873e8 100644 --- a/spider_transformations/src/html2text/mod.rs +++ b/spider_transformations/src/html2text/mod.rs @@ -854,17 +854,13 @@ where let next_node = next_node.to_process.next(); // Get the next child node to process if let Some(h) = next_node { - match pending_stack.last_mut() { - Some(pending) => { - pending - .prefn - .as_ref() - .map(|ref f| f(context, &h)) - .transpose()?; - } - _ => (), + if let Some(pending) = pending_stack.last_mut() { + pending + .prefn + .as_ref() + .map(|ref f| f(context, &h)) + .transpose()?; } - match process_node(context, h)? { TreeMapResult::Finished(result) => { if let Some(pending) = pending_stack.last_mut() { @@ -1544,7 +1540,7 @@ fn do_render_node<'b, T: Write, D: TextDecorator>( Sup(children) => { // Special case for digit-only superscripts - use superscript // characters. - fn sup_digits(children: &Vec) -> Option { + fn sup_digits(children: &[RenderNode]) -> Option { if children.len() != 1 { return None; } @@ -2134,10 +2130,9 @@ where R: io::Read, D: TextDecorator, { - match config::with_decorator(decorator).string_from_read(input, width) { - Ok(r) => r, - _ => Default::default(), - } + config::with_decorator(decorator) + .string_from_read(input, width) + .unwrap_or_default() } /// Reads HTML from `input`, and returns a `String` with text wrapped to @@ -2146,10 +2141,9 @@ pub fn from_read(input: R, width: usize) -> String where R: io::Read, { - match config::plain().string_from_read(input, width) { - Ok(v) => v, - _ => Default::default(), - } + config::plain() + .string_from_read(input, width) + .unwrap_or_default() } /// Reads HTML from `input`, and returns text wrapped to `width` columns. @@ -2159,10 +2153,9 @@ pub fn from_read_rich(input: R, width: usize) -> Vec v, - _ => Default::default(), - } + config::rich() + .lines_from_read(input, width) + .unwrap_or_default() } fn calc_ol_prefix_size(start: i64, num_items: usize, decorator: &D) -> usize { diff --git a/spider_transformations/src/html2text/render/text_renderer.rs b/spider_transformations/src/html2text/render/text_renderer.rs index cdfcdc77e..ca87aa1fb 100644 --- a/spider_transformations/src/html2text/render/text_renderer.rs +++ b/spider_transformations/src/html2text/render/text_renderer.rs @@ -30,7 +30,7 @@ impl Deref for TextRenderer { fn deref(&self) -> &Self::Target { match self.subrender.last() { Some(l) => l, - _ => &self, + _ => self, } } } @@ -57,11 +57,8 @@ impl TextRenderer { /// Add link to global link collection pub fn start_link(&mut self, target: &str) -> crate::html2text::Result<()> { self.links.push(target.to_string()); - match self.subrender.last_mut() { - Some(mt) => { - mt.start_link(target)?; - } - _ => (), + if let Some(mt) = self.subrender.last_mut() { + mt.start_link(target)?; } Ok(()) } @@ -182,16 +179,13 @@ impl TaggedLine { use self::TaggedLineElement::Str; if !self.v.is_empty() { - match self.v.last_mut() { - Some(mt) => { - if let Str(ref mut ts_prev) = mt { - if ts_prev.tag == ts.tag { - ts_prev.s.push_str(&ts.s); - return; - } + if let Some(mt) = self.v.last_mut() { + if let Str(ref mut ts_prev) = mt { + if ts_prev.tag == ts.tag { + ts_prev.s.push_str(&ts.s); + return; } } - _ => (), } } self.v.push(Str(ts)); @@ -220,16 +214,13 @@ impl TaggedLine { use self::TaggedLineElement::Str; if !self.v.is_empty() { - match self.v.last_mut() { - Some(mt) => { - if let Str(ref mut ts_prev) = mt { - if ts_prev.tag == *tag { - ts_prev.s.push(c); - return; - } + if let Some(mt) = self.v.last_mut() { + if let Str(ref mut ts_prev) = mt { + if ts_prev.tag == *tag { + ts_prev.s.push(c); + return; } } - _ => (), } } let mut s = String::new(); @@ -1213,11 +1204,8 @@ impl Renderer for SubRenderer { }; if self.pre_depth == 0 { - match self.wrapping.as_mut() { - Some(w) => { - w.add_text(filtered_text, &self.ann_stack)?; - } - _ => (), + if let Some(w) = self.wrapping.as_mut() { + w.add_text(filtered_text, &self.ann_stack)?; } } else { let mut tag_first = self.ann_stack.clone(); @@ -1226,11 +1214,8 @@ impl Renderer for SubRenderer { tag_first.push(self.decorator.decorate_preformat_first()); tag_cont.push(self.decorator.decorate_preformat_cont()); - match self.wrapping.as_mut() { - Some(w) => { - w.add_preformatted_text(filtered_text, &tag_first, &tag_cont)?; - } - _ => (), + if let Some(w) = self.wrapping.as_mut() { + w.add_preformatted_text(filtered_text, &tag_first, &tag_cont)?; } } Ok(()) @@ -1509,13 +1494,10 @@ impl Renderer for SubRenderer { Ok(()) } fn end_strikeout(&mut self) -> crate::html2text::Result<()> { - match self.text_filter_stack.pop() { - Some(_) => { - let s = self.decorator.decorate_strikeout_end(); - self.add_inline_text(&s)?; - self.ann_stack.pop(); - } - _ => (), + if self.text_filter_stack.pop().is_some() { + let s = self.decorator.decorate_strikeout_end(); + self.add_inline_text(&s)?; + self.ann_stack.pop(); } Ok(()) } @@ -1559,11 +1541,8 @@ impl Renderer for SubRenderer { use self::TaggedLineElement::FragmentStart; self.ensure_wrapping_exists(); - match self.wrapping.as_mut() { - Some(w) => { - w.add_element(FragmentStart(fragname.to_string())); - } - _ => (), + if let Some(w) = self.wrapping.as_mut() { + w.add_element(FragmentStart(fragname.to_string())); } } @@ -1621,6 +1600,12 @@ impl PlainDecorator { } } +impl Default for PlainDecorator { + fn default() -> Self { + Self::new() + } +} + impl TextDecorator for PlainDecorator { type Annotation = (); @@ -1714,6 +1699,12 @@ impl TrivialDecorator { } } +impl Default for TrivialDecorator { + fn default() -> Self { + Self::new() + } +} + impl TextDecorator for TrivialDecorator { type Annotation = (); @@ -1830,6 +1821,12 @@ impl RichDecorator { } } +impl Default for RichDecorator { + fn default() -> Self { + Self::new() + } +} + impl TextDecorator for RichDecorator { type Annotation = RichAnnotation; diff --git a/spider_transformations/src/transformation/content.rs b/spider_transformations/src/transformation/content.rs index d39749ea2..a99b16df5 100644 --- a/spider_transformations/src/transformation/content.rs +++ b/spider_transformations/src/transformation/content.rs @@ -180,13 +180,13 @@ pub fn aho_clean_markdown(html: &str) -> String { // handle the error on replace all // if the content is small just use an aho replacement if html.len() <= 40 { - match AHO.try_replace_all(&html, &*AHO_REPLACEMENTS) { + match AHO.try_replace_all(html, &*AHO_REPLACEMENTS) { Ok(r) => r, _ => html.into(), } } else { // regex smooth clean multiple - let cleaned_html = CLEAN_MARKDOWN_REGEX.replace_all(&html, |caps: ®ex::Captures| { + let cleaned_html = CLEAN_MARKDOWN_REGEX.replace_all(html, |caps: ®ex::Captures| { let matched = match caps.get(0) { Some(m) => m.as_str(), _ => Default::default(), @@ -218,8 +218,7 @@ pub fn clean_html_elements(html: &str, tags: Vec<&str>) -> String { Ok(()) }) }) - .collect::>() - .into(), + .collect::>(), ..RewriteStrSettings::default() }, ) { diff --git a/spider_utils/src/lib.rs b/spider_utils/src/lib.rs index 1c5129d47..8413b8ba6 100644 --- a/spider_utils/src/lib.rs +++ b/spider_utils/src/lib.rs @@ -55,8 +55,8 @@ where while let Some(selector) = stream.next().await { for s in selector.1 { - for element in fragment.select(&s) { - process_selector::(element, &selector.0, &mut map); + for element in fragment.select(s) { + process_selector::(element, selector.0, &mut map); } } } @@ -65,9 +65,8 @@ where if !selectors.xpath.is_empty() { if let Ok(package) = parser::parse(html) { let document = package.as_document(); - let mut stream = selectors.xpath.iter(); - while let Some(selector) = stream.next() { + for selector in selectors.xpath.iter() { for s in selector.1 { if let Ok(value) = evaluate_xpath(&document, s) { let text = value.into_string(); @@ -102,12 +101,11 @@ where if !selectors.css.is_empty() { let fragment = Html::parse_fragment(html); - let mut stream = selectors.css.iter(); - while let Some(selector) = stream.next() { + for selector in selectors.css.iter() { for s in selector.1 { - for element in fragment.select(&s) { - process_selector::(element, &selector.0, &mut map); + for element in fragment.select(s) { + process_selector::(element, selector.0, &mut map); } } } @@ -116,9 +114,8 @@ where if !selectors.xpath.is_empty() { if let Ok(package) = parser::parse(html) { let document = package.as_document(); - let mut stream = selectors.xpath.iter(); - while let Some(selector) = stream.next() { + for selector in selectors.xpath.iter() { for s in selector.1 { if let Ok(value) = evaluate_xpath(&document, s) { let text = value.into_string(); diff --git a/spider_worker/src/main.rs b/spider_worker/src/main.rs index 6d34acd21..a95c577f0 100644 --- a/spider_worker/src/main.rs +++ b/spider_worker/src/main.rs @@ -48,7 +48,7 @@ async fn forward( _ => (false, false), }; - let mut page = build(&"", Default::default()); + let mut page = build("", Default::default()); let extracted = match spider::page::get_page_selectors(&url_path, subdomains, tld) { Some(mut selectors) => { @@ -67,15 +67,14 @@ async fn forward( None, &None, &mut None, + &mut None, ) .await, ); let mut s = flexbuffers::FlexbufferSerializer::new(); - match links.serialize(&mut s) { - _ => (), - }; + let _ = links.serialize(&mut s); s.take_buffer() }