From a405f82047501621ea2cff5a854a625c24ad0921 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Mon, 2 Dec 2024 23:16:13 -0500 Subject: [PATCH] chore(transform): fix streaming repeated text --- Cargo.lock | 12 ++++++------ spider/Cargo.toml | 2 +- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- .../src/transformation/text_extract.rs | 19 +++++++++++++++---- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 8 files changed, 27 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6d5aa170a..2c71740f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4297,7 +4297,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.13.96" +version = "2.13.97" dependencies = [ "ahash", "aho-corasick", @@ -4359,7 +4359,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.13.96" +version = "2.13.97" dependencies = [ "adblock", "async-tungstenite", @@ -4394,7 +4394,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.13.96" +version = "2.13.97" dependencies = [ "clap", "env_logger", @@ -4419,7 +4419,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.13.96" +version = "2.13.97" dependencies = [ "aho-corasick", "fast_html2md", @@ -4441,7 +4441,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.13.96" +version = "2.13.97" dependencies = [ "indexmap 1.9.3", "serde", @@ -4453,7 +4453,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.13.96" +version = "2.13.97" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 05c0d29f6..08f8be138 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.13.96" +version = "2.13.97" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index c441b647c..360566eda 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.13.96" +version = "2.13.97" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index f94abe97f..1c415cebd 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.13.96" +version = "2.13.97" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index dd02fa9aa..6ff0ae411 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.13.96" +version = "2.13.97" authors = [ "j-mendez " ] diff --git a/spider_transformations/src/transformation/text_extract.rs b/spider_transformations/src/transformation/text_extract.rs index 45eb03519..2c508aab6 100644 --- a/spider_transformations/src/transformation/text_extract.rs +++ b/spider_transformations/src/transformation/text_extract.rs @@ -87,7 +87,7 @@ pub async fn extract_text_streaming_with_size( } let mut extracted_text = String::new(); - let txx1 = txx.clone(); + let mut last_sent_position = 0; element_content_handlers.push(text!( "*:not(script):not(style):not(svg):not(noscript):not(nav):not(footer)", @@ -105,7 +105,20 @@ pub async fn extract_text_streaming_with_size( extracted_text.push('\n'); } - let _ = txx1.send(extracted_text.clone()); + let new_slice = &extracted_text[last_sent_position..]; + + if !new_slice.is_empty() { + let _ = txx.send(new_slice.to_string()); + last_sent_position = extracted_text.len(); + } + + // clear the text tracker + if extracted_text.len() > 1024 { + if !extracted_text.ends_with(' ') { + extracted_text.clear(); + last_sent_position = 0; + } + } } Ok(()) @@ -138,8 +151,6 @@ pub async fn extract_text_streaming_with_size( let _ = rewriter.end(); } - drop(txx); - let mut rewrited_bytes: String = String::new(); while let Some(c) = rxx.recv().await { diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 734bf978a..38481fea2 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.13.96" +version = "2.13.97" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 9b6ec235a..ea28bf7af 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.13.96" +version = "2.13.97" authors = [ "j-mendez " ]