Skip to content

Commit

Permalink
chore(transform): fix streaming repeated text
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 3, 2024
1 parent 8d540e8 commit a405f82
Show file tree
Hide file tree
Showing 8 changed files with 27 additions and 16 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.96"
version = "2.13.97"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.96"
version = "2.13.97"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.96"
version = "2.13.97"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.96"
version = "2.13.97"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
19 changes: 15 additions & 4 deletions spider_transformations/src/transformation/text_extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ pub async fn extract_text_streaming_with_size(
}

let mut extracted_text = String::new();
let txx1 = txx.clone();
let mut last_sent_position = 0;

element_content_handlers.push(text!(
"*:not(script):not(style):not(svg):not(noscript):not(nav):not(footer)",
Expand All @@ -105,7 +105,20 @@ pub async fn extract_text_streaming_with_size(
extracted_text.push('\n');
}

let _ = txx1.send(extracted_text.clone());
let new_slice = &extracted_text[last_sent_position..];

if !new_slice.is_empty() {
let _ = txx.send(new_slice.to_string());
last_sent_position = extracted_text.len();
}

// clear the text tracker
if extracted_text.len() > 1024 {
if !extracted_text.ends_with(' ') {
extracted_text.clear();
last_sent_position = 0;
}
}
}

Ok(())
Expand Down Expand Up @@ -138,8 +151,6 @@ pub async fn extract_text_streaming_with_size(
let _ = rewriter.end();
}

drop(txx);

let mut rewrited_bytes: String = String::new();

while let Some(c) = rxx.recv().await {
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.96"
version = "2.13.97"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.13.96"
version = "2.13.97"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit a405f82

Please sign in to comment.