diff --git a/src/html/mod.rs b/src/html/mod.rs
index 73dfe336..7a33027e 100644
--- a/src/html/mod.rs
+++ b/src/html/mod.rs
@@ -18,12 +18,13 @@ pub use self::text_type::TextType;
pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) {
loop {
if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) {
- let Some((chunk_before, (matched, rest))) = content
- .split_at_checked(pos)
- .and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?)))
- else {
+ let Some((chunk_before, rest)) = content.split_at_checked(pos) else {
return;
};
+ let Some((matched, rest)) = rest.split_at_checked(1) else {
+ return;
+ };
+
content = rest;
let matched = matched.as_bytes()[0];
diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs
index 3fd002d0..c7b969da 100644
--- a/src/rewritable_units/text_encoder.rs
+++ b/src/rewritable_units/text_encoder.rs
@@ -125,9 +125,36 @@ impl<'output_handler> StreamingHandlerSinkInner<'output_handler> {
}
}
+/// Temporary buffer used for encoding_rs output
enum Buffer {
+ /// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path.
+ Stack([u8; 63]), // leave a byte for the enum's tag, so that the enum has 64-byte size
+ /// Used when encoding_rs asks for a larger buffer, or the content is large enough for small buffer roundtrips to add up
Heap(Vec),
- Stack([u8; 63]), // leave a byte for the tag
+}
+
+impl Buffer {
+ /// Arbitrary limit when to switch from a small on-stack buffer to heap allocation
+ const CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER: usize = 1 << 20;
+
+ /// Arbitrary, about a page size
+ const DEFAULT_HEAP_BUFFER_SIZE: usize = 4096;
+
+ fn buffer_for_length(&mut self, len: usize) -> &mut [u8] {
+ let buffer = match self {
+ Buffer::Heap(buf) => buf.as_mut_slice(),
+ // Long non-ASCII content could take lots of roundtrips through the encoder
+ buf if content.len() >= Self::CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER => {
+ *buf = Buffer::Heap(vec![0; Self::DEFAULT_HEAP_BUFFER_SIZE]);
+ match buf {
+ Buffer::Heap(buf) => buf.as_mut(),
+ _ => unreachable!(),
+ }
+ }
+ Buffer::Stack(buf) => buf.as_mut_slice(),
+ };
+ buffer
+ }
}
struct TextEncoder {
@@ -152,6 +179,7 @@ impl TextEncoder {
#[inline(never)]
fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) {
loop {
+ // First, fast path for ASCII-only prefix
debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it
let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes());
if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) {
@@ -164,20 +192,12 @@ impl TextEncoder {
content = remainder;
}
- let buffer = match &mut self.buffer {
- Buffer::Heap(buf) => buf.as_mut_slice(),
- // Long non-ASCII content could take lots of roundtrips through the encoder
- buf if content.len() >= 1 << 20 => {
- *buf = Buffer::Heap(vec![0; 4096]);
- match buf {
- Buffer::Heap(buf) => buf.as_mut(),
- _ => unreachable!(),
- }
- }
- Buffer::Stack(buf) => buf.as_mut_slice(),
- };
+ // Now the content starts with non-ASCII byte, so encoding_rs may need a buffer to convert to.
+ let buffer = self.buffer.buffer_for_length(content.len());
+ // last == true is needed only for the stateful ISO-JP encoding, which this library doesn't allow
let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false);
+
if written > 0 && written <= buffer.len() {
(output_handler)(&buffer[..written]);
}
@@ -185,20 +205,21 @@ impl TextEncoder {
return;
}
content = &content[read..];
+
match result {
CoderResult::InputEmpty => {
debug_assert!(content.is_empty());
return;
}
+ // we've made progress, and can try again without growing the buffer
+ CoderResult::OutputFull if written > 0 => {}
CoderResult::OutputFull => {
- match &mut self.buffer {
- Buffer::Heap(buf) if buf.len() >= 1024 => {
- if written == 0 {
- panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes
- }
- }
- buf => *buf = Buffer::Heap(vec![0; 1024]),
- }
+ // encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug.
+ assert!(
+ buffer.len() < Buffer::DEFAULT_HEAP_BUFFER_SIZE,
+ "encoding_rs infinite loop"
+ );
+ self.buffer = Buffer::Heap(vec![0; Buffer::DEFAULT_HEAP_BUFFER_SIZE]);
}
}
}
@@ -213,45 +234,60 @@ const fn utf8_width(b: u8) -> u8 {
b.leading_ones() as _
}
+/// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments
struct IncompleteUtf8Resync {
- bytes: [u8; 4],
- len: u8,
+ /// Buffers an incomplete UTF-8 sequence
+ char_bytes: [u8; 4],
+ /// Number of bytes in `bytes`
+ char_len: u8,
}
impl IncompleteUtf8Resync {
pub fn new() -> Self {
Self {
- bytes: [0; 4],
- len: 0,
+ char_bytes: [0; 4],
+ char_len: 0,
}
}
+ /// Returns a valid UTF-8 fragment, and not-yet-checked remainder of the bytes.
+ ///
+ /// Call `discard_incomplete()` after the last write to flush any partially-written chars.
pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>(
&'buf mut self,
mut content: &'src [u8],
) -> Result<(&'buf str, &'src [u8]), Utf8Error> {
- if self.len > 0 {
- let mut found_end_byte = false;
+ // There may be incomplete char buffered from previous write, that must be continued now
+ if self.char_len > 0 {
+ let mut must_emit_now = false;
while let Some((&next_byte, rest)) = content.split_first() {
if is_continuation_byte(next_byte) {
- if let Some(buf) = self.bytes.get_mut(self.len as usize) {
+ if let Some(buf) = self.char_bytes.get_mut(self.char_len as usize) {
*buf = next_byte;
- self.len += 1;
+ self.char_len += 1;
content = rest;
continue;
}
+ // overlong sequences fall here, and will be checked when the char_bytes is flushed
}
- found_end_byte = true;
+ must_emit_now = true;
break;
}
- if found_end_byte || self.len >= utf8_width(self.bytes[0]) {
- let char_buf = self.bytes.get(..self.len as usize).ok_or(Utf8Error)?;
- self.len = 0;
- std::str::from_utf8(char_buf)
- .map_err(|_| Utf8Error)
- .map(|ch| (ch, content))
+ if self.char_len >= utf8_width(self.char_bytes[0]) {
+ must_emit_now = true;
+ }
+
+ if must_emit_now {
+ let char_buf = self
+ .char_bytes
+ .get(..self.char_len as usize)
+ .ok_or(Utf8Error)?;
+ self.char_len = 0;
+ let ch = std::str::from_utf8(char_buf).map_err(|_| Utf8Error)?;
+ Ok((ch, content))
} else {
+ // a partial write has ended without fully completing a char (it's possible to write 1 byte at a time)
debug_assert!(content.is_empty());
Ok(("", b""))
}
@@ -264,11 +300,12 @@ impl IncompleteUtf8Resync {
let (valid, invalid) = content
.split_at_checked(err.valid_up_to())
.ok_or(Utf8Error)?;
- self.bytes
+ // save the incomplete bytes from the end for the next write
+ self.char_bytes
.get_mut(..invalid.len())
.ok_or(Utf8Error)?
.copy_from_slice(invalid);
- self.len = invalid.len() as _;
+ self.char_len = invalid.len() as _;
// valid_up_to promises it is valid
debug_assert!(std::str::from_utf8(valid).is_ok());
let valid = unsafe { std::str::from_utf8_unchecked(valid) };
@@ -280,8 +317,8 @@ impl IncompleteUtf8Resync {
/// True if there were incomplete invalid bytes in the buffer
pub fn discard_incomplete(&mut self) -> bool {
- if self.len > 0 {
- self.len = 0;
+ if self.char_len > 0 {
+ self.char_len = 0;
true
} else {
false