From 19fe643e0f9f97877fd0bc701fce158f9b281ff0 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Wed, 27 Nov 2024 10:12:11 -0300 Subject: [PATCH 1/2] Refactor `FormattedVector` for faster formatting of S3 objects --- crates/ark/src/variables/r_variables.rs | 4 +- crates/ark/src/variables/variable.rs | 181 +++++----- crates/harp/src/modules/utils.R | 4 + crates/harp/src/utils.rs | 10 + crates/harp/src/vector/formatted_vector.rs | 369 +++++++++++---------- 5 files changed, 292 insertions(+), 276 deletions(-) diff --git a/crates/ark/src/variables/r_variables.rs b/crates/ark/src/variables/r_variables.rs index 64a255bb4..5b05c6054 100644 --- a/crates/ark/src/variables/r_variables.rs +++ b/crates/ark/src/variables/r_variables.rs @@ -301,14 +301,14 @@ impl RVariables { &mut self, path: &Vec, format: ClipboardFormatFormat, - ) -> Result { + ) -> anyhow::Result { r_task(|| { let env = self.env.get().clone(); PositronVariable::clip(env, &path, &format) }) } - fn inspect(&mut self, path: &Vec) -> Result, harp::error::Error> { + fn inspect(&mut self, path: &Vec) -> anyhow::Result> { r_task(|| { let env = self.env.get().clone(); PositronVariable::inspect(env, &path) diff --git a/crates/ark/src/variables/variable.rs b/crates/ark/src/variables/variable.rs index 84d7c9e5a..c6a31f9b5 100644 --- a/crates/ark/src/variables/variable.rs +++ b/crates/ark/src/variables/variable.rs @@ -25,6 +25,7 @@ use harp::object::RObject; use harp::r_null; use harp::r_symbol; use harp::symbol::RSymbol; +use harp::table_info; use harp::utils::pairlist_size; use harp::utils::r_altrep_class; use harp::utils::r_assert_type; @@ -77,12 +78,16 @@ fn plural(text: &str, n: i32) -> String { impl WorkspaceVariableDisplayValue { pub fn from(value: SEXP) -> Self { + Self::try_from(value).unwrap_or_else(|err| Self::from_error(harp::Error::Anyhow(err))) + } + + pub fn try_from(value: SEXP) -> anyhow::Result { // Try to use the display method if there's one available\ if let Some(display_value) = Self::try_from_method(value) { - return display_value; + return Ok(display_value); } - match r_typeof(value) { + let out = match r_typeof(value) { NILSXP => Self::new(String::from("NULL"), false), VECSXP if r_inherits(value, "data.frame") => Self::from_data_frame(value), VECSXP if !r_inherits(value, "POSIXlt") => Self::from_list(value), @@ -93,13 +98,15 @@ impl WorkspaceVariableDisplayValue { CLOSXP => Self::from_closure(value), ENVSXP => Self::from_env(value), LANGSXP => Self::from_language(value), - _ if r_is_matrix(value) => Self::from_matrix(value), - RAWSXP | LGLSXP | INTSXP | REALSXP | STRSXP | CPLXSXP => Self::from_default(value), + _ if r_is_matrix(value) => Self::from_matrix(value)?, + RAWSXP | LGLSXP | INTSXP | REALSXP | STRSXP | CPLXSXP => Self::from_default(value)?, _ => Self::from_error(Error::Anyhow(anyhow!( "Unexpected type {}", r_type2char(r_typeof(value)) ))), - } + }; + + Ok(out) } fn new(display_value: String, is_truncated: bool) -> Self { @@ -292,10 +299,8 @@ impl WorkspaceVariableDisplayValue { // TODO: handle higher dimensional arrays, i.e. expand // recursively from the higher dimension - fn from_matrix(value: SEXP) -> Self { - let formatted = unwrap!(FormattedVector::new(value), Err(err) => { - return Self::from_error(err); - }); + fn from_matrix(value: SEXP) -> anyhow::Result { + let formatted = FormattedVector::new(RObject::from(value))?; let mut display_value = String::from(""); @@ -314,9 +319,13 @@ impl WorkspaceVariableDisplayValue { } display_value.push('['); - for char in formatted.column_iter(i as isize).join(" ").chars() { + for char in formatted + .column_iter_n(i as isize, MAX_DISPLAY_VALUE_LENGTH)? + .join(" ") + .chars() + { if display_value.len() >= MAX_DISPLAY_VALUE_LENGTH { - return Self::new(display_value, true); + return Ok(Self::new(display_value, true)); } display_value.push(char); } @@ -324,13 +333,11 @@ impl WorkspaceVariableDisplayValue { } display_value.push(']'); - Self::new(display_value, false) + Ok(Self::new(display_value, false)) } - fn from_default(value: SEXP) -> Self { - let formatted = unwrap!(FormattedVector::new(value), Err(err) => { - return Self::from_error(err); - }); + fn from_default(value: SEXP) -> anyhow::Result { + let formatted = FormattedVector::new(RObject::from(value))?; let mut display_value = String::with_capacity(MAX_DISPLAY_VALUE_LENGTH); let mut is_truncated = false; @@ -338,7 +345,7 @@ impl WorkspaceVariableDisplayValue { // Performance: value is potentially a very large vector, so we need to be careful // to not format every element of value. Instead only format the necessary elements // to display the first MAX_DISPLAY_VALUE_LENGTH characters. - 'outer: for (i, elt) in formatted.iter().enumerate() { + 'outer: for (i, elt) in formatted.iter_n(MAX_DISPLAY_VALUE_LENGTH)?.enumerate() { if i > 0 { display_value.push_str(" "); } @@ -351,7 +358,7 @@ impl WorkspaceVariableDisplayValue { } } - Self::new(display_value, is_truncated) + Ok(Self::new(display_value, is_truncated)) } fn from_error(err: Error) -> Self { @@ -469,11 +476,13 @@ impl WorkspaceVariableDisplayType { let dfclass = classes.get_unchecked(0).unwrap(); match include_length { true => { - let dim = RFunction::new("base", "dim.data.frame") - .add(value) - .call() - .unwrap(); - let shape = FormattedVector::new(dim.sexp).unwrap().iter().join(", "); + let dim = table_info(value); + let shape = match dim { + Some(info) => { + format!("{}, {}", info.dims.num_rows, info.dims.num_cols) + }, + None => String::from("?, ?"), + }; let display_type = format!("{} [{}]", dfclass, shape); Self::simple(display_type) }, @@ -852,7 +861,7 @@ impl PositronVariable { } } - pub fn inspect(env: RObject, path: &Vec) -> Result, harp::error::Error> { + pub fn inspect(env: RObject, path: &Vec) -> anyhow::Result> { let node = Self::resolve_object_from_path(env, &path)?; match node { @@ -862,12 +871,12 @@ impl PositronVariable { let enclos = Environment::new(RObject::new(env.find(".__enclos_env__")?)); let private = RObject::new(enclos.find("private")?); - Self::inspect_environment(private) + Ok(Self::inspect_environment(private)?) }, - "" => Self::inspect_r6_methods(object), + "" => Ok(Self::inspect_r6_methods(object)?), - _ => Err(harp::error::Error::InspectError { path: path.clone() }), + _ => Err(anyhow!("Unexpected path {:?}", path)), }, EnvironmentVariableNode::Concrete { object } => { @@ -883,23 +892,23 @@ impl PositronVariable { } if object.is_s4() { - Self::inspect_s4(object.sexp) + Ok(Self::inspect_s4(object.sexp)?) } else { match r_typeof(object.sexp) { - VECSXP | EXPRSXP => Self::inspect_list(object.sexp), - LISTSXP => Self::inspect_pairlist(object.sexp), + VECSXP | EXPRSXP => Ok(Self::inspect_list(object.sexp)?), + LISTSXP => Ok(Self::inspect_pairlist(object.sexp)?), ENVSXP => { if r_inherits(object.sexp, "R6") { - Self::inspect_r6(object) + Ok(Self::inspect_r6(object)?) } else { - Self::inspect_environment(object) + Ok(Self::inspect_environment(object)?) } }, LGLSXP | RAWSXP | STRSXP | INTSXP | REALSXP | CPLXSXP => { if r_is_matrix(object.sexp) { Self::inspect_matrix(object.sexp) } else { - Self::inspect_vector(object.sexp) + Ok(Self::inspect_vector(object.sexp)?) } }, _ => Ok(vec![]), @@ -908,7 +917,7 @@ impl PositronVariable { }, EnvironmentVariableNode::Matrixcolumn { object, index } => { - Self::inspect_matrix_column(object.sexp, index) + Ok(Self::inspect_matrix_column(object.sexp, index)?) }, EnvironmentVariableNode::AtomicVectorElement { .. } => Ok(vec![]), } @@ -918,7 +927,7 @@ impl PositronVariable { env: RObject, path: &Vec, _format: &ClipboardFormatFormat, - ) -> Result { + ) -> anyhow::Result { let node = Self::resolve_object_from_path(env, &path)?; match node { @@ -928,7 +937,7 @@ impl PositronVariable { .add(object) .call()?; - Ok(FormattedVector::new(formatted.sexp)?.iter().join("\n")) + Ok(FormattedVector::new(formatted)?.iter()?.join("\n")) } else if r_typeof(object.sexp) == CLOSXP { let deparsed: Vec = RFunction::from("deparse") .add(object.sexp) @@ -937,23 +946,16 @@ impl PositronVariable { Ok(deparsed.join("\n")) } else { - Ok(FormattedVector::new(object.sexp)?.iter().join(" ")) + Ok(FormattedVector::new(object)?.iter()?.join(" ")) } }, EnvironmentVariableNode::R6Node { .. } => Ok(String::from("")), EnvironmentVariableNode::AtomicVectorElement { object, index } => { - let formatted = FormattedVector::new(object.sexp)?; - Ok(formatted.get_unchecked(index)) + let formatted = FormattedVector::new(object)?; + Ok(formatted.format_elt(index)?) }, - EnvironmentVariableNode::Matrixcolumn { object, index } => unsafe { - let dim = IntegerVector::new(Rf_getAttrib(object.sexp, R_DimSymbol))?; - let n_row = dim.get_unchecked(0).unwrap() as usize; - - let clipped = FormattedVector::new(object.sexp)? - .iter() - .skip(index as usize * n_row) - .take(n_row) - .join(" "); + EnvironmentVariableNode::Matrixcolumn { object, index } => { + let clipped = FormattedVector::new(object)?.column_iter(index)?.join(" "); Ok(clipped) }, } @@ -1177,7 +1179,7 @@ impl PositronVariable { Ok(variables) } - fn inspect_matrix(matrix: SEXP) -> harp::error::Result> { + fn inspect_matrix(matrix: SEXP) -> anyhow::Result> { let matrix = RObject::new(matrix); let n_col = match harp::table_info(matrix.sexp) { @@ -1203,50 +1205,50 @@ impl PositronVariable { updated_time: Self::update_timestamp(), }; - let formatted = FormattedVector::new(matrix.sexp)?; - - let variables: Vec = (0..n_col) - .into_iter() - .take(MAX_DISPLAY_VALUE_ENTRIES) - .map(|i| { - // The display value of columns concatenates the column vector values into a - // single string with maximum length of MAX_DISPLAY_VALUE_LENGTH.\ - let mut is_truncated = false; - let mut display_value = String::with_capacity(MAX_DISPLAY_VALUE_LENGTH); - - 'outer: for (i, elt) in formatted.column_iter(i as isize).enumerate() { - if i > 0 { - display_value.push_str(" "); - } - for char in elt.chars() { - if display_value.len() >= MAX_DISPLAY_VALUE_LENGTH { - is_truncated = true; - // We break the outer loop to avoid adding more characters to the - // display value. - break 'outer; - } - display_value.push(char); + let formatted = FormattedVector::new(matrix)?; + let mut variables = Vec::with_capacity(n_col as usize); + + for col in (0..n_col).take(MAX_DISPLAY_VALUE_ENTRIES) { + // The display value of columns concatenates the column vector values into a + // single string with maximum length of MAX_DISPLAY_VALUE_LENGTH. + let mut is_truncated = false; + let mut display_value = String::with_capacity(MAX_DISPLAY_VALUE_LENGTH); + + let iter = formatted + // Even if each column element takes 0 characters, `MAX_DISPLAY_VALUE_LENGTH` + // is enough to fill the display value because we need to account for the space + // between elements. + .column_iter_n(col as isize, MAX_DISPLAY_VALUE_LENGTH)? + .enumerate(); + + 'outer: for (i, elt) in iter { + if i > 0 { + display_value.push_str(" "); + } + for char in elt.chars() { + if display_value.len() >= MAX_DISPLAY_VALUE_LENGTH { + is_truncated = true; + // We break the outer loop to avoid adding more characters to the + // display value. + break 'outer; } + display_value.push(char); } + } - make_variable( - format!("{}", i), - format!("[, {}]", i + 1), - display_value, - is_truncated, - ) - }) - .collect(); + variables.push(make_variable( + format!("{}", col), + format!("[, {}]", col + 1), + display_value, + is_truncated, + )); + } Ok(variables) } - fn inspect_matrix_column(matrix: SEXP, index: isize) -> harp::error::Result> { - let column = - match harp::table::tbl_get_column(matrix, index as i32, harp::TableKind::Matrix) { - Ok(column) => column, - Err(err) => return Err(Error::Anyhow(err)), - }; + fn inspect_matrix_column(matrix: SEXP, index: isize) -> anyhow::Result> { + let column = harp::table::tbl_get_column(matrix, index as i32, harp::TableKind::Matrix)?; let variables: Vec = Self::inspect_vector(column.sexp)? .into_iter() @@ -1260,7 +1262,7 @@ impl PositronVariable { Ok(variables) } - fn inspect_vector(vector: SEXP) -> harp::error::Result> { + fn inspect_vector(vector: SEXP) -> anyhow::Result> { let vector = RObject::new(vector); let r_type = r_typeof(vector.sexp); @@ -1291,13 +1293,12 @@ impl PositronVariable { updated_time: Self::update_timestamp(), }; - let formatted = FormattedVector::new(vector.sexp)?; + let formatted = FormattedVector::new(vector.clone())?; let names = Names::new(vector.sexp, |i| format!("[{}]", i + 1)); let variables: Vec = formatted - .iter() + .iter_n(MAX_DISPLAY_VALUE_ENTRIES)? .enumerate() - .take(MAX_DISPLAY_VALUE_ENTRIES) .map(|(i, value)| { let (is_truncated, display_value) = truncate_chars(value, MAX_DISPLAY_VALUE_LENGTH); // Names are arbitrarily set by users, so we add a safeguard to truncate them diff --git a/crates/harp/src/modules/utils.R b/crates/harp/src/modules/utils.R index be81d8299..b61b47772 100644 --- a/crates/harp/src/modules/utils.R +++ b/crates/harp/src/modules/utils.R @@ -24,3 +24,7 @@ is_matrix <- function(x) { class_collapsed <- function(x) { paste0(class(x), collapse = "/") } + +harp_subset_vec <- function(x, indices) { + x[indices] +} diff --git a/crates/harp/src/utils.rs b/crates/harp/src/utils.rs index cd0e00f87..2a4e1b924 100644 --- a/crates/harp/src/utils.rs +++ b/crates/harp/src/utils.rs @@ -735,6 +735,16 @@ pub fn r_format_vec(x: SEXP) -> Result { } } +pub fn r_subset_vec(x: SEXP, indices: Vec) -> Result { + let env = unsafe { HARP_ENV.unwrap() }; + let indices: Vec = indices.into_iter().map(|i| i + 1).collect(); + let out = RFunction::new("", "harp_subset_vec") + .add(x) + .add(&indices) + .call_in(env)?; + Ok(out.sexp) +} + #[cfg(test)] mod tests { use libr::STRING_ELT; diff --git a/crates/harp/src/vector/formatted_vector.rs b/crates/harp/src/vector/formatted_vector.rs index 01c495279..276f7a2a6 100644 --- a/crates/harp/src/vector/formatted_vector.rs +++ b/crates/harp/src/vector/formatted_vector.rs @@ -4,71 +4,31 @@ // Copyright (C) 2023 Posit Software, PBC. All rights reserved. // // -use libr::R_ClassSymbol; -use libr::R_DimSymbol; -use libr::Rf_getAttrib; -use libr::Rf_xlength; + +use anyhow::anyhow; use libr::CPLXSXP; use libr::INTSXP; use libr::LGLSXP; use libr::RAWSXP; use libr::REALSXP; -use libr::SEXP; use libr::STRSXP; -use crate::error::Error; -use crate::error::Result; use crate::r_format_vec; +use crate::r_is_object; +use crate::r_length; +use crate::r_subset_vec; +use crate::table_info; use crate::utils::r_assert_type; -use crate::utils::r_inherits; -use crate::utils::r_is_null; use crate::utils::r_typeof; use crate::vector::CharacterVector; use crate::vector::ComplexVector; -use crate::vector::Factor; use crate::vector::FormatOptions; use crate::vector::IntegerVector; use crate::vector::LogicalVector; use crate::vector::NumericVector; use crate::vector::RawVector; use crate::vector::Vector; -pub enum FormattedVector { - // simple - Raw { - vector: RawVector, - }, - Logical { - vector: LogicalVector, - }, - Integer { - vector: IntegerVector, - }, - Numeric { - vector: NumericVector, - }, - Character { - vector: CharacterVector, - options: FormatOptions, - }, - Complex { - vector: ComplexVector, - }, - // special - Factor { - vector: Factor, - }, - FormattedVector { - vector: CharacterVector, - options: FormatOptions, - }, -} - -// Formatting options for vectors -#[derive(Default)] -pub struct FormattedVectorOptions { - // Formatting options for character vectors - pub character: FormatOptions, -} +use crate::RObject; impl Default for FormatOptions { fn default() -> Self { @@ -76,144 +36,197 @@ impl Default for FormatOptions { } } +pub struct FormattedVector { + vector: RObject, +} + impl FormattedVector { - pub fn new(vector: SEXP) -> Result { - Self::new_with_options(vector, FormattedVectorOptions::default()) + pub fn new(vector: RObject) -> anyhow::Result { + r_assert_type(vector.sexp, &[ + RAWSXP, LGLSXP, INTSXP, REALSXP, STRSXP, CPLXSXP, + ])?; + Ok(Self { vector }) } - pub fn new_with_options( - vector: SEXP, - formatting_options: FormattedVectorOptions, - ) -> Result { - unsafe { - let class = Rf_getAttrib(vector, R_ClassSymbol); - if r_is_null(class) { - match r_typeof(vector) { - RAWSXP => Ok(Self::Raw { - vector: RawVector::new_unchecked(vector), - }), - LGLSXP => Ok(Self::Logical { - vector: LogicalVector::new_unchecked(vector), - }), - INTSXP => Ok(Self::Integer { - vector: IntegerVector::new_unchecked(vector), - }), - REALSXP => Ok(Self::Numeric { - vector: NumericVector::new_unchecked(vector), - }), - STRSXP => Ok(Self::Character { - vector: CharacterVector::new_unchecked(vector), - options: formatting_options.character, - }), - CPLXSXP => Ok(Self::Complex { - vector: ComplexVector::new_unchecked(vector), - }), - - _ => Err(Error::UnexpectedType(r_typeof(vector), vec![ - RAWSXP, LGLSXP, INTSXP, REALSXP, STRSXP, CPLXSXP, - ])), - } - } else { - if r_inherits(vector, "factor") { - Ok(Self::Factor { - vector: Factor::new_unchecked(vector), - }) - } else { - let formatted = r_format_vec(vector)?; - - r_assert_type(formatted, &[STRSXP])?; - Ok(Self::FormattedVector { - vector: CharacterVector::new_unchecked(formatted), - options: formatting_options.character, - }) - } - } - } + /// Returns an iterator for the vector. + /// Performance: for S3 objects this will cause the iterator to + /// format the entire vector before starting the iteration. + pub fn iter(&self) -> anyhow::Result { + FormattedVectorIter::new_unchecked(self.vector.clone(), None) } - pub fn get_unchecked(&self, index: isize) -> String { - match self { - FormattedVector::Raw { vector } => vector.format_elt_unchecked(index, None), - FormattedVector::Logical { vector } => vector.format_elt_unchecked(index, None), - FormattedVector::Integer { vector } => vector.format_elt_unchecked(index, None), - FormattedVector::Numeric { vector } => vector.format_elt_unchecked(index, None), - FormattedVector::Character { vector, options } => { - vector.format_elt_unchecked(index, Some(options)) - }, - FormattedVector::Complex { vector } => vector.format_elt_unchecked(index, None), - FormattedVector::Factor { vector } => vector.format_elt_unchecked(index, None), - FormattedVector::FormattedVector { vector, options } => { - vector.format_elt_unchecked(index, Some(options)) - }, + /// Returns an iterator over the first `n` elements of a vector. + /// Should be used when the vector is potentially large and you won't need to + /// iterate over the entire vector. + pub fn iter_n(&self, n: usize) -> anyhow::Result { + // The iterators for atomic values and factors are lazy and don't need any special + // treatment. + let length = r_length(self.vector.sexp); + let n = n.min(length as usize); + + FormattedVectorIter::new_unchecked(self.vector.clone(), Some(Box::new(0..n as i64))) + } + + /// Formats a single element of a vector + pub fn format_elt(&self, index: isize) -> anyhow::Result { + // Check if the index is allowed + let length = r_length(self.vector.sexp); + + if index < 0 || index >= length as isize { + return Err(anyhow!("Index out of bounds")); + } + + let indices = vec![index as i64].into_iter(); + let result: Vec = + FormattedVectorIter::new_unchecked(self.vector.clone(), Some(Box::new(indices)))? + .collect(); + + if result.len() != 1 { + return Err(anyhow!("Unexpected error")); } + + Ok(result[0].clone()) } - pub fn len(&self) -> isize { - unsafe { Rf_xlength(self.data()) } + /// Returns an iterator over a column of a matrix. + /// Subset a vector and return an iterator for the selected column. + pub fn column_iter(&self, column: isize) -> anyhow::Result { + let indices = self.column_iter_indices(column)?; + FormattedVectorIter::new_unchecked(self.vector.clone(), Some(Box::new(indices))) } - pub fn data(&self) -> SEXP { - match self { - FormattedVector::Raw { vector } => vector.data(), - FormattedVector::Logical { vector } => vector.data(), - FormattedVector::Integer { vector } => vector.data(), - FormattedVector::Numeric { vector } => vector.data(), - FormattedVector::Character { vector, options: _ } => vector.data(), - FormattedVector::Complex { vector } => vector.data(), - FormattedVector::Factor { vector } => vector.data(), - FormattedVector::FormattedVector { vector, options: _ } => vector.data(), - } + /// Returns an iterator over the first `n` elements of a column of a matrix. + pub fn column_iter_n(&self, column: isize, n: usize) -> anyhow::Result { + let indices = self.column_iter_indices(column)?.take(n); + FormattedVectorIter::new_unchecked(self.vector.clone(), Some(Box::new(indices))) + } + + fn column_iter_indices(&self, column: isize) -> anyhow::Result> { + let dim = table_info(self.vector.sexp) + .ok_or(anyhow!("Not a mtrix"))? + .dims; + + let start = column as i64 * dim.num_rows as i64; + let end = start + dim.num_rows as i64; + Ok(start..end) } } -pub struct FormattedVectorIter<'a> { - formatted: &'a FormattedVector, - index: isize, - size: isize, +enum AtomicVector { + Raw(RawVector), + Logical(LogicalVector), + Integer(IntegerVector), + Numeric(NumericVector), + Character(CharacterVector), + Complex(ComplexVector), } -impl<'a> FormattedVectorIter<'a> { - pub fn new(formatted: &'a FormattedVector) -> Self { - Self { - formatted, - index: 0, - size: formatted.len(), - } +impl AtomicVector { + fn new(vector: RObject) -> anyhow::Result { + let vector = match r_typeof(vector.sexp) { + RAWSXP => AtomicVector::Raw(unsafe { RawVector::new_unchecked(vector.sexp) }), + LGLSXP => AtomicVector::Logical(unsafe { LogicalVector::new_unchecked(vector.sexp) }), + INTSXP => AtomicVector::Integer(unsafe { IntegerVector::new_unchecked(vector.sexp) }), + REALSXP => AtomicVector::Numeric(unsafe { NumericVector::new_unchecked(vector.sexp) }), + STRSXP => { + AtomicVector::Character(unsafe { CharacterVector::new_unchecked(vector.sexp) }) + }, + CPLXSXP => AtomicVector::Complex(unsafe { ComplexVector::new_unchecked(vector.sexp) }), + _ => { + return Err(anyhow!("Unsupported type")); + }, + }; + Ok(vector) } -} -impl<'a> Iterator for FormattedVectorIter<'a> { - type Item = String; + fn format_element(&self, index: isize) -> String { + // We always use the default options for now as this is only used for the variables pane, + // we might want to change that in the future. + let options = FormatOptions::default(); + match self { + AtomicVector::Raw(v) => v.format_elt_unchecked(index, Some(&options)), + AtomicVector::Logical(v) => v.format_elt_unchecked(index, Some(&options)), + AtomicVector::Integer(v) => v.format_elt_unchecked(index, Some(&options)), + AtomicVector::Numeric(v) => v.format_elt_unchecked(index, Some(&options)), + AtomicVector::Character(v) => v.format_elt_unchecked(index, Some(&options)), + AtomicVector::Complex(v) => v.format_elt_unchecked(index, Some(&options)), + } + } - fn next(&mut self) -> Option { - if self.index == self.size { - None - } else { - let out = Some(self.formatted.get_unchecked(self.index)); - self.index = self.index + 1; - out + fn len(&self) -> usize { + unsafe { + match self { + AtomicVector::Raw(v) => v.len(), + AtomicVector::Logical(v) => v.len(), + AtomicVector::Integer(v) => v.len(), + AtomicVector::Numeric(v) => v.len(), + AtomicVector::Character(v) => v.len(), + AtomicVector::Complex(v) => v.len(), + } } } } -impl FormattedVector { - pub fn iter(&self) -> FormattedVectorIter { - FormattedVectorIter::new(self) +pub struct FormattedVectorIter { + vector: AtomicVector, + indices: Box>, +} + +impl FormattedVectorIter { + /// Creates a new iterator over the formatted elements of a vector. + /// If `indices` is `None`, the iterator will iterate over all elements of the vector. + /// If `indices` is `Some`, the iterator will iterate over the elements at the specified indices. + /// The indices must be valid and in bounds as we do no bounds checking. + fn new_unchecked( + vector: RObject, + indices: Option>>, + ) -> anyhow::Result { + // For atomic vectors we just create the iterator directly + if !r_is_object(vector.sexp) { + return Self::from_atomic(AtomicVector::new(vector)?, indices); + } + + // For objects we need to format the vector before iterating. However, we can't + // format the entire vector at once because it might be too large. Instead, we + // subset the vector prior to formatting. + let subset = match indices { + None => vector, + Some(indices) => { + let indices = indices.collect::>(); + RObject::from(r_subset_vec(vector.sexp, indices)?) + }, + }; + let formatted = RObject::from(r_format_vec(subset.sexp)?); + + // We already formatted the selected subset, so we can create an iterator over `None` + // indices, ie, over all elements. + Self::from_atomic(AtomicVector::new(formatted)?, None) } - pub fn column_iter(&self, column: isize) -> FormattedVectorIter { - unsafe { - let object = self.data(); - let dim = IntegerVector::new(Rf_getAttrib(object, R_DimSymbol)).unwrap(); - let n_row = dim.get_unchecked(0).unwrap() as isize; + fn from_atomic( + vector: AtomicVector, + indices: Option>>, + ) -> anyhow::Result { + let indices = match indices { + Some(indices) => indices, + None => { + let len = vector.len(); + Box::new(0..len as i64) + }, + }; - let index = column * n_row; + return Ok(Self { vector, indices }); + } +} - FormattedVectorIter { - formatted: self, - index, - size: index + n_row, - } +impl Iterator for FormattedVectorIter { + type Item = String; + + fn next(&mut self) -> Option { + if let Some(index) = self.indices.next() { + Some(self.vector.format_element(index as isize)) + } else { + None } } } @@ -221,16 +234,13 @@ impl FormattedVector { #[cfg(test)] mod tests { use itertools::Itertools; - use libr::STRSXP; use crate::environment::Environment; use crate::eval::parse_eval0; use crate::fixtures::r_task; use crate::modules::HARP_ENV; - use crate::r_assert_type; use crate::vector::formatted_vector::FormattedVector; - use crate::vector::formatted_vector::FormattedVectorOptions; - use crate::vector::FormatOptions; + use crate::RObject; #[test] fn test_unconforming_format_method() { @@ -243,43 +253,34 @@ mod tests { Environment::new(parse_eval0("init_test_format()", HARP_ENV.unwrap()).unwrap()); // Unconforming dims (posit-dev/positron#1862) - let x = FormattedVector::new(objs.find("unconforming_dims").unwrap()).unwrap(); - let out = x.column_iter(0).join(" "); + let x = FormattedVector::new(RObject::from(objs.find("unconforming_dims").unwrap())) + .unwrap(); + let out = x.column_iter(0).unwrap().join(" "); assert_eq!(out, exp); // Unconforming length - let x = FormattedVector::new(objs.find("unconforming_length").unwrap()).unwrap(); - let out = x.iter().join(" "); + let x = FormattedVector::new(RObject::from(objs.find("unconforming_length").unwrap())) + .unwrap(); + let out = x.iter().unwrap().join(" "); assert_eq!(out, exp); // Unconforming type - let x = FormattedVector::new(objs.find("unconforming_type").unwrap()).unwrap(); - let out = x.iter().join(" "); + let x = FormattedVector::new(RObject::from(objs.find("unconforming_type").unwrap())) + .unwrap(); + let out = x.iter().unwrap().join(" "); assert_eq!(out, exp); }) } #[test] - fn test_formatting_option() { + fn test_na_not_quoted() { r_task(|| { let x = harp::parse_eval_base(r#"c("1", "2", '"a"', "NA", NA_character_)"#).unwrap(); - r_assert_type(x.sexp, &[STRSXP]).unwrap(); - - let formatted = FormattedVector::new_with_options(x.sexp, FormattedVectorOptions { - character: FormatOptions { quote: false }, - }) - .unwrap(); - - let out = formatted.iter().join(" "); - assert_eq!(out, String::from(r#"1 2 "a" NA NA"#)); - let formatted = FormattedVector::new_with_options(x.sexp, FormattedVectorOptions { - character: FormatOptions { quote: true }, - }) - .unwrap(); + let formatted = FormattedVector::new(x).unwrap(); // NA is always unquoted regardless of the quote option - let out = formatted.iter().join(" "); + let out = formatted.iter().unwrap().join(" "); assert_eq!(out, String::from(r#""1" "2" "\"a\"" "NA" NA"#)); }) } From e862651607dbc04090722b4f691f8f99fbcf6a08 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Mon, 9 Dec 2024 10:57:47 -0300 Subject: [PATCH 2/2] Rename to `iter_take` and add safety notice. --- crates/ark/src/variables/variable.rs | 4 ++-- crates/harp/src/vector/formatted_vector.rs | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/ark/src/variables/variable.rs b/crates/ark/src/variables/variable.rs index c6a31f9b5..6bfe665d4 100644 --- a/crates/ark/src/variables/variable.rs +++ b/crates/ark/src/variables/variable.rs @@ -345,7 +345,7 @@ impl WorkspaceVariableDisplayValue { // Performance: value is potentially a very large vector, so we need to be careful // to not format every element of value. Instead only format the necessary elements // to display the first MAX_DISPLAY_VALUE_LENGTH characters. - 'outer: for (i, elt) in formatted.iter_n(MAX_DISPLAY_VALUE_LENGTH)?.enumerate() { + 'outer: for (i, elt) in formatted.iter_take(MAX_DISPLAY_VALUE_LENGTH)?.enumerate() { if i > 0 { display_value.push_str(" "); } @@ -1297,7 +1297,7 @@ impl PositronVariable { let names = Names::new(vector.sexp, |i| format!("[{}]", i + 1)); let variables: Vec = formatted - .iter_n(MAX_DISPLAY_VALUE_ENTRIES)? + .iter_take(MAX_DISPLAY_VALUE_ENTRIES)? .enumerate() .map(|(i, value)| { let (is_truncated, display_value) = truncate_chars(value, MAX_DISPLAY_VALUE_LENGTH); diff --git a/crates/harp/src/vector/formatted_vector.rs b/crates/harp/src/vector/formatted_vector.rs index 276f7a2a6..8e6ec3903 100644 --- a/crates/harp/src/vector/formatted_vector.rs +++ b/crates/harp/src/vector/formatted_vector.rs @@ -58,7 +58,7 @@ impl FormattedVector { /// Returns an iterator over the first `n` elements of a vector. /// Should be used when the vector is potentially large and you won't need to /// iterate over the entire vector. - pub fn iter_n(&self, n: usize) -> anyhow::Result { + pub fn iter_take(&self, n: usize) -> anyhow::Result { // The iterators for atomic values and factors are lazy and don't need any special // treatment. let length = r_length(self.vector.sexp); @@ -176,7 +176,8 @@ impl FormattedVectorIter { /// Creates a new iterator over the formatted elements of a vector. /// If `indices` is `None`, the iterator will iterate over all elements of the vector. /// If `indices` is `Some`, the iterator will iterate over the elements at the specified indices. - /// The indices must be valid and in bounds as we do no bounds checking. + /// SAFETY: The caller must make sure that indices are valid and in bounds. Iteration may panic + /// if the indices are out of bounds. fn new_unchecked( vector: RObject, indices: Option>>,