Skip to content

Commit

Permalink
datatype mod cleanly separates default type mapping vs. is compatible
Browse files Browse the repository at this point in the history
  • Loading branch information
rroelke committed Nov 27, 2024
1 parent be27400 commit 74a75d4
Show file tree
Hide file tree
Showing 4 changed files with 482 additions and 514 deletions.
28 changes: 16 additions & 12 deletions tiledb/query-core/src/buffers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use tiledb_api::query::read::aggregate::AggregateFunctionHandle;
use tiledb_api::{Context, ContextBound};
use tiledb_common::array::CellValNum;

use super::datatype::ToArrowConverter;
use super::field::QueryField;
use super::fields::{QueryField as RequestField, QueryFields};
use super::QueryType;
Expand All @@ -26,8 +25,6 @@ const AVERAGE_STRING_LENGTH: usize = 64;

#[derive(Debug, Error)]
pub enum Error {
#[error("Error converting to Arrow for field '{0}': {1}")]
ArrowConversionError(String, crate::datatype::Error),
#[error("Failed to convert Arrow Array for field '{0}': {1}")]
FailedConversionFromArrow(String, Box<Error>),
#[error("Failed to add field '{0}' to query: {1}")]
Expand Down Expand Up @@ -65,8 +62,10 @@ type Result<T> = std::result::Result<T, Error>;
pub enum FieldError {
#[error("Error reading query field: {0}")]
QueryField(#[from] crate::field::Error),
#[error("Type mismatch for requested field: {0}")]
TypeMismatch(crate::datatype::Error),
#[error("Type mismatch: arrow type '{0}' is not compatible with tiledb type '({1}, {2})'")]
TypeMismatch(adt::DataType, tiledb_common::Datatype, CellValNum),
#[error("No default arrow type: {0}")]
TargetTypeRequired(crate::datatype::DefaultArrowTypeError),
#[error("Failed to allocate buffer: {0}")]
BufferAllocation(ArrowError),
#[error("Unsupported arrow array: {0}")]
Expand Down Expand Up @@ -1436,8 +1435,6 @@ fn request_to_buffers(
field: RequestField,
tiledb_field: &QueryField,
) -> FieldResult<Arc<dyn aa::Array>> {
let conv = ToArrowConverter::strict();

let tdb_dtype = tiledb_field.datatype()?;
let tdb_cvn = tiledb_field.cell_val_num()?;
let tdb_nullable = tiledb_field.nullable()?;
Expand All @@ -1447,13 +1444,20 @@ fn request_to_buffers(
return Ok(array);
}

let arrow_type = if let Some(dtype) = field.target_type() {
conv.convert_datatype_to(&tdb_dtype, &tdb_cvn, tdb_nullable, dtype)
let arrow_type = if let Some(atype) = field.target_type() {
if !crate::datatype::is_physically_compatible(
&atype, tdb_dtype, tdb_cvn,
) {
return Err(FieldError::TypeMismatch(atype, tdb_dtype, tdb_cvn));
}
atype
} else {
conv.convert_datatype(&tdb_dtype, &tdb_cvn, tdb_nullable)
}
.map_err(FieldError::TypeMismatch)?;
crate::datatype::default_arrow_type(tdb_dtype, tdb_cvn)
.map_err(FieldError::TargetTypeRequired)?
.into_inner()
};

// SAFETY: I dunno this unwrap looks sketchy
alloc_array(arrow_type, tdb_nullable, field.capacity().unwrap())
}

Expand Down
251 changes: 251 additions & 0 deletions tiledb/query-core/src/datatype/compatibility.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
use arrow::datatypes::{DataType as ArrowLogicalType, TimeUnit};
use tiledb_common::array::CellValNum;
use tiledb_common::Datatype;

/// Returns whether an arrow [DataType] can be used
/// to query fields with a particular [Datatype] and [CellValNum].
///
/// Both arrow's [DataType] and tiledb's [Datatype] are "logical"
/// types, i.e. they prescribe both the physical shape of the data
/// as well as how it should be interpreted. The variants of each
/// do not perfectly overlap, meaning that there are some logical
/// types which have a natural representation in arrow but not in
/// tiledb, and vice verse.
///
/// To enable using arrow as a means of querying tiledb data,
/// the tiledb logical types which do not have a corresponding
/// arrow type may be queried using the arrow [DataType] which
/// matches the _physical_ type of the tiledb logical type.
/// ```
/// use arrow::datatypes::DataType as ArrowLogicalType;
/// use tiledb_common::Datatype as TileDBLogicalType;
///
/// // `DateTimeFemtosecond` has no corresponding arrow type
/// let tiledb = TileDBLogicalType::DateTimeFemtosecond;
///
/// assert!(is_physically_compatible(&ArrowLogicalType::Int64,
/// tiledb, CellValNum::single()));
/// ```
///
/// If an application uses tiledb as the storage engine for data
/// which is described by an arrow schema, then it may need
/// to query an arrow logical type which does not have a corresponding
/// tiledb logical type. As above, the tiledb [Datatype] which
/// matches the corresponding [ArrowNativeType] can be used.
/// ```
/// use arrow::datatypes::DataType as ArrowLogicalType;
/// use tiledb_common::Datatype as TileDBLogicalType;
///
/// // `Date32` has no corresponding tiledb type
/// let arrow = ArrowLogicalType::Date32;
///
/// assert!(is_physically_compatible(&arrow,
/// TileDBLogicalType::Int32, CellValNum::single()));
/// ```
pub fn is_physically_compatible(
arrow_datatype: &ArrowLogicalType,
tiledb_datatype: Datatype,
tiledb_cell_val_num: CellValNum,
) -> bool {
let is_single = tiledb_cell_val_num == CellValNum::single();
let is_var = tiledb_cell_val_num == CellValNum::Var;

match arrow_datatype {
ArrowLogicalType::Null => false,
ArrowLogicalType::Boolean => {
matches!(tiledb_datatype, Datatype::Boolean) && is_single
}

ArrowLogicalType::Int8 => {
matches!(tiledb_datatype, Datatype::Int8 | Datatype::Char)
&& is_single
}
ArrowLogicalType::Int16 => {
matches!(tiledb_datatype, Datatype::Int16) && is_single
}
ArrowLogicalType::Int32 => {
matches!(tiledb_datatype, Datatype::Int32) && is_single
}
ArrowLogicalType::Int64 => {
matches!(
tiledb_datatype,
Datatype::Int64
| Datatype::DateTimeYear
| Datatype::DateTimeMonth
| Datatype::DateTimeWeek
| Datatype::DateTimeDay
| Datatype::DateTimeHour
| Datatype::DateTimeMinute
| Datatype::DateTimeSecond
| Datatype::DateTimeMillisecond
| Datatype::DateTimeMicrosecond
| Datatype::DateTimeNanosecond
| Datatype::DateTimePicosecond
| Datatype::DateTimeFemtosecond
| Datatype::DateTimeAttosecond
| Datatype::TimeHour
| Datatype::TimeMinute
| Datatype::TimeSecond
| Datatype::TimeMillisecond
| Datatype::TimeMicrosecond
| Datatype::TimeNanosecond
| Datatype::TimePicosecond
| Datatype::TimeFemtosecond
| Datatype::TimeAttosecond
) && is_single
}
ArrowLogicalType::UInt8 => {
matches!(
tiledb_datatype,
Datatype::UInt8
| Datatype::StringAscii
| Datatype::StringUtf8
| Datatype::Any
| Datatype::Blob
| Datatype::Boolean
| Datatype::GeometryWkb
| Datatype::GeometryWkt
) && is_single
}
ArrowLogicalType::UInt16 => {
matches!(
tiledb_datatype,
Datatype::UInt16 | Datatype::StringUtf16 | Datatype::StringUcs2
) && is_single
}
ArrowLogicalType::UInt32 => {
matches!(
tiledb_datatype,
Datatype::UInt32 | Datatype::StringUtf32 | Datatype::StringUcs4
) && is_single
}
ArrowLogicalType::UInt64 => {
matches!(tiledb_datatype, Datatype::UInt64) && is_single
}
ArrowLogicalType::Float32 => {
matches!(tiledb_datatype, Datatype::Float32) && is_single
}
ArrowLogicalType::Float64 => {
matches!(tiledb_datatype, Datatype::Float64) && is_single
}
ArrowLogicalType::Timestamp(TimeUnit::Second, None) => {
matches!(tiledb_datatype, Datatype::DateTimeSecond) && is_single
}
ArrowLogicalType::Timestamp(TimeUnit::Millisecond, None) => {
matches!(tiledb_datatype, Datatype::DateTimeMillisecond)
&& is_single
}
ArrowLogicalType::Timestamp(TimeUnit::Microsecond, None) => {
matches!(tiledb_datatype, Datatype::DateTimeMicrosecond)
&& is_single
}
ArrowLogicalType::Timestamp(TimeUnit::Nanosecond, None) => {
matches!(tiledb_datatype, Datatype::DateTimeNanosecond) && is_single
}
ArrowLogicalType::Timestamp(_, Some(_)) => false,
ArrowLogicalType::Time64(TimeUnit::Second) => {
matches!(tiledb_datatype, Datatype::TimeSecond) && is_single
}
ArrowLogicalType::Time64(TimeUnit::Millisecond) => {
matches!(tiledb_datatype, Datatype::TimeMillisecond) && is_single
}
ArrowLogicalType::Time64(TimeUnit::Microsecond) => {
matches!(tiledb_datatype, Datatype::TimeMicrosecond) && is_single
}
ArrowLogicalType::Time64(TimeUnit::Nanosecond) => {
matches!(tiledb_datatype, Datatype::TimeNanosecond) && is_single
}

ArrowLogicalType::Utf8 | ArrowLogicalType::LargeUtf8 => {
matches!(
tiledb_datatype,
Datatype::StringAscii | Datatype::StringUtf8
) && is_var
}
ArrowLogicalType::Binary | ArrowLogicalType::LargeBinary => {
is_physically_compatible(
&ArrowLogicalType::UInt8,
tiledb_datatype,
CellValNum::single(),
) && is_var
}
ArrowLogicalType::FixedSizeBinary(cvn) => {
if tiledb_cell_val_num != *cvn as u32 {
false
} else {
is_physically_compatible(
&ArrowLogicalType::UInt8,
tiledb_datatype,
CellValNum::single(),
)
}
}

ArrowLogicalType::List(field) | ArrowLogicalType::LargeList(field) => {
// NB: any cell val num is allowed
is_physically_compatible(
field.data_type(),
tiledb_datatype,
CellValNum::single(),
)
}

ArrowLogicalType::FixedSizeList(field, cvn) => {
tiledb_cell_val_num == *cvn as u32
&& is_physically_compatible(
field.data_type(),
tiledb_datatype,
CellValNum::single(),
)
}
ArrowLogicalType::Date32 => {
matches!(tiledb_datatype, Datatype::Int32) && is_single
}
ArrowLogicalType::Date64 => {
matches!(tiledb_datatype, Datatype::Int64) && is_single
}
ArrowLogicalType::Time32(_) => {
matches!(tiledb_datatype, Datatype::Int32) && is_single
}

// TODO: Duration and Interval can be represented
// Decimal128 and Decimal256 could be blobs...

// Notes on other possible relaxed conversions:
//
// Duration and some intervals are likely supportable, but
// leaving them off for now as the docs aren't clear.
//
// Views are also likely supportable, but will likely require
// separate buffer allocations since individual values are not
// contiguous.
//
// Struct and Union are never supportable (given current core)
//
// Dictionary is, but they should be handled higher up the stack
// to ensure that things line up with enumerations.
//
// Decimal128 and Decimal256 might be supportable using Float64
// and 2 or 4 fixed length cell val num. Though it'd be fairly
// hacky.
//
// Map isn't supported in TileDB (given current core)
//
// RunEndEncoded is probably supportable, but like views will
// require separate buffer allocations so leaving for now.
ArrowLogicalType::Float16
| ArrowLogicalType::Duration(_)
| ArrowLogicalType::Interval(_)
| ArrowLogicalType::BinaryView
| ArrowLogicalType::Utf8View
| ArrowLogicalType::ListView(_)
| ArrowLogicalType::LargeListView(_)
| ArrowLogicalType::Struct(_)
| ArrowLogicalType::Union(_, _)
| ArrowLogicalType::Dictionary(_, _)
| ArrowLogicalType::Decimal128(_, _)
| ArrowLogicalType::Decimal256(_, _)
| ArrowLogicalType::Map(_, _)
| ArrowLogicalType::RunEndEncoded(_, _) => false,
}
}
Loading

0 comments on commit 74a75d4

Please sign in to comment.