Skip to content

Commit

Permalink
Hdf5 export (#17)
Browse files Browse the repository at this point in the history
* first skeleton

* added conversion to ndarray

* first hdf5 export

* fixed unique naming for hdf5 group

* no default feature warnings fix

* fixed bug with string parsing

* adapted tests for hdf5 parsing

* added metadata in hdf5 file

* more file metadata

* introduced most used compressions for hdf5

* added more inline for performance

* put back C Api for parquet and hdf5 export

* added possibility to compile for parallel HDF5

* come clippy improvement

---------

Co-authored-by: «ratal» <«[email protected]»>
  • Loading branch information
ratal and «ratal» authored May 8, 2024
1 parent f893cb3 commit da3b6e0
Show file tree
Hide file tree
Showing 15 changed files with 1,379 additions and 234 deletions.
755 changes: 573 additions & 182 deletions Cargo.lock

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "mdfr"
version = "0.6.1"
version = "0.6.2"
description = "A package for reading and writing MDF files"
authors = ["ratal <[email protected]>"]
edition = "2021"
Expand All @@ -12,10 +12,13 @@ readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
default = ["numpy", "parquet", "polars"]
default = ["numpy", "parquet", "polars", "hdf5"]
numpy = ["dep:numpy", "dep:pyo3"]
polars = ["dep:polars", "dep:numpy", "dep:pyo3"]
parquet = ["dep:parquet"]
hdf5 = ["dep:hdf5", "ndarray"]
ndarray = ["dep:ndarray"]
hdf5-mpio = ["hdf5/mpio"]

[dependencies]
clap = "4" # for input arguments
Expand Down Expand Up @@ -54,6 +57,10 @@ polars = { version = "0.39", features = [
"fmt",
], optional = true } # for python dataframe
parquet = { version = "51.0.0", optional = true } # to write parquet file
hdf5 = { version = "0.8", optional = true, features = [
"lzf",
] } # to export into hdf5 file
ndarray = { version = "0.15", optional = true } # to convert arraw data into ndarray, needed for hdf5

[dependencies.pyo3]
version = "0.20"
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ obj.plot('channel_name')

# Export to parquet:
obj.export_to_parquet('file_name', compression_option)
# Export to hdf5:
obj.export_to_hdf5('file_name')
# write to mdf4 file, compressed or not
obj.write('file_name', conpression_flag)
obj.write('file_name', conpression_algo)
```
118 changes: 79 additions & 39 deletions src/c_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,42 +191,82 @@ pub unsafe extern "C" fn get_channel_array(
}
}

// / export to Parquet file
// / Compression can be one of the following strings
// / "snappy", "gzip", "lzo", "brotli", "lz4", "lz4raw"
// / or null pointer if no compression wanted
// #[no_mangle]
// pub unsafe extern "C" fn export_to_parquet(
// mdf: *const Mdf,
// file_name: *const c_char,
// compression: *const c_char,
// ) {
// // # Safety
// //
// // It is the caller's guarantee to ensure `file_name`:
// //
// // - is not a null pointer
// // - points to valid, initialized data
// // - points to memory ending in a null byte
// // - won't be mutated for the duration of this function call
// let name = CStr::from_ptr(file_name)
// .to_str()
// .expect("Could not convert into utf8 the file name string");
// let comp = if compression.is_null() {
// None
// } else {
// Some(
// CStr::from_ptr(compression)
// .to_str()
// .expect("Could not convert into utf8 the compression string"),
// )
// };
// if let Some(mdf) = mdf.as_ref() {
// match mdf.export_to_parquet(name, comp) {
// Ok(_) => {}
// Err(e) => panic!("{}", e),
// }
// } else {
// panic!("Null pointer given for Mdf Rust object")
// }
// }
// export to Parquet file
// Compression can be one of the following strings
// "snappy", "gzip", "lzo", "brotli", "lz4", "lz4raw"
// or null pointer if no compression wanted
#[no_mangle]
pub unsafe extern "C" fn export_to_parquet(
mdf: *const Mdf,
file_name: *const c_char,
compression: *const c_char,
) {
// # Safety
//
// It is the caller's guarantee to ensure `file_name`:
//
// - is not a null pointer
// - points to valid, initialized data
// - points to memory ending in a null byte
// - won't be mutated for the duration of this function call
let name = CStr::from_ptr(file_name)
.to_str()
.expect("Could not convert into utf8 the file name string");
let comp = if compression.is_null() {
None
} else {
Some(
CStr::from_ptr(compression)
.to_str()
.expect("Could not convert into utf8 the compression string"),
)
};
if let Some(mdf) = mdf.as_ref() {
match mdf.export_to_parquet(name, comp) {
Ok(_) => {}
Err(e) => panic!("{}", e),
}
} else {
panic!("Null pointer given for Mdf Rust object")
}
}

// export to hdf5 file
// Compression can be one of the following strings
// "deflate", "lzf"
// or null pointer if no compression wanted
#[no_mangle]
pub unsafe extern "C" fn export_to_hdf5(
mdf: *const Mdf,
file_name: *const c_char,
compression: *const c_char,
) {
// # Safety
//
// It is the caller's guarantee to ensure `file_name`:
//
// - is not a null pointer
// - points to valid, initialized data
// - points to memory ending in a null byte
// - won't be mutated for the duration of this function call
let name = CStr::from_ptr(file_name)
.to_str()
.expect("Could not convert into utf8 the file name string");
let comp = if compression.is_null() {
None
} else {
Some(
CStr::from_ptr(compression)
.to_str()
.expect("Could not convert into utf8 the compression string"),
)
};
if let Some(mdf) = mdf.as_ref() {
match mdf.export_to_hdf5(name, comp) {
Ok(_) => {}
Err(e) => panic!("{}", e),
}
} else {
panic!("Null pointer given for Mdf Rust object")
}
}
1 change: 1 addition & 0 deletions src/data_holder/channel_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@ impl ChannelData {
}
}
}
/// returns the arrow DataType equivalent to the ChannelData
pub fn arrow_data_type(&self) -> DataType {
match self {
ChannelData::Int8(_) => DataType::Int8,
Expand Down
24 changes: 24 additions & 0 deletions src/data_holder/complex_arrow.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
//! complex number stored in primitive builders, fixedsizearraybuilder being too restricted
#[cfg(feature = "ndarray")]
use anyhow::{Context, Error, Result};
use arrow::{
array::{ArrayBuilder, BooleanBufferBuilder, PrimitiveArray, PrimitiveBuilder},
buffer::{BooleanBuffer, MutableBuffer},
datatypes::{ArrowPrimitiveType, Float32Type, Float64Type},
};
#[cfg(feature = "ndarray")]
use ndarray::{Array, Ix2};

/// Complex struct
#[derive(Debug)]
Expand Down Expand Up @@ -94,6 +98,26 @@ impl<T: ArrowPrimitiveType> ComplexArrow<T> {
}
}

#[cfg(feature = "ndarray")]
impl ComplexArrow<Float32Type> {
/// to convert ComplexArrow into ndarray
pub fn to_ndarray(&self) -> Result<Array<f32, Ix2>, Error> {
let vector: Vec<f32> = self.values_builder.values_slice().to_vec();
Array::from_shape_vec((self.len(), 2), vector)
.context("Failed reshaping f32 complex arrow into ndarray")
}
}

#[cfg(feature = "ndarray")]
impl ComplexArrow<Float64Type> {
/// to convert ComplexArrow into ndarray
pub fn to_ndarray(&self) -> Result<Array<f64, Ix2>, Error> {
let vector: Vec<f64> = self.values_builder.values_slice().to_vec();
Array::from_shape_vec((vector.len() / 2, 2), vector)
.context("Failed reshaping f64 complex arrow into ndarray")
}
}

impl<T: ArrowPrimitiveType> Default for ComplexArrow<T> {
fn default() -> Self {
Self::new()
Expand Down
35 changes: 34 additions & 1 deletion src/data_holder/tensor_arrow.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
//! tensor arrow array adapted to mdf4 specificities (samples of tensors)
#[cfg(feature = "ndarray")]
use anyhow::{Context, Error, Result};
use arrow::{
array::{ArrayBuilder, BooleanBufferBuilder, PrimitiveArray, PrimitiveBuilder},
buffer::{BooleanBuffer, MutableBuffer},
Expand All @@ -7,6 +9,8 @@ use arrow::{
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
},
};
#[cfg(feature = "ndarray")]
use ndarray::{Array, IxDyn};

/// Tensor with innner arrow primitive builder
#[derive(Debug)]
Expand All @@ -19,7 +23,7 @@ pub struct TensorArrow<T: ArrowPrimitiveType> {
len: usize,
/// shape of tensor
shape: Vec<usize>,
/// order of tesnor, row or column major
/// order of tensor, row or column major
order: Order,
}

Expand Down Expand Up @@ -205,3 +209,32 @@ tensor_arrow_clone!(Int64Type);
tensor_arrow_clone!(UInt64Type);
tensor_arrow_clone!(Float32Type);
tensor_arrow_clone!(Float64Type);

#[macro_export]
macro_rules! tensor_arrow_to_ndarray {
($arrow_type:ty, $rust_type:ty) => {
#[cfg(feature = "ndarray")]
impl TensorArrow<$arrow_type> {
/// to convert TensorArrow into ndarray
pub fn to_ndarray(&self) -> Result<Array<$rust_type, IxDyn>, Error> {
let vector: Vec<$rust_type> =
self.values_builder.values_slice().iter().copied().collect();
let mut shape = self.shape().clone();
shape.push(self.len());
Array::from_shape_vec(IxDyn(&shape), vector)
.context("Failed reshaping tensor arrow into ndarray")
}
}
};
}

tensor_arrow_to_ndarray!(UInt8Type, u8);
tensor_arrow_to_ndarray!(Int8Type, i8);
tensor_arrow_to_ndarray!(Int16Type, i16);
tensor_arrow_to_ndarray!(UInt16Type, u16);
tensor_arrow_to_ndarray!(Int32Type, i32);
tensor_arrow_to_ndarray!(UInt32Type, u32);
tensor_arrow_to_ndarray!(Int64Type, i64);
tensor_arrow_to_ndarray!(UInt64Type, u64);
tensor_arrow_to_ndarray!(Float32Type, f32);
tensor_arrow_to_ndarray!(Float64Type, f64);
Loading

0 comments on commit da3b6e0

Please sign in to comment.