From 51c3eb7ae4f84f8bc33811b5f1842187ca81c6ce Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 4 Nov 2024 23:12:02 -0800 Subject: [PATCH] R & W accessor changes, minimize code bloat --- lib/kernels/include/kernels/accessor.h | 154 ++++++++---------- lib/kernels/include/kernels/cast_kernels.h | 8 +- .../include/kernels/cast_kernels_cpu.h | 8 +- .../include/kernels/datatype_dispatch.h | 10 +- .../kernels/managed_per_device_ff_handle.h | 5 +- lib/kernels/src/accessor.cc | 107 +++--------- lib/kernels/src/cpu/cast_kernels.cc | 13 +- lib/kernels/src/cpu/replicate_kernels.cc | 9 +- lib/kernels/src/cpu/reverse_kernels.cc | 24 +-- lib/kernels/src/cuda/ops/cast_kernels.cu | 12 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 42 ++--- .../src/managed_per_device_ff_handle.cc | 8 +- lib/kernels/test/src/test_attention_kernel.cc | 2 +- .../test/src/test_batch_matmul_kernel.cc | 2 +- .../test/src/test_batch_norm_kernel.cc | 6 +- lib/kernels/test/src/test_cast_kernel.cc | 25 +-- lib/kernels/test/src/test_combine_kernel.cc | 9 +- lib/kernels/test/src/test_concat_kernel.cc | 2 +- lib/kernels/test/src/test_dropout.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 12 +- lib/kernels/test/src/test_gather_kernels.cc | 2 +- .../test/src/test_layer_norm_kernels.cc | 8 +- .../test/src/test_managed_ff_stream.cc | 12 +- .../src/test_managed_per_device_ff_handle.cc | 14 +- lib/kernels/test/src/test_partition_kernel.cc | 10 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 6 +- lib/kernels/test/src/test_reduction_kernel.cc | 6 +- lib/kernels/test/src/test_replicate_kernel.cc | 11 +- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 17 +- lib/kernels/test/src/test_softmax_kernel.cc | 2 +- lib/kernels/test/src/test_split_kernel.cc | 4 +- lib/kernels/test/src/test_transpose_kernel.cc | 2 +- lib/kernels/test/src/test_utils.cc | 114 ++++++++++--- lib/kernels/test/src/test_utils.h | 92 ++--------- lib/local-execution/src/ops/cast.cc | 8 +- lib/local-execution/src/ops/linear.cc | 14 +- .../test/src/test_local_cost_estimator.cc | 2 +- 38 files changed, 330 insertions(+), 456 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 0a134db695..653c8db42d 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -13,54 +13,36 @@ namespace FlexFlow { struct Allocator; -class GenericTensorAccessorW { +class GenericTensorAccessorR { public: template - typename data_type_enum_to_class
::type *get() const { + typename data_type_enum_to_class
::type const *get() const { if (this->data_type == DT) { - return static_cast *>(this->ptr); + return static_cast const *>(this->ptr); } else { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } } - int32_t *get_int32_ptr() const; - int64_t *get_int64_ptr() const; - float *get_float_ptr() const; - double *get_double_ptr() const; - half *get_half_ptr() const; + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; - GenericTensorAccessorW() = delete; + GenericTensorAccessorR() = delete; - GenericTensorAccessorW(DataType data_type, + GenericTensorAccessorR(DataType data_type, ArrayShape const &shape, - void *ptr, + void const *ptr, DeviceType device_type); - bool operator==(GenericTensorAccessorW const &) const; - bool operator!=(GenericTensorAccessorW const &) const; - - template - real_type_t
&at(Indices... indices) { - if (this->device_type != DeviceType::CPU) { - throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); - } - if (this->data_type != DT) { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } - - using T = real_type_t
; - - T *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); - - return data_ptr[offset]; - } + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; - template - real_type_t
const &at(Indices... indices) const { + template + real_type_t
const &at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -72,7 +54,7 @@ class GenericTensorAccessorW { using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); + size_t offset = calculate_index_offset(indices); return data_ptr[offset]; } @@ -80,7 +62,7 @@ class GenericTensorAccessorW { public: DataType data_type; ArrayShape shape; - void *ptr; + void const *ptr; DeviceType device_type; private: @@ -90,43 +72,62 @@ class GenericTensorAccessorW { decltype(device_type) const &> tie() const; - size_t calculate_index_offset( - std::initializer_list const &indices) const; + size_t calculate_index_offset(std::vector const &indices) const; }; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); -class GenericTensorAccessorR { +class GenericTensorAccessorW { public: template - typename data_type_enum_to_class
::type const *get() const { + typename data_type_enum_to_class
::type *get() const { if (this->data_type == DT) { - return static_cast const *>(this->ptr); + return static_cast *>(this->ptr); } else { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + int32_t *get_int32_ptr() const; + int64_t *get_int64_ptr() const; + float *get_float_ptr() const; + double *get_double_ptr() const; + half *get_half_ptr() const; - GenericTensorAccessorR() = delete; + GenericTensorAccessorW() = delete; - GenericTensorAccessorR(DataType data_type, + GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, - void const *ptr, + void *ptr, DeviceType device_type); - bool operator==(GenericTensorAccessorR const &) const; - bool operator!=(GenericTensorAccessorR const &) const; + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; + + template + real_type_t
&at(std::vector const &indices) { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + + using T = real_type_t
; + + T *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset(indices); + + return data_ptr[offset]; + } - template - real_type_t
const &at(Indices... indices) const { + template + real_type_t
&at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -138,7 +139,7 @@ class GenericTensorAccessorR { using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); + size_t offset = calculate_index_offset(indices); return data_ptr[offset]; } @@ -146,7 +147,7 @@ class GenericTensorAccessorR { public: DataType data_type; ArrayShape shape; - void const *ptr; + void *ptr; DeviceType device_type; private: @@ -156,27 +157,11 @@ class GenericTensorAccessorR { decltype(device_type) const &> tie() const; - size_t calculate_index_offset( - std::initializer_list const &indices) const; + size_t calculate_index_offset(std::vector const &indices) const; }; -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); - -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); @@ -241,12 +226,8 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -254,16 +235,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); - -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorR const &src_accessor); -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorW const &src_accessor); +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); GenericTensorAccessorR copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index f67613cec6..21e76fed1d 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -8,15 +8,11 @@ namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index 959617dcae..275476b4e6 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -7,14 +7,10 @@ namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index 0986d99791..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -34,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -42,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -55,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -69,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..f9f944c6ff 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 9332dd6703..4cb5bd83a2 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -4,7 +4,7 @@ namespace FlexFlow { -void transfer_data_between_accessors( +void copy_accessor_data_to_l_from_r( GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor) { size_t num_bytes = dst_accessor.shape.get_volume() * @@ -25,6 +25,8 @@ void transfer_data_between_accessors( checkCUDA(cudaMemcpy( dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); } else { + assert(src_device_type == DeviceType::GPU); + assert(src_device_type == DeviceType::CPU); checkCUDA(cudaMemcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes, @@ -32,12 +34,8 @@ void transfer_data_between_accessors( } } -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorW const &src_accessor) { - GenericTensorAccessorR r_src_accessor = - read_only_accessor_from_write_accessor(src_accessor); - transfer_data_between_accessors(dst_accessor, r_src_accessor); +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); } GenericTensorAccessorW::GenericTensorAccessorW( @@ -56,7 +54,7 @@ std::tuple const &indices) const { + std::vector const &indices) const { if (indices.size() != this->shape.num_dims()) { throw mk_runtime_error(fmt::format( @@ -67,22 +65,18 @@ size_t GenericTensorAccessorW::calculate_index_offset( size_t offset = 0; size_t multiplier = 1; - size_t cur_idx; - auto it = indices.begin(); for (size_t i = 0; i < this->shape.num_dims(); i++) { - cur_idx = *it++; - - if (cur_idx >= this->shape.at(legion_dim_t(i))) { + if (indices[i] >= this->shape.at(legion_dim_t(i))) { throw mk_runtime_error( fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, - cur_idx, + indices[i], this->shape.at(legion_dim_t(i)))); } - offset += cur_idx * multiplier; + offset += indices[i] * multiplier; multiplier *= this->shape.at(legion_dim_t(i)); } @@ -146,7 +140,7 @@ std::tuple const &indices) const { + std::vector const &indices) const { if (indices.size() != this->shape.num_dims()) { throw mk_runtime_error(fmt::format( @@ -155,24 +149,20 @@ size_t GenericTensorAccessorR::calculate_index_offset( this->shape.num_dims())); } - size_t offset = 0; + ssize_t offset = 0; size_t multiplier = 1; - size_t cur_idx; - auto it = indices.begin(); for (size_t i = 0; i < this->shape.num_dims(); i++) { - cur_idx = *it++; - - if (cur_idx >= this->shape.at(legion_dim_t(i))) { + if (indices[i] >= this->shape.at(legion_dim_t(i))) { throw mk_runtime_error( fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, - cur_idx, + indices[i], this->shape.at(legion_dim_t(i)))); } - offset += cur_idx * multiplier; + offset += indices[i] * multiplier; multiplier *= this->shape.at(legion_dim_t(i)); } @@ -220,51 +210,6 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { return (s << fmt::to_string(a)); } -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { return get(a); } @@ -318,18 +263,11 @@ GenericTensorAccessorR read_only_accessor_from_write_accessor( writable.device_type}; } -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; } -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, DataType const &expected_dtype) { @@ -342,11 +280,6 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - template struct CopyTensorAccessorW { GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, @@ -355,7 +288,7 @@ struct CopyTensorAccessorW { get_tensor_shape(src_accessor.shape, src_accessor.data_type); GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return dst_accessor; } @@ -365,7 +298,7 @@ GenericTensorAccessorW copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); + src_accessor.data_type, src_accessor, allocator); } template @@ -376,7 +309,7 @@ struct CopyTensorAccessorR { get_tensor_shape(src_accessor.shape, src_accessor.data_type); GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return read_only_accessor_from_write_accessor(dst_accessor); } @@ -386,7 +319,7 @@ GenericTensorAccessorR copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); + src_accessor.data_type, src_accessor, allocator); } } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 2d3f440c75..5a00503fe4 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -37,18 +37,15 @@ struct CPUBackwardKernel { }; void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); } void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, input, output); + input.data_type, output.data_type, input, output); } } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 683739b91e..25693b374d 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -22,24 +22,23 @@ struct CPUBackwardKernel { for (size_t i = 0; i < input.shape.num_elements(); i++) { T cur_sum = 0; for (size_t j = 0; j < num_replicas; j++) { - cur_sum += output.at
(i, j); + cur_sum += output.at
({i, j}); } - input.at
(i) = cur_sum; + input.at
({i}) = cur_sum; } } }; void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW &output) { - DataTypeDispatch1{}( - input.data_type, input, std::ref(output)); + DataTypeDispatch1{}(input.data_type, input, output); } void cpu_backward_kernel(GenericTensorAccessorR const &output, GenericTensorAccessorW &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, output, std::ref(input), num_replicas); + input.data_type, output, input, num_replicas); } } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index bc114c4e60..e5b3719d74 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -11,17 +11,17 @@ struct CPUReverseForwardKernel { GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - coord_t num_out_blocks = input.shape.at(legion_dim_t(0)); - coord_t reverse_dim_size = input.shape.at(legion_dim_t(1)); - coord_t in_block_size = input.shape.at(legion_dim_t(2)); + size_t num_out_blocks = input.shape.at(legion_dim_t(0)); + size_t reverse_dim_size = input.shape.at(legion_dim_t(1)); + size_t in_block_size = input.shape.at(legion_dim_t(2)); - for (coord_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { - for (coord_t i = 0; i < in_block_size; i++) { - output.at
(block_idx, rev_idx, i) = - input.at
(num_out_blocks - 1 - block_idx, - reverse_dim_size - 1 - rev_idx, - in_block_size - 1 - i); + for (size_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (size_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (size_t i = 0; i < in_block_size; i++) { + output.at
({block_idx, rev_idx, i}) = + input.at
({num_out_blocks - 1 - block_idx, + reverse_dim_size - 1 - rev_idx, + in_block_size - 1 - i}); } } } @@ -31,13 +31,13 @@ struct CPUReverseForwardKernel { void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, GenericTensorAccessorW &output_accessor) { DataTypeDispatch1{}( - input_accessor.data_type, input_accessor, std::ref(output_accessor)); + input_accessor.data_type, input_accessor, output_accessor); } void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, GenericTensorAccessorW &input_accessor) { DataTypeDispatch1{}( - output_accessor.data_type, output_accessor, std::ref(input_accessor)); + output_accessor.data_type, output_accessor, input_accessor); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index b895ffb68f..dc342fd0e0 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -60,20 +60,16 @@ struct BackwardKernel { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index 29b77fd9d9..f13ebee67e 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - (void *)weight_ptr, + reinterpret_cast(weight_ptr), weight_type, in_dim, - (void *)input_ptr, + reinterpret_cast(input_ptr), input_type, in_dim, &beta, - (void *)output_ptr, + reinterpret_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - (void *)bias_ptr, + reinterpret_cast(bias_ptr), weight_type, 1, - (void *)m.one_ptr, + reinterpret_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - (void *)output_ptr, + reinterpret_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - (void *)output_ptr, + reinterpret_cast(output_ptr), &beta, m.outputTensor, - (void *)output_ptr)); + reinterpret_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -217,14 +217,14 @@ void backward_kernel(cudaStream_t stream, if (m.activation.has_value()) { if (m.activation == Activation::RELU) { relu_backward_kernel(m.output_type, - (void *)output_grad_ptr, - (void *)output_ptr, + reinterpret_cast(output_grad_ptr), + reinterpret_cast(output_ptr), output_size, stream); } else if (m.activation == Activation::SIGMOID) { sigmoid_backward_kernel(m.output_type, - (void *)output_grad_ptr, - (void *)output_ptr, + reinterpret_cast(output_grad_ptr), + reinterpret_cast(output_ptr), output_size, stream); } else { @@ -241,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - (void *)input_ptr, + reinterpret_cast(input_ptr), input_type, in_dim, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)kernel_grad_ptr, + reinterpret_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -290,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - (void *)m.one_ptr, + reinterpret_cast(m.one_ptr), CUDA_R_32F, 1, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)bias_grad_ptr, + reinterpret_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -313,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - (void *)kernel_ptr, + reinterpret_cast(kernel_ptr), weight_type, in_dim, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)input_grad_ptr, + reinterpret_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index ca105f9bc9..5bd49dc26f 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -3,10 +3,11 @@ namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { this->handle = new PerDeviceFFHandle; - this->handle->workSpaceSize = 1024 * 1024; - this->handle->allowTensorOpMathConversion = true; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; checkCUDNN(cudnnCreate(&this->handle->dnn)); checkCUBLAS(cublasCreate(&this->handle->blas)); @@ -37,7 +38,6 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { checkCUBLAS(cublasDestroy(this->handle->blas)); checkCUDA(cudaFree(this->handle->workSpace)); delete this->handle; - this->handle = nullptr; } } diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 5245fab915..aae3676107 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t qoSeqLength = 20, kvSeqLength = 20; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index c08e08fd08..b87f3978b5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index a8a26b8eaf..a258a27a34 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); @@ -37,11 +37,11 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + create_filled_accessor_w(scale_shape, allocator, DataTypeValue(1.0f)); SUBCASE("forward_kernel") { GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + create_filled_accessor_w(bias_shape, allocator, DataTypeValue(0.0f)); Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index c5b1d98bb1..1be5839a9c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -21,11 +21,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); CHECK(contains_non_zero(output_accessor)); } @@ -38,9 +35,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Cast::backward_kernel(managed_stream.raw_stream(), grad_output_accessor, - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); + grad_input_accessor); CHECK(contains_non_zero(grad_input_accessor)); } @@ -65,11 +60,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor_gpu = create_zero_filled_accessor_w(output_shape, gpu_allocator); - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor_gpu, - output_accessor_gpu, - DataType::FLOAT, - DataType::DOUBLE); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorR input_accessor_cpu = @@ -78,12 +70,9 @@ TEST_SUITE(FF_TEST_SUITE) { create_zero_filled_accessor_w(output_shape, cpu_allocator); Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu, - DataType::FLOAT, - DataType::DOUBLE); + output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_gpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 89d06dff96..60179ee75b 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Call Combine Forward and Backward Kernels") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -70,8 +70,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_gpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -94,8 +93,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, input_grad_accessor_cpu); - CHECK(w_accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index b30995cf15..841d53133c 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t size_per_input = 10; ff_dim_t concat_axis = ff_dim_t(1); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; TensorShape input_shape = diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 2c7e2657f7..bee00d990d 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 3a3e3b28b7..9febf4bcc4 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; TensorShape input_shape = @@ -15,8 +15,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, DataTypeValue(2.0f))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -30,10 +30,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, DataTypeValue(0.0f)); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + create_filled_accessor_w(input_shape, allocator, DataTypeValue(1.0f)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index fd7a8ab47a..4f9fa02a1a 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index b667716181..87fc88f081 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape feature_shape = make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -33,13 +33,13 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_filled_accessor_w(feature_shape, allocator, DataTypeValue(1.0f)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, DataTypeValue(0.0f)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 1dedb0c41d..ce8a808454 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -6,24 +6,24 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed FF Stream") { ManagedFFStream base_stream{}; + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); SUBCASE("Test ManagedFFStream Move Constructor") { - ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - ManagedFFStream new_stream(std::move(base_stream)); - CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } SUBCASE("Test ManagedFFStream Assignment Operator") { - ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - ManagedFFStream new_stream{}; new_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } + + SUBCASE("Test Self-Assignment") { + base_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == base_stream_ptr); + } } } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index e85cfd61c7..d39da03ba9 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -5,7 +5,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed Per Device FF Handle") { - ManagedPerDeviceFFHandle base_handle{}; + ManagedPerDeviceFFHandle base_handle{1024 * 1024, true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); @@ -13,8 +14,6 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { - PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); CHECK(&base_handle.raw_handle() == nullptr); @@ -22,13 +21,16 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") { - PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - - ManagedPerDeviceFFHandle new_handle{}; + ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; new_handle = std::move(base_handle); CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } + + SUBCASE("Test Self-Assignment") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } } } diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 7110128885..079af64a8c 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - create_filled_accessor_r(input_shape, allocator, 1.0f); + create_filled_accessor_r(input_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -31,10 +31,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - create_filled_accessor_r(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + create_filled_accessor_w(input_shape, allocator, DataTypeValue(2.0f)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 52a177dd72..76b966ea15 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -56,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 8706c5d877..ddbe826c70 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_legion_dims( {10, 10, 10, 10, 10}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -35,8 +35,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - create_filled_accessor_r(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 77f4001328..1d9e0677b7 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); @@ -78,8 +78,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_gpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -103,8 +102,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Replicate::cpu_backward_kernel( output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas); - CHECK(w_accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 92a61524a3..41aaac9c3e 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 4e98ea701b..436b788a99 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -14,15 +14,15 @@ TEST_SUITE(FF_TEST_SUITE) { {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, DataTypeValue(1.0f))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -57,7 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { - std::size_t num_out_blks = 1; + std::size_t num_out_blks = 4; std::size_t reverse_dim_size = 3; std::size_t in_blk_size = 2; @@ -65,7 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) { {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); @@ -99,8 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_cpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -128,8 +127,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, input_grad_accessor_cpu); - CHECK(w_accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index f723a9ca46..b293d1ce75 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index a3cf215dff..114077d6ec 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { } GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + create_filled_accessor_w(input_shape, allocator, DataTypeValue(0.0f)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index d5d0b00576..5c5e9b31f8 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector perm = {ff_dim_t(0), ff_dim_t(1)}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 103c866c10..a59747b376 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -53,7 +53,7 @@ struct CreateRandomFilledAccessorW { } GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return dst_accessor; } @@ -62,7 +62,7 @@ struct CreateRandomFilledAccessorW { GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator) { return DataTypeDispatch1{}( - shape.data_type, shape, std::ref(allocator)); + shape.data_type, shape, allocator); } GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, @@ -111,20 +111,14 @@ struct CPUAccessorRContainsNonZero { bool contains_non_zero(GenericTensorAccessorR const &accessor) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); return DataTypeDispatch1{}( cpu_accessor.data_type, cpu_accessor); } -bool contains_non_zero(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - return contains_non_zero(r_accessor); -} - GenericTensorAccessorR - create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, - Allocator &cpu_allocator) { + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { GenericTensorAccessorR cpu_accessor = accessor; if (accessor.device_type == DeviceType::GPU) { cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); @@ -133,8 +127,8 @@ GenericTensorAccessorR } GenericTensorAccessorW - create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, - Allocator &cpu_allocator) { + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { GenericTensorAccessorW cpu_accessor = accessor; if (accessor.device_type == DeviceType::GPU) { cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); @@ -144,28 +138,102 @@ GenericTensorAccessorW template struct PrintCPUAccessorR { - void operator()(GenericTensorAccessorR const &accessor) { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { using T = real_type_t
; T const *data_ptr = accessor.get
(); for (size_t i = 0; i < accessor.shape.num_elements(); i++) { - std::cout << data_ptr[i] << " "; + stream << data_ptr[i] << " "; } - std::cout << "\n"; + stream << "\n"; } }; -void print_accessor(GenericTensorAccessorR const &accessor) { +void print_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); - DataTypeDispatch1{}(accessor.data_type, accessor); + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + DataTypeDispatch1{}(accessor.data_type, accessor, stream); } -void print_accessor(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - print_accessor(r_accessor); +template +struct AccessorsAreEqual { + bool operator()(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor_a = + copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); + GenericTensorAccessorR cpu_accessor_b = + copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); + + using T = real_type_t
; + T const *a_data_ptr = cpu_accessor_a.get
(); + T const *b_data_ptr = cpu_accessor_b.get
(); + + for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + return false; + } + } + + return true; + } +}; + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + if (accessor_a.shape != accessor_b.shape) { + throw mk_runtime_error( + fmt::format("accessors_are_equal expected accessors to have the same " + "shape, but received: {} != {}", + accessor_a.shape, + accessor_b.shape)); + } + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); +} + +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); } +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); +} } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 4de114bd48..efbbc90e08 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -8,6 +8,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" namespace FlexFlow { @@ -24,103 +25,30 @@ TensorShape make_tensor_shape_from_legion_dims(LegionOrdered const &dims, DataType DT); -bool contains_non_zero(GenericTensorAccessorW const &accessor); - bool contains_non_zero(GenericTensorAccessorR const &accessor); void fill_with_zeros(GenericTensorAccessorW const &accessor); GenericTensorAccessorW - create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, - Allocator &allocator); + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &allocator); GenericTensorAccessorR - create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, - Allocator &allocator); - -void print_accessor(GenericTensorAccessorR const &accessor); - -void print_accessor(GenericTensorAccessorW const &accessor); - -template -struct CreateFilledAccessorW { - GenericTensorAccessorW operator()(TensorShape const &shape, - Allocator &allocator, - real_type_t
val) { - using T = real_type_t
; + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &allocator); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); +void print_tensor_accessor_contents(GenericTensorAccessorR const &accessor); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); - T *data_ptr = src_accessor.get
(); - for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { - data_ptr[i] = val; - } - - transfer_data_between_accessors(dst_accessor, src_accessor); - return dst_accessor; - } -}; - -template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - T val) { - return DataTypeDispatch1{}( - shape.data_type, shape, std::ref(allocator), val); -} + DataTypeValue val); -template GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, Allocator &allocator, - T val) { - GenericTensorAccessorW w_accessor = - create_filled_accessor_w(shape, allocator, val); - return read_only_accessor_from_write_accessor(w_accessor); -} - -template -bool w_accessors_are_equal(GenericTensorAccessorW const &accessor_a, - GenericTensorAccessorW const &accessor_b) { - if (accessor_a.shape.num_dims() != accessor_b.shape.num_dims()) { - throw mk_runtime_error( - "Comparing equivalence for two accessors of differing dimensions"); - } - for (size_t i = 0; i < accessor_a.shape.num_dims(); i++) { - if (accessor_a.shape[legion_dim_t(i)] != - accessor_b.shape[legion_dim_t(i)]) { - throw mk_runtime_error( - "Comparing equivalence for two accessors of differing shape"); - } - } - - if (accessor_a.data_type != accessor_b.data_type) { - return false; - } - - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW cpu_accessor_a = - create_cpu_compatible_accessor_w(accessor_a, cpu_allocator); - GenericTensorAccessorW cpu_accessor_b = - create_cpu_compatible_accessor_w(accessor_b, cpu_allocator); - - using T = real_type_t
; - T *a_data_ptr = cpu_accessor_a.get
(); - T *b_data_ptr = cpu_accessor_b.get
(); - - for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { - if (a_data_ptr[i] != b_data_ptr[i]) { - print_accessor(cpu_accessor_a); - print_accessor(cpu_accessor_b); - return false; - } - } - - return true; -} - + DataTypeValue val); } // namespace FlexFlow #endif diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 860eedaa1c..1c5d5136cd 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -125,17 +125,17 @@ static std::optional auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto bias = acc.get_tensor(BIAS); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_ptr = NULL; if (attrs.use_bias) { bias_ptr = bias.get_float_ptr(); } @@ -149,12 +149,12 @@ static std::optional "[Linear] backward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), - (float *)input_grad.get_float_ptr(), + input_grad.get_float_ptr(), output.get_float_ptr(), - (float *)output_grad.get_float_ptr(), + output_grad.get_float_ptr(), weight.get_float_ptr(), - (float *)weight_grad.get_float_ptr(), - (float *)bias_ptr, + weight_grad.get_float_ptr(), + bias_ptr, in_dim, out_dim, batch_size); diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..788ab52a7a 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,7 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); // RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ // DeviceSpecific::create(managed_handle.raw_handle()),