diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 1ef121fb2a..14aa7bb010 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -29,15 +29,20 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; + GenericTensorAccessorW(DataType dt, + ArrayShape sh, + req p, + bool on_dev = true) + : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + public: DataType data_type; ArrayShape shape; req ptr; + bool on_device; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + GenericTensorAccessorW, data_type, shape, ptr, on_device); class GenericTensorAccessorR { public: @@ -57,15 +62,20 @@ class GenericTensorAccessorR { double const *get_double_ptr() const; half const *get_half_ptr() const; + GenericTensorAccessorR(DataType dt, + ArrayShape sh, + req p, + bool on_dev = true) + : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + public: DataType data_type; ArrayShape shape; req ptr; + bool on_device; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + GenericTensorAccessorR, data_type, shape, ptr, on_device); int32_t *get_int32_ptr(GenericTensorAccessorW const &); int64_t *get_int64_ptr(GenericTensorAccessorW const &); diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..452ccc47b0 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -5,10 +5,13 @@ #include #include +enum class AllocLocation { HOST, DEVICE }; + namespace FlexFlow { struct IAllocator { virtual void *allocate(size_t) = 0; + virtual void *allocate_and_zero(size_t) = 0; virtual void deallocate(void *) = 0; virtual ~IAllocator() = default; @@ -18,7 +21,11 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + GenericTensorAccessorW + allocate_tensor_and_zero(TensorShape const &tensor_shape); + void *allocate(size_t mem_size); + void *allocate_and_zero(size_t mem_size); void deallocate(void *ptr); template @@ -30,6 +37,8 @@ struct Allocator { Allocator(std::shared_ptr ptr) : i_allocator(ptr){}; + AllocLocation alloc_location; + private: std::shared_ptr i_allocator; }; diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index df4ef22b93..cae0c9da8d 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -7,19 +7,17 @@ namespace FlexFlow { namespace Kernels { namespace Cast { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); -void backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); +void cpu_backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); -} // namespace CPU } // namespace Cast } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h index 1d30297af1..66c22ddbf8 100644 --- a/lib/kernels/include/kernels/combine_kernels_cpu.h +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -7,15 +7,13 @@ namespace FlexFlow { namespace Kernels { namespace Combine { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); -void backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad); +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace CPU } // namespace Combine } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h index 27dcc9d854..121ed184e9 100644 --- a/lib/kernels/include/kernels/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -10,6 +10,7 @@ struct LocalCPUAllocator : public IAllocator { ~LocalCPUAllocator() override; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; private: diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..16f60daead 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator { ~LocalCudaAllocator() override; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; private: diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h index 4bc97f00ef..11d2f1bf5c 100644 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -7,16 +7,14 @@ namespace FlexFlow { namespace Kernels { namespace Replicate { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); -void backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, - size_t num_replicas); +void cpu_backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas); -} // namespace CPU } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index 89ed6ffdb4..bb17aa9400 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -6,22 +6,20 @@ namespace FlexFlow { namespace Kernels { namespace Reverse { -namespace CPU { -void forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); +void cpu_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size); -void backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); -} // namespace CPU +void cpu_backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size); } // namespace Reverse } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 56002718b1..01514ab679 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -134,8 +134,10 @@ std::vector GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.on_device}; } } // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index a892e14a54..c1c272fbbe 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -6,6 +6,10 @@ void *Allocator::allocate(size_t mem_size) { return this->i_allocator->allocate(mem_size); } +void *Allocator::allocate_and_zero(size_t mem_size) { + return this->i_allocator->allocate_and_zero(mem_size); +} + void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } @@ -13,7 +17,15 @@ void Allocator::deallocate(void *ptr) { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return {tensor_shape.data_type, tensor_shape, ptr}; + bool on_device = this->alloc_location == AllocLocation::DEVICE; + return {tensor_shape.data_type, tensor_shape, ptr, on_device}; +} + +GenericTensorAccessorW + Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) { + void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape)); + bool on_device = this->alloc_location == AllocLocation::DEVICE; + return {tensor_shape.data_type, tensor_shape, ptr, on_device}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 5410726e0a..0aae2a8ddd 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -60,4 +60,10 @@ size_t get_volume(ArrayShape const &shape) { return shape.get_volume(); } +TensorShape get_tensor_shape(ArrayShape const &shape, DataType DT) { + FFOrdered ff_dims(shape.dims.begin(), shape.dims.end()); + TensorDims tensor_shape_dims(ff_dims); + return TensorShape(tensor_shape_dims, DT); +} + } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index cf73a84b93..5888d9a96a 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -4,56 +4,55 @@ namespace FlexFlow { namespace Kernels { namespace Cast { -namespace CPU { template -void cast_forward(IDT const *input, ODT *output, size_t volume) { +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { for (size_t i = 0; i < volume; ++i) { output[i] = static_cast(input[i]); } } template -void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { for (size_t i = 0; i < volume; i++) { output[i] = static_cast(input[i]) + beta * output[i]; } } template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { size_t volume = input.shape.get_volume(); - cast_forward(input.get(), output.get(), volume); + cpu_cast_forward(input.get(), output.get(), volume); } }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { size_t volume = input.shape.get_volume(); - cast_backward( + cpu_cast_backward( input.get(), output.get(), volume, cast_to(1.0f)); } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}(input_type, output_type, input, output); } -void backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); +void cpu_backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}( + input_type, output_type, input, output); } -} // namespace CPU } // namespace Cast } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index f1950a56d2..e48f4c3e01 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -4,10 +4,9 @@ namespace FlexFlow { namespace Kernels { namespace Combine { -namespace CPU { template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { memcpy(output.get
(), @@ -17,7 +16,7 @@ struct ForwardKernel { }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { size_t num_elements = output_grad.shape.get_volume(); @@ -27,18 +26,17 @@ struct BackwardKernel { } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); } -void backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - DataTypeDispatch1{}( +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( input_grad.data_type, output_grad, input_grad); } -} // namespace CPU } // namespace Combine } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index a26d2054d1..239baf4041 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -4,13 +4,12 @@ namespace FlexFlow { namespace Kernels { namespace Replicate { -namespace CPU { template -void replicate_backward_kernel(T *input, - T const *output, - size_t num_elements, - size_t num_replicas) { +void cpu_replicate_backward_kernel(T *input, + T const *output, + size_t num_elements, + size_t num_replicas) { for (size_t i = 0; i < num_elements; ++i) { T sum = 0; for (size_t j = 0; j < num_replicas; ++j) { @@ -23,7 +22,7 @@ void replicate_backward_kernel(T *input, // Why does replicate forward seem to only transfer memory? Shouldn't it also // handle the replication? template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { memcpy(output.get(), @@ -33,29 +32,28 @@ struct ForwardKernel { }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, size_t num_replicas) { size_t total_elements = input.shape.num_elements() * num_replicas; - replicate_backward_kernel( + cpu_replicate_backward_kernel( input.get(), output.get(), total_elements, num_replicas); } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); } -void backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, - size_t num_replicas) { - DataTypeDispatch1{}( +void cpu_backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas) { + DataTypeDispatch1{}( input.data_type, input, output, num_replicas); } -} // namespace CPU } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index b035f03721..350dad03e9 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,48 +1,78 @@ #include "kernels/reverse_kernels_cpu.h" +#include +#include namespace FlexFlow { namespace Kernels { namespace Reverse { -namespace CPU { -void reverse_forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { +void cpu_reverse_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size; - for (coord_t i = 0; i < total_elements; ++i) { - coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - coord_t offset = i - blk_idx * (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = offset / in_blk_size; - coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + - (offset % in_blk_size); - out_ptr[i] = in_ptr[in_idx]; + + std::vector> in_blocks(num_out_blks * reverse_dim_size, + std::vector(in_blk_size)); + + // For each output block, copy the input block into in_blocks + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + // Each output block has reverse_dim_size input blocks + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + + // Copy elements from in_ptr to the current block in in_blocks + std::vector ¤t_block = + in_blocks[blk_idx * reverse_dim_size + rev_idx]; + for (coord_t i = 0; i < in_blk_size; ++i) { + current_block[i] = in_ptr[start_idx + i]; + } + } + } + + // Reverse the in_blocks within each output block + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + auto block_start = in_blocks.begin() + blk_idx * reverse_dim_size; + auto block_end = block_start + reverse_dim_size; + std::reverse(block_start, block_end); + } + + // Copy the reversed blocks to the output array + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + + // Copy elements from the current block in in_blocks to out_ptr + std::vector const ¤t_block = + in_blocks[blk_idx * reverse_dim_size + rev_idx]; + for (coord_t i = 0; i < in_blk_size; ++i) { + out_ptr[start_idx + i] = current_block[i]; + } + } } } -void forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { - reverse_forward_kernel( +void cpu_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { + cpu_reverse_forward_kernel( in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -void backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size) { - reverse_forward_kernel( +void cpu_backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size) { + cpu_reverse_forward_kernel( out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -} // namespace CPU } // namespace Reverse } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index 9cc86c44ca..ced707edcc 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -3,6 +3,18 @@ namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { + void *ptr = malloc(requested_memory_size); + + if (ptr != nullptr) { + this->ptrs.insert(ptr); + } else { + throw std::bad_alloc(); + } + + return ptr; +} + +void *LocalCPUAllocator::allocate_and_zero(size_t requested_memory_size) { void *ptr = calloc(1, requested_memory_size); if (ptr != nullptr) { @@ -25,13 +37,15 @@ void LocalCPUAllocator::deallocate(void *ptr) { } LocalCPUAllocator::~LocalCPUAllocator() { - for (auto ptr : ptrs) { + for (void *ptr : this->ptrs) { free(ptr); } } Allocator create_local_cpu_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + allocator.alloc_location = AllocLocation::HOST; + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index c82abc765d..c93e32734c 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -3,6 +3,13 @@ namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { + void *ptr; + checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + this->ptrs.insert(ptr); + return ptr; +} + +void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) { void *ptr; checkCUDA(cudaMalloc(&ptr, requested_memory_size)); checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); @@ -27,7 +34,9 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + allocator.alloc_location = AllocLocation::DEVICE; + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index c37b83fa24..bbb3c62a85 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -33,28 +33,28 @@ TEST_SUITE(FF_TEST_SUITE) { kvSeqLength, false); - TensorShape query_shape = - make_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = - make_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); + TensorShape query_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, qSize}, DataType::FLOAT); + TensorShape key_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, kSize}, DataType::FLOAT); + TensorShape value_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, vSize}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT); TensorShape weight_shape = - make_tensor_shape_from_legion_dims({state.weightSize}); + make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -69,22 +69,27 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output)); } SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, + allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 51a50e6cf2..e64941b574 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -20,18 +20,21 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape_a = - make_tensor_shape_from_legion_dims({m, k, batch}); + make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT); TensorShape input_shape_b = - make_tensor_shape_from_legion_dims({k, n, batch}); + make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({m, n, batch}); + make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, allocator); + create_random_filled_accessor_w(input_shape_a, + allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, allocator); + create_random_filled_accessor_w(input_shape_b, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -50,7 +53,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 0d4682996a..5135d703fd 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -23,23 +23,21 @@ TEST_SUITE(FF_TEST_SUITE) { output_w, true); - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape scale_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape bias_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW scale_accessor = create_filled_accessor_w(scale_shape, allocator, 1.0f); @@ -54,20 +52,24 @@ TEST_SUITE(FF_TEST_SUITE) { scale_accessor.get_float_ptr(), bias_accessor.get_float_ptr()); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, allocator); + create_random_filled_accessor_w(scale_shape, + allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, allocator); + create_random_filled_accessor_w(bias_shape, + allocator); Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, @@ -81,14 +83,11 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.shape.num_elements()); std::vector host_input_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector host_scale_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(scale_grad_accessor)); + load_accessor_data(scale_grad_accessor); std::vector host_bias_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(bias_grad_accessor)); + load_accessor_data(bias_grad_accessor); CHECK(contains_non_zero(host_input_grad_data)); CHECK(contains_non_zero(host_scale_grad_data)); diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index e7da356564..4e54aa2e1c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -12,17 +12,16 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); Kernels::Cast::forward_kernel(managed_stream.raw_stream(), input_accessor, @@ -31,26 +30,26 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::DOUBLE); std::vector host_double_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_double_data)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW grad_input_accessor = allocator.allocate_tensor(input_shape); - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor, + DataType::DOUBLE, + DataType::FLOAT); std::vector host_grad_float_data = - load_accessor_data( - read_only_accessor_from_write_accessor(grad_input_accessor)); + load_accessor_data(grad_input_accessor); CHECK(contains_non_zero(host_grad_float_data)); } } @@ -62,9 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -81,31 +80,34 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); Kernels::Cast::forward_kernel( managed_stream.raw_stream(), read_only_accessor_from_write_accessor(input_accessor_gpu), output_accessor_gpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before GPU load" << std::endl; std::vector result_data_gpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), - false); + load_accessor_data(output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); - Kernels::Cast::CPU::forward_kernel( + input_shape, cpu_allocator, transform); + Kernels::Cast::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before CPU load" << std::endl; + if (output_accessor_cpu.on_device) { + std::cout << "CPU data is on device" << std::endl; + } else { + std::cout << "CPU data is on host" << std::endl; + } std::vector result_data_cpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), - true); + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 60c55ca062..aeceb1ef4d 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -12,28 +12,28 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,8 +41,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + std::vector host_input_grad = + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_input_grad)); } } @@ -54,36 +54,36 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({5, 5}); + make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { // Run GPU Combine Forward Kernel GenericTensorAccessorR input_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, gpu_allocator)); + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Combine Forward Kernel GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories( - input_accessor_gpu, input_shape, cpu_allocator); + copy_tensor_between_memories(input_accessor_gpu, + cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); - Kernels::Combine::CPU::forward_kernel( + Kernels::Combine::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -91,33 +91,31 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Combine Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, gpu_allocator)); + create_random_filled_accessor_r(output_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); + gpu_allocator.allocate_tensor_and_zero(input_shape); Kernels::Combine::backward_kernel(managed_stream.raw_stream(), output_grad_accessor_gpu, input_grad_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Combine Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( - output_grad_accessor_gpu, output_shape, cpu_allocator); + output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); + cpu_allocator.allocate_tensor_and_zero(input_shape); - Kernels::Combine::CPU::backward_kernel( + Kernels::Combine::cpu_backward_kernel( read_only_accessor_from_write_accessor(output_grad_accessor_cpu), input_grad_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 04bd4b5929..8754381850 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -13,18 +13,17 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {size_per_input, num_inputs}); + make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {size_per_input, num_inputs}, DataType::FLOAT); Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { std::vector input_accessors = repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + return create_random_filled_accessor_r(input_shape, + allocator); }); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,16 +33,16 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessors, concat_axis); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); std::vector input_grad_accessors = repeat( num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index c944a80b02..8237e61729 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; @@ -31,8 +31,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -42,17 +42,18 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.get_float_ptr()); std::vector host_output_accessor = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 3f8ef38f0b..5c88110fde 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = @@ -27,8 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.get_float_ptr()); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 2.0f); @@ -47,8 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor.get_float_ptr()); std::vector backward_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index cfabef7ab2..b8c4da0df2 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -13,18 +13,18 @@ TEST_SUITE(FF_TEST_SUITE) { GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({50}); + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,17 +34,18 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, output_accessor); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, @@ -53,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor); std::vector host_input_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_input_grad_data)); } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5bb589607b..651959d171 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -11,12 +11,11 @@ TEST_SUITE(FF_TEST_SUITE) { float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {batch_size, feature_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {batch_size, feature_size}, DataType::FLOAT); TensorShape output_shape = input_shape; TensorShape feature_shape = - make_tensor_shape_from_legion_dims({feature_size}); + make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -32,8 +31,8 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(feature_shape, allocator, 1.0f); @@ -53,10 +52,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 1e009b205a..d34101d349 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { @@ -29,8 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), state, input_accessor, output_accessor); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); @@ -50,8 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 3.0f); diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index d6df1daa4a..e014accfd3 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -36,17 +36,17 @@ TEST_SUITE(FF_TEST_SUITE) { stride_w, pool_type); - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {input_w, input_h, input_c, input_n}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), @@ -54,8 +54,8 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } @@ -72,8 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.ptr, output_grad_accessor.ptr); - std::vector host_input_grad = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + std::vector host_input_grad = + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_input_grad)); } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 5dcf85e39d..989ffde163 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -7,9 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {10, 10, 10, 10, 10}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {10, 10, 10, 10, 10}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -18,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { TensorShape output_shape = - make_tensor_shape_from_legion_dims({10}); + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -31,8 +30,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } @@ -51,8 +50,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_grad_data = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + std::vector host_grad_data = + load_accessor_data(input_grad_accessor); CHECK(host_grad_data == expected_grad_input_data); } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 49807355e1..315a1c3489 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -9,9 +9,9 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t num_replicas = 10; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -29,8 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), input_accessor, output_accessor); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); @@ -50,8 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { num_replicas); std::vector check_aggregated_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(check_aggregated_data)); } } @@ -63,11 +61,11 @@ TEST_SUITE(FF_TEST_SUITE) { // reduced shape, but things are weird cause doesn't seem to be replicating // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape replicated_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape reduced_shape = - make_tensor_shape_from_legion_dims({10}); + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -78,30 +76,30 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { // Run GPU Replicate Forward Kernel GenericTensorAccessorR input_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, gpu_allocator)); + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(replicated_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Replicate Forward Kernel GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories( - input_accessor_gpu, input_shape, cpu_allocator); + copy_tensor_between_memories(input_accessor_gpu, + cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(replicated_shape); - Kernels::Replicate::CPU::forward_kernel( + Kernels::Replicate::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -109,35 +107,33 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(replicated_shape, gpu_allocator)); + create_random_filled_accessor_r(replicated_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(reduced_shape); + gpu_allocator.allocate_tensor_and_zero(reduced_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor_gpu, output_grad_accessor_gpu, num_replicas); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Replicate Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( - output_grad_accessor_gpu, replicated_shape, cpu_allocator); + output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(reduced_shape); + cpu_allocator.allocate_tensor_and_zero(reduced_shape); - Kernels::Replicate::CPU::backward_kernel( + Kernels::Replicate::cpu_backward_kernel( input_grad_accessor_cpu, read_only_accessor_from_write_accessor(output_grad_accessor_cpu), num_replicas); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index e1a8ccc4b7..e8b3d9d2f5 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -28,8 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), state, input_accessor, output_accessor); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); @@ -49,8 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 3.0f); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index fc7acc99cd..be1d946902 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -10,9 +10,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t in_blk_size = 10; std::size_t num_out_blks = 1; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -36,15 +35,15 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.shape.num_elements()); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); CHECK(contains_non_zero(check_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -58,8 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor.shape.num_elements()); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_grad_input_data)); } @@ -70,9 +68,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t reverse_dim_size = 3; std::size_t in_blk_size = 5; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -89,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Cast Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -101,17 +98,17 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Cast Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); + input_shape, cpu_allocator, transform); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); - Kernels::Reverse::CPU::forward_kernel( + Kernels::Reverse::cpu_forward_kernel( input_accessor_cpu.get_float_ptr(), output_accessor_cpu.get_float_ptr(), num_out_blks, @@ -119,8 +116,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -128,7 +125,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_gpu = - create_random_filled_accessor_w(output_shape, gpu_allocator); + create_random_filled_accessor_w(output_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = gpu_allocator.allocate_tensor(input_shape); @@ -141,20 +139,18 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( read_only_accessor_from_write_accessor(output_grad_accessor_gpu), - output_shape, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(input_shape); - Kernels::Reverse::CPU::backward_kernel( + Kernels::Reverse::cpu_backward_kernel( output_grad_accessor_cpu.get_float_ptr(), input_grad_accessor_cpu.get_float_ptr(), num_out_blks, @@ -162,9 +158,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index a9f7fa8bc0..c25c2f91d3 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -14,26 +14,28 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel( managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } @@ -52,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_input_grad_data = std::vector(input_grad_accessor.shape.num_elements(), 1.0f); std::vector host_input_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(host_input_grad_data == expected_input_grad_data); } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 304a7ba121..26acbee33c 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -17,13 +17,14 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({50}); + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -44,7 +45,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index a4cbf37c4b..2abbd66c8f 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -18,30 +18,31 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Transpose::init_kernel(num_dims, perm); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, @@ -49,8 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_grad_input_data)); } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index c9d2bf0a7c..b147523604 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,24 +1,19 @@ #include "test_utils.h" -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool on_host) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); - for (auto &val : host_data) { - val = dist(gen); - } - - transfer_memory(static_cast(accessor.ptr), - host_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + return read_only_accessor_from_write_accessor(accessor); +} - return accessor; +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, + DataType DT) { + return TensorShape{ + TensorDims{ + dims, + }, + DT, + }; } diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 1ce9e7a3d7..4426ba2df8 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -8,55 +8,74 @@ #include "kernels/managed_per_device_ff_handle.h" #include -enum class GpuDirection { - HostToDevice = 0, - DeviceToHost = 1, - DeviceToDevice = 2 -}; - template -void transfer_memory(DT *dst, +void transfer_memory(GenericTensorAccessorW dst_accessor, const DT *src, - size_t num_elements, - GpuDirection gpu_dir, - bool cpu_memory) { - size_t bytes = num_elements * sizeof(DT); - - if (cpu_memory) { - memcpy(dst, src, bytes); + AllocLocation src_loc) { + size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT); + AllocLocation dst_loc = + dst_accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; + + if (src_loc == AllocLocation::HOST && dst_loc == AllocLocation::HOST) { + memcpy(dst_accessor.ptr, src, bytes); + } else if (src_loc == AllocLocation::HOST && + dst_loc == AllocLocation::DEVICE) { + checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice)); + } else if (src_loc == AllocLocation::DEVICE && + dst_loc == AllocLocation::HOST) { + checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost)); } else { - switch (gpu_dir) { - case GpuDirection::HostToDevice: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice)); - break; - case GpuDirection::DeviceToHost: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost)); - break; - case GpuDirection::DeviceToDevice: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); - break; - } + checkCUDA( + cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToDevice)); } } +template GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool on_host = false); + Allocator &allocator) { + assert(shape.data_type == DataType::FLOAT || + shape.data_type == DataType::DOUBLE); + using T = real_type
; + + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + + std::vector host_data(accessor.shape.num_elements()); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0, 1.0); + + for (auto &val : host_data) { + val = dist(gen); + } + + transfer_memory(accessor, host_data.data(), AllocLocation::HOST); + + return accessor; +} + +template +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w
(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - DT val, - bool on_host = false) { + DT val) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + + size_t volume = accessor.shape.get_volume(); std::vector
host_data(volume, val); - transfer_memory(static_cast
(accessor.ptr), - host_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + transfer_memory(accessor, host_data.data(), AllocLocation::HOST); return accessor; } @@ -64,9 +83,11 @@ GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, template GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, Allocator &allocator, - F transform, - bool on_host = false) { + F transform) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + size_t volume = accessor.shape.get_volume(); std::vector input_data(volume); std::vector output_data(volume); @@ -74,11 +95,7 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, std::transform( input_data.begin(), input_data.end(), output_data.begin(), transform); - transfer_memory(static_cast(accessor.ptr), - output_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + transfer_memory(accessor, output_data.data(), AllocLocation::HOST); return accessor; } @@ -86,42 +103,59 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, template GenericTensorAccessorW copy_tensor_between_memories(GenericTensorAccessorR accessor, - TensorShape const &shape, - Allocator &allocator, - bool src_on_host = false) { + Allocator &allocator) { + TensorShape shape = get_tensor_shape(accessor.shape, accessor.data_type); GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape); + copied_accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; - size_t volume = accessor.shape.get_volume(); - GpuDirection gpu_dir = - src_on_host ? GpuDirection::HostToDevice : GpuDirection::DeviceToHost; + AllocLocation src_loc = + accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; - transfer_memory( - copied_accessor.get
(), accessor.get
(), volume, gpu_dir, false); + transfer_memory(copied_accessor, accessor.get
(), src_loc); return copied_accessor; } -template -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DT, - }; -} +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, + DataType DT); template -std::vector> load_accessor_data(GenericTensorAccessorR accessor, - bool on_host = false) { +std::vector> load_accessor_data(GenericTensorAccessorR accessor) { + using T = real_type
; + int volume = accessor.shape.get_volume(); + std::vector local_data(volume); + T const *src_ptr = accessor.get
(); + if (accessor.on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } + + return local_data; +} + +template +std::vector> load_accessor_data(GenericTensorAccessorW accessor) { using T = real_type
; + + int volume = accessor.shape.get_volume(); std::vector local_data(volume); T const *src_ptr = accessor.get
(); - transfer_memory( - local_data.data(), src_ptr, volume, GpuDirection::DeviceToHost, on_host); + if (accessor.on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } return local_data; } diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index ae7bd076ce..56d3b5550f 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -12,6 +12,7 @@ struct TrackedAllocator : public IAllocator { ~TrackedAllocator() = default; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; size_t get_current_mem_usage(); diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 68636906c3..18546ad54b 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -11,6 +11,12 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { return ptr; } +void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) { + void *ptr = this->allocator.allocate_and_zero(requested_memory_size); + this->current_mem_usage += requested_memory_size; + return ptr; +} + void TrackedAllocator::deallocate(void *ptr) { size_t psize; checkCUDA(cudaGetSymbolSize(&psize, ptr)); @@ -23,7 +29,9 @@ size_t TrackedAllocator::get_current_mem_usage() { } Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + allocator.alloc_location = base_allocator.alloc_location; + return allocator; } } // namespace FlexFlow