test utils logic cleanup, reverse cpu_kernel pedagogical implmentatio…

…n, other minor fixes
flexflow · Jul 31, 2024 · e6e2161 · e6e2161
1 parent ba586ae
commit e6e2161
Show file tree

Hide file tree

Showing 40 changed files with 590 additions and 476 deletions.
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
@@ -29,15 +29,20 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
+  GenericTensorAccessorW(DataType dt,
+                         ArrayShape sh,
+                         req<void *> p,
+                         bool on_dev = true)
+      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+
 public:
   DataType data_type;
   ArrayShape shape;
   req<void *> ptr;
+  bool on_device;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    GenericTensorAccessorW, data_type, shape, ptr, on_device);
 
 class GenericTensorAccessorR {
 public:
@@ -57,15 +62,20 @@ class GenericTensorAccessorR {
   double const *get_double_ptr() const;
   half const *get_half_ptr() const;
 
+  GenericTensorAccessorR(DataType dt,
+                         ArrayShape sh,
+                         req<void const *> p,
+                         bool on_dev = true)
+      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+
 public:
   DataType data_type;
   ArrayShape shape;
   req<void const *> ptr;
+  bool on_device;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    GenericTensorAccessorR, data_type, shape, ptr, on_device);
 
 int32_t *get_int32_ptr(GenericTensorAccessorW const &);
 int64_t *get_int64_ptr(GenericTensorAccessorW const &);

diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
@@ -5,10 +5,13 @@
 #include <cstddef>
 #include <memory>
 
+enum class AllocLocation { HOST, DEVICE };
+
 namespace FlexFlow {
 
 struct IAllocator {
   virtual void *allocate(size_t) = 0;
+  virtual void *allocate_and_zero(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
   virtual ~IAllocator() = default;
@@ -18,7 +21,11 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+  GenericTensorAccessorW
+      allocate_tensor_and_zero(TensorShape const &tensor_shape);
+
   void *allocate(size_t mem_size);
+  void *allocate_and_zero(size_t mem_size);
   void deallocate(void *ptr);
 
   template <typename T, typename... Args>
@@ -30,6 +37,8 @@ struct Allocator {
 
   Allocator(std::shared_ptr<IAllocator> ptr) : i_allocator(ptr){};
 
+  AllocLocation alloc_location;
+
 private:
   std::shared_ptr<IAllocator> i_allocator;
 };

diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -7,19 +7,17 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Cast {
-namespace CPU {
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        DataType input_type,
+                        DataType output_type);
 
-void backward_kernel(GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type);
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &output,
+                         DataType input_type,
+                         DataType output_type);
 
-} // namespace CPU
 } // namespace Cast
 } // namespace Kernels
 } // namespace FlexFlow

diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -7,15 +7,13 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Combine {
-namespace CPU {
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
 
-void backward_kernel(GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad);
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
 
-} // namespace CPU
 } // namespace Combine
 } // namespace Kernels
 } // namespace FlexFlow

diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -10,6 +10,7 @@ struct LocalCPUAllocator : public IAllocator {
   ~LocalCPUAllocator() override;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
 private:

diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator {
   ~LocalCudaAllocator() override;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
 private:

diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -7,16 +7,14 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Replicate {
-namespace CPU {
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
 
-void backward_kernel(GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output,
-                     size_t num_replicas);
+void cpu_backward_kernel(GenericTensorAccessorW const &input,
+                         GenericTensorAccessorR const &output,
+                         size_t num_replicas);
 
-} // namespace CPU
 } // namespace Replicate
 } // namespace Kernels
 } // namespace FlexFlow

diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -6,22 +6,20 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Reverse {
-namespace CPU {
 
-void forward_kernel(float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size);
+void cpu_forward_kernel(float const *in_ptr,
+                        float *out_ptr,
+                        coord_t num_out_blks,
+                        coord_t reverse_dim_size,
+                        coord_t in_blk_size,
+                        coord_t output_size);
 
-void backward_kernel(float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size);
-} // namespace CPU
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         float *in_grad_ptr,
+                         coord_t num_out_blks,
+                         coord_t reverse_dim_size,
+                         coord_t in_blk_size,
+                         coord_t input_size);
 } // namespace Reverse
 } // namespace Kernels
 } // namespace FlexFlow

diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
@@ -134,8 +134,10 @@ std::vector<half const *>
 
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
+  return GenericTensorAccessorR{writable.data_type,
+                                writable.shape,
+                                req<void const *>(writable.ptr),
+                                writable.on_device};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
@@ -6,14 +6,26 @@ void *Allocator::allocate(size_t mem_size) {
   return this->i_allocator->allocate(mem_size);
 }
 
+void *Allocator::allocate_and_zero(size_t mem_size) {
+  return this->i_allocator->allocate_and_zero(mem_size);
+}
+
 void Allocator::deallocate(void *ptr) {
   this->i_allocator->deallocate(ptr);
 }
 
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
-  return {tensor_shape.data_type, tensor_shape, ptr};
+  bool on_device = this->alloc_location == AllocLocation::DEVICE;
+  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
+}
+
+GenericTensorAccessorW
+    Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) {
+  void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape));
+  bool on_device = this->alloc_location == AllocLocation::DEVICE;
+  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
@@ -60,4 +60,10 @@ size_t get_volume(ArrayShape const &shape) {
   return shape.get_volume();
 }
 
+TensorShape get_tensor_shape(ArrayShape const &shape, DataType DT) {
+  FFOrdered<size_t> ff_dims(shape.dims.begin(), shape.dims.end());
+  TensorDims tensor_shape_dims(ff_dims);
+  return TensorShape(tensor_shape_dims, DT);
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
@@ -4,56 +4,55 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Cast {
-namespace CPU {
 
 template <typename IDT, typename ODT>
-void cast_forward(IDT const *input, ODT *output, size_t volume) {
+void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
   for (size_t i = 0; i < volume; ++i) {
     output[i] = static_cast<ODT>(input[i]);
   }
 }
 
 template <typename IDT, typename ODT>
-void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
   for (size_t i = 0; i < volume; i++) {
     output[i] = static_cast<ODT>(input[i]) + beta * output[i];
   }
 }
 
 template <DataType IDT, DataType ODT>
-struct ForwardKernel {
+struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
     size_t volume = input.shape.get_volume();
-    cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+    cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
   }
 };
 
 template <DataType IDT, DataType ODT>
-struct BackwardKernel {
+struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
     size_t volume = input.shape.get_volume();
-    cast_backward(
+    cpu_cast_backward(
         input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type) {
-  DataTypeDispatch2<ForwardKernel>{}(input_type, output_type, input, output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        DataType input_type,
+                        DataType output_type) {
+  DataTypeDispatch2<CPUForwardKernel>{}(input_type, output_type, input, output);
 }
 
-void backward_kernel(GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type) {
-  DataTypeDispatch2<BackwardKernel>{}(input_type, output_type, input, output);
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &output,
+                         DataType input_type,
+                         DataType output_type) {
+  DataTypeDispatch2<CPUBackwardKernel>{}(
+      input_type, output_type, input, output);
 }
 
-} // namespace CPU
 } // namespace Cast
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
@@ -4,10 +4,9 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Combine {
-namespace CPU {
 
 template <DataType DT>
-struct ForwardKernel {
+struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
     memcpy(output.get<DT>(),
@@ -17,7 +16,7 @@ struct ForwardKernel {
 };
 
 template <DataType DT>
-struct BackwardKernel {
+struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
     size_t num_elements = output_grad.shape.get_volume();
@@ -27,18 +26,17 @@ struct BackwardKernel {
   }
 };
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(input.data_type, input, output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
 }
 
-void backward_kernel(GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad) {
-  DataTypeDispatch1<BackwardKernel>{}(
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
       input_grad.data_type, output_grad, input_grad);
 }
 
-} // namespace CPU
 } // namespace Combine
 } // namespace Kernels
 } // namespace FlexFlow