flexflow · oOTigger · Jul 11, 2024 · Jul 12, 2024 · Jul 14, 2024 · Jul 14, 2024
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
@@ -29,15 +29,20 @@
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
+  GenericTensorAccessorW(DataType dt,
+                         ArrayShape sh,
+                         req<void *> p,
+                         bool on_dev = true)
+      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+
 public:
   DataType data_type;
   ArrayShape shape;
   req<void *> ptr;
+  bool on_device;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    GenericTensorAccessorW, data_type, shape, ptr, on_device);
 
 class GenericTensorAccessorR {
 public:
@@ -57,15 +62,20 @@
   double const *get_double_ptr() const;
   half const *get_half_ptr() const;
 
+  GenericTensorAccessorR(DataType dt,
+                         ArrayShape sh,
+                         req<void const *> p,
+                         bool on_dev = true)
+      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+
 public:
   DataType data_type;
   ArrayShape shape;
   req<void const *> ptr;
+  bool on_device;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    GenericTensorAccessorR, data_type, shape, ptr, on_device);
 
 int32_t *get_int32_ptr(GenericTensorAccessorW const &);
 int64_t *get_int64_ptr(GenericTensorAccessorW const &);

diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
@@ -5,10 +5,13 @@
 #include <cstddef>
 #include <memory>
 
+enum class AllocLocation { HOST, DEVICE };
+
 namespace FlexFlow {
 
 struct IAllocator {
   virtual void *allocate(size_t) = 0;
+  virtual void *allocate_and_zero(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
   virtual ~IAllocator() = default;
@@ -18,7 +21,11 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+  GenericTensorAccessorW
+      allocate_tensor_and_zero(TensorShape const &tensor_shape);
+
   void *allocate(size_t mem_size);
+  void *allocate_and_zero(size_t mem_size);
   void deallocate(void *ptr);
 
   template <typename T, typename... Args>
@@ -30,6 +37,8 @@ struct Allocator {
 
   Allocator(std::shared_ptr<IAllocator> ptr) : i_allocator(ptr){};
 
+  AllocLocation alloc_location;
+
 private:
   std::shared_ptr<IAllocator> i_allocator;
 };

diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
@@ -3,8 +3,6 @@
 
 #include "device.h"
 #include "kernels/accessor.h"
-#include "kernels/ff_handle.h"
-#include "op-attrs/activation.dtg.h"
 
 namespace FlexFlow {
 namespace Kernels {

diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -0,0 +1,25 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Cast {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        DataType input_type,
+                        DataType output_type);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &output,
+                         DataType input_type,
+                         DataType output_type);
+
+} // namespace Cast
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Combine {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace Combine
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -0,0 +1,23 @@
+#include "kernels/allocation.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+struct LocalCPUAllocator : public IAllocator {
+  LocalCPUAllocator() = default;
+  LocalCPUAllocator(LocalCPUAllocator const &) = delete;
+  LocalCPUAllocator(LocalCPUAllocator &&) = delete;
+  ~LocalCPUAllocator() override;
+
+  void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
+  void deallocate(void *) override;
+
+private:
+  std::unordered_set<void *> ptrs;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
+
+Allocator create_local_cpu_memory_allocator();
+
+} // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator {
   ~LocalCudaAllocator() override;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
 private:

diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Replicate {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorW const &input,
+                         GenericTensorAccessorR const &output,
+                         size_t num_replicas);
+
+} // namespace Replicate
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+
+#include "device.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Reverse {
+
+void cpu_forward_kernel(float const *in_ptr,
+                        float *out_ptr,
+                        coord_t num_out_blks,
+                        coord_t reverse_dim_size,
+                        coord_t in_blk_size,
+                        coord_t output_size);
+
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         float *in_grad_ptr,
+                         coord_t num_out_blks,
+                         coord_t reverse_dim_size,
+                         coord_t in_blk_size,
+                         coord_t input_size);
+} // namespace Reverse
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
@@ -134,8 +134,10 @@
 
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
+  return GenericTensorAccessorR{writable.data_type,
+                                writable.shape,
+                                req<void const *>(writable.ptr),
+                                writable.on_device};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
@@ -6,14 +6,26 @@
   return this->i_allocator->allocate(mem_size);
 }
 
+void *Allocator::allocate_and_zero(size_t mem_size) {
+  return this->i_allocator->allocate_and_zero(mem_size);
+}
+
 void Allocator::deallocate(void *ptr) {
   this->i_allocator->deallocate(ptr);
 }
 
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
-  return {tensor_shape.data_type, tensor_shape, ptr};
+  bool on_device = this->alloc_location == AllocLocation::DEVICE;
+  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
+}
+
+GenericTensorAccessorW
+    Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) {
+  void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape));
+  bool on_device = this->alloc_location == AllocLocation::DEVICE;
+  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
@@ -60,4 +60,10 @@
   return shape.get_volume();
 }
 
+TensorShape get_tensor_shape(ArrayShape const &shape, DataType DT) {
+  FFOrdered<size_t> ff_dims(shape.dims.begin(), shape.dims.end());
+  TensorDims tensor_shape_dims(ff_dims);
+  return TensorShape(tensor_shape_dims, DT);
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
@@ -0,0 +1,58 @@
+#include "kernels/cast_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Cast {
+
+template <typename IDT, typename ODT>
+void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
+  for (size_t i = 0; i < volume; ++i) {
+    output[i] = static_cast<ODT>(input[i]);
+  }
+}
+
+template <typename IDT, typename ODT>
+void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+  for (size_t i = 0; i < volume; i++) {
+    output[i] = static_cast<ODT>(input[i]) + beta * output[i];
+  }
+}
+
+template <DataType IDT, DataType ODT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume();
+    cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+  }
+};
+
+template <DataType IDT, DataType ODT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume();
+    cpu_cast_backward(
+        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        DataType input_type,
+                        DataType output_type) {
+  DataTypeDispatch2<CPUForwardKernel>{}(input_type, output_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &output,
+                         DataType input_type,
+                         DataType output_type) {
+  DataTypeDispatch2<CPUBackwardKernel>{}(
+      input_type, output_type, input, output);
+}
+
+} // namespace Cast
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
@@ -0,0 +1,42 @@
+#include "kernels/combine_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Combine {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.get_volume() * size_of_datatype(DT));
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
+    size_t num_elements = output_grad.shape.get_volume();
+    for (int i = 0; i < num_elements; ++i) {
+      input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
+      input_grad.data_type, output_grad, input_grad);
+}
+
+} // namespace Combine
+} // namespace Kernels
+} // namespace FlexFlow