flexflow · reyna-abhyankar · May 24, 2024 · Oct 16, 2023 · Oct 18, 2023 · Nov 8, 2023
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
@@ -881,6 +881,7 @@ INPUT                 += $(FF_HOME)/include
 INPUT                 += $(FF_HOME)/nmt
 INPUT                 += $(FF_HOME)/python
 INPUT                 += $(FF_HOME)/src
+INPUT                 += $(FF_HOME)/lib/substitutions/include
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

diff --git a/lib/kernels/src/hip/layer_norm_kernels.cpp b/lib/kernels/src/hip/layer_norm_kernels.cpp
diff --git a/lib/kernels/src/hip/linear_kernels.cpp b/lib/kernels/src/hip/linear_kernels.cpp
@@ -19,65 +19,71 @@
 
 namespace FlexFlow {
 
-LinearPerDeviceState::LinearPerDeviceState(FFHandler handler, int batch_size)
-    : PerDeviceOpState(handler) {
-  // Allocate an all-one's vector
-  float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size);
-  for (int i = 0; i < batch_size; i++) {
-    dram_one_ptr[i] = 1.0f;
-  }
-  float *fb_one_ptr;
-  checkCUDA(hipMalloc(&fb_one_ptr, sizeof(float) * batch_size));
-  checkCUDA(hipMemcpy(fb_one_ptr,
-                      dram_one_ptr,
-                      sizeof(float) * batch_size,
-                      hipMemcpyHostToDevice));
-  one_ptr = (float const *)fb_one_ptr;
-  // Allocate descriptors
-  checkCUDNN(miopenCreateActivationDescriptor(&actiDesc));
-  checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
-}
-
 namespace Kernels {
 namespace Linear {
 
-bool use_activation(ActiMode mode) {
-  switch (mode) {
-    case AC_MODE_RELU:
-    case AC_MODE_SIGMOID:
-    case AC_MODE_TANH:
-      return true;
-    case AC_MODE_NONE:
-      return false;
-    default:
-      assert(0);
-      break;
-  }
-  return false;
-}
-
-void init_kernel(LinearPerDeviceState *m, int batch_size, int channel) {
-  if (use_activation(m->activation)) {
-    miopenActivationMode_t mode;
-    switch (m->activation) {
-      case AC_MODE_RELU:
-        mode = miopenActivationRELU;
-        break;
-      case AC_MODE_SIGMOID:
-        mode = miopenActivationLOGISTIC;
-        break;
-      default:
-        // Unsupported activation mode
-        assert(false);
-    }
-    checkCUDNN(miopenSetActivationDescriptor(m->actiDesc, mode, 0.0, 0.0, 0.0));
-    checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor,
-                                           ff_to_cudnn_datatype(m->output_type),
+// what's the float * one_ptr
+LinearPerDeviceState
+    init_kernel(PerDeviceFFHandle handle, Allocator allocator, float *one_ptr;
+                ActiMode activation,
+                Regularizer regularizer,
+                bool use_bias,
+                DataType input_type,
+                DataType weight_type,
+                DataType output_type,
+                int batch_size,
+                int channel) {
+  ffTensorDescriptor_t outputTensor;
+  ffActivationDescriptor_t actiDesc;
+  checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
+  checkCUDNN(miopenSetActivationDescriptor(actiDesc, mode, 0.0, 0.0, 0.0));
+  checkCUDNN(miopenSet4dTensorDescriptor(outputTensor,
+                                           ff_to_cudnn_datatype(output_type),
                                            batch_size,
                                            channel,
                                            1,
                                            1));
+
+  miopenActivationMode_t mode;
+  switch (activation) {
+    case RELU:
+      mode = MIOPEN_ACTIVATION_RELU;
+      break;
+    case SIGMOID:
+      mode = MIOPEN_ACTIVATION_SIGMOID;
+      break;
+    case TANH:
+      mode = MIOPEN_ACTIVATION_TANH;
+      break;
+    case GELU:
+      mode = MIOPEN_ACTIVATION_GELU;
+      break;
+    default:
+      // Unsupported activation mode
+      assert(false);
   }
+  checkCUDNN(miopenSetActivationDescriptor(actiDesc, mode, 0.0, 0.0, 0.0));
+  checkCUDNN(miopenSet4dTensorDescriptor(outputTensor,
+                                          ff_to_cudnn_datatype(m->output_type),
+                                          batch_size,
+                                          channel,
+                                          1,
+                                          1));
+
+  // todo: how to use allocator to allocate memory for float * one_ptr, how many
+  // bytes to allocate?
+  checkCUDA(hipMalloc(&one_ptr, sizeof(float) * batch_size));
+  LinearPerDeviceState per_device_state = {handle,
+                                           outputTensor,
+                                           actiDesc,
+                                           one_ptr,
+                                           activation,
+                                           regularizer,
+                                           use_bias,
+                                           input_type,
+                                           weight_type,
+                                           output_type};
+  return per_device_state;
 }
 
 void forward_kernel(hipStream_t stream,
@@ -230,7 +236,27 @@ void backward_kernel(hipStream_t stream,
                           in_dim,
                           compute_type,
                           HIPBLAS_GEMM_DEFAULT));
-  // Compute bias gradiant
+
+  if (m.kernel_reg_type == REG_MODE_NONE){
+    //do nothing
+  } else if (m.kernel_reg_type == REG_MODE_L2){
+    checkCUDA(hipblasSgeam(m->handle.blas,
+                          HIPBLAS_OP_N,
+                          HIPBLAS_OP_N,
+                          in_dim,
+                          out_dim,
+                          &alpha,
+                          (float *)kernel_grad_ptr,
+                          in_dim,
+                          &(m->kernel_reg_lambda),
+                          (float *)kernel_ptr,
+                          in_dim,
+                          (float *)kernel_grad_ptr,
+                          in_dim));
+  }else{
+    assert(false && "Only L2 regularization is supported");
+  }
+  // compute bias gradient
   // NOTE: we use alpha=1 for bias_grad to accumulate gradients
   // use_bias = True
   if (bias_grad_ptr != NULL) {

diff --git a/lib/substitutions/TUTORIAL.md b/lib/substitutions/TUTORIAL.md
@@ -0,0 +1,206 @@
+## Tutorial of substitution lib with simple example
+
+#### Create a pattern
+
+```c++
+//we should specify both the node pattern and edge pattern when defining a GraphPattern 
+
+//first define an operator pattern for example, specify the node to have a linear 
+//operator
+OperatorPattern operator_pattern_n0{
+        std::vector<OperatorAttributeConstraint>{OperatorAttributeConstraint{
+            ConstraintType::EQUAL, OperatorAttributeKey::OP_TYPE, Op::LINEAR}}};
+
+//then define a tensor_pattern that restrict the pattern of edge in pcg. for example, 
+//specify that the first dimension (indexed by 0) of a tensor should be 2
+ParallelTensorPattern tensor_pattern_e0{
+    std::vector<TensorAttributeConstraint>{
+        TensorAttributeConstraint{ConstraintType::EQUAL,
+                                    ListIndexAccess<TensorAttributeKey>{
+                                        TensorAttributeKey::DIM_SIZES, 0},
+                                    2}}};
+/*
+remeber that both operator_pattern and tensor_pattern are std::vector, meaning that you 
+can define more than one constraint depending on the context
+*/
+```
+
+
+#### Pack into GraphPattern
+```c++
+//create a graph with node label of OperatorPattern and edge label of ParallelTensorPattern
+auto ig =
+    OutputLabelledOpenMultiDiGraph<OperatorPattern, ParallelTensorPattern>::
+        create<UnorderedOutputLabelledOpenMultiDiGraph<
+            OperatorPattern,
+            ParallelTensorPattern>>();
+//add constraints defined above as argument to create a node
+Node n0 = ig.add_node(operator_pattern_n0);
+//add port number to distinguish different edges going to the same node
+NodePort p0 = ig.add_node_port();
+//create edge
+InputMultiDiEdge e0{n0, p0, std::make_pair(p0.value(), p0.value())};
+ig.add_edge(e0);
+//add edge constraints above to the edge e0
+ig.add_label(e0, tensor_pattern_e0);
+
+//a pattern graph with one input edge pointing to a node
+/*
+        n0 (Linear)
+        ↑
+*/
+RC_ASSERT(get_nodes(ig).size() == 1);
+RC_ASSERT(get_edges(ig).size() == 1);
+```
+
+#### Define OutputGraph
+```cpp
+
+//define a 3-node PCG that can be applied from the input graph ig
+
+//Partition node that can partite the input into two parts
+OperatorAttrAssignment op_ass_n1{
+        {{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::REPARTITION}},
+         {OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
+         {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
+
+//Linear node
+OperatorAttrAssignment op_ass_n2{
+    {{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::LINEAR}},
+        {OperatorAttributeKey::OUT_CHANNELS,
+        OperatorAttrAccess{n0, OperatorAttributeKey::OUT_CHANNELS}},
+        {OperatorAttributeKey::USE_BIAS,
+        OperatorAttrAccess{n0, OperatorAttributeKey::USE_BIAS}},
+        {OperatorAttributeKey::DATA_TYPE,
+        OperatorAttrAccess{n0, OperatorAttributeKey::DATA_TYPE}},
+        {OperatorAttributeKey::ACTIVATION,
+        OperatorAttrAccess{n0, OperatorAttributeKey::ACTIVATION}},
+        {OperatorAttributeKey::REGULARIZER,
+        OperatorAttrAccess{n0, OperatorAttributeKey::REGULARIZER}}}};
+
+//Reduce node that will combine the result of two partitions
+OperatorAttrAssignment op_ass_n3{
+    {{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::REDUCTION}},
+    {OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
+    {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
+
+//notice that these assignments will be evaluated 
+//into new operators in the apply_substitution function 
+//and be inserted into the new pcg
+
+//create outputgraph with 3 nodes and 3 edges
+auto og = NodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>::create<
+    UnorderedNodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>>();
+Node n1 = og.add_node(op_ass_n1);
+Node n2 = og.add_node(op_ass_n2);
+Node n3 = og.add_node(op_ass_n3);
+NodePort p1 = og.add_node_port();
+NodePort p2 = og.add_node_port();
+NodePort p3 = og.add_node_port();
+
+InputMultiDiEdge e1{n1, p1, {p1.value(), p1.value()}};
+MultiDiEdge e2{n2, p2, n1, p1};
+MultiDiEdge e3{n3, p3, n2, p2};
+og.add_edge(e1);
+og.add_edge(e2);
+og.add_edge(e3);
+OutputGraphExpr output_graph_expr{og};
+
+/*
+The output graph looks like this
+               n3 (Reduce)
+               ↑
+               n2 (Linear)
+               ↑
+               n1 (Partition)
+               ↑
+*/
+RC_ASSERT(get_nodes(og).size() == 3);
+RC_ASSERT(get_edges(og).size() == 3);
+```
+
+#### Define substitution
+```cpp
+//define two dict that specify how the input and output edges are mapped in the substitution
+bidict<InputMultiDiEdge, InputMultiDiEdge> input_mapping;
+input_mapping.equate(e0, e1);
+bidict<OutputMultiDiEdge, OutputMultiDiEdge> output_mapping;
+
+Substitution substitution{
+    input_graph, output_graph_expr, input_mapping, output_mapping};
+```
+
+#### Apply substitution
+```cpp
+
+//create the target pcg that we want to apply for substitution
+SubParallelComputationGraph pcg =
+    OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>::create<
+        UnorderedOutputLabelledOpenMultiDiGraph<Operator,
+                                                ParallelTensor>>();
+
+Node n4 = pcg.add_node(Operator{InputAttrs{}, "input"});
+Node n5 = pcg.add_node(Operator{
+    LinearAttrs{1, false, DataType::FLOAT, Activation::RELU, std::nullopt},
+    "linear"});
+NodePort p4 = pcg.add_node_port();
+NodePort p5 = pcg.add_node_port();
+
+MultiDiEdge e4{n5, p5, n4, p4};
+pcg.add_edge(e4);
+pcg.add_label(e4,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                                DataType::FLOAT,
+                                CreateGrad::YES));
+
+/* Our target pcg looks like this
+           n5 (Linear)
+           ↑
+           n4 (input)
+*/
+
+//create criterion function that will test every predefined edge and node constraints
+MatchAdditionalCriterion criterion{
+    [&](Node const &pattern_node, Node const &graph_node) {
+        return operator_satisfies(pcg.at(graph_node),
+                                input_graph.value().at(pattern_node));
+    },
+    [&](OpenMultiDiEdge const &pattern_edge,
+        OpenMultiDiEdge const &graph_edge) {
+        return parallel_tensor_satisfies(
+            pcg.at(graph_edge), input_graph.value().at(pattern_edge));
+    }};
+
+RC_ASSERT(criterion.node_criterion(n0, n5));
+
+
+//find the match point that we can apply the substitution in the target pcg
+std::vector<MultiDiGraphPatternMatch> matches =
+    find_pattern_matches(input_graph, pcg, criterion);
+
+//there is only one match point in the pcg that we defined
+RC_ASSERT(matches.size() == 1);
+
+//apply substitution
+//the number of new pcg generated is bounded by O(2^(sn))where s is the number of
+//different substitutions and n is the number of nodes
+SubParallelComputationGraph new_pcg =
+    apply_substitution(pcg, substitution, matches[0]);
+
+//now the new pcg becomes as follow
+/*
+    n3 (Reduce)
+    ↑
+    n2 (Linear)
+    ↑
+    n1 (Partition)
+    ↑
+    n4 (Input)
+*/
+RC_ASSERT(get_nodes(new_pcg).size() == 4);
+RC_ASSERT(get_edges(new_pcg).size() == 3);
+```
+
+
+
+