Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hip Refactor #1359

Merged
merged 59 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
d269b40
compiler build
wmdi Oct 16, 2023
f46fd11
Merge branch 'test-substitution' into test-compiler
wmdi Oct 18, 2023
af67e9e
Merge branch 'test-substitution' into test-compiler
wmdi Nov 8, 2023
c015efb
unity dp works
wmdi Nov 15, 2023
6211b84
format
wmdi Nov 15, 2023
d9f1302
Merge remote-tracking branch 'upstream/repo-refactor' into test-compiler
wmdi Jan 24, 2024
fb58a99
fmt
wmdi Jan 24, 2024
02937e1
fix
wmdi Jan 24, 2024
6402ed0
add substitutions, compiler, and their unit tests to CI
wmdi Jan 25, 2024
0c45f61
disable runtime unit test
wmdi Jan 25, 2024
95fa427
minor fix
wmdi Feb 15, 2024
1f7e2b6
(not compilable) visitable issue for OptimalCostState
wmdi Feb 18, 2024
ffa7f79
first try on docs
Bob-Chen222 Feb 23, 2024
a9a6402
fix machine mapping hash & refactor dp algorithm
wmdi Feb 27, 2024
d8bbcb8
minor fix
wmdi Feb 27, 2024
09d3152
fix variant issue
wmdi Feb 28, 2024
a150d3a
fmt
wmdi Feb 28, 2024
2eb3fdf
fix
wmdi Mar 11, 2024
7598a92
fmt
wmdi Mar 11, 2024
05c8336
fix
wmdi Mar 14, 2024
71aeddb
Merge remote-tracking branch 'upstream/repo-refactor' into test-compiler
wmdi Mar 14, 2024
502b41f
Merge branch 'repo-refactor' into repo-refactor
Bob-Chen222 Mar 14, 2024
6962bc8
additional doc
Bob-Chen222 Mar 14, 2024
73d72d2
Merge branch 'repo-refactor' of https://github.com/Bob-Chen222/FlexFl…
Bob-Chen222 Mar 14, 2024
9345400
add more unit tests
wmdi Mar 18, 2024
c0015df
fmt
wmdi Mar 18, 2024
6d28697
Merge remote-tracking branch 'origin/repo-refactor' into compiler
lockshaw Mar 22, 2024
102f5fb
Fix post-merge
lockshaw Mar 22, 2024
d6e10bb
Add shell hook for sapling development
lockshaw Mar 23, 2024
95fb4cc
changed from nullopt to std::nullopt
Mar 23, 2024
c091479
fix cast issue
wmdi Mar 23, 2024
57bd35f
Merge branch 'test-compiler' of github.com:wmdi/FlexFlow into test-co…
wmdi Mar 23, 2024
54c604a
Fix spdlog cmake issue
lockshaw Mar 24, 2024
a09e528
Merge remote-tracking branch 'refs/remotes/wmdi/test-compiler' into c…
lockshaw Mar 24, 2024
8b914cf
Re-remove submodules
lockshaw Mar 24, 2024
189f323
minor fix & fmt
wmdi Mar 24, 2024
d2eb505
upd tests name to match ci
wmdi Mar 24, 2024
371324a
Add TEST_SUITE declaration to make tests findable by ctest
lockshaw Mar 26, 2024
da74817
Remove unnecessary nix files, add utils test to ci
lockshaw Mar 26, 2024
0db60db
Fix utils tests name, format
lockshaw Mar 26, 2024
6e520bb
Merge pull request #1229 from wmdi/test-compiler
wmdi Mar 26, 2024
bf9c0c0
resolved merge conflict
Bob-Chen222 Mar 28, 2024
c21d66e
add tutorial
Bob-Chen222 Apr 1, 2024
817065b
Merge branch 'repo-refactor' into repo-refactor
Bob-Chen222 Apr 4, 2024
fe164cc
Merge branch 'flexflow:repo-refactor' into repo-refactor
Bob-Chen222 Apr 6, 2024
7a9aeaf
align linear and layer_norm
Bob-Chen222 Apr 6, 2024
1369b35
fixed style
Apr 12, 2024
274d497
fixed layer_norm and linear
Apr 12, 2024
6e979e6
Revert "add tutorial"
Apr 12, 2024
080ed6a
Revert "first try on docs"
Bob-Chen222 Apr 12, 2024
03451e6
Revert "additional doc"
Bob-Chen222 Apr 12, 2024
de551ce
Update operator_pattern.h
Bob-Chen222 Apr 12, 2024
2fe8015
"revert substitution"
Bob-Chen222 Apr 12, 2024
4893ad4
align with the newest changes of linear and layer_norm
Bob-Chen222 Apr 15, 2024
943fabf
Merge remote-tracking branch 'upstream/repo-refactor' into bob-hip-re…
Bob-Chen222 Apr 19, 2024
86df8cc
format
Bob-Chen222 Apr 19, 2024
fe2ea11
deleted redundant function
Bob-Chen222 Apr 19, 2024
a304c7e
Merge branch 'repo-refactor' into bob-hip-refactor
reyna-abhyankar May 17, 2024
d18f3ff
Merge branch 'repo-refactor' into bob-hip-refactor
reyna-abhyankar May 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/doxygen/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,7 @@ INPUT += $(FF_HOME)/include
INPUT += $(FF_HOME)/nmt
INPUT += $(FF_HOME)/python
INPUT += $(FF_HOME)/src
INPUT += $(FF_HOME)/lib/substitutions/include

# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
Expand Down
458 changes: 244 additions & 214 deletions lib/kernels/src/hip/layer_norm_kernels.cpp

Large diffs are not rendered by default.

130 changes: 78 additions & 52 deletions lib/kernels/src/hip/linear_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,65 +19,71 @@

namespace FlexFlow {

LinearPerDeviceState::LinearPerDeviceState(FFHandler handler, int batch_size)
: PerDeviceOpState(handler) {
// Allocate an all-one's vector
float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size);
for (int i = 0; i < batch_size; i++) {
dram_one_ptr[i] = 1.0f;
}
float *fb_one_ptr;
checkCUDA(hipMalloc(&fb_one_ptr, sizeof(float) * batch_size));
checkCUDA(hipMemcpy(fb_one_ptr,
dram_one_ptr,
sizeof(float) * batch_size,
hipMemcpyHostToDevice));
one_ptr = (float const *)fb_one_ptr;
// Allocate descriptors
checkCUDNN(miopenCreateActivationDescriptor(&actiDesc));
checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
}

namespace Kernels {
namespace Linear {

bool use_activation(ActiMode mode) {
switch (mode) {
case AC_MODE_RELU:
case AC_MODE_SIGMOID:
case AC_MODE_TANH:
return true;
case AC_MODE_NONE:
return false;
default:
assert(0);
break;
}
return false;
}

void init_kernel(LinearPerDeviceState *m, int batch_size, int channel) {
if (use_activation(m->activation)) {
miopenActivationMode_t mode;
switch (m->activation) {
case AC_MODE_RELU:
mode = miopenActivationRELU;
break;
case AC_MODE_SIGMOID:
mode = miopenActivationLOGISTIC;
break;
default:
// Unsupported activation mode
assert(false);
}
checkCUDNN(miopenSetActivationDescriptor(m->actiDesc, mode, 0.0, 0.0, 0.0));
checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor,
ff_to_cudnn_datatype(m->output_type),
// what's the float * one_ptr
LinearPerDeviceState
init_kernel(PerDeviceFFHandle handle, Allocator allocator, float *one_ptr;
ActiMode activation,
Regularizer regularizer,
bool use_bias,
DataType input_type,
DataType weight_type,
DataType output_type,
int batch_size,
int channel) {
ffTensorDescriptor_t outputTensor;
ffActivationDescriptor_t actiDesc;
checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
checkCUDNN(miopenSetActivationDescriptor(actiDesc, mode, 0.0, 0.0, 0.0));
checkCUDNN(miopenSet4dTensorDescriptor(outputTensor,
ff_to_cudnn_datatype(output_type),
batch_size,
channel,
1,
1));

miopenActivationMode_t mode;
switch (activation) {
case RELU:
mode = MIOPEN_ACTIVATION_RELU;
break;
case SIGMOID:
mode = MIOPEN_ACTIVATION_SIGMOID;
break;
case TANH:
mode = MIOPEN_ACTIVATION_TANH;
break;
case GELU:
mode = MIOPEN_ACTIVATION_GELU;
break;
default:
// Unsupported activation mode
assert(false);
}
checkCUDNN(miopenSetActivationDescriptor(actiDesc, mode, 0.0, 0.0, 0.0));
checkCUDNN(miopenSet4dTensorDescriptor(outputTensor,
ff_to_cudnn_datatype(m->output_type),
batch_size,
channel,
1,
1));

// todo: how to use allocator to allocate memory for float * one_ptr, how many
// bytes to allocate?
checkCUDA(hipMalloc(&one_ptr, sizeof(float) * batch_size));
LinearPerDeviceState per_device_state = {handle,
outputTensor,
actiDesc,
one_ptr,
activation,
regularizer,
use_bias,
input_type,
weight_type,
output_type};
return per_device_state;
}

void forward_kernel(hipStream_t stream,
Expand Down Expand Up @@ -230,7 +236,27 @@ void backward_kernel(hipStream_t stream,
in_dim,
compute_type,
HIPBLAS_GEMM_DEFAULT));
// Compute bias gradiant

if (m.kernel_reg_type == REG_MODE_NONE){
//do nothing
} else if (m.kernel_reg_type == REG_MODE_L2){
checkCUDA(hipblasSgeam(m->handle.blas,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
in_dim,
out_dim,
&alpha,
(float *)kernel_grad_ptr,
in_dim,
&(m->kernel_reg_lambda),
(float *)kernel_ptr,
in_dim,
(float *)kernel_grad_ptr,
in_dim));
}else{
assert(false && "Only L2 regularization is supported");
}
// compute bias gradient
// NOTE: we use alpha=1 for bias_grad to accumulate gradients
// use_bias = True
if (bias_grad_ptr != NULL) {
Expand Down
206 changes: 206 additions & 0 deletions lib/substitutions/TUTORIAL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
## Tutorial of substitution lib with simple example

#### Create a pattern

```c++
//we should specify both the node pattern and edge pattern when defining a GraphPattern

//first define an operator pattern for example, specify the node to have a linear
//operator
OperatorPattern operator_pattern_n0{
std::vector<OperatorAttributeConstraint>{OperatorAttributeConstraint{
ConstraintType::EQUAL, OperatorAttributeKey::OP_TYPE, Op::LINEAR}}};

//then define a tensor_pattern that restrict the pattern of edge in pcg. for example,
//specify that the first dimension (indexed by 0) of a tensor should be 2
ParallelTensorPattern tensor_pattern_e0{
std::vector<TensorAttributeConstraint>{
TensorAttributeConstraint{ConstraintType::EQUAL,
ListIndexAccess<TensorAttributeKey>{
TensorAttributeKey::DIM_SIZES, 0},
2}}};
/*
remeber that both operator_pattern and tensor_pattern are std::vector, meaning that you
can define more than one constraint depending on the context
*/
```


#### Pack into GraphPattern
```c++
//create a graph with node label of OperatorPattern and edge label of ParallelTensorPattern
auto ig =
OutputLabelledOpenMultiDiGraph<OperatorPattern, ParallelTensorPattern>::
create<UnorderedOutputLabelledOpenMultiDiGraph<
OperatorPattern,
ParallelTensorPattern>>();
//add constraints defined above as argument to create a node
Node n0 = ig.add_node(operator_pattern_n0);
//add port number to distinguish different edges going to the same node
NodePort p0 = ig.add_node_port();
//create edge
InputMultiDiEdge e0{n0, p0, std::make_pair(p0.value(), p0.value())};
ig.add_edge(e0);
//add edge constraints above to the edge e0
ig.add_label(e0, tensor_pattern_e0);

//a pattern graph with one input edge pointing to a node
/*
n0 (Linear)
*/
RC_ASSERT(get_nodes(ig).size() == 1);
RC_ASSERT(get_edges(ig).size() == 1);
```

#### Define OutputGraph
```cpp

//define a 3-node PCG that can be applied from the input graph ig

//Partition node that can partite the input into two parts
OperatorAttrAssignment op_ass_n1{
{{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::REPARTITION}},
{OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
{OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};

//Linear node
OperatorAttrAssignment op_ass_n2{
{{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::LINEAR}},
{OperatorAttributeKey::OUT_CHANNELS,
OperatorAttrAccess{n0, OperatorAttributeKey::OUT_CHANNELS}},
{OperatorAttributeKey::USE_BIAS,
OperatorAttrAccess{n0, OperatorAttributeKey::USE_BIAS}},
{OperatorAttributeKey::DATA_TYPE,
OperatorAttrAccess{n0, OperatorAttributeKey::DATA_TYPE}},
{OperatorAttributeKey::ACTIVATION,
OperatorAttrAccess{n0, OperatorAttributeKey::ACTIVATION}},
{OperatorAttributeKey::REGULARIZER,
OperatorAttrAccess{n0, OperatorAttributeKey::REGULARIZER}}}};

//Reduce node that will combine the result of two partitions
OperatorAttrAssignment op_ass_n3{
{{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::REDUCTION}},
{OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
{OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};

//notice that these assignments will be evaluated
//into new operators in the apply_substitution function
//and be inserted into the new pcg

//create outputgraph with 3 nodes and 3 edges
auto og = NodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>::create<
UnorderedNodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>>();
Node n1 = og.add_node(op_ass_n1);
Node n2 = og.add_node(op_ass_n2);
Node n3 = og.add_node(op_ass_n3);
NodePort p1 = og.add_node_port();
NodePort p2 = og.add_node_port();
NodePort p3 = og.add_node_port();

InputMultiDiEdge e1{n1, p1, {p1.value(), p1.value()}};
MultiDiEdge e2{n2, p2, n1, p1};
MultiDiEdge e3{n3, p3, n2, p2};
og.add_edge(e1);
og.add_edge(e2);
og.add_edge(e3);
OutputGraphExpr output_graph_expr{og};

/*
The output graph looks like this
n3 (Reduce)
n2 (Linear)
n1 (Partition)
*/
RC_ASSERT(get_nodes(og).size() == 3);
RC_ASSERT(get_edges(og).size() == 3);
```

#### Define substitution
```cpp
//define two dict that specify how the input and output edges are mapped in the substitution
bidict<InputMultiDiEdge, InputMultiDiEdge> input_mapping;
input_mapping.equate(e0, e1);
bidict<OutputMultiDiEdge, OutputMultiDiEdge> output_mapping;

Substitution substitution{
input_graph, output_graph_expr, input_mapping, output_mapping};
```

#### Apply substitution
```cpp

//create the target pcg that we want to apply for substitution
SubParallelComputationGraph pcg =
OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>::create<
UnorderedOutputLabelledOpenMultiDiGraph<Operator,
ParallelTensor>>();

Node n4 = pcg.add_node(Operator{InputAttrs{}, "input"});
Node n5 = pcg.add_node(Operator{
LinearAttrs{1, false, DataType::FLOAT, Activation::RELU, std::nullopt},
"linear"});
NodePort p4 = pcg.add_node_port();
NodePort p5 = pcg.add_node_port();

MultiDiEdge e4{n5, p5, n4, p4};
pcg.add_edge(e4);
pcg.add_label(e4,
ParallelTensor(ParallelTensorDims({2, 1}),
DataType::FLOAT,
CreateGrad::YES));

/* Our target pcg looks like this
n5 (Linear)
n4 (input)
*/

//create criterion function that will test every predefined edge and node constraints
MatchAdditionalCriterion criterion{
[&](Node const &pattern_node, Node const &graph_node) {
return operator_satisfies(pcg.at(graph_node),
input_graph.value().at(pattern_node));
},
[&](OpenMultiDiEdge const &pattern_edge,
OpenMultiDiEdge const &graph_edge) {
return parallel_tensor_satisfies(
pcg.at(graph_edge), input_graph.value().at(pattern_edge));
}};

RC_ASSERT(criterion.node_criterion(n0, n5));


//find the match point that we can apply the substitution in the target pcg
std::vector<MultiDiGraphPatternMatch> matches =
find_pattern_matches(input_graph, pcg, criterion);

//there is only one match point in the pcg that we defined
RC_ASSERT(matches.size() == 1);

//apply substitution
//the number of new pcg generated is bounded by O(2^(sn))where s is the number of
//different substitutions and n is the number of nodes
SubParallelComputationGraph new_pcg =
apply_substitution(pcg, substitution, matches[0]);

//now the new pcg becomes as follow
/*
n3 (Reduce)
n2 (Linear)
n1 (Partition)
n4 (Input)
*/
RC_ASSERT(get_nodes(new_pcg).size() == 4);
RC_ASSERT(get_edges(new_pcg).size() == 3);
```




Loading
Loading