-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
cbd2648
commit 11b82f7
Showing
4 changed files
with
408 additions
and
0 deletions.
There are no files selected for viewing
76 changes: 76 additions & 0 deletions
76
apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Usage: | ||
# make all: compiles all code without running | ||
# generator: create Halide generator | ||
# design: create cpu design | ||
# design-clockwork: create clockwork design | ||
# image: create an image with random data | ||
# run: run cpu design with image | ||
# run-clockwork: run clockwork design with image | ||
# compare: compare two output images | ||
# test: run and compare to cpu output | ||
# eval: evaluate runtime | ||
# clean: remove bin directory | ||
|
||
############################################################################### | ||
# MODIFICATION: NONE | ||
#------------------------------------------------------------------------------ | ||
# Makefile parameter/variable declarations | ||
# | ||
############################################################################### | ||
|
||
include ../../hw_support/Makefile.inc | ||
|
||
|
||
|
||
############################################################################### | ||
# MODIFICATION: OPTIONAL | ||
#------------------------------------------------------------------------------ | ||
# App-specific info | ||
# | ||
# TESTNAME : name of the app | ||
# USE_CORE_IR_VALID : whether to generate valid signal for coreir codegen | ||
# HL_TARGET : Halide target | ||
# RDAI_PLATFORM_RUNTIME : RDAI platform runtime to use | ||
############################################################################### | ||
|
||
TESTNAME = depthwise_conv | ||
USE_COREIR_VALID = 1 | ||
HL_TARGET = host-x86-64-enable_ponds-bfloat_hardware | ||
# HL_TARGET = host-x86-64 | ||
RDAI_PLATFORM_RUNTIME = clockwork_sim | ||
EXT = mat | ||
|
||
# Set enviroment variable to set these: | ||
# HALIDE_GEN_ARGS="ksize=3 stride=2 k_ic=10 k_oc=3 in_img=64" | ||
|
||
# mobilenet layers include: | ||
# conv1: HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=1 k_ic=32 k_oc=64" | ||
# conv2: HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=2 k_ic=64 k_oc=128" | ||
# conv3: HALIDE_GEN_ARGS="in_img=56 pad=1 ksize=3 stride=1 k_ic=128 k_oc=128" | ||
# conv4: HALIDE_GEN_ARGS="in_img=56 pad=1 ksize=3 stride=2 k_ic=128 k_oc=256" | ||
# conv5: HALIDE_GEN_ARGS="in_img=28 pad=1 ksize=3 stride=1 k_ic=256 k_oc=256" | ||
# conv6: HALIDE_GEN_ARGS="in_img=28 pad=1 ksize=3 stride=2 k_ic=256 k_oc=512" | ||
# conv7: HALIDE_GEN_ARGS="in_img=14 pad=1 ksize=3 stride=1 k_ic=512 k_oc=512" | ||
# conv8: HALIDE_GEN_ARGS="in_img=14 pad=1 ksize=3 stride=2 k_ic=512 k_oc=1024" | ||
# conv9: HALIDE_GEN_ARGS="in_img=7 pad=4 ksize=3 stride=2 k_ic=1024 k_oc=1024" | ||
|
||
|
||
# conv1: HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=1 k_ic=4 k_oc=8" | ||
# conv2: HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=2 k_ic=4 k_oc=8" | ||
# conv3: HALIDE_GEN_ARGS="in_img=56 pad=1 ksize=3 stride=1 k_ic=4 k_oc=16" | ||
# conv4: HALIDE_GEN_ARGS="in_img=56 pad=1 ksize=3 stride=2 k_ic=4 k_oc=8" | ||
# conv5: HALIDE_GEN_ARGS="in_img=28 pad=1 ksize=3 stride=1 k_ic=4 k_oc=8" | ||
# conv6: HALIDE_GEN_ARGS="in_img=28 pad=1 ksize=3 stride=2 k_ic=4 k_oc=8" | ||
# conv7: HALIDE_GEN_ARGS="in_img=14 pad=1 ksize=3 stride=1 k_ic=4 k_oc=8" | ||
# conv8: HALIDE_GEN_ARGS="in_img=14 pad=1 ksize=3 stride=2 k_ic=4 k_oc=8" | ||
# conv9: HALIDE_GEN_ARGS="in_img=7 pad=4 ksize=3 stride=2 k_ic=4 k_oc=8" | ||
|
||
|
||
############################################################################### | ||
# MODIFICATION : NONE | ||
#------------------------------------------------------------------------------ | ||
# Include hardwrae build targets | ||
# | ||
############################################################################### | ||
|
||
include ../../hw_support/hardware_targets.mk |
29 changes: 29 additions & 0 deletions
29
apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/cgra_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"IOs": { | ||
"inputs": [{ | ||
"name": "input", | ||
"bitwidth": 16, | ||
"shape": [4, 9, 9], | ||
"pixels_per_cycle": 1, | ||
"datafile": "bin/input_host_stencil.raw" | ||
}, | ||
{ | ||
"name": "kernel", | ||
"bitwidth": 16, | ||
"shape": [4, 3, 3], | ||
"pixels_per_cycle": 1, | ||
"datafile": "bin/kernel_host_stencil.raw" | ||
} | ||
], | ||
"output": { | ||
"name": "hw_output", | ||
"bitwidth": 16, | ||
"shape": [4, 7, 7], | ||
"pixels_per_cycle": 1, | ||
"datafile": "bin/hw_output.raw" | ||
} | ||
}, | ||
"testing": { | ||
"total_cycles": 40000 | ||
} | ||
} |
104 changes: 104 additions & 0 deletions
104
apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/depthwise_conv_generator.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#include "Halide.h" | ||
|
||
namespace { | ||
|
||
using namespace Halide; | ||
using namespace Halide::ConciseCasts; | ||
|
||
class DepthwiseConv : public Halide::Generator<DepthwiseConv> { | ||
public: | ||
Input<Buffer<uint16_t>> input{"input", 3}; | ||
Input<Buffer<uint16_t>> kernel{"kernel", 3}; | ||
Output<Buffer<uint16_t>> output{"output", 3}; | ||
|
||
// in_img determines the input image size | ||
GeneratorParam<int> in_img{"in_img", 40}; // default: 114 | ||
|
||
// ksize determines the output stencil size | ||
GeneratorParam<int> ksize{"ksize", 5}; // default: 3 | ||
|
||
// Stride determines the sampling rate for the down sample | ||
GeneratorParam<int> stride{"stride", 2}; // default: 1 | ||
|
||
// n_ic determines the number of input channels | ||
GeneratorParam<int> n_ic{"n_ic", 24}; // default: 16 | ||
|
||
GeneratorParam<int> unroll{"unroll", 6}; // default: 8 | ||
|
||
|
||
void generate() { | ||
|
||
/* THE ALGORITHM */ | ||
// Define algorithm variables | ||
Var x("x"), y("y"), c("c"); | ||
Func input_host("input_host"), kernel_host("kernel_host"); | ||
Func hw_output("hw_output"); | ||
|
||
// Define hardware input and reduction domain | ||
input_host(c, x, y) = cast<bfloat16_t>(input(c, x, y)); | ||
kernel_host(c, x, y) = cast<bfloat16_t>(kernel(c, x, y)); | ||
RDom r(0, ksize, 0, ksize); | ||
|
||
// DepthwiseConv Expression | ||
Func depthwise_conv; | ||
depthwise_conv(c, x, y) = cast<bfloat16_t>(0); | ||
depthwise_conv(c, x, y) += cast<bfloat16_t>(kernel_host(c, r.x, r.y) * input_host(c, stride * x + r.x, stride * y + r.y)); | ||
|
||
// Send the Output | ||
hw_output(c, x, y) = cast<bfloat16_t>(depthwise_conv(c, x, y)); | ||
output(c, x, y) = cast<uint16_t>(hw_output(c, x, y)); | ||
|
||
/* THE SCHEDULE */ | ||
if (get_target().has_feature(Target::Clockwork)) { | ||
|
||
// Define scheduling variables | ||
int out_img = floor((int(in_img) - int(ksize)) / stride) + 1; | ||
int gbsize_x = out_img; | ||
int gbsize_y = out_img; | ||
int gbsize_c = n_ic; | ||
int tile_size_x = gbsize_x; | ||
int tile_size_y = gbsize_y; | ||
Var x_host, x_glb, x_cgra; | ||
Var y_host, y_glb, y_cgra; | ||
Var c_glb, c_cgra; | ||
Var c_accel, c_host; | ||
|
||
// Create output boundaries | ||
output.bound(x, 0, out_img); | ||
output.bound(y, 0, out_img); | ||
output.bound(c, 0, n_ic); | ||
// kernel_host.bound(c, 0, n_ic); | ||
|
||
// Tile the image at host level, by default the tile size is the whole output image | ||
// Reorder channel dimension to be the innermost to get the clockwork pass | ||
hw_output.compute_root(); | ||
hw_output.tile(x, y, x_host, y_host, x_glb, y_glb, gbsize_x, gbsize_y) | ||
.split(c, c_host, c_glb, gbsize_c) | ||
.reorder(c_glb, x_glb, y_glb, c_host, x_host, y_host) | ||
.hw_accelerate(c_glb, c_host); | ||
hw_output.unroll(c_glb, unroll); | ||
|
||
// Fully unroll the reduction domain to create line buffer scheduling | ||
// Unroll channel dimension for hardware compute unrolling | ||
// .update() is for initialization | ||
depthwise_conv.compute_at(hw_output, c_host); | ||
depthwise_conv.unroll(c, unroll); | ||
depthwise_conv.update().unroll(r.x).unroll(r.y).unroll(c, unroll); | ||
|
||
// Unroll input channels along glb; by default using all GLB tile | ||
input_host.in().compute_at(hw_output, c_host); | ||
// input_host.in().store_in(MemoryType::GLB); | ||
input_host.in().unroll(c, unroll); | ||
input_host.compute_root().accelerator_input(); | ||
|
||
// Unroll kernel channels along glb; by default using all GLB tiles | ||
kernel_host.in().compute_at(hw_output, c_host); | ||
kernel_host.in().unroll(c, unroll); | ||
kernel_host.compute_root().accelerator_input(); | ||
|
||
} | ||
} | ||
}; | ||
} // namespace | ||
|
||
HALIDE_REGISTER_GENERATOR(DepthwiseConv, depthwise_conv) |
Oops, something went wrong.