depthwise_conv stream

StanfordAHA · May 27, 2024 · 11b82f7 · 11b82f7
1 parent cbd2648
commit 11b82f7
Show file tree

Hide file tree

Showing 4 changed files with 408 additions and 0 deletions.
diff --git a/apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/Makefile b/apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/Makefile
@@ -0,0 +1,76 @@
+# Usage:
+#  make all:                    compiles all code without running
+#       generator:              create Halide generator
+#       design:                 create cpu design
+#       design-clockwork:       create clockwork design
+#       image:                  create an image with random data
+#       run:                    run cpu design with image
+#       run-clockwork:          run clockwork design with image
+#       compare:                compare two output images
+#       test:                   run and compare to cpu output
+#       eval:                   evaluate runtime
+#       clean:                  remove bin directory
+
+###############################################################################
+# MODIFICATION: NONE
+#------------------------------------------------------------------------------
+# Makefile parameter/variable declarations
+#
+###############################################################################
+
+include ../../hw_support/Makefile.inc
+
+
+
+###############################################################################
+# MODIFICATION: OPTIONAL
+#------------------------------------------------------------------------------
+# App-specific info
+#
+# TESTNAME                      : name of the app
+# USE_CORE_IR_VALID             : whether to generate valid signal for coreir codegen
+# HL_TARGET                     : Halide target
+# RDAI_PLATFORM_RUNTIME         : RDAI platform runtime to use 
+###############################################################################
+
+TESTNAME                        = depthwise_conv
+USE_COREIR_VALID                = 1
+HL_TARGET                       = host-x86-64-enable_ponds-bfloat_hardware
+# HL_TARGET                       = host-x86-64
+RDAI_PLATFORM_RUNTIME           = clockwork_sim
+EXT                             = mat
+
+# Set enviroment variable to set these:
+#  HALIDE_GEN_ARGS="ksize=3 stride=2 k_ic=10 k_oc=3 in_img=64"
+
+# mobilenet layers include:
+#  conv1:   HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=1 k_ic=32   k_oc=64"
+#  conv2:   HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=2 k_ic=64   k_oc=128"
+#  conv3:   HALIDE_GEN_ARGS="in_img=56  pad=1 ksize=3 stride=1 k_ic=128  k_oc=128"
+#  conv4:   HALIDE_GEN_ARGS="in_img=56  pad=1 ksize=3 stride=2 k_ic=128  k_oc=256"
+#  conv5:   HALIDE_GEN_ARGS="in_img=28  pad=1 ksize=3 stride=1 k_ic=256  k_oc=256"
+#  conv6:   HALIDE_GEN_ARGS="in_img=28  pad=1 ksize=3 stride=2 k_ic=256  k_oc=512"
+#  conv7:   HALIDE_GEN_ARGS="in_img=14  pad=1 ksize=3 stride=1 k_ic=512  k_oc=512"
+#  conv8:   HALIDE_GEN_ARGS="in_img=14  pad=1 ksize=3 stride=2 k_ic=512  k_oc=1024"
+#  conv9:   HALIDE_GEN_ARGS="in_img=7   pad=4 ksize=3 stride=2 k_ic=1024 k_oc=1024"
+
+
+#  conv1:   HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=1 k_ic=4 k_oc=8"
+#  conv2:   HALIDE_GEN_ARGS="in_img=112 pad=1 ksize=3 stride=2 k_ic=4 k_oc=8"
+#  conv3:   HALIDE_GEN_ARGS="in_img=56  pad=1 ksize=3 stride=1 k_ic=4 k_oc=16"
+#  conv4:   HALIDE_GEN_ARGS="in_img=56  pad=1 ksize=3 stride=2 k_ic=4 k_oc=8"
+#  conv5:   HALIDE_GEN_ARGS="in_img=28  pad=1 ksize=3 stride=1 k_ic=4 k_oc=8"
+#  conv6:   HALIDE_GEN_ARGS="in_img=28  pad=1 ksize=3 stride=2 k_ic=4 k_oc=8"
+#  conv7:   HALIDE_GEN_ARGS="in_img=14  pad=1 ksize=3 stride=1 k_ic=4 k_oc=8"
+#  conv8:   HALIDE_GEN_ARGS="in_img=14  pad=1 ksize=3 stride=2 k_ic=4 k_oc=8"
+#  conv9:   HALIDE_GEN_ARGS="in_img=7   pad=4 ksize=3 stride=2 k_ic=4 k_oc=8"
+
+
+###############################################################################
+# MODIFICATION : NONE
+#------------------------------------------------------------------------------
+# Include hardwrae build targets
+#
+###############################################################################
+
+include ../../hw_support/hardware_targets.mk
diff --git a/apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/cgra_config.json b/apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/cgra_config.json
@@ -0,0 +1,29 @@
+{
+	"IOs": {
+		"inputs": [{
+				"name": "input",
+				"bitwidth": 16,
+  			"shape": [4, 9, 9],
+				"pixels_per_cycle": 1,
+				"datafile": "bin/input_host_stencil.raw"
+			},
+			{
+				"name": "kernel",
+				"bitwidth": 16,
+				"shape": [4, 3, 3],
+				"pixels_per_cycle": 1,
+				"datafile": "bin/kernel_host_stencil.raw"
+			}
+    ],
+		"output": {
+			"name": "hw_output",
+			"bitwidth": 16,
+	    "shape": [4, 7, 7],
+			"pixels_per_cycle": 1,
+      "datafile": "bin/hw_output.raw"
+		}
+	},
+	"testing": {
+		"total_cycles": 40000
+	}
+}
diff --git a/apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/depthwise_conv_generator.cpp b/apps/hardware_benchmarks/apps/depthwise_conv_stream_fp/depthwise_conv_generator.cpp
@@ -0,0 +1,104 @@
+#include "Halide.h"
+
+namespace {
+
+using namespace Halide;
+using namespace Halide::ConciseCasts;
+
+class DepthwiseConv : public Halide::Generator<DepthwiseConv> {
+public:
+    Input<Buffer<uint16_t>> input{"input", 3};
+    Input<Buffer<uint16_t>> kernel{"kernel", 3};
+    Output<Buffer<uint16_t>> output{"output", 3};
+
+    // in_img determines the input image size
+    GeneratorParam<int> in_img{"in_img", 40};    // default: 114
+
+    // ksize determines the output stencil size
+    GeneratorParam<int> ksize{"ksize", 5};    // default: 3
+
+    // Stride determines the sampling rate for the down sample
+    GeneratorParam<int>  stride{"stride", 2};  // default: 1
+
+    // n_ic determines the number of input channels
+    GeneratorParam<int> n_ic{"n_ic", 24};    // default: 16
+
+    GeneratorParam<int> unroll{"unroll", 6};    // default: 8
+
+
+    void generate() {
+
+        /* THE ALGORITHM */
+        // Define algorithm variables
+        Var x("x"), y("y"), c("c");
+        Func input_host("input_host"), kernel_host("kernel_host");
+        Func hw_output("hw_output");
+
+        // Define hardware input and reduction domain
+        input_host(c, x, y) = cast<bfloat16_t>(input(c, x, y));
+        kernel_host(c, x, y) = cast<bfloat16_t>(kernel(c, x, y));
+        RDom r(0, ksize, 0, ksize);
+
+        // DepthwiseConv Expression
+        Func depthwise_conv;
+        depthwise_conv(c, x, y) = cast<bfloat16_t>(0);
+        depthwise_conv(c, x, y) += cast<bfloat16_t>(kernel_host(c, r.x, r.y) * input_host(c, stride * x + r.x, stride * y + r.y));
+
+        // Send the Output
+        hw_output(c, x, y) = cast<bfloat16_t>(depthwise_conv(c, x, y));
+        output(c, x, y) = cast<uint16_t>(hw_output(c, x, y));
+
+        /* THE SCHEDULE */
+        if (get_target().has_feature(Target::Clockwork)) {
+
+            // Define scheduling variables
+            int out_img = floor((int(in_img) - int(ksize)) / stride) + 1;
+            int gbsize_x = out_img;
+            int gbsize_y = out_img;
+            int gbsize_c = n_ic;
+            int tile_size_x = gbsize_x;
+            int tile_size_y = gbsize_y;
+            Var x_host, x_glb, x_cgra;
+            Var y_host, y_glb, y_cgra;
+            Var c_glb, c_cgra;
+            Var c_accel, c_host;
+
+            // Create output boundaries
+            output.bound(x, 0, out_img);
+            output.bound(y, 0, out_img);
+            output.bound(c, 0, n_ic);
+            // kernel_host.bound(c, 0, n_ic);
+
+            // Tile the image at host level, by default the tile size is the whole output image
+            // Reorder channel dimension to be the innermost to get the clockwork pass
+            hw_output.compute_root();
+            hw_output.tile(x, y, x_host, y_host, x_glb, y_glb, gbsize_x, gbsize_y)
+                .split(c, c_host, c_glb, gbsize_c)
+                .reorder(c_glb, x_glb, y_glb, c_host, x_host, y_host)
+                .hw_accelerate(c_glb, c_host);
+            hw_output.unroll(c_glb, unroll);
+
+            // Fully unroll the reduction domain to create line buffer scheduling
+            // Unroll channel dimension for hardware compute unrolling
+            // .update() is for initialization
+            depthwise_conv.compute_at(hw_output, c_host);
+            depthwise_conv.unroll(c, unroll);
+            depthwise_conv.update().unroll(r.x).unroll(r.y).unroll(c, unroll);
+
+            // Unroll input channels along glb; by default using all GLB tile
+            input_host.in().compute_at(hw_output, c_host);
+            // input_host.in().store_in(MemoryType::GLB);
+            input_host.in().unroll(c, unroll);
+            input_host.compute_root().accelerator_input();
+
+            // Unroll kernel channels along glb; by default using all GLB tiles
+            kernel_host.in().compute_at(hw_output, c_host);
+            kernel_host.in().unroll(c, unroll);
+            kernel_host.compute_root().accelerator_input();
+
+        }
+    }
+};
+}  // namespace
+
+HALIDE_REGISTER_GENERATOR(DepthwiseConv, depthwise_conv)