From ddc928bfd5cbe74847d407cde6e1fd8476bcddb6 Mon Sep 17 00:00:00 2001
From: Mohammed Junaid <88209527+jkottiku@users.noreply.github.com>
Date: Mon, 25 Mar 2024 12:21:51 -0700
Subject: [PATCH] Merge pull request #730 from jkottiku/master

Port babel to AMD internal HIPStream version
---
 babel.so/CMakeLists.txt        |   6 +-
 babel.so/include/HIPStream.h   |  62 +++-
 babel.so/include/Stream.h      |  17 +-
 babel.so/src/rvs_memworker.cpp |   4 +-
 babel.so/src/rvs_stream.cpp    | 535 ++++++++++++++++++++++++++-------
 babel.so/src/rvs_stress.cpp    |  10 +-
 rvs/conf/MI300X/babel.conf     |  49 +++
 7 files changed, 538 insertions(+), 145 deletions(-)
 create mode 100644 rvs/conf/MI300X/babel.conf
diff --git a/babel.so/CMakeLists.txt b/babel.so/CMakeLists.txt
index f163daec..54a0e3a1 100644
--- a/babel.so/CMakeLists.txt
+++ b/babel.so/CMakeLists.txt
@@ -61,11 +61,13 @@ set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSIO
 set(HIP_HCC_BUILD_FLAGS)
 set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}")
 
+set(HIP_STREAM_BUILD_FLAGS "-DNONTEMPORAL=1 -DDWORDS_PER_LANE=4 -DTBSIZE=1024 -DCHUNKS_PER_BLOCK=2 -O3 -std=c++17")
+
 # Set compiler and compiler flags
 set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc")
 set(CMAKE_C_COMPILER   "${HIPCC_PATH}/bin/hipcc")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} ${HIP_STREAM_BUILD_FLAGS}")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS} ${HIP_STREAM_BUILD_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
 set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
 
diff --git a/babel.so/include/HIPStream.h b/babel.so/include/HIPStream.h
index 5278aec0..461e4dd6 100644
--- a/babel.so/include/HIPStream.h
+++ b/babel.so/include/HIPStream.h
@@ -5,23 +5,62 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
-#ifndef MEM_SO_INCLUDE_HIP_STREAM_H_
-#define MEM_SO_INCLUDE_HIP_STREAM_H_
+#pragma once
 
+#include <algorithm>
 #include <iostream>
 #include <stdexcept>
 #include <sstream>
 
 #include "Stream.h"
+#include "hip/hip_runtime.h"
+#ifndef __HIP_PLATFORM_NVCC__
+#include "hip/hip_ext.h"
+#endif
 
 #define IMPLEMENTATION_STRING "HIP"
 
 template <class T>
 class HIPStream : public Stream<T>
 {
+#ifdef __HIP_PLATFORM_NVCC__
+  #ifndef DWORDS_PER_LANE
+  #define DWORDS_PER_LANE 1
+  #endif
+  #ifndef CHUNKS_PER_BLOCK
+  #define CHUNKS_PER_BLOCK 8
+  #endif
+#else
+  #ifndef DWORDS_PER_LANE
+  #define DWORDS_PER_LANE 4
+  #endif
+  #ifndef CHUNKS_PER_BLOCK
+  #define CHUNKS_PER_BLOCK 1
+  #endif
+#endif
+  // make sure that either:
+  //    DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
+  //    or
+  //    DWORDS_PER_LANE is divisible by sizeof(T)
+  static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
+                (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
+                "DWORDS_PER_LANE not divisible by sizeof(element_type)");
+
+  static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK};
+  // take into account the datatype size
+  // that is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements
+  // and 4 FP32 elements
+  static constexpr unsigned int elements_per_lane{
+    (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
+     DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};
   protected:
     // Size of arrays
-    unsigned int array_size;
+    const unsigned int array_size;
+    const unsigned int block_cnt;
+    const bool evt_timing;
+    hipEvent_t start_ev;
+    hipEvent_t stop_ev;
+    hipEvent_t coherent_ev;
 
     // Host array for partial sums for dot kernel
     T *sums;
@@ -30,22 +69,19 @@ class HIPStream : public Stream<T>
     T *d_a;
     T *d_b;
     T *d_c;
-    T *d_sum;
-
 
   public:
-
-    HIPStream(const unsigned int, const int);
+    HIPStream(const unsigned int, const bool, const int);
     ~HIPStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
+    virtual float read() override;
+    virtual float write() override;
+    virtual float copy() override;
+    virtual float add() override;
+    virtual float mul() override;
+    virtual float triad() override;
     virtual T dot() override;
 
     virtual void init_arrays(T initA, T initB, T initC) override;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
 };
-#endif
diff --git a/babel.so/include/Stream.h b/babel.so/include/Stream.h
index c7a8816f..408a80cb 100644
--- a/babel.so/include/Stream.h
+++ b/babel.so/include/Stream.h
@@ -5,9 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
-
-#ifndef RVS_INCLUDE_STREAM_H_
-#define RVS_INCLUDE_STREAM_H_
+#pragma once
 
 #include <vector>
 #include <string>
@@ -27,10 +25,12 @@ class Stream
 
     // Kernels
     // These must be blocking calls
-    virtual void copy() = 0;
-    virtual void mul() = 0;
-    virtual void add() = 0;
-    virtual void triad() = 0;
+    virtual float read() = 0;
+    virtual float write() = 0;
+    virtual float copy() = 0;
+    virtual float mul() = 0;
+    virtual float add() = 0;
+    virtual float triad() = 0;
     virtual T dot() = 0;
 
     // Copy memory between host and device
@@ -44,6 +44,3 @@ class Stream
 void listDevices(void);
 std::string getDeviceName(const int);
 std::string getDeviceDriver(const int);
-
-#endif
-
diff --git a/babel.so/src/rvs_memworker.cpp b/babel.so/src/rvs_memworker.cpp
index f6c1f818..ec2ea21d 100644
--- a/babel.so/src/rvs_memworker.cpp
+++ b/babel.so/src/rvs_memworker.cpp
@@ -63,8 +63,8 @@ void MemWorker::run() {
     std::pair<int, uint16_t> device;
 
     // log MEM stress test - start message
-    msg = "[" + action_name + "] " + MODULE_NAME + " " +
-            std::to_string(gpu_id) + " "  + " Starting the Memory stress test ";
+    msg = "[" + action_name + "] " + "[GPU:: " +
+            std::to_string(gpu_id) + "] " + "Starting the Babel memory stress test";
     rvs::lp::Log(msg, rvs::logresults);
 
     /* Device Index */
diff --git a/babel.so/src/rvs_stream.cpp b/babel.so/src/rvs_stream.cpp
index 2cefb3e5..5f40fe53 100644
--- a/babel.so/src/rvs_stream.cpp
+++ b/babel.so/src/rvs_stream.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// Copyright (c) 2014-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
@@ -8,87 +8,158 @@
 #include "include/HIPStream.h"
 #include "hip/hip_runtime.h"
 
+#include <cfloat>
+
+#ifndef TBSIZE
 #define TBSIZE 1024
-#define DOT_NUM_BLOCKS 256
+#endif
 
-void check_error(void)
-{
-  hipError_t err = hipGetLastError();
-  if (err != hipSuccess)
-  {
-    std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
-    exit(err);
+#ifdef __HCC__
+__device__ uint32_t grid_size() {
+  return hc_get_grid_size(0);
+}
+__device__ uint32_t localid() {
+  return hc_get_workitem_absolute_id(0);
+}
+#elif defined(__HIP__)
+extern "C" __device__ size_t __ockl_get_global_size(uint);
+extern "C" __device__ size_t __ockl_get_global_id(uint);
+__device__ uint32_t grid_size() {
+  return __ockl_get_global_size(0);
+}
+__device__ uint32_t localid() {
+  return __ockl_get_global_id(0);
+}
+#else
+__device__ uint32_t grid_size() {
+  return blockDim.x * gridDim.x;
+}
+__device__ uint32_t localid() {
+  return threadIdx.x + blockIdx.x * blockDim.x;
+}
+#endif
+
+
+template<typename T>
+__device__ __forceinline__ constexpr T scalar(const T scalar) {
+  if constexpr (sizeof(T) == sizeof(float)) {
+    return static_cast<float>(scalar);
+  } else {
+    return static_cast<double>(scalar);
   }
 }
 
+#define check_error(status)                                                    \
+  do {                                                                         \
+    hipError_t err = status;                                                   \
+    if (err != hipSuccess) {                                                   \
+      std::cerr << "Error: " << hipGetErrorString(err) << std::endl;           \
+      exit(err);                                                               \
+    }                                                                          \
+  } while(0)
+
+template <typename... Args, typename F = void (*)(Args...)>
+static void hipLaunchKernelWithEvents(F kernel, const dim3& numBlocks,
+                           const dim3& dimBlocks, hipStream_t stream,
+                           hipEvent_t startEvent, hipEvent_t stopEvent,
+                           Args... args)
+{
+  check_error(hipEventRecord(startEvent));
+  hipLaunchKernelGGL(kernel, numBlocks, dimBlocks,
+                   0, stream, args...);
+  check_error(hipGetLastError());
+  check_error(hipEventRecord(stopEvent));
+}
+
+template <typename... Args, typename F = void (*)(Args...)>
+static void hipLaunchKernelSynchronous(F kernel, const dim3& numBlocks,
+                           const dim3& dimBlocks, hipStream_t stream,
+                           hipEvent_t event, Args... args)
+{
+#ifdef __HIP_PLATFORM_NVCC__
+  hipLaunchKernelGGL(kernel, numBlocks, dimBlocks,
+                   0, stream, args...);
+  check_error(hipGetLastError());
+  check_error(hipDeviceSynchronize());
+#else
+  hipLaunchKernelGGL(kernel, numBlocks, dimBlocks,
+                     0, stream, args...);
+  check_error(hipGetLastError());
+  check_error(hipEventRecord(event));
+  check_error(hipEventSynchronize(event));
+#endif
+}
+
 template <class T>
-HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
+HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const bool event_timing,
+  const int device_index)
+  : array_size{ARRAY_SIZE}, evt_timing(event_timing),
+    block_cnt(array_size / (TBSIZE * elements_per_lane * chunks_per_block))
 {
+  std::cerr << "elements per lane " << elements_per_lane << std::endl;
+  std::cerr << "chunks per block " << chunks_per_block << std::endl;
 
-  // The array size must be divisible by TBSIZE for kernel launches
-  if (ARRAY_SIZE % TBSIZE != 0)
+  // The array size must be divisible by total number of elements
+  // moved per block for kernel launches
+  if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0)
   {
     std::stringstream ss;
-    ss << "Array size must be a multiple of " << TBSIZE;
+    ss << "Array size must be a multiple of elements operated on per block (" <<
+          TBSIZE * elements_per_lane * chunks_per_block << ").";
     throw std::runtime_error(ss.str());
   }
+  std::cerr << "block count " << block_cnt << std::endl;
+
 
   // Set device
   int count;
-  hipGetDeviceCount(&count);
-  check_error();
+  check_error(hipGetDeviceCount(&count));
   if (device_index >= count)
     throw std::runtime_error("Invalid device index");
-  hipSetDevice(device_index);
-  check_error();
+  check_error(hipSetDevice(device_index));
 
   // Print out device information
   std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
 
-  array_size = ARRAY_SIZE;
-
   // Allocate the host array for partial sums for dot kernels
-  sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
+  check_error(hipHostMalloc(&sums, sizeof(T) * block_cnt, hipHostMallocNonCoherent));
 
   // Check buffers fit on the device
   hipDeviceProp_t props;
-  hipGetDeviceProperties(&props, 0);
+  check_error(hipGetDeviceProperties(&props, 0));
   if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
+  std::cout << "pciBusID: " << props.pciBusID << std::endl;
   // Create device buffers
-  hipMalloc(&d_a, ARRAY_SIZE*sizeof(T));
-  check_error();
-  hipMalloc(&d_b, ARRAY_SIZE*sizeof(T));
-  check_error();
-  hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
-  check_error();
-  hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
-  check_error();
+  check_error(hipMalloc(&d_a, ARRAY_SIZE * sizeof(T)));
+  check_error(hipMalloc(&d_b, ARRAY_SIZE * sizeof(T)));
+  check_error(hipMalloc(&d_c, ARRAY_SIZE * sizeof(T)));
+
+  check_error(hipEventCreate(&start_ev));
+  check_error(hipEventCreate(&stop_ev));
+  check_error(hipEventCreateWithFlags(&coherent_ev, hipEventReleaseToSystem));
 }
 
 
 template <class T>
 HIPStream<T>::~HIPStream()
 {
-  free(sums);
-
-  hipFree(d_a);
-  check_error();
-  hipFree(d_b);
-  check_error();
-  hipFree(d_c);
-  check_error();
-  hipFree(d_sum);
-  check_error();
+  check_error(hipHostFree(sums));
+  check_error(hipFree(d_a));
+  check_error(hipFree(d_b));
+  check_error(hipFree(d_c));
+  check_error(hipEventDestroy(start_ev));
+  check_error(hipEventDestroy(stop_ev));
+  check_error(hipEventDestroy(coherent_ev));
 }
 
 
 template <typename T>
 __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
 {
-  const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  const int i = localid();
   a[i] = initA;
   b[i] = initB;
   c[i] = initC;
@@ -97,128 +168,364 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
 template <class T>
 void HIPStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
-  check_error();
-  hipDeviceSynchronize();
-  check_error();
+  hipLaunchKernelGGL(init_kernel<T>, dim3(array_size/TBSIZE), dim3(TBSIZE), 0,
+                     nullptr, d_a, d_b, d_c, initA, initB, initC);
+  check_error(hipGetLastError());
+  check_error(hipDeviceSynchronize());
 }
 
 template <class T>
 void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
+  check_error(hipDeviceSynchronize());
   // Copy device memory to host
-  hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
+  check_error(hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost));
+  check_error(hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost));
+  check_error(hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost));
 }
 
 
-template <typename T>
-__global__ void copy_kernel(const T * a, T * c)
+// turn on non-temporal by default
+#ifndef NONTEMPORAL
+#define NONTEMPORAL 1
+#endif
+
+#if NONTEMPORAL == 0
+template<typename T>
+__device__ __forceinline__ T load(const T& ref) {
+  return ref;
+}
+
+template<typename T>
+__device__ __forceinline__ void store(const T& value, T& ref) {
+  ref = value;
+}
+#else
+template<typename T>
+__device__ __forceinline__ T load(const T& ref) {
+  return __builtin_nontemporal_load(&ref);
+}
+
+template<typename T>
+__device__ __forceinline__ void store(const T& value, T& ref) {
+  __builtin_nontemporal_store(value, &ref);
+}
+#endif
+
+template <unsigned int elements_per_lane, unsigned int chunks_per_block, typename T>
+__launch_bounds__(TBSIZE)
+__global__
+void read_kernel(const T * __restrict a, T * __restrict c)
 {
-  const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-  c[i] = a[i];
+  const auto dx = grid_size() * elements_per_lane;
+  const auto gidx = (localid()) * elements_per_lane;
+
+  T tmp{0};
+  for (auto i = 0u; i != chunks_per_block; ++i)
+  {
+    for (auto j = 0u; j != elements_per_lane; ++j)
+    {
+      tmp += load(a[gidx + i * dx + j]);
+    }
+  }
+
+  // Prevent side-effect free loop from being optimised away.
+  if (tmp == FLT_MIN)
+  {
+    c[gidx] = tmp;
+  }
 }
 
 template <class T>
-void HIPStream<T>::copy()
+float HIPStream<T>::read()
 {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c);
-  check_error();
-  hipDeviceSynchronize();
-  check_error();
+  float kernel_time = 0.;
+  if (evt_timing)
+  {
+    hipLaunchKernelWithEvents(read_kernel<elements_per_lane, chunks_per_block, T>,
+                              dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev,
+                              stop_ev, d_a, d_c);
+    check_error(hipEventSynchronize(stop_ev));
+    check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev));
+  }
+  else
+  {
+    hipLaunchKernelSynchronous(read_kernel<elements_per_lane, chunks_per_block, T>,
+                               dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev,
+                               d_a, d_c);
+  }
+  return kernel_time;
 }
 
-template <typename T>
-__global__ void mul_kernel(T * b, const T * c)
+template <unsigned int elements_per_lane, unsigned int chunks_per_block, typename T>
+__launch_bounds__(TBSIZE)
+__global__
+void write_kernel(T * __restrict c)
 {
-  const T scalar = startScalar;
-  const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-  b[i] = scalar * c[i];
+  const auto dx = grid_size() * elements_per_lane;
+  const auto gidx = (localid()) * elements_per_lane;
+
+  for (auto i = 0u; i != chunks_per_block; ++i)
+  {
+    for (auto j = 0u; j != elements_per_lane; ++j)
+    {
+      store(scalar<T>(startC), c[gidx + i * dx + j]);
+    }
+  }
 }
 
 template <class T>
-void HIPStream<T>::mul()
+float HIPStream<T>::write()
 {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c);
-  check_error();
-  hipDeviceSynchronize();
-  check_error();
+  float kernel_time = 0.;
+  if (evt_timing)
+  {
+    hipLaunchKernelWithEvents(write_kernel<elements_per_lane, chunks_per_block, T>,
+                              dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev,
+                              stop_ev, d_c);
+    check_error(hipEventSynchronize(stop_ev));
+    check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev));
+  }
+  else
+  {
+    hipLaunchKernelSynchronous(write_kernel<elements_per_lane, chunks_per_block, T>,
+                               dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev,
+                               d_c);
+  }
+  return kernel_time;
 }
 
-template <typename T>
-__global__ void add_kernel(const T * a, const T * b, T * c)
+template <unsigned int elements_per_lane, unsigned int chunks_per_block, typename T>
+__launch_bounds__(TBSIZE)
+__global__
+void copy_kernel(const T * __restrict a, T * __restrict c)
 {
-  const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-  c[i] = a[i] + b[i];
+  const auto dx = grid_size() * elements_per_lane;
+  const auto gidx = (localid()) * elements_per_lane;
+
+  for (auto i = 0u; i != chunks_per_block; ++i)
+  {
+    for (auto j = 0u; j != elements_per_lane; ++j)
+    {
+      store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]);
+    }
+  }
 }
 
 template <class T>
-void HIPStream<T>::add()
+float HIPStream<T>::copy()
 {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
-  check_error();
-  hipDeviceSynchronize();
-  check_error();
+  float kernel_time = 0.;
+  if (evt_timing)
+  {
+    hipLaunchKernelWithEvents(copy_kernel<elements_per_lane, chunks_per_block, T>,
+                              dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev,
+                              stop_ev, d_a, d_c);
+    check_error(hipEventSynchronize(stop_ev));
+    check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev));
+  }
+  else
+  {
+    hipLaunchKernelSynchronous(copy_kernel<elements_per_lane, chunks_per_block, T>,
+                               dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev,
+                               d_a, d_c);
+  }
+  return kernel_time;
 }
 
-template <typename T>
-__global__ void triad_kernel(T * a, const T * b, const T * c)
+template <unsigned int elements_per_lane, unsigned int chunks_per_block, typename T>
+__launch_bounds__(TBSIZE)
+__global__
+void mul_kernel(T * __restrict b, const T * __restrict c)
 {
-  const T scalar = startScalar;
-  const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-  a[i] = b[i] + scalar * c[i];
+  const auto dx = grid_size() * elements_per_lane;
+  const auto gidx = (localid()) * elements_per_lane;
+
+  for (auto i = 0u; i != chunks_per_block; ++i)
+  {
+    for (auto j = 0u; j != elements_per_lane; ++j)
+    {
+      store(scalar<T>(startScalar) * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]);
+    }
+  }
 }
 
 template <class T>
-void HIPStream<T>::triad()
+float HIPStream<T>::mul()
 {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
-  check_error();
-  hipDeviceSynchronize();
-  check_error();
+  float kernel_time = 0.;
+  if (evt_timing)
+  {
+    hipLaunchKernelWithEvents(mul_kernel<elements_per_lane, chunks_per_block, T>,
+                              dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev,
+                              stop_ev, d_b, d_c);
+    check_error(hipEventSynchronize(stop_ev));
+    check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev));
+  }
+  else
+  {
+    hipLaunchKernelSynchronous(mul_kernel<elements_per_lane, chunks_per_block, T>,
+                               dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev,
+                               d_b, d_c);
+  }
+  return kernel_time;
+}
+
+template <unsigned int elements_per_lane, unsigned int chunks_per_block, typename T>
+__launch_bounds__(TBSIZE)
+__global__
+void add_kernel(const T * __restrict a, const T * __restrict b,
+                T * __restrict c)
+{
+  const auto dx = grid_size() * elements_per_lane;
+  const auto gidx = (localid()) * elements_per_lane;
+
+  for (auto i = 0u; i != chunks_per_block; ++i)
+  {
+    for (auto j = 0u; j != elements_per_lane; ++j)
+    {
+      store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]);
+    }
+  }
 }
 
 template <class T>
-__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
+float HIPStream<T>::add()
 {
-  __shared__ T tb_sum[TBSIZE];
+  float kernel_time = 0.;
+  if (evt_timing)
+  {
+    hipLaunchKernelWithEvents(add_kernel<elements_per_lane, chunks_per_block, T>,
+                              dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev,
+                              stop_ev, d_a, d_b, d_c);
+    check_error(hipEventSynchronize(stop_ev));
+    check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev));
+  }
+  else
+  {
+    hipLaunchKernelSynchronous(add_kernel<elements_per_lane, chunks_per_block, T>,
+                               dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev,
+                               d_a, d_b, d_c);
+  }
+  return kernel_time;
+}
 
-  int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-  const size_t local_i = hipThreadIdx_x;
+template <unsigned int elements_per_lane, unsigned int chunks_per_block, typename T>
+__launch_bounds__(TBSIZE)
+__global__
+void triad_kernel(T * __restrict a, const T * __restrict b,
+                  const T * __restrict c)
+{
+  const auto dx = grid_size() * elements_per_lane;
+  const auto gidx = (localid()) * elements_per_lane;
 
-  tb_sum[local_i] = 0.0;
-  for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
-    tb_sum[local_i] += a[i] * b[i];
+  for (auto i = 0u; i != chunks_per_block; ++i)
+  {
+    for (auto j = 0u; j != elements_per_lane; ++j)
+    {
+      store(load(b[gidx + i * dx + j]) + scalar<T>(startScalar) * load(c[gidx + i * dx + j]),
+            a[gidx + i * dx + j]);
+    }
+  }
+}
+
+template <class T>
+float HIPStream<T>::triad()
+{
+  float kernel_time = 0.;
+  if (evt_timing)
+  {
+    hipLaunchKernelWithEvents(triad_kernel<elements_per_lane, chunks_per_block, T>,
+                              dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev,
+                              stop_ev, d_a, d_b, d_c);
+    check_error(hipEventSynchronize(stop_ev));
+    check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev));
+  }
+  else
+  {
+    hipLaunchKernelSynchronous(triad_kernel<elements_per_lane, chunks_per_block, T>,
+                               dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev,
+                               d_a, d_b, d_c);
+  }
+  return kernel_time;
+}
+
+template<unsigned int n = TBSIZE>
+struct Reducer {
+  template<typename I>
+  __device__
+  static
+  void reduce(I it) noexcept
+  {
+    if (n == 1) return;
 
-  for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
+#if defined(__HIP_PLATFORM_NVCC__)
+    constexpr unsigned int warpSize = 32;
+#endif
+    constexpr bool is_same_warp{n <= warpSize * 2};
+    if (static_cast<int>(threadIdx.x) < n / 2)
+    {
+      it[threadIdx.x] += it[threadIdx.x + n / 2];
+    }
+    is_same_warp ? __threadfence_block() : __syncthreads();
+
+    Reducer<n / 2>::reduce(it);
+  }
+};
+
+template<>
+struct Reducer<1u> {
+  template<typename I>
+  __device__
+  static
+  void reduce(I) noexcept
+  {}
+};
+
+template <unsigned int elements_per_lane, unsigned int chunks_per_block, typename T>
+__launch_bounds__(TBSIZE)
+__global__
+void dot_kernel(const T * __restrict a, const T * __restrict b,
+                T * __restrict sum)
+{
+  const auto dx = grid_size() * elements_per_lane;
+  const auto gidx = (localid()) * elements_per_lane;
+
+  T tmp{0};
+  for (auto i = 0u; i != chunks_per_block; ++i)
   {
-    __syncthreads();
-    if (local_i < offset)
+    for (auto j = 0u; j != elements_per_lane; ++j)
     {
-      tb_sum[local_i] += tb_sum[local_i+offset];
+      tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]);
     }
   }
 
-  if (local_i == 0)
-    sum[hipBlockIdx_x] = tb_sum[local_i];
+  __shared__ T tb_sum[TBSIZE];
+  tb_sum[threadIdx.x] = tmp;
+
+  __syncthreads();
+
+  Reducer<>::reduce(tb_sum);
+
+  if (threadIdx.x)
+  {
+    return;
+  }
+  store(tb_sum[0], sum[blockIdx.x]);
 }
 
 template <class T>
 T HIPStream<T>::dot()
 {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
-  check_error();
+  hipLaunchKernelSynchronous(dot_kernel<elements_per_lane, chunks_per_block, T>,
+                             dim3(block_cnt), dim3(TBSIZE), nullptr, coherent_ev,
+                             d_a, d_b, sums);
 
-  hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-
-  T sum = 0.0;
-  for (int i = 0; i < DOT_NUM_BLOCKS; i++)
+  T sum{0};
+  for (auto i = 0u; i != block_cnt; ++i)
+  {
     sum += sums[i];
+  }
 
   return sum;
 }
@@ -227,8 +534,7 @@ void listDevices(void)
 {
   // Get number of devices
   int count;
-  hipGetDeviceCount(&count);
-  check_error();
+  check_error(hipGetDeviceCount(&count));
 
   // Print device names
   if (count == 0)
@@ -251,19 +557,16 @@ void listDevices(void)
 std::string getDeviceName(const int device)
 {
   hipDeviceProp_t props;
-  hipGetDeviceProperties(&props, device);
-  check_error();
+  check_error(hipGetDeviceProperties(&props, device));
   return std::string(props.name);
 }
 
 
 std::string getDeviceDriver(const int device)
 {
-  hipSetDevice(device);
-  check_error();
+  check_error(hipSetDevice(device));
   int driver;
-  hipDriverGetVersion(&driver);
-  check_error();
+  check_error(hipDriverGetVersion(&driver));
   return std::to_string(driver);
 }
 
diff --git a/babel.so/src/rvs_stress.cpp b/babel.so/src/rvs_stress.cpp
index 9d728154..5e726999 100644
--- a/babel.so/src/rvs_stress.cpp
+++ b/babel.so/src/rvs_stress.cpp
@@ -26,6 +26,9 @@
 std::string csv_separator = ",";
 static bool triad_only = false;
 
+ bool event_timing = false;
+
+
 template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum, uint64_t);
 
@@ -111,7 +114,7 @@ void run_stress(std::pair<int, uint16_t> device, int num_times, int ARRAY_SIZE,
   Stream<T> *stream;
 
   // Use the HIP implementation
-  stream = new HIPStream<T>(ARRAY_SIZE, device.first);
+  stream = new HIPStream<T>(ARRAY_SIZE, event_timing, device.first);
 
   stream->init_arrays(startA, startB, startC);
 
@@ -184,6 +187,7 @@ void run_stress(std::pair<int, uint16_t> device, int num_times, int ARRAY_SIZE,
       << std::left << std::setw(12) << "Max"
       << std::left << std::setw(12) << "Average"
       << std::endl
+      << "------------------------------------------------------------------------" << std::endl
       << std::fixed;
   }
 
@@ -232,6 +236,8 @@ void run_stress(std::pair<int, uint16_t> device, int num_times, int ARRAY_SIZE,
         << std::endl;
     }
   }
+  std::cout
+    << "------------------------------------------------------------------------" << std::endl;
 
   delete stream;
 
@@ -282,7 +288,7 @@ void run_triad(std::pair<int, uint16_t> device, int num_times, int ARRAY_SIZE, b
   Stream<T> *stream;
 
   // Use the HIP implementation
-  stream = new HIPStream<T>(ARRAY_SIZE, device.first);
+  stream = new HIPStream<T>(ARRAY_SIZE, event_timing, device.first);
 
   stream->init_arrays(startA, startB, startC);
 
diff --git a/rvs/conf/MI300X/babel.conf b/rvs/conf/MI300X/babel.conf
new file mode 100644
index 00000000..22d2c6a4
--- /dev/null
+++ b/rvs/conf/MI300X/babel.conf
@@ -0,0 +1,49 @@
+# ################################################################################
+# #
+# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
+# #
+# # MIT LICENSE:
+# # Permission is hereby granted, free of charge, to any person obtaining a copy of
+# # this software and associated documentation files (the "Software"), to deal in
+# # the Software without restriction, including without limitation the rights to
+# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+# # of the Software, and to permit persons to whom the Software is furnished to do
+# # so, subject to the following conditions:
+# #
+# # The above copyright notice and this permission notice shall be included in all
+# # copies or substantial portions of the Software.
+# #
+# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# # SOFTWARE.
+# #
+# ###############################################################################
+
+# BABEL test
+#
+# Preconditions:
+#   Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
+#   option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
+#   all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
+#   Set parallel execution to false
+#   Set buffer size to reflect the buffer you want to test
+#   Set run count to 1 (test will run once)
+#
+
+actions:
+- name: babel-float-256MiB
+  device: all
+  module: babel          # Name of the module
+  parallel: false        # Parallel true or false
+  count: 1               # Number of times you want to repeat the test from the begin ( A clean start every time)
+  num_iter: 5000         # Number of iterations, this many kernels are launched simultaneosuly and stresses the system
+  array_size: 268435456  # Buffer size the test operates, this is 256 MiB
+  test_type: 1           # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double
+  mibibytes: true        # mibibytes (MiB) or megabytes (MB), true for MiB
+  o/p_csv: false         # o/p as csv file
+  subtest: 5             # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot
+