From ddc928bfd5cbe74847d407cde6e1fd8476bcddb6 Mon Sep 17 00:00:00 2001 From: Mohammed Junaid <88209527+jkottiku@users.noreply.github.com> Date: Mon, 25 Mar 2024 12:21:51 -0700 Subject: [PATCH] Merge pull request #730 from jkottiku/master Port babel to AMD internal HIPStream version --- babel.so/CMakeLists.txt | 6 +- babel.so/include/HIPStream.h | 62 +++- babel.so/include/Stream.h | 17 +- babel.so/src/rvs_memworker.cpp | 4 +- babel.so/src/rvs_stream.cpp | 535 ++++++++++++++++++++++++++------- babel.so/src/rvs_stress.cpp | 10 +- rvs/conf/MI300X/babel.conf | 49 +++ 7 files changed, 538 insertions(+), 145 deletions(-) create mode 100644 rvs/conf/MI300X/babel.conf diff --git a/babel.so/CMakeLists.txt b/babel.so/CMakeLists.txt index f163daec..54a0e3a1 100644 --- a/babel.so/CMakeLists.txt +++ b/babel.so/CMakeLists.txt @@ -61,11 +61,13 @@ set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSIO set(HIP_HCC_BUILD_FLAGS) set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}") +set(HIP_STREAM_BUILD_FLAGS "-DNONTEMPORAL=1 -DDWORDS_PER_LANE=4 -DTBSIZE=1024 -DCHUNKS_PER_BLOCK=2 -O3 -std=c++17") + # Set compiler and compiler flags set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc") set(CMAKE_C_COMPILER "${HIPCC_PATH}/bin/hipcc") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS}") -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} ${HIP_STREAM_BUILD_FLAGS}") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS} ${HIP_STREAM_BUILD_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}") diff --git a/babel.so/include/HIPStream.h b/babel.so/include/HIPStream.h index 5278aec0..461e4dd6 100644 --- a/babel.so/include/HIPStream.h +++ b/babel.so/include/HIPStream.h @@ -5,23 +5,62 @@ // For full license terms please see the LICENSE file distributed with this // source code -#ifndef MEM_SO_INCLUDE_HIP_STREAM_H_ -#define MEM_SO_INCLUDE_HIP_STREAM_H_ +#pragma once +#include #include #include #include #include "Stream.h" +#include "hip/hip_runtime.h" +#ifndef __HIP_PLATFORM_NVCC__ +#include "hip/hip_ext.h" +#endif #define IMPLEMENTATION_STRING "HIP" template class HIPStream : public Stream { +#ifdef __HIP_PLATFORM_NVCC__ + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 1 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 8 + #endif +#else + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 4 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 1 + #endif +#endif + // make sure that either: + // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // or + // DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + + static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; + // take into account the datatype size + // that is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // and 4 FP32 elements + static constexpr unsigned int elements_per_lane{ + (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; protected: // Size of arrays - unsigned int array_size; + const unsigned int array_size; + const unsigned int block_cnt; + const bool evt_timing; + hipEvent_t start_ev; + hipEvent_t stop_ev; + hipEvent_t coherent_ev; // Host array for partial sums for dot kernel T *sums; @@ -30,22 +69,19 @@ class HIPStream : public Stream T *d_a; T *d_b; T *d_c; - T *d_sum; - public: - - HIPStream(const unsigned int, const int); + HIPStream(const unsigned int, const bool, const int); ~HIPStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; + virtual float read() override; + virtual float write() override; + virtual float copy() override; + virtual float add() override; + virtual float mul() override; + virtual float triad() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - }; -#endif diff --git a/babel.so/include/Stream.h b/babel.so/include/Stream.h index c7a8816f..408a80cb 100644 --- a/babel.so/include/Stream.h +++ b/babel.so/include/Stream.h @@ -5,9 +5,7 @@ // For full license terms please see the LICENSE file distributed with this // source code - -#ifndef RVS_INCLUDE_STREAM_H_ -#define RVS_INCLUDE_STREAM_H_ +#pragma once #include #include @@ -27,10 +25,12 @@ class Stream // Kernels // These must be blocking calls - virtual void copy() = 0; - virtual void mul() = 0; - virtual void add() = 0; - virtual void triad() = 0; + virtual float read() = 0; + virtual float write() = 0; + virtual float copy() = 0; + virtual float mul() = 0; + virtual float add() = 0; + virtual float triad() = 0; virtual T dot() = 0; // Copy memory between host and device @@ -44,6 +44,3 @@ class Stream void listDevices(void); std::string getDeviceName(const int); std::string getDeviceDriver(const int); - -#endif - diff --git a/babel.so/src/rvs_memworker.cpp b/babel.so/src/rvs_memworker.cpp index f6c1f818..ec2ea21d 100644 --- a/babel.so/src/rvs_memworker.cpp +++ b/babel.so/src/rvs_memworker.cpp @@ -63,8 +63,8 @@ void MemWorker::run() { std::pair device; // log MEM stress test - start message - msg = "[" + action_name + "] " + MODULE_NAME + " " + - std::to_string(gpu_id) + " " + " Starting the Memory stress test "; + msg = "[" + action_name + "] " + "[GPU:: " + + std::to_string(gpu_id) + "] " + "Starting the Babel memory stress test"; rvs::lp::Log(msg, rvs::logresults); /* Device Index */ diff --git a/babel.so/src/rvs_stream.cpp b/babel.so/src/rvs_stream.cpp index 2cefb3e5..5f40fe53 100644 --- a/babel.so/src/rvs_stream.cpp +++ b/babel.so/src/rvs_stream.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// Copyright (c) 2014-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this @@ -8,87 +8,158 @@ #include "include/HIPStream.h" #include "hip/hip_runtime.h" +#include + +#ifndef TBSIZE #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 +#endif -void check_error(void) -{ - hipError_t err = hipGetLastError(); - if (err != hipSuccess) - { - std::cerr << "Error: " << hipGetErrorString(err) << std::endl; - exit(err); +#ifdef __HCC__ +__device__ uint32_t grid_size() { + return hc_get_grid_size(0); +} +__device__ uint32_t localid() { + return hc_get_workitem_absolute_id(0); +} +#elif defined(__HIP__) +extern "C" __device__ size_t __ockl_get_global_size(uint); +extern "C" __device__ size_t __ockl_get_global_id(uint); +__device__ uint32_t grid_size() { + return __ockl_get_global_size(0); +} +__device__ uint32_t localid() { + return __ockl_get_global_id(0); +} +#else +__device__ uint32_t grid_size() { + return blockDim.x * gridDim.x; +} +__device__ uint32_t localid() { + return threadIdx.x + blockIdx.x * blockDim.x; +} +#endif + + +template +__device__ __forceinline__ constexpr T scalar(const T scalar) { + if constexpr (sizeof(T) == sizeof(float)) { + return static_cast(scalar); + } else { + return static_cast(scalar); } } +#define check_error(status) \ + do { \ + hipError_t err = status; \ + if (err != hipSuccess) { \ + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; \ + exit(err); \ + } \ + } while(0) + +template +static void hipLaunchKernelWithEvents(F kernel, const dim3& numBlocks, + const dim3& dimBlocks, hipStream_t stream, + hipEvent_t startEvent, hipEvent_t stopEvent, + Args... args) +{ + check_error(hipEventRecord(startEvent)); + hipLaunchKernelGGL(kernel, numBlocks, dimBlocks, + 0, stream, args...); + check_error(hipGetLastError()); + check_error(hipEventRecord(stopEvent)); +} + +template +static void hipLaunchKernelSynchronous(F kernel, const dim3& numBlocks, + const dim3& dimBlocks, hipStream_t stream, + hipEvent_t event, Args... args) +{ +#ifdef __HIP_PLATFORM_NVCC__ + hipLaunchKernelGGL(kernel, numBlocks, dimBlocks, + 0, stream, args...); + check_error(hipGetLastError()); + check_error(hipDeviceSynchronize()); +#else + hipLaunchKernelGGL(kernel, numBlocks, dimBlocks, + 0, stream, args...); + check_error(hipGetLastError()); + check_error(hipEventRecord(event)); + check_error(hipEventSynchronize(event)); +#endif +} + template -HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const int device_index) +HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const bool event_timing, + const int device_index) + : array_size{ARRAY_SIZE}, evt_timing(event_timing), + block_cnt(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) { + std::cerr << "elements per lane " << elements_per_lane << std::endl; + std::cerr << "chunks per block " << chunks_per_block << std::endl; - // The array size must be divisible by TBSIZE for kernel launches - if (ARRAY_SIZE % TBSIZE != 0) + // The array size must be divisible by total number of elements + // moved per block for kernel launches + if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) { std::stringstream ss; - ss << "Array size must be a multiple of " << TBSIZE; + ss << "Array size must be a multiple of elements operated on per block (" << + TBSIZE * elements_per_lane * chunks_per_block << ")."; throw std::runtime_error(ss.str()); } + std::cerr << "block count " << block_cnt << std::endl; + // Set device int count; - hipGetDeviceCount(&count); - check_error(); + check_error(hipGetDeviceCount(&count)); if (device_index >= count) throw std::runtime_error("Invalid device index"); - hipSetDevice(device_index); - check_error(); + check_error(hipSetDevice(device_index)); // Print out device information std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - array_size = ARRAY_SIZE; - // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + check_error(hipHostMalloc(&sums, sizeof(T) * block_cnt, hipHostMallocNonCoherent)); // Check buffers fit on the device hipDeviceProp_t props; - hipGetDeviceProperties(&props, 0); + check_error(hipGetDeviceProperties(&props, 0)); if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + std::cout << "pciBusID: " << props.pciBusID << std::endl; // Create device buffers - hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); - check_error(); - hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); - check_error(); - hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); - check_error(); - hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); - check_error(); + check_error(hipMalloc(&d_a, ARRAY_SIZE * sizeof(T))); + check_error(hipMalloc(&d_b, ARRAY_SIZE * sizeof(T))); + check_error(hipMalloc(&d_c, ARRAY_SIZE * sizeof(T))); + + check_error(hipEventCreate(&start_ev)); + check_error(hipEventCreate(&stop_ev)); + check_error(hipEventCreateWithFlags(&coherent_ev, hipEventReleaseToSystem)); } template HIPStream::~HIPStream() { - free(sums); - - hipFree(d_a); - check_error(); - hipFree(d_b); - check_error(); - hipFree(d_c); - check_error(); - hipFree(d_sum); - check_error(); + check_error(hipHostFree(sums)); + check_error(hipFree(d_a)); + check_error(hipFree(d_b)); + check_error(hipFree(d_c)); + check_error(hipEventDestroy(start_ev)); + check_error(hipEventDestroy(stop_ev)); + check_error(hipEventDestroy(coherent_ev)); } template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const int i = localid(); a[i] = initA; b[i] = initB; c[i] = initC; @@ -97,128 +168,364 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) template void HIPStream::init_arrays(T initA, T initB, T initC) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); - check_error(); - hipDeviceSynchronize(); - check_error(); + hipLaunchKernelGGL(init_kernel, dim3(array_size/TBSIZE), dim3(TBSIZE), 0, + nullptr, d_a, d_b, d_c, initA, initB, initC); + check_error(hipGetLastError()); + check_error(hipDeviceSynchronize()); } template void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { + check_error(hipDeviceSynchronize()); // Copy device memory to host - hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); - check_error(); + check_error(hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost)); + check_error(hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost)); + check_error(hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost)); } -template -__global__ void copy_kernel(const T * a, T * c) +// turn on non-temporal by default +#ifndef NONTEMPORAL +#define NONTEMPORAL 1 +#endif + +#if NONTEMPORAL == 0 +template +__device__ __forceinline__ T load(const T& ref) { + return ref; +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) { + ref = value; +} +#else +template +__device__ __forceinline__ T load(const T& ref) { + return __builtin_nontemporal_load(&ref); +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) { + __builtin_nontemporal_store(value, &ref); +} +#endif + +template +__launch_bounds__(TBSIZE) +__global__ +void read_kernel(const T * __restrict a, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i]; + const auto dx = grid_size() * elements_per_lane; + const auto gidx = (localid()) * elements_per_lane; + + T tmp{0}; + for (auto i = 0u; i != chunks_per_block; ++i) + { + for (auto j = 0u; j != elements_per_lane; ++j) + { + tmp += load(a[gidx + i * dx + j]); + } + } + + // Prevent side-effect free loop from being optimised away. + if (tmp == FLT_MIN) + { + c[gidx] = tmp; + } } template -void HIPStream::copy() +float HIPStream::read() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); - check_error(); - hipDeviceSynchronize(); - check_error(); + float kernel_time = 0.; + if (evt_timing) + { + hipLaunchKernelWithEvents(read_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev, + stop_ev, d_a, d_c); + check_error(hipEventSynchronize(stop_ev)); + check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev)); + } + else + { + hipLaunchKernelSynchronous(read_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev, + d_a, d_c); + } + return kernel_time; } -template -__global__ void mul_kernel(T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void write_kernel(T * __restrict c) { - const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - b[i] = scalar * c[i]; + const auto dx = grid_size() * elements_per_lane; + const auto gidx = (localid()) * elements_per_lane; + + for (auto i = 0u; i != chunks_per_block; ++i) + { + for (auto j = 0u; j != elements_per_lane; ++j) + { + store(scalar(startC), c[gidx + i * dx + j]); + } + } } template -void HIPStream::mul() +float HIPStream::write() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); - check_error(); - hipDeviceSynchronize(); - check_error(); + float kernel_time = 0.; + if (evt_timing) + { + hipLaunchKernelWithEvents(write_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev, + stop_ev, d_c); + check_error(hipEventSynchronize(stop_ev)); + check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev)); + } + else + { + hipLaunchKernelSynchronous(write_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev, + d_c); + } + return kernel_time; } -template -__global__ void add_kernel(const T * a, const T * b, T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void copy_kernel(const T * __restrict a, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i] + b[i]; + const auto dx = grid_size() * elements_per_lane; + const auto gidx = (localid()) * elements_per_lane; + + for (auto i = 0u; i != chunks_per_block; ++i) + { + for (auto j = 0u; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template -void HIPStream::add() +float HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); - check_error(); - hipDeviceSynchronize(); - check_error(); + float kernel_time = 0.; + if (evt_timing) + { + hipLaunchKernelWithEvents(copy_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev, + stop_ev, d_a, d_c); + check_error(hipEventSynchronize(stop_ev)); + check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev)); + } + else + { + hipLaunchKernelSynchronous(copy_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev, + d_a, d_c); + } + return kernel_time; } -template -__global__ void triad_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void mul_kernel(T * __restrict b, const T * __restrict c) { - const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] = b[i] + scalar * c[i]; + const auto dx = grid_size() * elements_per_lane; + const auto gidx = (localid()) * elements_per_lane; + + for (auto i = 0u; i != chunks_per_block; ++i) + { + for (auto j = 0u; j != elements_per_lane; ++j) + { + store(scalar(startScalar) * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); + } + } } template -void HIPStream::triad() +float HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); - check_error(); - hipDeviceSynchronize(); - check_error(); + float kernel_time = 0.; + if (evt_timing) + { + hipLaunchKernelWithEvents(mul_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev, + stop_ev, d_b, d_c); + check_error(hipEventSynchronize(stop_ev)); + check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev)); + } + else + { + hipLaunchKernelSynchronous(mul_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev, + d_b, d_c); + } + return kernel_time; +} + +template +__launch_bounds__(TBSIZE) +__global__ +void add_kernel(const T * __restrict a, const T * __restrict b, + T * __restrict c) +{ + const auto dx = grid_size() * elements_per_lane; + const auto gidx = (localid()) * elements_per_lane; + + for (auto i = 0u; i != chunks_per_block; ++i) + { + for (auto j = 0u; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template -__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size) +float HIPStream::add() { - __shared__ T tb_sum[TBSIZE]; + float kernel_time = 0.; + if (evt_timing) + { + hipLaunchKernelWithEvents(add_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev, + stop_ev, d_a, d_b, d_c); + check_error(hipEventSynchronize(stop_ev)); + check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev)); + } + else + { + hipLaunchKernelSynchronous(add_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev, + d_a, d_b, d_c); + } + return kernel_time; +} - int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - const size_t local_i = hipThreadIdx_x; +template +__launch_bounds__(TBSIZE) +__global__ +void triad_kernel(T * __restrict a, const T * __restrict b, + const T * __restrict c) +{ + const auto dx = grid_size() * elements_per_lane; + const auto gidx = (localid()) * elements_per_lane; - tb_sum[local_i] = 0.0; - for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) - tb_sum[local_i] += a[i] * b[i]; + for (auto i = 0u; i != chunks_per_block; ++i) + { + for (auto j = 0u; j != elements_per_lane; ++j) + { + store(load(b[gidx + i * dx + j]) + scalar(startScalar) * load(c[gidx + i * dx + j]), + a[gidx + i * dx + j]); + } + } +} + +template +float HIPStream::triad() +{ + float kernel_time = 0.; + if (evt_timing) + { + hipLaunchKernelWithEvents(triad_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, start_ev, + stop_ev, d_a, d_b, d_c); + check_error(hipEventSynchronize(stop_ev)); + check_error(hipEventElapsedTime(&kernel_time, start_ev, stop_ev)); + } + else + { + hipLaunchKernelSynchronous(triad_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, stop_ev, + d_a, d_b, d_c); + } + return kernel_time; +} + +template +struct Reducer { + template + __device__ + static + void reduce(I it) noexcept + { + if (n == 1) return; - for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) +#if defined(__HIP_PLATFORM_NVCC__) + constexpr unsigned int warpSize = 32; +#endif + constexpr bool is_same_warp{n <= warpSize * 2}; + if (static_cast(threadIdx.x) < n / 2) + { + it[threadIdx.x] += it[threadIdx.x + n / 2]; + } + is_same_warp ? __threadfence_block() : __syncthreads(); + + Reducer::reduce(it); + } +}; + +template<> +struct Reducer<1u> { + template + __device__ + static + void reduce(I) noexcept + {} +}; + +template +__launch_bounds__(TBSIZE) +__global__ +void dot_kernel(const T * __restrict a, const T * __restrict b, + T * __restrict sum) +{ + const auto dx = grid_size() * elements_per_lane; + const auto gidx = (localid()) * elements_per_lane; + + T tmp{0}; + for (auto i = 0u; i != chunks_per_block; ++i) { - __syncthreads(); - if (local_i < offset) + for (auto j = 0u; j != elements_per_lane; ++j) { - tb_sum[local_i] += tb_sum[local_i+offset]; + tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); } } - if (local_i == 0) - sum[hipBlockIdx_x] = tb_sum[local_i]; + __shared__ T tb_sum[TBSIZE]; + tb_sum[threadIdx.x] = tmp; + + __syncthreads(); + + Reducer<>::reduce(tb_sum); + + if (threadIdx.x) + { + return; + } + store(tb_sum[0], sum[blockIdx.x]); } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); - check_error(); + hipLaunchKernelSynchronous(dot_kernel, + dim3(block_cnt), dim3(TBSIZE), nullptr, coherent_ev, + d_a, d_b, sums); - hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - - T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + T sum{0}; + for (auto i = 0u; i != block_cnt; ++i) + { sum += sums[i]; + } return sum; } @@ -227,8 +534,7 @@ void listDevices(void) { // Get number of devices int count; - hipGetDeviceCount(&count); - check_error(); + check_error(hipGetDeviceCount(&count)); // Print device names if (count == 0) @@ -251,19 +557,16 @@ void listDevices(void) std::string getDeviceName(const int device) { hipDeviceProp_t props; - hipGetDeviceProperties(&props, device); - check_error(); + check_error(hipGetDeviceProperties(&props, device)); return std::string(props.name); } std::string getDeviceDriver(const int device) { - hipSetDevice(device); - check_error(); + check_error(hipSetDevice(device)); int driver; - hipDriverGetVersion(&driver); - check_error(); + check_error(hipDriverGetVersion(&driver)); return std::to_string(driver); } diff --git a/babel.so/src/rvs_stress.cpp b/babel.so/src/rvs_stress.cpp index 9d728154..5e726999 100644 --- a/babel.so/src/rvs_stress.cpp +++ b/babel.so/src/rvs_stress.cpp @@ -26,6 +26,9 @@ std::string csv_separator = ","; static bool triad_only = false; + bool event_timing = false; + + template void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum, uint64_t); @@ -111,7 +114,7 @@ void run_stress(std::pair device, int num_times, int ARRAY_SIZE, Stream *stream; // Use the HIP implementation - stream = new HIPStream(ARRAY_SIZE, device.first); + stream = new HIPStream(ARRAY_SIZE, event_timing, device.first); stream->init_arrays(startA, startB, startC); @@ -184,6 +187,7 @@ void run_stress(std::pair device, int num_times, int ARRAY_SIZE, << std::left << std::setw(12) << "Max" << std::left << std::setw(12) << "Average" << std::endl + << "------------------------------------------------------------------------" << std::endl << std::fixed; } @@ -232,6 +236,8 @@ void run_stress(std::pair device, int num_times, int ARRAY_SIZE, << std::endl; } } + std::cout + << "------------------------------------------------------------------------" << std::endl; delete stream; @@ -282,7 +288,7 @@ void run_triad(std::pair device, int num_times, int ARRAY_SIZE, b Stream *stream; // Use the HIP implementation - stream = new HIPStream(ARRAY_SIZE, device.first); + stream = new HIPStream(ARRAY_SIZE, event_timing, device.first); stream->init_arrays(startA, startB, startC); diff --git a/rvs/conf/MI300X/babel.conf b/rvs/conf/MI300X/babel.conf new file mode 100644 index 00000000..22d2c6a4 --- /dev/null +++ b/rvs/conf/MI300X/babel.conf @@ -0,0 +1,49 @@ +# ################################################################################ +# # +# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. +# # +# # MIT LICENSE: +# # Permission is hereby granted, free of charge, to any person obtaining a copy of +# # this software and associated documentation files (the "Software"), to deal in +# # the Software without restriction, including without limitation the rights to +# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# # of the Software, and to permit persons to whom the Software is furnished to do +# # so, subject to the following conditions: +# # +# # The above copyright notice and this permission notice shall be included in all +# # copies or substantial portions of the Software. +# # +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# # SOFTWARE. +# # +# ############################################################################### + +# BABEL test +# +# Preconditions: +# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g +# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify +# all the GPUs IDs separated by white space (e.g.: device: 50599 3245) +# Set parallel execution to false +# Set buffer size to reflect the buffer you want to test +# Set run count to 1 (test will run once) +# + +actions: +- name: babel-float-256MiB + device: all + module: babel # Name of the module + parallel: false # Parallel true or false + count: 1 # Number of times you want to repeat the test from the begin ( A clean start every time) + num_iter: 5000 # Number of iterations, this many kernels are launched simultaneosuly and stresses the system + array_size: 268435456 # Buffer size the test operates, this is 256 MiB + test_type: 1 # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double + mibibytes: true # mibibytes (MiB) or megabytes (MB), true for MiB + o/p_csv: false # o/p as csv file + subtest: 5 # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot +