From af36b99fa3394878795a7314fce1cce994093a0f Mon Sep 17 00:00:00 2001 From: Mikhail Kiselyov Date: Fri, 5 Apr 2024 20:43:38 +0300 Subject: [PATCH 1/3] parallelize final mask computation among threads of a given block --- octreelib/ransac/cuda_ransac.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/octreelib/ransac/cuda_ransac.py b/octreelib/ransac/cuda_ransac.py index f803559..8705870 100644 --- a/octreelib/ransac/cuda_ransac.py +++ b/octreelib/ransac/cuda_ransac.py @@ -135,19 +135,25 @@ def kernel( # replace the maximum number of inliers if the current number is greater cuda.atomic.max(max_inliers_number, block_id, inliers_number_local) cuda.syncthreads() - # set the best mask index for this block + # if this thread has the maximum number of inliers + # write this thread's plane to the shared memory + best_plane = cuda.shared.array(shape=4, dtype=nb.float32) if ( inliers_number_local == max_inliers_number[block_id] and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0 ): - for i in range(block_sizes[block_id]): - if ( - measure_distance( - plane, point_cloud[block_start_indices[block_id] + i] - ) - < threshold - ): - result_mask[block_start_indices[block_id] + i] = True + for i in range(4): + best_plane[i] = plane[i] + cuda.syncthreads() + + # parallelize mask computation among threads in the block + for i in range( + block_start_indices[block_id] + thread_id, + block_start_indices[block_id] + block_sizes[block_id], + CUDA_THREADS, + ): + if measure_distance(best_plane, point_cloud[i]) < threshold: + result_mask[i] = True return kernel From 0f094cb69980d60447cfb5c7979ec57bd5db63ac Mon Sep 17 00:00:00 2001 From: Mikhail Kiselyov Date: Fri, 5 Apr 2024 21:17:37 +0300 Subject: [PATCH 2/3] use shared variables for max_inliers_number and mutex --- octreelib/ransac/cuda_ransac.py | 34 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/octreelib/ransac/cuda_ransac.py b/octreelib/ransac/cuda_ransac.py index 8705870..0ebb76c 100644 --- a/octreelib/ransac/cuda_ransac.py +++ b/octreelib/ransac/cuda_ransac.py @@ -56,11 +56,6 @@ def evaluate( # create result mask and copy it to the device result_mask_cuda = cuda.to_device(np.zeros((len(point_cloud)), dtype=np.bool_)) - # create arrays to store the maximum number of inliers and the best mask indices - max_inliers_number_cuda = cuda.to_device( - np.zeros(blocks_number, dtype=np.int32) - ) - # copy point_cloud, block_sizes and block_start_indices to the device point_cloud_cuda = cuda.to_device(point_cloud) block_sizes_cuda = cuda.to_device(block_sizes) @@ -71,9 +66,6 @@ def evaluate( np.cumsum(np.concatenate(([0], block_sizes[:-1]))) ) - # this mutex is needed to make sure that only one thread writes to the mask - mask_mutex = cuda.to_device(np.zeros(blocks_number, dtype=np.int32)) - # call the kernel self.__kernel[blocks_number, self.__threads_per_block]( point_cloud_cuda, @@ -82,8 +74,6 @@ def evaluate( self.__random_hypotheses_cuda, self.__threshold, result_mask_cuda, - max_inliers_number_cuda, - mask_mutex, ) # copy result mask back to the host @@ -100,8 +90,6 @@ def kernel( random_hypotheses: npt.NDArray, threshold: float, result_mask: npt.NDArray, - max_inliers_number: npt.NDArray, - mask_mutex: npt.NDArray, ): thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x @@ -132,22 +120,32 @@ def kernel( if distance < threshold: inliers_number_local += 1 - # replace the maximum number of inliers if the current number is greater - cuda.atomic.max(max_inliers_number, block_id, inliers_number_local) + # shared memory to store the best plane and the maximum number of inliers + # for all hypotheses + best_plane = cuda.shared.array(shape=4, dtype=nb.float32) + max_inliers_number = cuda.shared.array(shape=1, dtype=nb.int32) + # this mutex is needed to make sure that only one thread writes to the mask + mutex = cuda.shared.array(shape=1, dtype=nb.int32) + if thread_id == 0: + max_inliers_number[0] = 0 + mutex[0] = 0 cuda.syncthreads() + # replace the maximum number of inliers if the current number is greater + cuda.atomic.max(max_inliers_number, 0, inliers_number_local) + # if this thread has the maximum number of inliers # write this thread's plane to the shared memory - best_plane = cuda.shared.array(shape=4, dtype=nb.float32) + cuda.syncthreads() if ( - inliers_number_local == max_inliers_number[block_id] - and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0 + inliers_number_local == max_inliers_number[0] + and cuda.atomic.compare_and_swap(mutex, 0, 1) == 0 ): for i in range(4): best_plane[i] = plane[i] cuda.syncthreads() - # parallelize mask computation among threads in the block + # parallelize final mask computation among threads in the block for i in range( block_start_indices[block_id] + thread_id, block_start_indices[block_id] + block_sizes[block_id], From 99ddc5ae60226b41db1a6674ef9ac1faaa6c642f Mon Sep 17 00:00:00 2001 From: Mikhail Kiselyov Date: Fri, 5 Apr 2024 21:18:36 +0300 Subject: [PATCH 3/3] fix comment --- octreelib/ransac/cuda_ransac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/octreelib/ransac/cuda_ransac.py b/octreelib/ransac/cuda_ransac.py index 0ebb76c..9293b2f 100644 --- a/octreelib/ransac/cuda_ransac.py +++ b/octreelib/ransac/cuda_ransac.py @@ -124,7 +124,7 @@ def kernel( # for all hypotheses best_plane = cuda.shared.array(shape=4, dtype=nb.float32) max_inliers_number = cuda.shared.array(shape=1, dtype=nb.int32) - # this mutex is needed to make sure that only one thread writes to the mask + # this mutex is needed to make sure that only one thread writes the best plane mutex = cuda.shared.array(shape=1, dtype=nb.int32) if thread_id == 0: max_inliers_number[0] = 0