Skip to content

Commit

Permalink
[feat CudaRansac] Parallelize final mask computation. Use shared vari…
Browse files Browse the repository at this point in the history
…ables instead of global variables (#30)

* parallelize final mask computation among threads of a given block

* use shared variables for max_inliers_number and mutex

* fix comment
  • Loading branch information
true-real-michael authored Apr 6, 2024
1 parent 5d4309d commit 37d554a
Showing 1 changed file with 29 additions and 25 deletions.
54 changes: 29 additions & 25 deletions octreelib/ransac/cuda_ransac.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ def evaluate(
# create result mask and copy it to the device
result_mask_cuda = cuda.to_device(np.zeros((len(point_cloud)), dtype=np.bool_))

# create arrays to store the maximum number of inliers and the best mask indices
max_inliers_number_cuda = cuda.to_device(
np.zeros(blocks_number, dtype=np.int32)
)

# copy point_cloud, block_sizes and block_start_indices to the device
point_cloud_cuda = cuda.to_device(point_cloud)
block_sizes_cuda = cuda.to_device(block_sizes)
Expand All @@ -71,9 +66,6 @@ def evaluate(
np.cumsum(np.concatenate(([0], block_sizes[:-1])))
)

# this mutex is needed to make sure that only one thread writes to the mask
mask_mutex = cuda.to_device(np.zeros(blocks_number, dtype=np.int32))

# call the kernel
self.__kernel[blocks_number, self.__threads_per_block](
point_cloud_cuda,
Expand All @@ -82,8 +74,6 @@ def evaluate(
self.__random_hypotheses_cuda,
self.__threshold,
result_mask_cuda,
max_inliers_number_cuda,
mask_mutex,
)

# copy result mask back to the host
Expand All @@ -100,8 +90,6 @@ def kernel(
random_hypotheses: npt.NDArray,
threshold: float,
result_mask: npt.NDArray,
max_inliers_number: npt.NDArray,
mask_mutex: npt.NDArray,
):
thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x

Expand Down Expand Up @@ -132,22 +120,38 @@ def kernel(
if distance < threshold:
inliers_number_local += 1

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, block_id, inliers_number_local)
# shared memory to store the best plane and the maximum number of inliers
# for all hypotheses
best_plane = cuda.shared.array(shape=4, dtype=nb.float32)
max_inliers_number = cuda.shared.array(shape=1, dtype=nb.int32)
# this mutex is needed to make sure that only one thread writes the best plane
mutex = cuda.shared.array(shape=1, dtype=nb.int32)
if thread_id == 0:
max_inliers_number[0] = 0
mutex[0] = 0
cuda.syncthreads()
# set the best mask index for this block

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, 0, inliers_number_local)

# if this thread has the maximum number of inliers
# write this thread's plane to the shared memory
cuda.syncthreads()
if (
inliers_number_local == max_inliers_number[block_id]
and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0
inliers_number_local == max_inliers_number[0]
and cuda.atomic.compare_and_swap(mutex, 0, 1) == 0
):
for i in range(4):
best_plane[i] = plane[i]
cuda.syncthreads()

# parallelize final mask computation among threads in the block
for i in range(
block_start_indices[block_id] + thread_id,
block_start_indices[block_id] + block_sizes[block_id],
CUDA_THREADS,
):
for i in range(block_sizes[block_id]):
if (
measure_distance(
plane, point_cloud[block_start_indices[block_id] + i]
)
< threshold
):
result_mask[block_start_indices[block_id] + i] = True
if measure_distance(best_plane, point_cloud[i]) < threshold:
result_mask[i] = True

return kernel

0 comments on commit 37d554a

Please sign in to comment.