Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat CudaRansac] Parallelize final mask computation. Use shared variables instead of global variables #30

Merged
merged 3 commits into from
Apr 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 29 additions & 25 deletions octreelib/ransac/cuda_ransac.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ def evaluate(
# create result mask and copy it to the device
result_mask_cuda = cuda.to_device(np.zeros((len(point_cloud)), dtype=np.bool_))

# create arrays to store the maximum number of inliers and the best mask indices
max_inliers_number_cuda = cuda.to_device(
np.zeros(blocks_number, dtype=np.int32)
)

# copy point_cloud, block_sizes and block_start_indices to the device
point_cloud_cuda = cuda.to_device(point_cloud)
block_sizes_cuda = cuda.to_device(block_sizes)
Expand All @@ -71,9 +66,6 @@ def evaluate(
np.cumsum(np.concatenate(([0], block_sizes[:-1])))
)

# this mutex is needed to make sure that only one thread writes to the mask
mask_mutex = cuda.to_device(np.zeros(blocks_number, dtype=np.int32))

# call the kernel
self.__kernel[blocks_number, self.__threads_per_block](
point_cloud_cuda,
Expand All @@ -82,8 +74,6 @@ def evaluate(
self.__random_hypotheses_cuda,
self.__threshold,
result_mask_cuda,
max_inliers_number_cuda,
mask_mutex,
)

# copy result mask back to the host
Expand All @@ -100,8 +90,6 @@ def kernel(
random_hypotheses: npt.NDArray,
threshold: float,
result_mask: npt.NDArray,
max_inliers_number: npt.NDArray,
mask_mutex: npt.NDArray,
):
thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x

Expand Down Expand Up @@ -132,22 +120,38 @@ def kernel(
if distance < threshold:
inliers_number_local += 1

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, block_id, inliers_number_local)
# shared memory to store the best plane and the maximum number of inliers
# for all hypotheses
best_plane = cuda.shared.array(shape=4, dtype=nb.float32)
max_inliers_number = cuda.shared.array(shape=1, dtype=nb.int32)
# this mutex is needed to make sure that only one thread writes the best plane
mutex = cuda.shared.array(shape=1, dtype=nb.int32)
if thread_id == 0:
max_inliers_number[0] = 0
mutex[0] = 0
cuda.syncthreads()
# set the best mask index for this block

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, 0, inliers_number_local)

# if this thread has the maximum number of inliers
# write this thread's plane to the shared memory
cuda.syncthreads()
if (
inliers_number_local == max_inliers_number[block_id]
and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0
inliers_number_local == max_inliers_number[0]
and cuda.atomic.compare_and_swap(mutex, 0, 1) == 0
):
for i in range(4):
best_plane[i] = plane[i]
cuda.syncthreads()

# parallelize final mask computation among threads in the block
for i in range(
block_start_indices[block_id] + thread_id,
block_start_indices[block_id] + block_sizes[block_id],
CUDA_THREADS,
):
for i in range(block_sizes[block_id]):
if (
measure_distance(
plane, point_cloud[block_start_indices[block_id] + i]
)
< threshold
):
result_mask[block_start_indices[block_id] + i] = True
if measure_distance(best_plane, point_cloud[i]) < threshold:
result_mask[i] = True

return kernel
Loading