Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat CudaRansac] Do not use CUDA RNGs. Make number of initial points configurable #28

Merged
merged 2 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions octreelib/grid/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,14 @@ def map_leaf_points_cuda_ransac(
poses_per_batch: int = 10,
threshold: float = 0.01,
hypotheses_number: int = 1024,
initial_points_number: int = 6,
):
"""
transform point cloud in the node using the function
:param poses_per_batch: Number of poses per batch.
:param threshold: Distance threshold.
:param hypotheses_number: Number of RANSAC iterations (<= 1024).
:param initial_points_number: Number of initial points to use in RANSAC.
"""
if threshold <= 0:
raise ValueError("Threshold must be positive")
Expand All @@ -154,18 +156,11 @@ def map_leaf_points_cuda_ransac(
for i in range(0, len(self.__pose_voxel_coordinates), poses_per_batch)
]

# find the maximum number of leaf voxels across all batches
# this is needed to initialize the random number generators on the GPU
max_leaf_voxels = max(
[
sum([self.n_leaves(pose_number) for pose_number in batch_pose_numbers])
for batch_pose_numbers in batches
]
)
ransac = CudaRansac(
max_blocks_number=max_leaf_voxels,
hypotheses_number=hypotheses_number,
threshold=threshold,
hypotheses_number=hypotheses_number,
initial_points_number=initial_points_number,
)

# process each batch
Expand Down
129 changes: 62 additions & 67 deletions octreelib/ransac/cuda_ransac.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@
import numpy.typing as npt
import numba as nb
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states

from octreelib.internal import PointCloud
from octreelib.ransac.util import (
generate_random_indices,
get_plane_from_points,
measure_distance,
INITIAL_POINTS_NUMBER,
)

__all__ = ["CudaRansac"]
Expand All @@ -25,22 +22,22 @@ class CudaRansac:

def __init__(
self,
max_blocks_number: int,
threshold: float = 0.01,
hypotheses_number: int = CUDA_THREADS,
initial_points_number: int = 6,
):
"""
Initialize the RANSAC parameters.
:param threshold: Distance threshold.
:param hypotheses_number: Number of RANSAC hypotheses. (<= 1024)
:param max_blocks_number: Maximum number of blocks.
:param initial_points_number: Number of initial points to use in RANSAC.
"""

self.__threshold: float = threshold
self.__threads_per_block = min(hypotheses_number, CUDA_THREADS)
# create random number generator states
self.__rng_states = create_xoroshiro128p_states(
self.__threads_per_block * max_blocks_number, seed=0
self.__kernel = self.__get_kernel(initial_points_number)
self.__random_hypotheses_cuda = cuda.to_device(
np.random.random((self.__threads_per_block, initial_points_number))
)

def evaluate(
Expand Down Expand Up @@ -78,12 +75,12 @@ def evaluate(
mask_mutex = cuda.to_device(np.zeros(blocks_number, dtype=np.int32))

# call the kernel
self.__ransac_kernel[blocks_number, self.__threads_per_block](
self.__kernel[blocks_number, self.__threads_per_block](
point_cloud_cuda,
block_sizes_cuda,
block_start_indices_cuda,
self.__random_hypotheses_cuda,
self.__threshold,
self.__rng_states,
result_mask_cuda,
max_inliers_number_cuda,
mask_mutex,
Expand All @@ -94,65 +91,63 @@ def evaluate(
return result_mask

@staticmethod
@cuda.jit
def __ransac_kernel(
point_cloud: PointCloud,
block_sizes: npt.NDArray,
block_start_indices: npt.NDArray,
threshold: float,
rng_states,
result_mask: npt.NDArray,
max_inliers_number: npt.NDArray,
mask_mutex: npt.NDArray,
):
thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x
def __get_kernel(initial_points_number):
@cuda.jit
def kernel(
point_cloud: PointCloud,
block_sizes: npt.NDArray,
block_start_indices: npt.NDArray,
random_hypotheses: npt.NDArray,
threshold: float,
result_mask: npt.NDArray,
max_inliers_number: npt.NDArray,
mask_mutex: npt.NDArray,
):
thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x

if block_sizes[block_id] < INITIAL_POINTS_NUMBER:
return
if block_sizes[block_id] < initial_points_number:
return

# select random points as inliers
initial_point_indices = cuda.local.array(
shape=INITIAL_POINTS_NUMBER, dtype=nb.size_t
)
initial_point_indices = generate_random_indices(
initial_point_indices,
rng_states,
block_sizes[block_id],
INITIAL_POINTS_NUMBER,
)
for i in range(INITIAL_POINTS_NUMBER):
initial_point_indices[i] = (
block_start_indices[block_id] + initial_point_indices[i]
# select random points as inliers
initial_point_indices = cuda.local.array(
shape=initial_points_number, dtype=nb.size_t
)
for i in range(initial_points_number):
initial_point_indices[i] = nb.int32(
random_hypotheses[thread_id][i] * block_sizes[block_id]
+ block_start_indices[block_id]
)

# calculate the plane coefficients
plane = cuda.local.array(shape=4, dtype=nb.float32)
plane[0], plane[1], plane[2], plane[3] = get_plane_from_points(
point_cloud, initial_point_indices
)

# calculate the plane coefficients
plane = cuda.local.array(shape=4, dtype=nb.float32)
plane[0], plane[1], plane[2], plane[3] = get_plane_from_points(
point_cloud, initial_point_indices
)

# for each point in the block check if it is an inlier
inliers_number_local = 0
for i in range(block_sizes[block_id]):
point = point_cloud[block_start_indices[block_id] + i]
distance = measure_distance(plane, point)
if distance < threshold:
inliers_number_local += 1

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, block_id, inliers_number_local)
cuda.syncthreads()
# set the best mask index for this block
# if this thread has the maximum number of inliers
if (
inliers_number_local == max_inliers_number[block_id]
and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0
):
# for each point in the block check if it is an inlier
inliers_number_local = 0
for i in range(block_sizes[block_id]):
if (
measure_distance(
plane, point_cloud[block_start_indices[block_id] + i]
)
< threshold
):
result_mask[block_start_indices[block_id] + i] = True
point = point_cloud[block_start_indices[block_id] + i]
distance = measure_distance(plane, point)
if distance < threshold:
inliers_number_local += 1

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, block_id, inliers_number_local)
cuda.syncthreads()
# set the best mask index for this block
# if this thread has the maximum number of inliers
if (
inliers_number_local == max_inliers_number[block_id]
and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0
):
for i in range(block_sizes[block_id]):
if (
measure_distance(
plane, point_cloud[block_start_indices[block_id] + i]
)
< threshold
):
result_mask[block_start_indices[block_id] + i] = True

return kernel
43 changes: 3 additions & 40 deletions octreelib/ransac/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,11 @@
These functions are auxiliary functions for the RANSAC algorithm.
They cannot be defined inside the `CudaRansac` class because
`CudaRansac.__ransac_kernel` would not be able to access them.
This file also contains the `INITIAL_POINTS_NUMBER` constant which
can be used to configure the number of initial points to be used in the RANSAC algorithm.
"""

import math
import numba as nb

from numba import cuda
from numba.cuda.random import xoroshiro128p_uniform_float32

# This constant configures the number of initial points to be used in the RANSAC algorithm.
INITIAL_POINTS_NUMBER = 6


@cuda.jit(
Expand All @@ -34,36 +27,6 @@ def measure_distance(plane, point):
)


@cuda.jit(device=True, inline=True)
def generate_random_int(rng_states, lower_bound, upper_bound):
"""
Generate a random number between a and b.
:param rng_states: Random number generator states.
:param lower_bound: Lower bound.
:param upper_bound: Upper bound.
"""
thread_id = cuda.grid(1)
x = xoroshiro128p_uniform_float32(rng_states, thread_id)
return nb.int32(x * (upper_bound - lower_bound) + lower_bound)


@cuda.jit(device=True, inline=True)
def generate_random_indices(
initial_point_indices, rng_states, block_size, points_number
):
"""
Generate random points from the given block.
:param initial_point_indices: Array to store the initial point indices.
:param rng_states: Random number generator states.
:param block_size: Size of the block.
:param points_number: Number of points to generate.
"""

for i in range(points_number):
initial_point_indices[i] = generate_random_int(rng_states, 0, block_size)
return initial_point_indices


@cuda.jit(device=True, inline=True)
def get_plane_from_points(points, initial_point_indices):
"""
Expand All @@ -79,9 +42,9 @@ def get_plane_from_points(points, initial_point_indices):
centroid_y += points[idx][1]
centroid_z += points[idx][2]

centroid_x /= INITIAL_POINTS_NUMBER
centroid_y /= INITIAL_POINTS_NUMBER
centroid_z /= INITIAL_POINTS_NUMBER
centroid_x /= initial_point_indices.shape[0]
centroid_y /= initial_point_indices.shape[0]
centroid_z /= initial_point_indices.shape[0]

xx, xy, xz, yy, yz, zz = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

Expand Down
Loading