From e2abf722e3be8f6c63f53864ff2c1f5663df03a0 Mon Sep 17 00:00:00 2001
From: LyceanEM <lyceanem@gmail.com>
Date: Wed, 27 Sep 2023 18:03:34 +0100
Subject: [PATCH] Changing chunking logic to better support larger ray numbers.

---
 lyceanem/base_classes.py            |  49 ++++-
 lyceanem/raycasting/rayfunctions.py | 272 ++++++++++++++++------------
 2 files changed, 209 insertions(+), 112 deletions(-)

diff --git a/lyceanem/base_classes.py b/lyceanem/base_classes.py
index f54b25b..9a84380 100644
--- a/lyceanem/base_classes.py
+++ b/lyceanem/base_classes.py
@@ -4,6 +4,7 @@
 import open3d as o3d
 from scipy import interpolate as sp
 from scipy.spatial.transform import Rotation as R
+import pyvista as pv
 
 from . import base_types as base_types
 from .electromagnetics import beamforming as BM
@@ -1132,9 +1133,22 @@ def export_global_weights(self):
         )
         weights = np.matmul(weights, rotation_matrix)
         return points, weights
+    def pattern_mesh(self):
+        points = self.cartesian_points()
+        mesh = pv.StructuredGrid(points[:, 0], points[:, 1], points[:, 2])
+        mesh.dimensions = [self.azimuth_resolution, self.elevation_resolution, 1]
+        if self.arbitary_pattern_format == "Etheta/Ephi":
+            mesh.point_data['Etheta'] = self.pattern[:,:,0]
+            mesh.point_data['Ephi'] = self.pattern[:, :, 1]
+        elif self.arbitary_pattern_format == "ExEyEz":
+            mesh.point_data['Ex'] = self.pattern[:, :, 0]
+            mesh.point_data['Ey'] = self.pattern[:, :, 1]
+            mesh.point_data['Ez'] = self.pattern[:, :, 2]
+
+        return mesh
 
     def display_pattern(
-        self, plottype="Polar", desired_pattern="both", pattern_min=-40, plot_max=0
+        self, plottype="Polar", desired_pattern="both", pattern_min=-40, plot_max=0,plotengine="matplotlib"
     ):
         """
         Displays the Antenna Pattern using :func:`lyceanem.electromagnetics.beamforming.PatternPlot`
@@ -1152,7 +1166,40 @@ def display_pattern(
         -------
         None
         """
+        if plotengine=="pyvista":
+            def PatternPlot(
+                    data,
+                    az,
+                    elev,
+                    pattern_min=-40,
+                    plot_max=0.0,
+                    plottype="Polar",
+                    logtype="amplitude",
+                    ticknum=6,
+                    title_text=None,
+                    shell_radius=1.0
+            ):
+                # points = spherical_mesh(az, elev,
+                # (data / np.nanmax(data) * shell_radius))
+                # mesh = pv.StructuredGrid(points[:, 0], points[:, 1], points[:, 2])
+                # mesh.dimensions = [az.size, elev.size, 1]
+                # mesh.point_data['Beamformed Directivity (dBi)'] = 10 * np.log10(data.ravel())
+                mesh = self.pattern_mesh(shell_radius)
+                sargs = dict(
+                    title_font_size=20,
+                    label_font_size=16,
+                    shadow=True,
+                    n_labels=ticknum,
+                    italic=True,
+                    fmt="%.1f",
+                    font_family="arial",
+                )
+                pl = pv.Plotter()
+                pl.add_mesh(mesh, scalar_bar_args=sargs, clim=[pattern_min, plot_max])
+                pl.show_axes()
+                pl.show()
 
+                return pl
         if self.arbitary_pattern_format == "Etheta/Ephi":
             if desired_pattern == "both":
                 BM.PatternPlot(
diff --git a/lyceanem/raycasting/rayfunctions.py b/lyceanem/raycasting/rayfunctions.py
index b1803ef..f8bafca 100644
--- a/lyceanem/raycasting/rayfunctions.py
+++ b/lyceanem/raycasting/rayfunctions.py
@@ -2319,11 +2319,109 @@ def chunkingRaycaster1Dv3(
         target_indexing = create_model_index(
             filtered_index, sink_index, scattering_point_index
         )
+        second_ray_payload = charge_rays_environment1Dv2(
+            sources, sinks, scattering_points, target_indexing
+        )
+        # print('Chunking Raycaster Triangles ', len(environment_local))
+        # max_tris=2**18
+        # triangle_chunk=np.linspace(0,tri_num,math.ceil(tri_num/max_tris)+1,dtype=np.int32)
+
+        # chunk_size=2**11
+        threads_in_block = 256
+        # ray_chunks=np.linspace(0,ray_num,math.ceil(ray_num/chunk_size)+1,dtype=np.int32)
+        # for idx in range(len(triangle_chunk)-1):
+        d_environment = cuda.device_array(
+            len(environment_local), dtype=base_types.triangle_t
+        )
+        cuda.to_device(environment_local, to=d_environment)
+        d_chunk_payload = cuda.device_array(
+            [second_ray_payload.shape[0]], dtype=base_types.ray_t
+        )
+        cuda.to_device(second_ray_payload, to=d_chunk_payload)
+        #
+        # ray_temp=np.empty((chunk_ray_size),dtype=np.bool)
+        # ray_temp[:]=first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']
+        # distance_temp=np.empty((chunk_ray_size),dtype=np.float32)
+        # distance_temp[:]=math.inf
+        # dist_list=cuda.to_device(distance_temp)
+        # d_ray_flag=cuda.to_device(ray_temp)
+        # Here, we choose the granularity of the threading on our device. We want
+        # to try to cover the entire workload of rays and targets with simulatenous threads, so we'll
+        # choose a grid of (source_num/16. target_num/16) blocks, each with (16, 16) threads
+        grids = math.ceil(second_ray_payload.shape[0] / threads_in_block)
+        threads = threads_in_block
+        # Execute the kernel
+        # cuda.profile_start()
+        # kernel1Dv2[grids, threads](d_chunk_payload,d_environment,d_ray_flag)
+        kernel1Dv3[grids, threads](d_chunk_payload, d_environment)
+        # cuda.profile_stop()
+        # distmap[source_chunks[n]:source_chunks[n+1],target_chunks[m]:target_chunks[m+1]] = d_distmap_chunked.copy_to_host()
+        second_ray_payload = d_chunk_payload.copy_to_host()
+        # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']=d_chunk_payload['intersect'].copy_to_host()
+        # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['dist']=d_chunk_payload['dist'].copy_to_host()
+
+        # cuda.close()
+        kernel_dt = timer() - raystart
+        start = timer()
+        RAYS_CAST += second_ray_payload.shape[0]
+        filtered_index2, final_index2 = rayHits1Dv2(
+            second_ray_payload, target_indexing, sink_index
+        )
+        mem_dt = timer() - start
     else:
         if (filtered_index.shape[0] * sink_index.shape[0]) < ray_limit:
             target_indexing = create_model_index(
                 filtered_index, sink_index, np.empty((0, 0), dtype=np.int32)
             )  # only target rays at sinks
+            second_ray_payload = charge_rays_environment1Dv2(
+                sources, sinks, scattering_points, target_indexing
+            )
+            # print('Chunking Raycaster Triangles ', len(environment_local))
+            # max_tris=2**18
+            # triangle_chunk=np.linspace(0,tri_num,math.ceil(tri_num/max_tris)+1,dtype=np.int32)
+
+            # chunk_size=2**11
+            threads_in_block = 256
+            # ray_chunks=np.linspace(0,ray_num,math.ceil(ray_num/chunk_size)+1,dtype=np.int32)
+            # for idx in range(len(triangle_chunk)-1):
+            d_environment = cuda.device_array(
+                len(environment_local), dtype=base_types.triangle_t
+            )
+            cuda.to_device(environment_local, to=d_environment)
+            d_chunk_payload = cuda.device_array(
+                [second_ray_payload.shape[0]], dtype=base_types.ray_t
+            )
+            cuda.to_device(second_ray_payload, to=d_chunk_payload)
+            #
+            # ray_temp=np.empty((chunk_ray_size),dtype=np.bool)
+            # ray_temp[:]=first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']
+            # distance_temp=np.empty((chunk_ray_size),dtype=np.float32)
+            # distance_temp[:]=math.inf
+            # dist_list=cuda.to_device(distance_temp)
+            # d_ray_flag=cuda.to_device(ray_temp)
+            # Here, we choose the granularity of the threading on our device. We want
+            # to try to cover the entire workload of rays and targets with simulatenous threads, so we'll
+            # choose a grid of (source_num/16. target_num/16) blocks, each with (16, 16) threads
+            grids = math.ceil(second_ray_payload.shape[0] / threads_in_block)
+            threads = threads_in_block
+            # Execute the kernel
+            # cuda.profile_start()
+            # kernel1Dv2[grids, threads](d_chunk_payload,d_environment,d_ray_flag)
+            kernel1Dv3[grids, threads](d_chunk_payload, d_environment)
+            # cuda.profile_stop()
+            # distmap[source_chunks[n]:source_chunks[n+1],target_chunks[m]:target_chunks[m+1]] = d_distmap_chunked.copy_to_host()
+            second_ray_payload = d_chunk_payload.copy_to_host()
+            # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']=d_chunk_payload['intersect'].copy_to_host()
+            # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['dist']=d_chunk_payload['dist'].copy_to_host()
+
+            # cuda.close()
+            kernel_dt = timer() - raystart
+            start = timer()
+            RAYS_CAST += second_ray_payload.shape[0]
+            filtered_index2, final_index2 = rayHits1Dv2(
+                second_ray_payload, target_indexing, sink_index
+            )
+            mem_dt = timer() - start
         else:
             # the rays must fit in GPU memory, aim for no more than 80% utilisation
             # establish memory limits
@@ -2396,117 +2494,69 @@ def chunkingRaycaster1Dv3(
                 # ctx = cuda.current_context()
                 # ctx.reset()
 
-    if target_indexing.shape[0] >= ray_limit:
-        # need to split the array and process seperatly
-
-        sub_target = np.array_split(
-            target_indexing, np.ceil(target_indexing.shape[0] / ray_limit).astype(int)
-        )
-        chunknum = len(sub_target)
-        filtered_index2 = np.empty((0, target_indexing.shape[1]), dtype=np.int32)
-        final_index2 = np.empty((0, target_indexing.shape[1]), dtype=np.int32)
-        # print('chunking total of ',target_indexing.shape[0],' rays in ',chunknum,' batches')
-        for chunkindex in range(chunknum):
-            # cycle the raycaster over the sub arrays
-            threads_in_block = 256
-            second_ray_payload = charge_rays_environment1Dv2(
-                sources, sinks, scattering_points, sub_target[chunkindex]
-            )
-            # ray_chunks=np.linspace(0,ray_num,math.ceil(ray_num/chunk_size)+1,dtype=np.int32)
-            # for idx in range(len(triangle_chunk)-1):
-            d_environment = cuda.device_array(
-                len(environment_local), dtype=base_types.triangle_t
-            )
-            cuda.to_device(environment_local, to=d_environment)
-            d_chunk_payload = cuda.device_array(
-                [second_ray_payload.shape[0]], dtype=base_types.ray_t
-            )
-            cuda.to_device(second_ray_payload, to=d_chunk_payload)
-            #
-            # ray_temp=np.empty((chunk_ray_size),dtype=np.bool)
-            # ray_temp[:]=first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']
-            # distance_temp=np.empty((chunk_ray_size),dtype=np.float32)
-            # distance_temp[:]=math.inf
-            # dist_list=cuda.to_device(distance_temp)
-            # d_ray_flag=cuda.to_device(ray_temp)
-            # Here, we choose the granularity of the threading on our device. We want
-            # to try to cover the entire workload of rays and targets with simulatenous threads, so we'll
-            # choose a grid of (source_num/16. target_num/16) blocks, each with (16, 16) threads
-            grids = math.ceil(second_ray_payload.shape[0] / threads_in_block)
-            threads = threads_in_block
-            # Execute the kernel
-            # cuda.profile_start()
-            # kernel1Dv2[grids, threads](d_chunk_payload,d_environment,d_ray_flag)
-            kernel1Dv3[grids, threads](d_chunk_payload, d_environment)
-            # cuda.profile_stop()
-            # distmap[source_chunks[n]:source_chunks[n+1],target_chunks[m]:target_chunks[m+1]] = d_distmap_chunked.copy_to_host()
-            second_ray_payload = d_chunk_payload.copy_to_host()
-            # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']=d_chunk_payload['intersect'].copy_to_host()
-            # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['dist']=d_chunk_payload['dist'].copy_to_host()
-
-            # cuda.close()
-            kernel_dt = timer() - raystart
-            start = timer()
-            RAYS_CAST += second_ray_payload.shape[0]
-            temp_filtered_index2, temp_final_index2 = rayHits1Dv2(
-                second_ray_payload, sub_target[chunkindex], sink_index
-            )
-            mem_dt = timer() - start
-            filtered_index2 = np.append(filtered_index2, temp_filtered_index2, axis=0)
-            final_index2 = np.append(final_index2, temp_final_index2, axis=0)
-            # deallocate memory on gpu
-            # ctx = cuda.current_context()
-            # ctx.reset()
-    else:
-        second_ray_payload = charge_rays_environment1Dv2(
-            sources, sinks, scattering_points, target_indexing
-        )
-        # print('Chunking Raycaster Triangles ', len(environment_local))
-        # max_tris=2**18
-        # triangle_chunk=np.linspace(0,tri_num,math.ceil(tri_num/max_tris)+1,dtype=np.int32)
-
-        # chunk_size=2**11
-        threads_in_block = 256
-        # ray_chunks=np.linspace(0,ray_num,math.ceil(ray_num/chunk_size)+1,dtype=np.int32)
-        # for idx in range(len(triangle_chunk)-1):
-        d_environment = cuda.device_array(
-            len(environment_local), dtype=base_types.triangle_t
-        )
-        cuda.to_device(environment_local, to=d_environment)
-        d_chunk_payload = cuda.device_array(
-            [second_ray_payload.shape[0]], dtype=base_types.ray_t
-        )
-        cuda.to_device(second_ray_payload, to=d_chunk_payload)
-        #
-        # ray_temp=np.empty((chunk_ray_size),dtype=np.bool)
-        # ray_temp[:]=first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']
-        # distance_temp=np.empty((chunk_ray_size),dtype=np.float32)
-        # distance_temp[:]=math.inf
-        # dist_list=cuda.to_device(distance_temp)
-        # d_ray_flag=cuda.to_device(ray_temp)
-        # Here, we choose the granularity of the threading on our device. We want
-        # to try to cover the entire workload of rays and targets with simulatenous threads, so we'll
-        # choose a grid of (source_num/16. target_num/16) blocks, each with (16, 16) threads
-        grids = math.ceil(second_ray_payload.shape[0] / threads_in_block)
-        threads = threads_in_block
-        # Execute the kernel
-        # cuda.profile_start()
-        # kernel1Dv2[grids, threads](d_chunk_payload,d_environment,d_ray_flag)
-        kernel1Dv3[grids, threads](d_chunk_payload, d_environment)
-        # cuda.profile_stop()
-        # distmap[source_chunks[n]:source_chunks[n+1],target_chunks[m]:target_chunks[m+1]] = d_distmap_chunked.copy_to_host()
-        second_ray_payload = d_chunk_payload.copy_to_host()
-        # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']=d_chunk_payload['intersect'].copy_to_host()
-        # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['dist']=d_chunk_payload['dist'].copy_to_host()
-
-        # cuda.close()
-        kernel_dt = timer() - raystart
-        start = timer()
-        RAYS_CAST += second_ray_payload.shape[0]
-        filtered_index2, final_index2 = rayHits1Dv2(
-            second_ray_payload, target_indexing, sink_index
-        )
-        mem_dt = timer() - start
+    # if target_indexing.shape[0] >= ray_limit:
+    #     # need to split the array and process seperatly
+
+    #     sub_target = np.array_split(
+    #         target_indexing, np.ceil(target_indexing.shape[0] / ray_limit).astype(int)
+    #     )
+    #     chunknum = len(sub_target)
+    #     filtered_index2 = np.empty((0, target_indexing.shape[1]), dtype=np.int32)
+    #     final_index2 = np.empty((0, target_indexing.shape[1]), dtype=np.int32)
+    #     # print('chunking total of ',target_indexing.shape[0],' rays in ',chunknum,' batches')
+    #     for chunkindex in range(chunknum):
+    #         # cycle the raycaster over the sub arrays
+    #         threads_in_block = 256
+    #         second_ray_payload = charge_rays_environment1Dv2(
+    #             sources, sinks, scattering_points, sub_target[chunkindex]
+    #         )
+    #         # ray_chunks=np.linspace(0,ray_num,math.ceil(ray_num/chunk_size)+1,dtype=np.int32)
+    #         # for idx in range(len(triangle_chunk)-1):
+    #         d_environment = cuda.device_array(
+    #             len(environment_local), dtype=base_types.triangle_t
+    #         )
+    #         cuda.to_device(environment_local, to=d_environment)
+    #         d_chunk_payload = cuda.device_array(
+    #             [second_ray_payload.shape[0]], dtype=base_types.ray_t
+    #         )
+    #         cuda.to_device(second_ray_payload, to=d_chunk_payload)
+    #         #
+    #         # ray_temp=np.empty((chunk_ray_size),dtype=np.bool)
+    #         # ray_temp[:]=first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']
+    #         # distance_temp=np.empty((chunk_ray_size),dtype=np.float32)
+    #         # distance_temp[:]=math.inf
+    #         # dist_list=cuda.to_device(distance_temp)
+    #         # d_ray_flag=cuda.to_device(ray_temp)
+    #         # Here, we choose the granularity of the threading on our device. We want
+    #         # to try to cover the entire workload of rays and targets with simulatenous threads, so we'll
+    #         # choose a grid of (source_num/16. target_num/16) blocks, each with (16, 16) threads
+    #         grids = math.ceil(second_ray_payload.shape[0] / threads_in_block)
+    #         threads = threads_in_block
+    #         # Execute the kernel
+    #         # cuda.profile_start()
+    #         # kernel1Dv2[grids, threads](d_chunk_payload,d_environment,d_ray_flag)
+    #         kernel1Dv3[grids, threads](d_chunk_payload, d_environment)
+    #         # cuda.profile_stop()
+    #         # distmap[source_chunks[n]:source_chunks[n+1],target_chunks[m]:target_chunks[m+1]] = d_distmap_chunked.copy_to_host()
+    #         second_ray_payload = d_chunk_payload.copy_to_host()
+    #         # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['intersect']=d_chunk_payload['intersect'].copy_to_host()
+    #         # first_ray_payload[ray_chunks[n]:ray_chunks[n+1]]['dist']=d_chunk_payload['dist'].copy_to_host()
+
+    #         # cuda.close()
+    #         kernel_dt = timer() - raystart
+    #         start = timer()
+    #         RAYS_CAST += second_ray_payload.shape[0]
+    #         temp_filtered_index2, temp_final_index2 = rayHits1Dv2(
+    #             second_ray_payload, sub_target[chunkindex], sink_index
+    #         )
+    #         mem_dt = timer() - start
+    #         filtered_index2 = np.append(filtered_index2, temp_filtered_index2, axis=0)
+    #         final_index2 = np.append(final_index2, temp_final_index2, axis=0)
+    #         # deallocate memory on gpu
+    #         # ctx = cuda.current_context()
+    #         # ctx.reset()
+    # else:
+        
         # deallocate memory on gpu
         # ctx = cuda.current_context()
         # deallocs = ctx.deallocations