From b9886d138a45312f6eb2df7499817d7431d0ca47 Mon Sep 17 00:00:00 2001 From: Quentin Bourgerie Date: Tue, 10 Dec 2024 11:16:53 +0100 Subject: [PATCH] try std bootstrap --- .../compiler/lib/Runtime/GPUDFG.cpp | 73 +++++++++++-------- .../compiler/lib/Runtime/wrappers.cpp | 16 ++-- 2 files changed, 50 insertions(+), 39 deletions(-) diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp index 143178e7a8..698cb59bc4 100644 --- a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp +++ b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp @@ -117,22 +117,26 @@ struct Dependence; // is required. struct PBS_buffer { PBS_buffer(void *stream, uint32_t gpu_idx, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count) + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count) : max_pbs_buffer_samples(input_lwe_ciphertext_count), - glwe_dim(glwe_dimension), poly_size(polynomial_size), - gpu_stream(stream), gpu_index(gpu_idx) { - scratch_cuda_programmable_bootstrap_amortized_64( + glwe_dim(glwe_dimension), _level_count(level_count), + poly_size(polynomial_size), gpu_stream(stream), gpu_index(gpu_idx) { + scratch_cuda_programmable_bootstrap_64( gpu_stream, gpu_index, &pbs_buffer, glwe_dim, poly_size, - max_pbs_buffer_samples, true); + _level_count, max_pbs_buffer_samples, true); } ~PBS_buffer() { - cleanup_cuda_programmable_bootstrap_amortized(gpu_stream, gpu_index, &pbs_buffer); + cleanup_cuda_programmable_bootstrap(gpu_stream, gpu_index, + &pbs_buffer); } int8_t *get_pbs_buffer(void *stream, uint32_t gpu_idx, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t input_lwe_ciphertext_count) { assert(glwe_dimension <= glwe_dim); assert(polynomial_size <= poly_size); + assert(level_count <= _level_count); assert(input_lwe_ciphertext_count <= max_pbs_buffer_samples); assert(stream == gpu_stream); assert(gpu_idx == gpu_index); @@ -144,6 +148,7 @@ struct PBS_buffer { uint32_t max_pbs_buffer_samples; uint32_t glwe_dim; uint32_t poly_size; + uint32_t _level_count; void *gpu_stream; uint32_t gpu_index; }; @@ -163,10 +168,11 @@ struct GPU_state { cuda_destroy_stream((cudaStream_t)gpu_stream, gpu_idx); } inline int8_t *get_pbs_buffer(uint32_t glwe_dimension, - uint32_t polynomial_size, + uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count) { if (pbs_buffer != nullptr && (pbs_buffer->glwe_dim != glwe_dimension || pbs_buffer->poly_size != polynomial_size || + pbs_buffer->_level_count != level_count || pbs_buffer->get_max_pbs_buffer_samples() < input_lwe_ciphertext_count)) { delete pbs_buffer; @@ -174,9 +180,10 @@ struct GPU_state { } if (pbs_buffer == nullptr) pbs_buffer = new PBS_buffer(get_gpu_stream(), gpu_idx, glwe_dimension, - polynomial_size, input_lwe_ciphertext_count); + polynomial_size, level_count, + input_lwe_ciphertext_count); return pbs_buffer->get_pbs_buffer(get_gpu_stream(), gpu_idx, glwe_dimension, - polynomial_size, + polynomial_size, level_count, input_lwe_ciphertext_count); } inline void *get_gpu_stream() { @@ -216,16 +223,17 @@ struct GPU_DFG { to_free_list.clear(); } inline int8_t *get_pbs_buffer(uint32_t glwe_dimension, - uint32_t polynomial_size, + uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count) { if (pbs_buffer == nullptr) { - int8_t *ret = gpus[gpu_idx].get_pbs_buffer( - glwe_dimension, polynomial_size, input_lwe_ciphertext_count); + int8_t *ret = + gpus[gpu_idx].get_pbs_buffer(glwe_dimension, polynomial_size, + level_count, input_lwe_ciphertext_count); pbs_buffer = gpus[gpu_idx].pbs_buffer; return ret; } return pbs_buffer->get_pbs_buffer(gpu_stream, gpu_idx, glwe_dimension, - polynomial_size, + polynomial_size, level_count, input_lwe_ciphertext_count); } inline void *get_gpu_stream(int32_t loc) { @@ -422,8 +430,8 @@ struct Dependence { } inline void free_data(GPU_DFG *dfg, bool immediate = false) { if (device_data != nullptr) { - cuda_drop_async(device_data, - (cudaStream_t)dfg->get_gpu_stream(location), location); + cuda_drop_async(device_data, (cudaStream_t)dfg->get_gpu_stream(location), + location); } if (onHostReady && host_data.allocated != nullptr && hostAllocated) { // As streams are not synchronized aside from the GET operation, @@ -1080,16 +1088,18 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, void *ksk_gpu = p->ctx.val->get_ksk_gpu( p->level.val, p->input_lwe_dim.val, p->output_lwe_dim.val, loc, s, p->sk_index.val); - // Initialize indexes - uint64_t *indexes = (uint64_t*) malloc(num_samples * sizeof(uint64_t)); + // Initialize indexes + uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t)); for (uint32_t i = 0; i < num_samples; i++) { - indexes[i] = i; + indexes[i] = i; } - void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s); + void *indexes_gpu = + alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s); cuda_keyswitch_lwe_ciphertext_vector_64( - s, loc, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu, p->input_lwe_dim.val, - p->output_lwe_dim.val, p->base_log.val, p->level.val, num_samples); + s, loc, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu, + p->input_lwe_dim.val, p->output_lwe_dim.val, p->base_log.val, + p->level.val, num_samples); cuda_drop_async(indexes_gpu, s, loc); Dependence *dep = new Dependence(loc, out, out_gpu, false, false, d->chunk_id); @@ -1188,23 +1198,25 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes, test_vector_idxes_size, s, loc); // Initialize indexes - uint64_t *indexes = (uint64_t*) malloc(num_samples * sizeof(uint64_t)); + uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t)); for (uint32_t i = 0; i < num_samples; i++) { - indexes[i] = i; + indexes[i] = i; } - void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s); + void *indexes_gpu = + alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s); int8_t *pbs_buffer = p->dfg->gpus[loc].get_pbs_buffer( - p->glwe_dim.val, p->poly_size.val, num_samples); + p->glwe_dim.val, p->poly_size.val, p->level.val, num_samples); void *ct0_gpu = d0->device_data; void *out_gpu = cuda_malloc_async(data_size, s, loc); void *fbsk_gpu = p->ctx.val->get_bsk_gpu( p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->glwe_dim.val, loc, s, p->sk_index.val); - cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( - s, loc, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, indexes_gpu, - fbsk_gpu, (int8_t *)pbs_buffer, p->input_lwe_dim.val, p->glwe_dim.val, - p->poly_size.val, p->base_log.val, p->level.val, num_samples); + cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + s, loc, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu, + ct0_gpu, indexes_gpu, fbsk_gpu, (int8_t *)pbs_buffer, + p->input_lwe_dim.val, p->glwe_dim.val, p->poly_size.val, + p->base_log.val, p->level.val, num_samples); cuda_drop_async(test_vector_idxes_gpu, s, loc); cuda_drop_async(glwe_ct_gpu, s, loc); cuda_drop_async(indexes_gpu, s, loc); @@ -1442,8 +1454,7 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc, Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id); if (p->output_streams[0]->need_new_gen(chunk_id)) p->output_streams[0]->put( - sched(idep0, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc), - chunk_id); + sched(idep0, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc), chunk_id); } } // namespace diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp index 02433969e5..0cfb021e3d 100644 --- a/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp +++ b/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp @@ -225,15 +225,15 @@ void memref_batched_bootstrap_lwe_cuda_u64( } void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t), gpu_idx, (cudaStream_t)stream); // Allocate PBS buffer on GPU - scratch_cuda_programmable_bootstrap_amortized_64( - stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples, + scratch_cuda_programmable_bootstrap_64( + stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, level, num_samples, true); // Run the bootstrap kernel on the GPU - cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( + cuda_programmable_bootstrap_lwe_ciphertext_vector_64( stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples); - cleanup_cuda_programmable_bootstrap_amortized(stream, gpu_idx, &pbs_buffer); + cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer); // Copy the output batch of ciphertext back to CPU memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx, stream); @@ -331,15 +331,15 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64( void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t), gpu_idx, (cudaStream_t)stream); // Allocate PBS buffer on GPU - scratch_cuda_programmable_bootstrap_amortized_64( - stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples, + scratch_cuda_programmable_bootstrap_64( + stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, level, num_samples, true); // Run the bootstrap kernel on the GPU - cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( + cuda_programmable_bootstrap_lwe_ciphertext_vector_64( stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples); - cleanup_cuda_programmable_bootstrap_amortized(stream, gpu_idx, &pbs_buffer); + cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer); // Copy the output batch of ciphertext back to CPU memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx, stream);