From 2ffe60a108b3d7bc1309bce01fd52c1b49873880 Mon Sep 17 00:00:00 2001 From: Antoniu Pop Date: Mon, 17 Jun 2024 15:33:21 +0100 Subject: [PATCH] feat(compiler): [GPU runtime] reduce copies in merge operations by merging SDFG batch outputs in place. --- .../compiler/lib/Runtime/GPUDFG.cpp | 374 ++++++++++-------- 1 file changed, 217 insertions(+), 157 deletions(-) diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp index 23fe23b398..eca23952a3 100644 --- a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp +++ b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp @@ -212,17 +212,12 @@ struct GPU_DFG { polynomial_size, input_lwe_ciphertext_count); } - void free_streams() { - streams.sort(); - streams.unique(); - for (auto s : streams) - delete s; - } inline void *get_gpu_stream(int32_t loc) { if (loc < 0) return nullptr; return gpus[loc].get_gpu_stream(); } + void free_streams(); private: std::list to_free_list; @@ -249,6 +244,7 @@ struct Dependence { int32_t chunk_id; size_t stream_generation; std::vector chunks; + std::vector chunking_schedule; Dependence(int32_t l, MemRef2 hd, void *dd, bool ohr, bool alloc = false, int32_t chunk_id = single_chunk, size_t gen = 0) : location(l), host_data(hd), device_data(dd), onHostReady(ohr), @@ -262,7 +258,8 @@ struct Dependence { // Split a dependence into a number of chunks either to run on // multiple GPUs or execute concurrently on the host. void split_dependence(size_t num_chunks, size_t num_gpu_chunks, - size_t chunk_dim, bool constant) { + size_t chunk_dim, bool constant, + size_t gpu_chunk_factor) { // If this dependence is already split, check that the split // matches the new request if (chunk_id == split_chunks) { @@ -297,8 +294,8 @@ struct Dependence { return; } size_t chunk_size = - num_samples / (num_chunks + num_gpu_chunks * device_compute_factor); - size_t gpu_chunk_size = chunk_size * device_compute_factor; + num_samples / (num_chunks + num_gpu_chunks * gpu_chunk_factor); + size_t gpu_chunk_size = chunk_size * gpu_chunk_factor; chunk_size = (num_samples - gpu_chunk_size * num_gpu_chunks) / num_chunks; size_t chunk_remainder = (num_samples - gpu_chunk_size * num_gpu_chunks) % num_chunks; @@ -327,63 +324,42 @@ struct Dependence { chunk_id = split_chunks; location = split_location; } - void merge_dependence(GPU_DFG *dfg) { + void finalize_merged_dependence(GPU_DFG *dfg) { assert(!chunks.empty() && "Cannot merge dependence with no chunks"); - size_t data_size = 0; - size_t num_samples = 0; - for (auto c : chunks) { - data_size += memref_get_data_size(c->host_data); - num_samples += c->host_data.sizes[0]; - } - uint64_t *data = (uint64_t *)malloc(data_size); - MemRef2 output = {data, - data, - 0, - {num_samples, chunks.front()->host_data.sizes[1]}, - {chunks.front()->host_data.sizes[1], 1}}; - - std::list custreams_used; - for (auto c : chunks) { - // Write out the piece in the final target dependence - size_t csize = memref_get_data_size(c->host_data); - if (c->onHostReady) { - memcpy(((char *)output.aligned) + output.offset, c->host_data.aligned, - csize); - } else { - assert(c->location > host_location); - cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(c->location); - cuda_memcpy_async_to_cpu(((char *)output.aligned) + output.offset, - c->device_data, csize, s, c->location); - custreams_used.push_back(s); - } - output.offset += csize; - } - output.offset = 0; - for (auto c : chunks) - c->free_data(dfg, true); + assert(host_data.allocated != nullptr); chunks.clear(); - - custreams_used.sort(); - custreams_used.unique(); - for (auto s : custreams_used) - cudaStreamSynchronize(*s); - location = host_location; onHostReady = true; - assert(host_data.allocated == nullptr); - host_data = output; assert(device_data == nullptr); - hostAllocated = true; chunk_id = single_chunk; } + void copy_chunk_off_device(int32_t chunk_id, GPU_DFG *dfg) { + if (chunks[chunk_id]->onHostReady) + return; + chunks[chunk_id]->copy(host_location, dfg, false); + } void move_chunk_off_device(int32_t chunk_id, GPU_DFG *dfg) { - chunks[chunk_id]->copy(host_location, dfg); + copy_chunk_off_device(chunk_id, dfg); cuda_drop_async( chunks[chunk_id]->device_data, (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location), chunks[chunk_id]->location); + chunks[chunk_id]->device_data = nullptr; chunks[chunk_id]->location = host_location; } + void merge_output_off_device(int32_t chunk_id, GPU_DFG *dfg) { + assert(chunks[chunk_id]->location > host_location); + size_t data_offset = 0; + for (int32_t c = 0; c < chunk_id; ++c) + data_offset += + chunking_schedule[c] * host_data.sizes[1] * sizeof(uint64_t); + size_t csize = memref_get_data_size(chunks[chunk_id]->host_data); + cudaStream_t *s = + (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location); + cuda_memcpy_async_to_cpu(((char *)host_data.aligned) + data_offset, + chunks[chunk_id]->device_data, csize, s, + chunks[chunk_id]->location); + } void free_chunk_host_data(int32_t chunk_id, GPU_DFG *dfg) { assert(chunks[chunk_id]->location == host_location && chunks[chunk_id]->onHostReady && chunks[chunk_id]->hostAllocated); @@ -393,8 +369,8 @@ struct Dependence { chunks[chunk_id]->onHostReady = false; } void free_chunk_device_data(int32_t chunk_id, GPU_DFG *dfg) { - assert(chunks[chunk_id]->location > host_location && - chunks[chunk_id]->device_data != nullptr); + if (chunks[chunk_id]->device_data == nullptr) + return; cuda_drop_async( chunks[chunk_id]->device_data, (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location), @@ -402,7 +378,7 @@ struct Dependence { chunks[chunk_id]->device_data = nullptr; } inline void free_data(GPU_DFG *dfg, bool immediate = false) { - if (location >= 0 && device_data != nullptr) { + if (device_data != nullptr) { cuda_drop_async(device_data, (cudaStream_t *)dfg->get_gpu_stream(location), location); } @@ -410,17 +386,19 @@ struct Dependence { // As streams are not synchronized aside from the GET operation, // we cannot free host-side data until after the synchronization // point as it could still be used by an asynchronous operation. - if (immediate) + if (immediate) { free(host_data.allocated); - else + host_data.allocated = nullptr; + } else { dfg->register_stream_order_dependent_allocation(host_data.allocated); + } } for (auto c : chunks) c->free_data(dfg, immediate); chunks.clear(); delete (this); } - inline void copy(int32_t loc, GPU_DFG *dfg) { + inline void copy(int32_t loc, GPU_DFG *dfg, bool synchronize = true) { size_t data_size = memref_get_data_size(host_data); if (loc == location) return; @@ -434,7 +412,8 @@ struct Dependence { cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(location); cuda_memcpy_async_to_cpu(host_data.aligned, device_data, data_size, s, location); - cudaStreamSynchronize(*s); + if (synchronize) + cudaStreamSynchronize(*s); onHostReady = true; } else { assert(onHostReady && @@ -520,7 +499,7 @@ struct Stream { } ~Stream() { if (dep != nullptr) - dep->free_data(dfg); + dep->free_data(dfg, true); if (producer != nullptr) delete producer; } @@ -575,18 +554,22 @@ struct Stream { size_t subgraph_bootstraps = 0; for (auto p : queue) { is_batched_subgraph |= p->batched_process; - subgraph_bootstraps += - (p->fun == memref_bootstrap_lwe_u64_process) ? 1 : 0; + subgraph_bootstraps += (p->fun == memref_bootstrap_lwe_u64_process || + p->fun == memref_keyswitch_lwe_u64_process) + ? 1 + : 0; } // If this subgraph is not batched, then use this DFG's allocated // GPU to offload to. If this does not bootstrap, just execute on // the host. if (!is_batched_subgraph) { - for (auto p : queue) { - schedule_kernel( - p, (subgraph_bootstraps > 0) ? dfg->gpu_idx : host_location, - single_chunk, nullptr); - } + int32_t loc = (subgraph_bootstraps > 0) ? dfg->gpu_idx : host_location; + for (auto p : queue) + schedule_kernel(p, loc, single_chunk, + (p == producer) ? out.aligned : nullptr); + if (loc != host_location) + dep->copy(host_location, dfg, true); + dep->onHostReady = true; return; } @@ -641,8 +624,6 @@ struct Stream { num_real_inputs++; if (s->dep->host_data.sizes[0] > num_samples) num_samples = s->dep->host_data.sizes[0]; - if (!s->dep->chunks.empty()) - num_samples = s->dep->chunks.size(); } else { mem_per_sample += sizeof(uint64_t); } @@ -653,7 +634,7 @@ struct Stream { (num_real_inputs ? num_real_inputs : 1); size_t num_chunks = 1; size_t num_gpu_chunks = 0; - int32_t num_devices_to_use = 0; + size_t gpu_chunk_factor = device_compute_factor; // If the subgraph does not have sufficient computational // intensity (which we approximate by whether it bootstraps), then // we assume (TODO: confirm with profiling) that it is not @@ -677,16 +658,20 @@ struct Stream { (available_mem - const_mem_per_sample) / ((mem_per_sample ? mem_per_sample : 1) * gpu_memory_inflation_factor); - if (num_samples < num_cores + device_compute_factor * num_devices) { - num_devices_to_use = 0; + while (gpu_chunk_factor > 4) { + if (num_samples < num_cores + gpu_chunk_factor * num_devices) + gpu_chunk_factor >>= 1; + else + break; + } + + if (num_samples < num_cores + gpu_chunk_factor * num_devices) { num_chunks = std::min(num_cores, num_samples); } else { - num_devices_to_use = num_devices; - size_t compute_resources = - num_cores + num_devices * device_compute_factor; + size_t compute_resources = num_cores + num_devices * gpu_chunk_factor; size_t gpu_chunk_size = std::ceil((double)num_samples / compute_resources) * - device_compute_factor; + gpu_chunk_factor; size_t scale_factor = std::ceil((double)gpu_chunk_size / max_samples_per_chunk); num_chunks = num_cores * scale_factor; @@ -698,7 +683,8 @@ struct Stream { for (auto i : inputs) i->dep->split_dependence(num_chunks, num_gpu_chunks, - (i->ct_stream) ? 0 : 1, i->const_stream); + (i->ct_stream) ? 0 : 1, i->const_stream, + gpu_chunk_factor); for (auto iv : intermediate_values) { if (iv->need_new_gen()) { iv->put(new Dependence(split_location, @@ -708,21 +694,67 @@ struct Stream { } } for (auto o : outputs) { - if (o->need_new_gen()) { - o->put(new Dependence(split_location, - {nullptr, nullptr, 0, {0, 0}, {0, 0}}, nullptr, - false, false, split_chunks)); - o->dep->chunks.resize(num_chunks + num_gpu_chunks, nullptr); + if (!o->need_new_gen()) + continue; + std::function get_output_size = + [&](Stream *s) -> uint64_t { + uint64_t res = 0; + // If this stream is not produced within SDFG, we could use + // the input size. For now return 0. + if (s->producer == nullptr) + return 0; + // If the producer process has an output size registered, + // return it. + if (s->producer->output_size.val > 0) + return s->producer->output_size.val; + // Finally we look for sizes from inputs to the producer if + // we don't have it registered as poly size does not change + // in operators that do not register size. + for (auto p : s->producer->input_streams) { + uint64_t p_size = get_output_size(p); + if (p_size == 0) + continue; + if (res == 0) + res = get_output_size(p); + else + assert(res == p_size); + } + return res; + }; + MemRef2 out_mref; + bool allocated = false; + if (o == this) { + out_mref = out; + } else { + uint64_t output_size = get_output_size(o); + out_mref = {0, 0, 0, {num_samples, output_size}, {output_size, 1}}; + size_t data_size = memref_get_data_size(out_mref); + out_mref.allocated = out_mref.aligned = (uint64_t *)malloc(data_size); + allocated = true; + } + + o->put(new Dependence(split_location, out_mref, nullptr, false, allocated, + split_chunks)); + o->dep->chunks.resize(num_chunks + num_gpu_chunks, nullptr); + } + for (auto o : outputs) { + o->dep->chunking_schedule.clear(); + for (auto i : inputs) { + size_t cdim = (i->ct_stream) ? 0 : 1; + if (i->dep->host_data.sizes[cdim] == num_samples) { + for (auto c : i->dep->chunks) + o->dep->chunking_schedule.push_back(c->host_data.sizes[cdim]); + break; + } } } - // Execute graph std::list workers; std::list gpu_schedulers; std::vector> gpu_chunk_list; gpu_chunk_list.resize(num_devices); int32_t dev = 0; - for (size_t c = 0; c < num_chunks + num_gpu_chunks; ++c) { + for (int c = num_chunks + num_gpu_chunks - 1; c >= 0; --c) { if (!subgraph_bootstraps) { workers.push_back(std::thread( [&](std::list queue, size_t c, int32_t host_location) { @@ -738,7 +770,21 @@ struct Stream { workers.push_back(std::thread( [&](std::list queue, size_t c, int32_t host_location) { for (auto p : queue) { - schedule_kernel(p, host_location, c, nullptr); + Stream *os = p->output_streams[0]; + auto it = std::find(outputs.begin(), outputs.end(), os); + if (it == outputs.end()) { + schedule_kernel(p, host_location, c, nullptr); + } else { + size_t data_offset = 0; + for (int32_t ch = 0; ch < c; ++ch) + data_offset += + outputs.front()->dep->chunking_schedule[ch] * + os->dep->host_data.sizes[1] * sizeof(uint64_t); + schedule_kernel( + p, host_location, c, + (uint64_t *)(((char *)os->dep->host_data.aligned) + + data_offset)); + } } for (auto iv : intermediate_values) if (iv->consumers.size() == 1) @@ -761,15 +807,34 @@ struct Stream { assert(status == cudaSuccess); cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); assert(status == cudaSuccess); - for (auto p : queue) + for (auto p : queue) { schedule_kernel(p, dev, c, nullptr); + for (auto out_str : p->output_streams) { + // For all output streams, if this is an output, + // schedule copy out of the data produced by this + // process. + if (auto it = + std::find(outputs.begin(), outputs.end(), out_str); + it != outputs.end()) { + out_str->dep->merge_output_off_device(c, dfg); + continue; + } + // If this is not an output, but some process is not + // part of this subgraph, we need to copy the data + // out. + for (auto cons_proc : out_str->consumers) + if (auto it = + std::find(queue.begin(), queue.end(), cons_proc); + it == queue.end()) + out_str->dep->copy_chunk_off_device(c, dfg); + } + } + for (auto i : inputs) + i->dep->free_chunk_device_data(c, dfg); for (auto iv : intermediate_values) - if (iv->consumers.size() > 1) - iv->dep->move_chunk_off_device(c, dfg); - else - iv->dep->free_chunk_device_data(c, dfg); + iv->dep->free_chunk_device_data(c, dfg); for (auto o : outputs) - o->dep->move_chunk_off_device(c, dfg); + o->dep->free_chunk_device_data(c, dfg); cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev)); } }, @@ -782,65 +847,43 @@ struct Stream { gs.join(); gpu_schedulers.clear(); // Build output out of the separate chunks processed - for (auto o : outputs) { - assert(o->batched_stream && o->ct_stream && - "Only operations with ciphertext output supported."); - o->dep->merge_dependence(dfg); - } + for (auto o : outputs) + o->dep->finalize_merged_dependence(dfg); + for (dev = 0; dev < num_devices; ++dev) + cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev)); // We will assume that only one subgraph is being processed per // DFG at a time, so we can safely free these here. dfg->free_stream_order_dependent_data(); return; } - Dependence *get_on_host(MemRef2 &out) { + void get_on_host(MemRef2 &out) { + // Schedule the execution of the SDFG subgraph required to compute + // the value requested schedule_work(out); - assert(dep != nullptr && "GET on empty stream not allowed."); - // If this was already copied to host, copy out - if (dep->onHostReady) { - memref_copy_contiguous(out, dep->host_data); - return dep; - } else if (dep->location == split_location) { - char *pos = (char *)(out.aligned + out.offset); - std::list devices_used; - for (auto c : dep->chunks) { - size_t data_size = memref_get_data_size(c->host_data); - cuda_memcpy_async_to_cpu( - pos, c->device_data, data_size, - (cudaStream_t *)dfg->get_gpu_stream(c->location), c->location); - pos += data_size; - devices_used.push_back(c->location); - } - // We should only synchronize devices that had data chunks - devices_used.sort(); - devices_used.unique(); - for (auto i : devices_used) - cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(i)); - } else { - size_t data_size = memref_get_data_size(dep->host_data); - cuda_memcpy_async_to_cpu(out.aligned + out.offset, dep->device_data, - data_size, (cudaStream_t *)dfg->gpu_stream, - dep->location); - cudaStreamSynchronize(*(cudaStream_t *)dfg->gpu_stream); - } - // After this synchronization point, all of the host-side - // allocated memory can be freed as we know all asynchronous - // operations have finished. - dfg->free_stream_order_dependent_data(); + + // The result should already have been copied to host, nothing to + // do (synchronization of transfers to host are pre-synchronized + // in schedule_work). + assert(dep != nullptr && dep->onHostReady); + + // For now we make a copy of this dependence for future use as we + // can't assume that the output location will remain live until + // the next use. + // TODO: eliminate this copy. if (!dep->hostAllocated) dep->host_data = memref_copy_alloc(out); dep->onHostReady = true; dep->hostAllocated = true; - return dep; } Dependence *get(int32_t location, int32_t chunk_id = single_chunk) { assert(dep != nullptr && "Dependence could not be computed."); assert(chunk_id != split_chunks); if (chunk_id != single_chunk) { Dependence *d = dep->chunks[chunk_id]; - d->copy(location, dfg); + d->copy(location, dfg, false); return d; } - dep->copy(location, dfg); + dep->copy(location, dfg, false); return dep; } inline bool need_new_gen(int32_t chunk_id = single_chunk) { @@ -859,6 +902,13 @@ struct Stream { } }; +void GPU_DFG::free_streams() { + streams.sort(); + streams.unique(); + for (auto s : streams) + delete s; +} + static inline mlir::concretelang::gpu_dfg::Process * make_process_1_1(void *dfg, void *sin1, void *sout, void (*fun)(Process *, int32_t, int32_t, uint64_t *)) { @@ -878,6 +928,7 @@ make_process_1_1(void *dfg, void *sin1, void *sout, p->dfg->register_stream(s1); p->dfg->register_stream(so); p->batched_process = s1->batched_stream; + p->output_size.val = 0; return p; } @@ -905,6 +956,7 @@ make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout, p->dfg->register_stream(s2); p->dfg->register_stream(so); p->batched_process = s1->batched_stream; + p->output_size.val = 0; return p; } @@ -953,8 +1005,11 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, uint64_t *out_ptr) { auto sched = [&](Dependence *d) { uint64_t num_samples = d->host_data.sizes[0]; - MemRef2 out = { - 0, 0, 0, {num_samples, p->output_size.val}, {p->output_size.val, 1}}; + MemRef2 out = {out_ptr, + out_ptr, + 0, + {num_samples, p->output_size.val}, + {p->output_size.val, 1}}; size_t data_size = memref_get_data_size(out); if (loc == host_location) { // If it is not profitable to offload, schedule kernel on CPU @@ -968,8 +1023,8 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, d->host_data.strides[1], p->level.val, p->base_log.val, p->input_lwe_dim.val, p->output_lwe_dim.val, p->sk_index.val, p->ctx.val); - Dependence *dep = - new Dependence(loc, out, nullptr, true, true, d->chunk_id); + Dependence *dep = new Dependence(loc, out, nullptr, true, + (out_ptr == nullptr), d->chunk_id); return dep; } else { // Schedule the keyswitch kernel on the GPU @@ -995,7 +1050,8 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, uint64_t *out_ptr) { assert(p->output_size.val == p->glwe_dim.val * p->poly_size.val + 1); - + if (!p->output_streams[0]->need_new_gen(chunk_id)) + return; Dependence *idep1 = p->input_streams[1]->get(host_location, chunk_id); MemRef2 &mtlu = idep1->host_data; uint32_t num_lut_vectors = mtlu.sizes[0]; @@ -1019,8 +1075,11 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, std::vector &lut_indexes, cudaStream_t *s, int32_t loc) { uint64_t num_samples = d0->host_data.sizes[0]; - MemRef2 out = { - 0, 0, 0, {num_samples, p->output_size.val}, {p->output_size.val, 1}}; + MemRef2 out = {out_ptr, + out_ptr, + 0, + {num_samples, p->output_size.val}, + {p->output_size.val, 1}}; size_t data_size = memref_get_data_size(out); // Move test vector indexes to the GPU, the test vector indexes is set of 0 @@ -1061,8 +1120,8 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id, d1->host_data.strides[1], p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->base_log.val, p->glwe_dim.val, p->sk_index.val, p->ctx.val); - Dependence *dep = - new Dependence(loc, out, nullptr, true, true, d0->chunk_id); + Dependence *dep = new Dependence(loc, out, nullptr, true, + (out_ptr == nullptr), d0->chunk_id); free(glwe_ct); return dep; } else { @@ -1125,8 +1184,8 @@ void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc, assert(d0->host_data.sizes[1] == d1->host_data.sizes[1]); assert(d0->chunk_id == d1->chunk_id); uint64_t num_samples = d0->host_data.sizes[0]; - MemRef2 out = {0, - 0, + MemRef2 out = {out_ptr, + out_ptr, 0, {num_samples, d0->host_data.sizes[1]}, {d0->host_data.sizes[1], 1}}; @@ -1144,8 +1203,8 @@ void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc, d1->host_data.aligned, d1->host_data.offset, d1->host_data.sizes[0], d1->host_data.sizes[1], d1->host_data.strides[0], d1->host_data.strides[1]); - Dependence *dep = - new Dependence(loc, out, nullptr, true, true, d0->chunk_id); + Dependence *dep = new Dependence(loc, out, nullptr, true, + (out_ptr == nullptr), d0->chunk_id); return dep; } else { // Schedule the kernel on the GPU @@ -1175,8 +1234,8 @@ void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc, d1->host_data.sizes[1] == 1); assert(d0->chunk_id == d1->chunk_id); uint64_t num_samples = d0->host_data.sizes[0]; - MemRef2 out = {0, - 0, + MemRef2 out = {out_ptr, + out_ptr, 0, {num_samples, d0->host_data.sizes[1]}, {d0->host_data.sizes[1], 1}}; @@ -1202,8 +1261,8 @@ void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc, d0->host_data.strides[1], d1->host_data.allocated, d1->host_data.aligned, d1->host_data.offset, d1->host_data.sizes[1], d1->host_data.strides[1]); - Dependence *dep = - new Dependence(loc, out, nullptr, true, true, d0->chunk_id); + Dependence *dep = new Dependence(loc, out, nullptr, true, + (out_ptr == nullptr), d0->chunk_id); return dep; } else { // Schedule the kernel on the GPU @@ -1233,8 +1292,8 @@ void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc, d1->host_data.sizes[1] == 1); assert(d0->chunk_id == d1->chunk_id); uint64_t num_samples = d0->host_data.sizes[0]; - MemRef2 out = {0, - 0, + MemRef2 out = {out_ptr, + out_ptr, 0, {num_samples, d0->host_data.sizes[1]}, {d0->host_data.sizes[1], 1}}; @@ -1260,8 +1319,8 @@ void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc, d0->host_data.strides[1], d1->host_data.allocated, d1->host_data.aligned, d1->host_data.offset, d1->host_data.sizes[1], d1->host_data.strides[1]); - Dependence *dep = - new Dependence(loc, out, nullptr, true, true, d0->chunk_id); + Dependence *dep = new Dependence(loc, out, nullptr, true, + (out_ptr == nullptr), d0->chunk_id); return dep; } else { // Schedule the keyswitch kernel on the GPU @@ -1287,8 +1346,8 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc, uint64_t *out_ptr) { auto sched = [&](Dependence *d0, cudaStream_t *s, int32_t loc) { uint64_t num_samples = d0->host_data.sizes[0]; - MemRef2 out = {0, - 0, + MemRef2 out = {out_ptr, + out_ptr, 0, {num_samples, d0->host_data.sizes[1]}, {d0->host_data.sizes[1], 1}}; @@ -1303,8 +1362,8 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc, d0->host_data.aligned, d0->host_data.offset, d0->host_data.sizes[0], d0->host_data.sizes[1], d0->host_data.strides[0], d0->host_data.strides[1]); - Dependence *dep = - new Dependence(loc, out, nullptr, true, true, d0->chunk_id); + Dependence *dep = new Dependence(loc, out, nullptr, true, + (out_ptr == nullptr), d0->chunk_id); return dep; } else { // Schedule the kernel on the GPU @@ -1587,6 +1646,7 @@ void stream_emulator_get_memref_batch(void *stream, uint64_t *out_allocated, uint64_t out_offset, uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0, uint64_t out_stride1) { + static size_t count = 0; assert(out_stride1 == 1 && "Strided memrefs not supported"); MemRef2 mref = {out_allocated, out_aligned,