diff --git a/ChASE-MPI/mpi_wrapper.hpp b/ChASE-MPI/mpi_wrapper.hpp index fc77569..0cfca27 100644 --- a/ChASE-MPI/mpi_wrapper.hpp +++ b/ChASE-MPI/mpi_wrapper.hpp @@ -150,6 +150,7 @@ void Memcpy(int mode, void* dst, const void* src, std::size_t count) #if defined(CUDA_AWARE) case CPY_D2D: cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice); + cudaDeviceSynchronize(); break; case CPY_D2H: cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost);