diff --git a/Project.toml b/Project.toml
index 28149ba6..b797ed0d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "FastIce"
 uuid = "e0de9f13-a007-490e-b696-b07d031015ca"
-authors = ["Ludovic Raess <ludovic.rass@gmail.com>, Ivan Utkin <utkin@hey.com> and contributors"]
+authors = ["Ludovic Raess <ludovic.rass@gmail.com>, Ivan Utkin <utkin@hey.com>, Samuel Omlin <samuel.omlin@cscs.ch> and contributors"]
 version = "0.2.0"
 
 [deps]
@@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ElasticArrays = "fdbdab4c-e67f-52f5-8c3f-e7b388dad3d4"
 GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+ImplicitGlobalGrid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -32,6 +33,7 @@ CUDA = "5"
 ElasticArrays = "1"
 GeometryBasics = "0.4"
 HDF5 = "0.17"
+ImplicitGlobalGrid = "0.15"
 KernelAbstractions = "0.9"
 LightXML = "0.9"
 MPI = "0.20"
diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index 1afc2ef2..a6d42307 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -26,6 +26,7 @@ import FastIce.BoundaryConditions: apply_boundary_conditions!
 
 using MPI
 using KernelAbstractions
+import ImplicitGlobalGrid
 
 "Trait structure used as a type parameter to indicate that the Architecture is a distributed MPI Architecture."
 struct DistributedMPI end
diff --git a/src/Distributed/gather.jl b/src/Distributed/gather.jl
index b52ed47c..b9b8150e 100644
--- a/src/Distributed/gather.jl
+++ b/src/Distributed/gather.jl
@@ -7,29 +7,7 @@ The array will be gathered on the process with id `root` (`root=0` by default).
 Note that the memory for a global array should be allocated only on the process with id `root`, on other processes `dst` can be set to `nothing`.
 """
 function gather!(dst::Union{AbstractArray{T,N},Nothing}, src::AbstractArray{T,N}, comm::MPI.Comm; root=0) where {T,N}
-    dims, _, _ = MPI.Cart_get(comm)
-    dims = Tuple(dims)
-    if MPI.Comm_rank(comm) == root
-        # make subtype for gather
-        offset  = Tuple(0 for _ in 1:N)
-        subtype = MPI.Types.create_subarray(size(dst), size(src), offset, MPI.Datatype(eltype(dst)))
-        subtype = MPI.Types.create_resized(subtype, 0, size(src, 1) * Base.elsize(dst))
-        MPI.Types.commit!(subtype)
-        # make VBuffer for collective communication
-        counts  = fill(Cint(1), reverse(dims)) # gather one subarray from each MPI rank
-        displs  = zeros(Cint, reverse(dims))   # reverse dims since MPI Cart comm is row-major
-        csizes  = cumprod(size(src)[2:end] .* dims[1:end-1])
-        strides = (1, csizes...)
-        for I in CartesianIndices(displs)
-            offset = reverse(Tuple(I - oneunit(I)))
-            displs[I] = sum(offset .* strides)
-        end
-        recvbuf = MPI.VBuffer(dst, vec(counts), vec(displs), subtype)
-        MPI.Gatherv!(src, recvbuf, comm; root)
-    else
-        MPI.Gatherv!(src, nothing, comm; root)
-    end
-    return
+    ImplicitGlobalGrid.gather!(src, dst, comm; root=root)
 end
 
 """