diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
new file mode 100644
index 00000000..5bb5f5a7
--- /dev/null
+++ b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -0,0 +1,14 @@
+module AMDGPUExt
+
+using AMDGPU
+using KernelAbstractions
+
+using FastIce.Architecture
+
+set_device!(dev::HIPDevice) = device!(dev)
+
+heuristic_groupsize(::ROCBackend, ::Val{1}) = (256, )
+heuristic_groupsize(::ROCBackend, ::Val{2}) = (128, 2, )
+heuristic_groupsize(::ROCBackend, ::Val{3}) = (128, 2, 1, )
+
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
new file mode 100644
index 00000000..27f8841f
--- /dev/null
+++ b/ext/CUDAExt/CUDAExt.jl
@@ -0,0 +1,14 @@
+module CUDAExt
+
+using CUDA
+using KernelAbstractions
+
+using FastIce.Architecture
+
+set_device!(dev::CuDevice) = device!(dev)
+
+heuristic_groupsize(::CUDABackend, ::Val{1}) = (256, )
+heuristic_groupsize(::CUDABackend, ::Val{2}) = (32, 8, )
+heuristic_groupsize(::CUDABackend, ::Val{3}) = (32, 8, 1, )
+
+end
\ No newline at end of file
diff --git a/scripts_future_API/tm_stokes_wip.jl b/scripts_future_API/tm_stokes_wip.jl
index f66f80ea..d532e2ec 100644
--- a/scripts_future_API/tm_stokes_wip.jl
+++ b/scripts_future_API/tm_stokes_wip.jl
@@ -94,7 +94,7 @@ Isothermal._apply_bcs!(model.backend, model.grid, model.fields, model.boundary_c
 set!(model.fields.η, other_fields.A)
 extrapolate!(model.fields.η)
 
-for it in 1:10
+for it in 1:100
     advance_iteration!(model, 0.0, 1.0; async = false)
     if it % 10 == 0
         plt.Pr[3][] = interior(model.fields.Pr)[:, :, size(grid,3)÷2]
diff --git a/src/Distributed/distributed.jl b/src/Distributed/distributed.jl
new file mode 100644
index 00000000..ac045a45
--- /dev/null
+++ b/src/Distributed/distributed.jl
@@ -0,0 +1,47 @@
+module Distributed
+
+using FastIce.Architecture
+using FastIce.Grids
+
+export CartesianTopology
+
+export global_rank, shared_rank, node_name, cartesian_communicator, shared_communicator
+
+export dimensions, global_size, node_size
+
+export global_grid_size, local_grid
+
+export split_ndrange
+
+using FastIce.Grids
+
+using MPI
+
+include("topology.jl")
+
+include("split_ndrange.jl")
+
+struct DistributedArchitecture{C,T,R} <: AbstractArchitecture
+    child_arch::C
+    topology::T
+    ranges::R
+end
+
+device(arch::DistributedArchitecture) = device(arch.child_arch)
+
+function launch!(arch::DistributedArchitecture, grid::CartesianGrid, kernel::Pair{Kernel,Args}; boundary_conditions=nothing, async=true) where {Args}
+    fun, args = kernel
+
+    worksize = size(grid, Vertex())
+    groupsize = heuristic_groupsize(arch.child_arch)
+
+    fun(arch.backend, groupsize)(args...; ndrange=size(arch.ranges[end]), offset=first(arch.ranges[end]))
+
+
+    isnothing(boundary_conditions) || apply_boundary_conditions!(boundary_conditions)
+
+    async || synchronize(arch.backend)
+    return
+end
+
+end
\ No newline at end of file
diff --git a/src/Distributed/exchanger.jl b/src/Distributed/exchanger.jl
new file mode 100644
index 00000000..2659ed04
--- /dev/null
+++ b/src/Distributed/exchanger.jl
@@ -0,0 +1,72 @@
+mutable struct Exchanger
+    @atomic done::Bool
+    ch::Channel
+    bottom::Base.Event
+    task::Task
+    @atomic err
+
+    function Exchanger(f::F, arch::AbstractArchitecture, comm, rank, halo, border) where F
+        top    = Base.Event(true)
+        bottom = Base.Event(true)
+
+        send_buf = similar(border)
+        recv_buf = similar(halo)
+        this = new(false, top, bottom, nothing)
+
+        has_neighbor = rank != MPI.PROC_NULL
+        compute_bc   = !has_neighbor
+
+        this.task = Threads.@spawn begin
+            set_device!(device(arch))
+            KernelAbstractions.priority!(backend(arch), :high)
+            try
+                while !(@atomic this.done)
+                    wait(top)
+                    if has_neighbor
+                        recv = MPI.Irecv!(recv_buf, comm; source=rank)
+                    end
+                    f(compute_bc)
+                    if has_neighbor
+                        copyto!(send_buf, border)
+                        send = MPI.Isend(send_buf, comm; dest=rank)
+                        cooperative_test!(recv)
+                        copyto!(halo, recv_buf)
+                        cooperative_test!(send)
+                    end
+                    notify(bottom)
+                end
+            catch err
+                @show err
+                @atomic this.done = true
+                @atomic this.err = err
+            end
+        end
+        errormonitor(this.task)
+        return this
+    end
+end
+
+setdone!(exc::Exchanger) = @atomic exc.done = true
+
+Base.isdone(exc::Exchanger) = @atomic exc.done
+
+function Base.notify(exc::Exchanger)
+    if !(@atomic exc.done)
+        notify(exc.top)
+    else
+        error("notify: Exchanger is not running")
+    end
+end
+function Base.wait(exc::Exchanger)
+    if !(@atomic exc.done)
+        wait(exc.bottom)
+    else
+        error("wait: Exchanger is not running")
+    end
+end
+
+get_recv_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 1          : Colon(), Val(ndims(A)))...)
+get_recv_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) : Colon(), Val(ndims(A)))...)
+
+get_send_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 2              : Colon(), Val(ndims(A)))...)
+get_send_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) - 1 : Colon(), Val(ndims(A)))...)
diff --git a/src/Distributed/split_ndrange.jl b/src/Distributed/split_ndrange.jl
new file mode 100644
index 00000000..37c20128
--- /dev/null
+++ b/src/Distributed/split_ndrange.jl
@@ -0,0 +1,27 @@
+@inline subrange(nr,bw,I,::Val{1}) = 1:bw[I]
+@inline subrange(nr,bw,I,::Val{2}) = (size(nr,I)-bw[I]+1):size(nr,I)
+@inline subrange(nr,bw,I,::Val{3}) = (bw[I]+1):(size(nr,I)-bw[I])
+
+@inline split_ndrange(ndrange,ndwidth) = split_ndrange(CartesianIndices(ndrange),ndwidth)
+
+function split_ndrange(ndrange::CartesianIndices{N},ndwidth::NTuple{N,<:Integer}) where N
+    @assert all(size(ndrange) .> ndwidth.*2)
+    @inline ndsubrange(I,::Val{J}) where J = ntuple(Val(N)) do idim
+        if idim < I
+            1:size(ndrange,idim)
+        elseif idim == I
+            subrange(ndrange,ndwidth,idim,Val(J))
+        else
+            subrange(ndrange,ndwidth,idim,Val(3))
+        end
+    end
+    ndinner = ntuple(idim -> subrange(ndrange,ndwidth,idim,Val(3)), Val(N))
+    return ntuple(Val(2N+1)) do i
+        if i == 2N+1
+            ndrange[ndinner...]
+        else
+            idim,idir = divrem(i-1,2) .+ 1
+            ndrange[ndsubrange(idim,Val(idir))...]
+        end
+    end
+end
\ No newline at end of file
diff --git a/src/distributed.jl b/src/Distributed/topology.jl
similarity index 83%
rename from src/distributed.jl
rename to src/Distributed/topology.jl
index f8548383..e5a0c81c 100644
--- a/src/distributed.jl
+++ b/src/Distributed/topology.jl
@@ -1,23 +1,10 @@
-module Distributed
-
-export CartesianTopology
-
-export global_rank, shared_rank, node_name, cartesian_communicator, shared_communicator
-
-export dimensions, global_size, node_size
-
-export global_grid_size, local_grid
-
-using FastIce.Grids
-
-using MPI
-
 struct CartesianTopology{N}
     nprocs::Int
     dims::NTuple{N,Int}
     global_rank::Int
     shared_rank::Int
     cart_coords::NTuple{N,Int}
+    neighbors::NTuple{N,NTuple{2,Int}}
     comm::MPI.Comm
     cart_comm::MPI.Comm
     shared_comm::MPI.Comm
@@ -34,7 +21,11 @@ function CartesianTopology(dims::NTuple{N,Int}; comm = MPI.COMM_WORLD) where {N}
     node_name = MPI.Get_processor_name()
     cart_coords = Tuple(MPI.Cart_coords(cart_comm))
 
-    return CartesianTopology{N}(nprocs, dims, global_rank, shared_rank, cart_coords, comm, cart_comm, shared_comm, node_name)
+    neighbors = ntuple(Val(N)) do dim
+        MPI.Cart_shift(cart_comm, dim-1, 1)
+    end
+
+    return CartesianTopology{N}(nprocs, dims, global_rank, shared_rank, cart_coords, neighbors, comm, cart_comm, shared_comm, node_name)
 end
 
 global_rank(t::CartesianTopology) = t.global_rank
@@ -49,6 +40,10 @@ shared_communicator(t::CartesianTopology) = t.shared_comm
 
 dimensions(t::CartesianTopology) = t.dims
 
+coordinates(t::CartesianTopology) = t.cart_coords
+
+neighbors(t::CartesianTopology) = t.neighbors
+
 global_size(t::CartesianTopology) = MPI.Comm_size(t.cart_comm)
 node_size(t::CartesianTopology) = MPI.Comm_size(t.shared_comm)
 
@@ -60,6 +55,4 @@ function local_grid(g::CartesianGrid, t::CartesianTopology)
     local_origin = origin(g) .+ local_extent .* t.cart_coords
 
     return CartesianGrid(local_origin, local_extent, local_size)
-end
-
 end
\ No newline at end of file
diff --git a/src/Models/full_stokes/isothermal/isothermal.jl b/src/Models/full_stokes/isothermal/isothermal.jl
index d510e2f7..20ead3e2 100644
--- a/src/Models/full_stokes/isothermal/isothermal.jl
+++ b/src/Models/full_stokes/isothermal/isothermal.jl
@@ -85,9 +85,14 @@ function advance_iteration!(model::IsothermalFullStokesModel, t, Δt; async = tr
     set_bcs!(bcs) = _apply_bcs!(model.backend, model.grid, model.fields, bcs)
 
     # stress
+
+    # launch!(arch, grid, update_σ!, Pr, τ, V, η, Δτ, Δ)
+
     update_σ!(backend, 256, (nx+1, ny+1, nz+1))(Pr, τ, V, η, Δτ, Δ)
     set_bcs!(model.boundary_conditions.stress)
     # velocity
+
+    # launch!(arch, grid, (update_res_V! => (rV, V, Pr, τ, η, Δτ, Δ), update_V! => (V, rV, dt)); exchangers = exchangers.velocity, boundary_conditions = boundary_conditions.velocity)
     update_V!(backend, 256, (nx+1, ny+1, nz+1))(V, Pr, τ, η, Δτ, Δ)
     set_bcs!(model.boundary_conditions.velocity)
     # rheology
diff --git a/src/architecture.jl b/src/architecture.jl
new file mode 100644
index 00000000..963ad350
--- /dev/null
+++ b/src/architecture.jl
@@ -0,0 +1,48 @@
+module Architecture
+
+export AbstractArchitecture
+
+export SingleDeviceArchitecture
+
+export launch!, set_device!, heuristic_groupsize
+
+using FastIce.Grids
+
+using KernelAbstractions
+import KernelAbstractions.Kernel
+
+abstract type AbstractArchitecture end
+
+set_device!(arch::AbstractArchitecture) = set_device!(device(arch))
+
+heuristic_groupsize(arch::AbstractArchitecture) = heuristic_groupsize(device(arch))
+
+struct SingleDeviceArchitecture{B,D} <: AbstractArchitecture
+    backend::B
+    device::D
+end
+
+set_device!(::SingleDeviceArchitecture{CPU}) = nothing
+
+heuristic_groupsize(::SingleDeviceArchitecture{CPU}) = 256
+
+device(arch::SingleDeviceArchitecture) = arch.device
+
+function launch!(arch::SingleDeviceArchitecture, grid::CartesianGrid, kernel::Pair{Kernel,Args}; kwargs...) where {Args}
+    worksize = size(grid, Vertex())
+    launch!(arch, worksize, kernel; kwargs...)
+end
+
+function launch!(arch::SingleDeviceArchitecture, worksize::NTuple{N,Int}, kernel::Pair{Kernel,Args}; boundary_conditions=nothing, async=true) where {N,Args}
+    fun, args = kernel
+
+    groupsize = heuristic_groupsize(device(arch))
+
+    fun(arch.backend, groupsize, worksize)(args...)
+    isnothing(boundary_conditions) || apply_boundary_conditions!(boundary_conditions)
+
+    async || synchronize(arch.backend)
+    return
+end
+
+end
\ No newline at end of file