diff --git a/.gitignore b/.gitignore
index 06175108..66c68edb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,8 +27,18 @@ docs/site/
 Manifest.toml
 LocalPreferences.toml
 GeoData/Manifest.toml
+
+# OSX-specific files
 .DS_Store
+
+# VSCode-specific files
 .vscode
+
+# Project-specific files
 out_visu/
 gitignore/
 data/
+*.h5
+*.xdmf3
+*.png
+*.mp4
diff --git a/Project.toml b/Project.toml
index 557aea98..df111601 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,23 +4,29 @@ authors = ["Ludovic Raess <ludovic.rass@gmail.com>, Ivan Utkin and contributors"
 version = "0.1.0"
 
 [deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-CellArrays = "d35fcfd7-7af4-4c67-b1aa-d78070614af4"
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 ElasticArrays = "fdbdab4c-e67f-52f5-8c3f-e7b388dad3d4"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 ImplicitGlobalGrid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9"
-ParallelStencil = "94395366-693c-11ea-3b26-d9b7aac5d958"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+TinyKernels = "f7cbc414-f748-44bf-86e6-e44e9a55e39d"
+UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
diff --git a/region.png b/region.png
new file mode 100644
index 00000000..585c034a
Binary files /dev/null and b/region.png differ
diff --git a/scripts2D_variational/app_inclusion2D.jl b/scripts2D_variational/app_inclusion2D.jl
new file mode 100644
index 00000000..63868090
--- /dev/null
+++ b/scripts2D_variational/app_inclusion2D.jl
@@ -0,0 +1,164 @@
+using FastIce
+using TinyKernels
+using CairoMakie
+using ElasticArrays
+using Printf
+
+include("bcs.jl")
+include("helpers_tmp.jl")
+include("level_sets.jl")
+include("stokes.jl")
+include("volume_fractions.jl")
+
+@views av1(A) = 0.5 .* (A[1:end-1] .+ A[2:end])
+@views inn_x(A) = A[2:end-1,:]
+@views inn_y(A) = A[:,2:end-1]
+@views inn(A) = A[2:end-1,2:end-1]
+
+@views function runsim(::Type{DAT}; nx=127) where {DAT}
+    # physics
+    # lx, ly   = 2.0, 1.0
+    lx, ly   = 1.0, 1.0
+    ox, oy   = -0.5lx, -0.5ly
+    # xb1, yb1 = ox + 0.5lx, oy + 0.0ly
+    # xb2, yb2 = ox + 0.5lx, oy + 3.0ly
+    xb1, yb1 = ox + 0.5lx, oy + 0.5ly
+    rinc     = 0.14ly
+    # rair     = 2.3ly
+    ηs0      = 1.0
+    ebg      = 1.0
+    ρg0      = 0.0
+    α        = 0.0
+    npow     = 1
+    # numerics
+    ny       = ceil(Int, (nx + 1) * ly / lx) - 1
+    maxiter  = 40nx
+    ncheck   = 2nx
+    ϵtol     = (1e-6, 1e-6, 1e-6)
+    nt       = 1
+    χ        = 1.0
+    # preprocessing
+    dx, dy   = lx / nx, ly / ny
+    xv, yv   = LinRange(ox, ox + lx, nx + 1), LinRange(oy, oy + ly, ny + 1)
+    xc, yc   = av1(xv), av1(yv)
+    mc1      = to_device(make_marker_chain_circle(Point(xb1, yb1), rinc, min(dx, dy)))
+    # mc2      = to_device(make_marker_chain_circle(Point(xb2, yb2), rair, min(dx, dy)))
+    ρg       = (x=ρg0 .* sin(α), y=ρg0 .* cos(α))
+    mpow     = -(1 - 1 / npow)
+    # PT parameters
+    r        = 0.7
+    re_mech  = 6π
+    lτ       = min(lx, ly)
+    vdτ      = min(dx, dy) / sqrt(2.1)
+    θ_dτ     = lτ * (r + 4 / 3) / (re_mech * vdτ)
+    nudτ     = vdτ * lτ / re_mech
+    dτ_r     = 1.0 / (θ_dτ + 1.0)
+    # level set
+    Ψ  = (
+        not_air = field_array(DAT, nx + 1, ny + 1),  # liquid
+    )
+    wt = (
+        not_solid = volfrac_field(DAT, nx, ny), # fluid
+        not_air   = volfrac_field(DAT, nx, ny), # liquid
+    )
+    # mechanics
+    Pr = scalar_field(DAT, nx, ny)
+    τ  = tensor_field(DAT, nx, ny)
+    ε  = tensor_field(DAT, nx, ny)
+    V  = vector_field(DAT, nx, ny)
+    ηs = scalar_field(DAT, nx, ny)
+    τII = scalar_field(DAT, nx, ny)
+    εII = scalar_field(DAT, nx, ny)
+    # residuals
+    Res = (
+        Pr = scalar_field(DAT, nx    , ny    ),
+        V  = vector_field(DAT, nx - 2, ny - 2),
+    )
+    # visualisation
+    Vmag = field_array(DAT, nx - 2, ny - 2)
+    Ψav = (
+        not_solid = field_array(DAT, nx - 2, ny - 2),
+        not_air   = field_array(DAT, nx - 2, ny - 2),
+    )
+    # initial and boundary conditions
+    @info "computing the level set for the inclusion"
+    for comp in eachindex(Ψ) fill!(Ψ[comp], 1.0) end
+    init!(Pr, τ, V, ηs, ebg, ηs0, xv, yv)
+    fill!(τII, 0.0)
+    fill!(εII, 0.0)
+    Ψ.not_air .= Inf # needs init now
+    compute_levelset!(Ψ.not_air, xv, yv, mc1)
+    # compute_levelset!(Ψ.not_air, xv, yv, mc2)
+    Ψ.not_air .= .-Ψ.not_air
+
+    @info "computing volume fractions from level sets"
+    compute_volume_fractions_from_level_set!(wt.not_air, Ψ.not_air, dx, dy)
+    for comp in eachindex(wt.not_solid) fill!(wt.not_solid[comp], 1.0) end
+
+    update_vis!(Vmag, Ψav, V, Ψ)
+    # convergence history
+    iter_evo = Float64[]
+    errs_evo = ElasticArray{Float64}(undef, length(ϵtol), 0)
+    # figures
+    fig = Figure(resolution=(2500, 1600), fontsize=32)
+    ax = (
+        Pr  =Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="p"),
+        τII =Axis(fig[1, 2][1, 1]; aspect=DataAspect(), title="τII"),
+        Vmag=Axis(fig[2, 1][1, 1]; aspect=DataAspect(), title="|v|"),
+        εII =Axis(fig[2, 2][1, 1]; aspect=DataAspect(), title="εII"),
+        wt  =Axis(fig[1, 3][1, 1]; aspect=DataAspect(), title="Volume fraction"),
+        errs=Axis(fig[2, 3]      ; yscale=log10, title="Convergence", xlabel="#iter/ny", ylabel="error"),
+    )
+    plt = (
+        fields=(
+            Pr  =heatmap!(ax.Pr  , xc, yc, to_host(Pr  ); colormap=:turbo),
+            τII =heatmap!(ax.τII , xc, yc, to_host(τII ); colormap=:turbo),
+            Vmag=heatmap!(ax.Vmag, xc, yc, to_host(Vmag); colormap=:turbo),
+            εII =heatmap!(ax.εII , xc, yc, to_host(εII ); colormap=:turbo),
+            wt  =heatmap!(ax.wt  , xc, yc, to_host(wt.not_air.c); colormap=Reverse(:grays)),
+        ),
+        errs=[scatterlines!(ax.errs, Point2.(iter_evo, errs_evo[ir, :])) for ir in eachindex(ϵtol)],
+    )
+    Colorbar(fig[1, 1][1, 2], plt.fields.Pr  )
+    Colorbar(fig[1, 2][1, 2], plt.fields.τII )
+    Colorbar(fig[2, 1][1, 2], plt.fields.Vmag)
+    Colorbar(fig[2, 2][1, 2], plt.fields.εII )
+    Colorbar(fig[1, 3][1, 2], plt.fields.wt  )
+    display(fig)
+
+    @info "running simulation 🚀"
+    for it in 1:nt
+        @printf "it # %d\n" it
+        # iteration loop
+        empty!(iter_evo); resize!(errs_evo, length(ϵtol), 0)
+        iter = 0; errs = 2.0 .* ϵtol
+        while any(errs .>= ϵtol) && (iter += 1) <= maxiter
+            update_σ!(Pr, ε, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy)
+            compute_invariants!(εII, τII, ε, τ, ηs, χ, mpow)
+            update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy)
+            if iter % ncheck == 0
+                compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy)
+                errs = (maximum(abs.(Res.V.x)), maximum(abs.(Res.V.y)), maximum(abs.(Res.Pr)))
+                @printf "  iter/nx # %2.1f, errs: [ Vx = %1.3e, Vy = %1.3e, Pr = %1.3e ]\n" iter / nx errs...
+                push!(iter_evo, iter / nx); append!(errs_evo, errs)
+                # visu
+                for ir in eachindex(plt.errs)
+                    plt.errs[ir][1] = Point2.(iter_evo, errs_evo[ir, :])
+                end
+                autolimits!(ax.errs)
+                update_vis!(Vmag, Ψav, V, Ψ)
+                plt.fields[1][3] = to_host(to_host(Pr))
+                plt.fields[2][3] = to_host(to_host(τII))
+                plt.fields[3][3] = to_host(to_host(Vmag))
+                # plt.fields[4][3] = to_host(to_host(εII))
+                plt.fields[4][3] = to_host(to_host(log10.(ηs)))
+                plt.fields[5][3] = to_host(to_host(wt.not_air.c))
+                # plt.fields[4][3] = to_host(to_host(Ψ.not_air))
+                display(fig)
+            end
+        end
+    end
+    return
+end
+
+runsim(Float64, nx=127)
\ No newline at end of file
diff --git a/scripts2D_variational/bc_kernels.jl b/scripts2D_variational/bc_kernels.jl
new file mode 100644
index 00000000..2e926a98
--- /dev/null
+++ b/scripts2D_variational/bc_kernels.jl
@@ -0,0 +1,43 @@
+@tiny function _kernel_bc_x_dirichlet!(val,arrays...)
+    iy, = @indices
+    for A in arrays
+        if iy ∈ axes(A,2)
+            @inbounds A[1  ,iy] = val
+            @inbounds A[end,iy] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_dirichlet!(val, arrays...)
+    ix, = @indices
+    for A in arrays
+        if ix ∈ axes(A,1)
+            @inbounds A[ix,1  ] = val
+            @inbounds A[ix,end] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_x_neumann!(val, arrays...)
+    iy, = @indices
+    for A in arrays
+        if iy ∈ axes(A,2)
+            @inbounds A[1  ,iy] = A[2    ,iy] + val
+            @inbounds A[end,iy] = A[end-1,iy] + val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_neumann!(val, arrays...)
+    ix, = @indices
+    for A in arrays
+        if ix ∈ axes(A,1)
+            @inbounds A[ix,1  ] = A[ix,2    ] + val
+            @inbounds A[ix,end] = A[ix,end-1] + val
+        end
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational/bcs.jl b/scripts2D_variational/bcs.jl
new file mode 100644
index 00000000..72f48ecf
--- /dev/null
+++ b/scripts2D_variational/bcs.jl
@@ -0,0 +1,33 @@
+include("bc_kernels.jl")
+
+const _bc_x_dirichlet! = _kernel_bc_x_dirichlet!(get_device())
+const _bc_y_dirichlet! = _kernel_bc_y_dirichlet!(get_device())
+
+const _bc_x_neumann! = _kernel_bc_x_neumann!(get_device())
+const _bc_y_neumann! = _kernel_bc_y_neumann!(get_device())
+
+for fname in (:bx_x_dirichlet!, :bc_x_neumann!)
+    @eval begin
+        function $fname(val, arrays...)
+            ax = axes(arrays[1], 2)
+            for A in arrays[2:end]
+                ax = union.(ax, axes(A, 2))
+            end
+            wait($(Symbol(:_, fname))(val, arrays...; ndrange=ax))
+            return
+        end
+    end
+end
+
+for fname in (:bx_y_dirichlet!, :bc_y_neumann!)
+    @eval begin
+        function $fname(val, arrays...)
+            ax = axes(arrays[1], 1)
+            for A in arrays[2:end]
+                ax = union.(ax, axes(A, 1))
+            end
+            wait($(Symbol(:_, fname))(val, arrays...; ndrange=ax))
+            return
+        end
+    end
+end
\ No newline at end of file
diff --git a/scripts2D_variational/geometry.jl b/scripts2D_variational/geometry.jl
new file mode 100644
index 00000000..32f0eee7
--- /dev/null
+++ b/scripts2D_variational/geometry.jl
@@ -0,0 +1,24 @@
+using LinearAlgebra, GeometryBasics
+
+function make_marker_chain_circle(rc, rad, hmax)
+    np = ceil(Int, 2π * rad / hmax)
+    return [rc + rad .* Point2(reverse(sincospi(2 * (i - 1) / np))...) for i in 1:np]
+end
+
+function signed_distance(p::Point2{T}, poly::AbstractVector{Point2{T}}) where {T}
+    d = dot(p - poly[1], p - poly[1])
+    s = 1.0
+    j = length(poly)
+    for i in eachindex(poly)
+        e = poly[j] - poly[i]
+        w = p - poly[i]
+        b = w - e .* clamp(dot(w, e) / dot(e, e), 0.0, 1.0)
+        d = min(d, dot(b, b))
+        c = p[2] >= poly[i][2], p[2] < poly[j][2], e[1] * w[2] > e[2] * w[1]
+        if all(c) || all(.!c)
+            s = -s
+        end
+        j = i
+    end
+    return s * sqrt(d)
+end
\ No newline at end of file
diff --git a/scripts2D_variational/helpers_tmp.jl b/scripts2D_variational/helpers_tmp.jl
new file mode 100644
index 00000000..aaaf4f1a
--- /dev/null
+++ b/scripts2D_variational/helpers_tmp.jl
@@ -0,0 +1,67 @@
+@inline scalar_field(::Type{T}, nx, ny) where {T}  = field_array(T, nx, ny)
+@inline vector_field(::Type{T}, nx, ny) where {T}  =  (x = field_array(T, nx + 1, ny    ),
+                                                       y = field_array(T, nx    , ny + 1))
+@inline tensor_field(::Type{T}, nx, ny) where {T}  = (xx = field_array(T, nx    , ny    ),
+                                                      yy = field_array(T, nx    , ny    ),
+                                                      xy = field_array(T, nx - 1, ny - 1))
+@inline volfrac_field(::Type{T}, nx, ny) where {T} = (c  = field_array(T, nx    , ny    ),
+                                                      x  = field_array(T, nx + 1, ny    ),
+                                                      y  = field_array(T, nx    , ny + 1),
+                                                      xy = field_array(T, nx - 1, ny - 1))
+
+@tiny function _kernel_init!(Pr, τ, V, ηs, ebg, ηs0, xv, yv)
+    ix, iy = @indices()
+    @inbounds if ix ∈ axes(Pr, 1) && iy ∈ axes(Pr, 2)
+        Pr[ix, iy]   = 0.0
+        τ.xx[ix, iy] = 0.0
+        τ.yy[ix, iy] = 0.0
+        ηs[ix, iy]   = ηs0
+    end
+    if ix ∈ axes(τ.xy, 1) && iy ∈ axes(τ.xy, 2)
+        @inbounds τ.xy[ix, iy] = 0.0
+    end
+    if ix ∈ axes(V.x, 1) && iy ∈ axes(V.x, 2)
+        @inbounds V.x[ix, iy] = -xv[ix] * ebg
+    end
+    if ix ∈ axes(V.y, 1) && iy ∈ axes(V.y, 2)
+        @inbounds V.y[ix, iy] = yv[iy] * ebg
+    end
+end
+
+@tiny function _kernel_update_vis_fields!(Vmag, Ψav, V, Ψ)
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    @inbounds if isin(Ψ.not_air)
+        pav = 0.0
+        for idy = 1:2, idx = 1:2
+            pav += Ψ.not_air[ix+idx, iy+idy]
+        end
+        Ψav.not_air[ix, iy] = pav / 8
+    end
+    # @inbounds if isin(Ψ.not_solid)
+    #     pav = 0.0
+    #     for idy = 1:2, idx = 1:2
+    #         pav += Ψ.not_solid[ix+idx, iy+idy]
+    #     end
+    #     Ψav.not_solid[ix, iy] = pav / 8
+    # end
+    @inbounds if isin(Vmag)
+        vxc = 0.5 * (V.x[ix+1, iy+1] + V.x[ix+2, iy+1])
+        vyc = 0.5 * (V.y[ix+1, iy+1] + V.y[ix+1, iy+2])
+        Vmag[ix, iy] = sqrt(vxc^2 + vyc^2)
+    end
+    return
+end
+
+const _init! = _kernel_init!(get_device())
+const _update_vis! = _kernel_update_vis_fields!(get_device())
+
+function init!(Pr, τ, V, ηs, ebg, ηs0, xv, yv)
+    wait(_init!(Pr, τ, V, ηs, ebg, ηs0, xv, yv; ndrange=size(Pr) .+ 1))
+    return
+end
+
+function update_vis!(Vmag, Ψav, V, Ψ)
+    wait(_update_vis!(Vmag, Ψav, V, Ψ; ndrange=axes(Vmag)))
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational/level_set_kernels.jl b/scripts2D_variational/level_set_kernels.jl
new file mode 100644
index 00000000..adba3347
--- /dev/null
+++ b/scripts2D_variational/level_set_kernels.jl
@@ -0,0 +1,23 @@
+include("geometry.jl")
+
+@tiny function _kernel_compute_levelset_from_polygon!(op, ψ, xv, yv, mc)
+    ix, iy = @indices
+    @inbounds ψ[ix, iy] = op(ψ[ix, iy], signed_distance(Point(xv[ix], yv[iy]), mc))
+end
+
+@tiny function _kernel_extrapolate_with_levelset!(∂A_∂τ,A,Ψ,Δx,Δy)
+    @inline S(x) = x/sqrt(x^2+max(Δx,Δy)^2)
+    @inline Ψ_c(_ix,_iy) = 0.25*(Ψ[_ix,_iy] + Ψ[_ix+1,_iy] + Ψ[_ix,_iy+1] + Ψ[_ix+1,_iy+1])
+    @inline Ψ_x(_ix,_iy) = 0.5*(Ψ[_ix,_iy] + Ψ[_ix+1,_iy])
+    @inline Ψ_y(_ix,_iy) = 0.5*(Ψ[_ix,_iy] + Ψ[_ix,_iy+1])
+    ix,iy = @indices
+    if Ψ[ix,iy] > 0 && Ψ[ix+1,iy] > 0 && Ψ[ix,iy+1] > 0 && Ψ[ix+1,iy+1] > 0
+        s     = S(Ψ_c(ix,iy))
+        ∇Ψx   = (Ψ_y(ix+1,iy) - Ψ_y(ix,iy))/Δx
+        ∇Ψy   = (Ψ_x(ix,iy+1) - Ψ_x(ix,iy))/Δy
+        nx,ny = ∇Ψx/sqrt(∇Ψx^2 + ∇Ψy^2), ∇Ψy/sqrt(∇Ψx^2 + ∇Ψy^2)
+        Fx    = max(s*nx,0)*(A[ix,iy]-A[ix-1,iy])/Δx + min(s*nx,0)*(A[ix+1,iy]-A[ix,iy])/Δx
+        Fy    = max(s*ny,0)*(A[ix,iy]-A[ix,iy-1])/Δy + min(s*ny,0)*(A[ix,iy+1]-A[ix,iy])/Δy
+        ∂A_∂τ[ix-1,iy-1] = -(Fx + Fy)
+    end
+end
\ No newline at end of file
diff --git a/scripts2D_variational/level_sets.jl b/scripts2D_variational/level_sets.jl
new file mode 100644
index 00000000..dac746b5
--- /dev/null
+++ b/scripts2D_variational/level_sets.jl
@@ -0,0 +1,18 @@
+include("level_set_kernels.jl")
+
+const _compute_levelset_from_polygon! = _kernel_compute_levelset_from_polygon!(get_device())
+
+function compute_levelset!(op, Ψ, xv, yv, mc)
+    wait(_compute_levelset_from_polygon!(op, Ψ, xv, yv, mc; ndrange=axes(Ψ)))
+    return
+end
+
+# by default, compute union of new and current levelset
+compute_levelset!(Ψ, xv, yv, mc) = compute_levelset!(min, Ψ, xv, yv, mc)
+
+const _extrapolate_with_levelset! = _kernel_extrapolate_with_levelset!(get_device())
+
+function extrapolate_with_levelset!(∂A_∂τ,A,Ψ,Δx,Δy)
+    wait(_extrapolate_with_levelset!(∂A_∂τ,A,Ψ,Δx,Δy; ndrange=(axes(A,1)[2:end-1],axes(A,2)[2:end-1])))
+    return
+end
diff --git a/src/level_sets/signed_distances.jl b/scripts2D_variational/signed_distances.jl
similarity index 90%
rename from src/level_sets/signed_distances.jl
rename to scripts2D_variational/signed_distances.jl
index a95f537b..6bc7b26e 100644
--- a/src/level_sets/signed_distances.jl
+++ b/scripts2D_variational/signed_distances.jl
@@ -35,8 +35,8 @@ end
 
 @inline function triangle_pair(Iv,dem,rc)
     @inline function sample_dem(I)
-        x,y = rc[1][I[1]],rc[2][I[2]]
-        Point3(x,y,dem[I])
+        @inbounds x,y = rc[1][I[1]],rc[2][I[2]]
+        @inbounds Point3(x,y,dem[I])
     end
     T_BL = Triangle(sample_dem(Iv)       ,sample_dem(inc(Iv,1)),sample_dem(inc(Iv,2)))
     T_TR = Triangle(sample_dem(inc(Iv,2)),sample_dem(inc(Iv,1)),sample_dem(inc(Iv)))
@@ -50,8 +50,8 @@ end
 end
 
 function sd_dem(P,cutoff,dem,rc)
-    Pp = clamp.(Point(P[1],P[2]),first.(rc),last.(rc))
-    P  = Point(Pp[1],Pp[2],P[3])
+    @inbounds Pp = clamp.(Point(P[1],P[2]),first.(rc),last.(rc))
+    @inbounds P  = Point(Pp[1],Pp[2],P[3])
     BL = closest_vertex_index(Pp.-cutoff,rc)
     TR = closest_vertex_index(Pp.+cutoff,rc)
     Ic = closest_vertex_index(Pp,rc)
diff --git a/scripts2D_variational/stokes.jl b/scripts2D_variational/stokes.jl
new file mode 100644
index 00000000..eafdba44
--- /dev/null
+++ b/scripts2D_variational/stokes.jl
@@ -0,0 +1,43 @@
+include("stokes_kernels.jl")
+
+const _update_σ! = _kernel_update_σ!(get_device())
+const _compute_invariants! = _kernel_compute_invariants!(get_device())
+const _compute_xII_η! = _kernel_compute_xII_η!(get_device())
+const _update_V! = _kernel_update_V!(get_device())
+const _compute_residual_P! = _kernel_compute_residual_P!(get_device())
+const _compute_residual_V! = _kernel_compute_residual_V!(get_device())
+
+function update_σ!(Pr, ε, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy)
+    wait(_update_σ!(Pr, ε, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy; ndrange=axes(Pr)))
+    return
+end
+
+function compute_invariants!(εII, τII, ε, τ, ηs, χ, mpow)
+    εII_inn = inn(εII)
+    τII_inn = inn(τII)
+    ηs_inn = inn(ηs)
+    wait(_compute_xII_η!(εII_inn, τII_inn, ε, τ, ηs_inn, χ, mpow; ndrange=axes(εII_inn)))
+    bc_x_neumann!(0.0, εII)
+    bc_y_neumann!(0.0, εII)
+    bc_x_neumann!(0.0, τII)
+    bc_y_neumann!(0.0, τII)
+    bc_x_neumann!(0.0, ηs)
+    bc_y_neumann!(0.0, ηs)
+    return
+end
+
+function update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy)
+    V_inn = (x=inn(V.x), y=inn(V.y))
+    wait(_update_V!(V_inn, Pr, τ, ηs, wt, nudτ, ρg, dx, dy; ndrange=axes(Pr)))
+    bc_x_neumann!(0.0, V.y)
+    bc_y_neumann!(0.0, V.x)
+    return
+end
+
+function compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy)
+    V_inn = (x=inn(V.x), y=inn(V.y))
+    e1 = _compute_residual_P!(Res, V, wt, dx, dy; ndrange=axes(Pr))
+    e2 = _compute_residual_V!(Res, Pr, V_inn, τ, wt, ρg, dx, dy; ndrange=axes(Pr))
+    wait.((e1, e2))
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational/stokes_kernels.jl b/scripts2D_variational/stokes_kernels.jl
new file mode 100644
index 00000000..3b81722d
--- /dev/null
+++ b/scripts2D_variational/stokes_kernels.jl
@@ -0,0 +1,138 @@
+@tiny function _kernel_update_σ!(Pr, ε, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy)
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # detect and eliminate null spaces
+    isnull = (wt.not_air.x[ix, iy] ≈ 0.0) || (wt.not_air.x[ix+1, iy] ≈ 0.0) ||
+             (wt.not_air.y[ix, iy] ≈ 0.0) || (wt.not_air.y[ix, iy+1] ≈ 0.0)
+    if !isnull && (wt.not_air.c[ix, iy] > 0.0)
+        ε.xx[ix, iy] = (V.x[ix+1, iy] * wt.not_solid.x[ix+1, iy] - V.x[ix, iy] * wt.not_solid.x[ix, iy]) / dx
+        ε.yy[ix, iy] = (V.y[ix, iy+1] * wt.not_solid.y[ix, iy+1] - V.y[ix, iy] * wt.not_solid.y[ix, iy]) / dy
+        ∇V = ε.xx[ix, iy] + ε.yy[ix, iy]
+        Pr[ix, iy] -= ∇V * ηs[ix, iy] * r / θ_dτ
+        τ.xx[ix, iy] += (-τ.xx[ix, iy] + 2.0 * wt.not_air.c[ix, iy] * ηs[ix, iy] * (ε.xx[ix, iy] - ∇V / 3.0)) * dτ_r
+        τ.yy[ix, iy] += (-τ.yy[ix, iy] + 2.0 * wt.not_air.c[ix, iy] * ηs[ix, iy] * (ε.yy[ix, iy] - ∇V / 3.0)) * dτ_r
+    else
+        ε.xx[ix, iy] = 0.0
+        ε.yy[ix, iy] = 0.0
+        Pr[ix, iy] = 0.0
+        τ.xx[ix, iy] = 0.0
+        τ.yy[ix, iy] = 0.0
+    end
+    @inbounds if isin(τ.xy)
+        # detect and eliminate null spaces
+        isnull = (wt.not_air.x[ix+1, iy+1] ≈ 0.0) || (wt.not_air.x[ix+1, iy] ≈ 0.0) ||
+                 (wt.not_air.y[ix+1, iy+1] ≈ 0.0) || (wt.not_air.y[ix, iy+1] ≈ 0.0)
+        if !isnull && (wt.not_air.xy[ix, iy] > 0.0)
+            ε.xy[ix, iy] =
+                0.5 * (
+                    (V.x[ix+1, iy+1] * wt.not_solid.x[ix+1, iy+1] - V.x[ix+1, iy] * wt.not_solid.x[ix+1, iy]) / dy +
+                    (V.y[ix+1, iy+1] * wt.not_solid.y[ix+1, iy+1] - V.y[ix, iy+1] * wt.not_solid.y[ix, iy+1]) / dx
+                )
+            ηs_av = 0.25 * (ηs[ix, iy] + ηs[ix+1, iy] + ηs[ix, iy+1] + ηs[ix+1, iy+1])
+            τ.xy[ix, iy] += (-τ.xy[ix, iy] + 2.0 * wt.not_air.xy[ix, iy] * ηs_av * ε.xy[ix, iy]) * dτ_r
+        else
+            ε.xy[ix, iy] = 0.0
+            τ.xy[ix, iy] = 0.0
+        end
+    end
+    return
+end
+
+@tiny function _kernel_compute_xII_η!(εII, τII, ε, τ, ηs, χ, mpow)
+    ix, iy = @indices
+    εxyc = 0.25 * (ε.xy[ix, iy] + ε.xy[ix+1, iy] + ε.xy[ix, iy+1] + ε.xy[ix+1, iy+1])
+    τxyc = 0.25 * (τ.xy[ix, iy] + τ.xy[ix+1, iy] + τ.xy[ix, iy+1] + τ.xy[ix+1, iy+1])
+    εII[ix, iy] = sqrt(0.5 * (ε.xx[ix+1, iy+1]^2 + ε.yy[ix+1, iy+1]^2) + εxyc^2)
+    τII[ix, iy] = sqrt(0.5 * (τ.xx[ix+1, iy+1]^2 + τ.yy[ix+1, iy+1]^2) + τxyc^2)
+    # nonlinear viscosity
+    ηs_τ = εII[ix, iy]^mpow
+    ηs[ix, iy] = min((1 - χ) * ηs[ix, iy] + χ * ηs_τ, 1e2)
+    return
+end
+
+@tiny function _kernel_update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy)
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # TODO: check which volume fraction (non-air or non-solid) really determines the null spaces
+    @inbounds if isin(V.x)
+        # detect and eliminate null spaces
+        isnull = (wt.not_solid.c[ix+1, iy+1] ≈ 0) || (wt.not_solid.c[ix, iy+1] ≈ 0) ||
+                 (wt.not_solid.xy[ix, iy+1] ≈ 0) || (wt.not_solid.xy[ix, iy] ≈ 0)
+        if !isnull && (wt.not_air.x[ix+1, iy+1] > 0) && (wt.not_solid.x[ix+1, iy+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_x is computed correctly
+            ηs_x = max(ηs[ix, iy+1], ηs[ix+1, iy+1])
+            ∂σxx_∂x = ((-Pr[ix+1, iy+1] + τ.xx[ix+1, iy+1]) * wt.not_air.c[ix+1, iy+1] -
+                       (-Pr[ix  , iy+1] + τ.xx[ix  , iy+1]) * wt.not_air.c[ix  , iy+1]) / dx
+            ∂τxy_∂y = (τ.xy[ix, iy+1] * wt.not_air.xy[ix, iy+1] - τ.xy[ix, iy] * wt.not_air.xy[ix, iy]) / dy
+            V.x[ix, iy] += (∂σxx_∂x + ∂τxy_∂y - ρg.x * wt.not_air.x[ix+1, iy+1]) * nudτ / ηs_x
+        else
+            V.x[ix, iy] = 0.0
+        end
+    end
+    @inbounds if isin(V.y)
+        # detect and eliminate null spaces
+        isnull = (wt.not_solid.c[ix+1, iy+1] ≈ 0) || (wt.not_solid.c[ix+1, iy] ≈ 0) ||
+                 (wt.not_solid.xy[ix+1, iy] ≈ 0) || (wt.not_solid.xy[ix, iy] ≈ 0)
+        if !isnull && (wt.not_air.y[ix+1, iy+1] > 0) && (wt.not_solid.y[ix+1, iy+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_y is computed correctly
+            ηs_y = max(ηs[ix+1, iy], ηs[ix+1, iy+1])
+            ∂σyy_∂y = ((-Pr[ix+1, iy+1] + τ.yy[ix+1, iy+1]) * wt.not_air.c[ix+1, iy+1] -
+                       (-Pr[ix+1, iy  ] + τ.yy[ix+1, iy  ]) * wt.not_air.c[ix+1, iy  ]) / dy
+            ∂τxy_∂x = (τ.xy[ix+1, iy] * wt.not_air.xy[ix+1, iy] - τ.xy[ix, iy] * wt.not_air.xy[ix, iy]) / dx
+            V.y[ix, iy] += (∂σyy_∂y + ∂τxy_∂x - ρg.y * wt.not_air.y[ix+1, iy+1]) * nudτ / ηs_y
+        else
+            V.y[ix, iy] = 0.0
+        end
+    end
+    return
+end
+
+@tiny function _kernel_compute_residual_P!(Res, V, wt, dx, dy)
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # detect and eliminate null spaces
+    isnull = (wt.not_air.x[ix, iy] ≈ 0.0) || (wt.not_air.x[ix+1, iy] ≈ 0.0) ||
+             (wt.not_air.y[ix, iy] ≈ 0.0) || (wt.not_air.y[ix, iy+1] ≈ 0.0)
+    if !isnull && (wt.not_air.c[ix, iy] > 0.0)
+        exx = (V.x[ix+1, iy] * wt.not_solid.x[ix+1, iy] - V.x[ix, iy] * wt.not_solid.x[ix, iy]) / dx
+        eyy = (V.y[ix, iy+1] * wt.not_solid.y[ix, iy+1] - V.y[ix, iy] * wt.not_solid.y[ix, iy]) / dy
+        ∇V  = exx + eyy
+        Res.Pr[ix, iy] = ∇V
+    else
+        Res.Pr[ix, iy] = 0.0
+    end
+    return
+end
+
+@tiny function _kernel_compute_residual_V!(Res, Pr, V, τ, wt, ρg, dx, dy)
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # TODO: check which volume fraction (non-air or non-solid) really determines the null spaces
+    @inbounds if isin(V.x)
+        # detect and eliminate null spaces
+        isnull = (wt.not_solid.c[ix+1, iy+1] ≈ 0) || (wt.not_solid.c[ix, iy+1] ≈ 0) ||
+                 (wt.not_solid.xy[ix, iy+1] ≈ 0) || (wt.not_solid.xy[ix, iy] ≈ 0)
+        if !isnull && (wt.not_air.x[ix+1, iy+1] > 0) && (wt.not_solid.x[ix+1, iy+1] > 0)
+            ∂σxx_∂x = ((-Pr[ix+1, iy+1] + τ.xx[ix+1, iy+1]) * wt.not_air.c[ix+1, iy+1] -
+                       (-Pr[ix  , iy+1] + τ.xx[ix  , iy+1]) * wt.not_air.c[ix  , iy+1]) / dx
+            ∂τxy_∂y = (τ.xy[ix, iy+1] * wt.not_air.xy[ix, iy+1] - τ.xy[ix, iy] * wt.not_air.xy[ix, iy]) / dy
+            Res.V.x[ix, iy] = ∂σxx_∂x + ∂τxy_∂y - ρg.x * wt.not_air.x[ix+1, iy+1]
+        else
+            Res.V.x[ix, iy] = 0.0
+        end
+    end
+    @inbounds if isin(V.y)
+        # detect and eliminate null spaces
+        isnull = (wt.not_solid.c[ix+1, iy+1] ≈ 0) || (wt.not_solid.c[ix+1, iy] ≈ 0) ||
+                 (wt.not_solid.xy[ix+1, iy] ≈ 0) || (wt.not_solid.xy[ix, iy] ≈ 0)
+        if !isnull && (wt.not_air.y[ix+1, iy+1] > 0) && (wt.not_solid.y[ix+1, iy+1] > 0)
+            ∂σyy_∂y = ((-Pr[ix+1, iy+1] + τ.yy[ix+1, iy+1]) * wt.not_air.c[ix+1, iy+1] -
+                       (-Pr[ix+1, iy  ] + τ.yy[ix+1, iy  ]) * wt.not_air.c[ix+1, iy  ]) / dy
+            ∂τxy_∂x = (τ.xy[ix+1, iy] * wt.not_air.xy[ix+1, iy] - τ.xy[ix, iy] * wt.not_air.xy[ix, iy]) / dx
+            Res.V.y[ix, iy] = ∂σyy_∂y + ∂τxy_∂x - ρg.y * wt.not_air.y[ix+1, iy+1]
+        else
+            Res.V.y[ix, iy] = 0.0
+        end
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational/test_volume_fractions2D.jl b/scripts2D_variational/test_volume_fractions2D.jl
new file mode 100644
index 00000000..e3bb7448
--- /dev/null
+++ b/scripts2D_variational/test_volume_fractions2D.jl
@@ -0,0 +1,222 @@
+using FastIce
+using TinyKernels
+using HDF5
+using LightXML
+
+include("load_dem.jl")
+include("signed_distances.jl")
+include("level_sets.jl")
+include("volume_fractions.jl")
+include("bcs.jl")
+include("stokes.jl")
+include("data_io.jl")
+include("hide_communication.jl")
+
+@views av1(A) = 0.5.*(A[1:end-1].+A[2:end])
+@views inn_x(A) = A[2:end-1,:]
+@views inn_y(A) = A[:,2:end-1]
+@views inn(A)   = A[2:end-1,2:end-1]
+
+const DAT = Float32
+
+@views function main(grid_dims)
+
+    # load DEM
+    (me==0) && @info "loading DEM data from the file '$greenland_path'"
+    (;x,y,bed,surface) = load_dem(greenland_path,global_region)
+    (me==0) && @info "DEM resolution: $(size(bed,1)) × $(size(bed,2))"
+
+    # compute origin and size of the domain (required for scaling and computing the grid size)
+    ox,oy,oz = x[1], y[1], minimum(bed)
+    lx = x[end] - ox
+    ly = y[end] - oy
+    lz = maximum(surface) - oz
+
+    # shift and scale the domain before computation (center of the domain is (0,0) in x-y plane)
+    δx, δy = ox + 0.5lx,oy + 0.5ly # required to avoid conversion to Vector  
+    x = @. (x - δx)/lz
+    y = @. (y - δy)/lz
+    @. bed     = (bed     - oz)/lz
+    @. surface = (surface - oz)/lz
+
+    # run simulation
+    dem_data = (;x,y,bed,surface)
+    @info "running the simulation"
+    run_simulation(dem_data,grid_dims,me,dims,coords,comm_cart)
+
+    return
+end
+
+@views function run_simulation(dem_data,grid_dims,me,dims,coords,comm_cart)
+    # physics
+    # global domain origin and size
+    ox_g, oy_g, oz_g = dem_data.x[1], dem_data.y[1], 0.0
+    lx_g = dem_data.x[end] - ox_g
+    ly_g = dem_data.y[end] - oy_g
+    lz_g = 1.0
+
+    ρg  = (x=0.0,y=0.0,z=1.0)
+
+    # local domain size and origin
+    lx_l,ly_l,lz_l = (lx_g,ly_g,lz_g)./dims
+    ox_l,oy_l,oz_l = (ox_g,oy_g,oz_g) .+ coords.*(lx_l,ly_l,lz_l)
+
+    # numerics
+    nx,ny,nz = grid_dims
+    bwidth   = (8,4,4)
+    
+    # preprocessing
+    dx,dy,dz = lx_g/nx_g(), ly_g/ny_g(), lz_g/nz_g()
+    (me==0) && @info "grid spacing: dx = $dx, dy = $dy, dz = $dz"
+
+    xv_l = LinRange(ox_l,ox_l+lx_l,nx+1)
+    yv_l = LinRange(oy_l,oy_l+ly_l,ny+1)
+    zv_l = LinRange(oz_l,oz_l+lz_l,nz+1)
+    xc_l,yc_l,zc_l = av1.((xv_l,yv_l,zv_l))
+    
+    # PT params
+    r          = 0.7
+    lτ_re_mech = 0.5min(lx_g,ly_g,lz_g)/π
+    vdτ        = min(dx,dy,dz)/sqrt(10.1)
+    θ_dτ       = lτ_re_mech*(r+4/3)/vdτ
+    nudτ       = vdτ*lτ_re_mech
+    dτ_r       = 1.0/(θ_dτ+1.0)
+
+    # fields allocation
+    # level set
+    Ψ = (
+        not_solid = field_array(DAT,nx+1,ny+1), # fluid
+        not_air   = field_array(DAT,nx+1,ny+1), # liquid
+    )
+    wt = (
+        not_solid = (
+            c  = field_array(DAT,nx  ,ny  ),
+            x  = field_array(DAT,nx+1,ny  ),
+            y  = field_array(DAT,nx  ,ny+1),
+            xy = field_array(DAT,nx-1,ny-1),
+        ),
+        not_air = (
+            c  = field_array(DAT,nx  ,ny  ),
+            x  = field_array(DAT,nx+1,ny  ),
+            y  = field_array(DAT,nx  ,ny+1),
+            xy = field_array(DAT,nx-1,ny-1),
+        )
+    )
+    # mechanics
+    Pr = field_array(DAT,nx,ny)
+    τ  = (
+        xx = field_array(DAT,nx  ,ny  ),
+        yy = field_array(DAT,nx  ,ny  ),
+        xy = field_array(DAT,nx-1,ny-1),
+    )
+    V = (
+        x = field_array(DAT,nx+1,ny),
+        y = field_array(DAT,nx,ny+1),
+    )
+    ηs = field_array(DAT,nx,ny)
+    # residuals
+    Res = (
+        Pr = field_array(DAT,nx,ny),
+        V = (
+            x = field_array(DAT,nx-1,ny-2),
+            y = field_array(DAT,nx-2,ny-1),
+        )
+    )
+    # visualisation
+    Vmag = field_array(DAT,nx-2,ny-2)
+    τII  = field_array(DAT,nx-2,ny-2)
+    Ψav  = (
+        not_solid = field_array(DAT,nx-2,ny-2),
+        not_air   = field_array(DAT,nx-2,ny-2),
+    )
+
+    # initialisation
+    for comp in eachindex(V) fill!(V[comp],0.0) end
+    for comp in eachindex(τ) fill!(τ[comp],0.0) end
+    fill!(Pr,0.0)
+    fill!(ηs,1.0)
+
+    # compute level sets from DEM data
+    dem_grid = (dem_data.x,dem_data.y)
+    Ψ_grid   = (xv_l,yv_l,zv_l)
+    
+    (me==0) && @info "computing the level set for the ice surface"
+    compute_level_set_from_dem!(Ψ.not_air,to_device(dem_data.surface),dem_grid,Ψ_grid)
+
+    (me==0) && @info "computing the level set for the bedrock surface"
+    compute_level_set_from_dem!(Ψ.not_solid,to_device(dem_data.bed),dem_grid,Ψ_grid)
+    TinyKernels.device_synchronize(get_device())
+    @. Ψ.not_solid*= -1.0
+    TinyKernels.device_synchronize(get_device())
+
+    (me==0) && @info "computing volume fractions from level sets"
+    for phase in eachindex(Ψ)
+        compute_volume_fractions_from_level_set!(wt[phase],Ψ[phase],dx,dy,dz)
+    end
+    
+    (me==0) && @info "iteration loop"
+    for iter in 1:1000
+        (me==0) && println("  iter: $iter")
+        update_σ!(Pr,τ,V,ηs,wt,r,θ_dτ,dτ_r,dx,dy,dz)
+        update_V!(V,Pr,τ,ηs,wt,nudτ,ρg,dx,dy,dz;bwidth)
+    end
+
+    (me==0) && @info "saving results on disk"
+    dim_g = (nx_g()-2, ny_g()-2, nz_g()-2)
+    update_vis_fields!(Vmag,τII,Ψav,V,τ,Ψ)
+    out_h5 = "results.h5"
+    ndrange = CartesianIndices(( (coords[1]*(nx-2) + 1):(coords[1]+1)*(nx-2),
+                                 (coords[2]*(ny-2) + 1):(coords[2]+1)*(ny-2),
+                                 (coords[3]*(nz-2) + 1):(coords[3]+1)*(nz-2) ))
+    fields = Dict("LS_ice"=>Ψav.not_air,"LS_bed"=>Ψav.not_solid"Vmag"=>Vmag,"TII"=>τII,"Pr"=>inn(Pr))
+    (me==0) && @info "saving HDF5 file"
+    write_h5(out_h5,fields,dim_g,ndrange,comm_cart,MPI.Info())
+
+    if me==0
+        @info "saving XDMF file..."
+        write_xdmf("results.xdmf3",out_h5,fields,(xc_l[2],yc_l[2],zc_l[2]),(dx,dy,dz),dim_g)
+    end
+
+    return
+end
+
+@tiny function _kernel_update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ)
+    ix,iy,iz = @indices
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    @inbounds if isin(Ψ.not_air)
+        pav = 0.0
+        for idz = 1:2, idy=1:2, idx = 1:2
+            pav += Ψ.not_air[ix+idx,iy+idy,iz+idz]
+        end
+        Ψav.not_air[ix,iy,iz] = pav/8
+    end
+    @inbounds if isin(Ψ.not_solid
+        pav = 0.0
+        for idz = 1:2, idy=1:2, idx = 1:2
+            pav += Ψ.not_solidix+idx,iy+idy,iz+idz]
+        end
+        Ψav.not_solidix,iy,iz] = pav/8
+    end
+    @inbounds if isin(Vmag)
+        vxc = 0.5*(V.x[ix+1,iy+1,iz+1] + V.x[ix+2,iy+1,iz+1])
+        vyc = 0.5*(V.y[ix+1,iy+1,iz+1] + V.y[ix+1,iy+2,iz+1])
+        vzc = 0.5*(V.z[ix+1,iy+1,iz+1] + V.z[ix+1,iy+1,iz+2])
+        Vmag[ix,iy,iz] = sqrt(vxc^2 + vyc^2 + vzc^2)
+    end
+    @inbounds if isin(τII)
+        τxyc = 0.25*(τ.xy[ix,iy,iz]+τ.xy[ix+1,iy,iz]+τ.xy[ix,iy+1,iz]+τ.xy[ix+1,iy+1,iz])
+        τxzc = 0.25*(τ.xz[ix,iy,iz]+τ.xz[ix+1,iy,iz]+τ.xz[ix,iy,iz+1]+τ.xz[ix+1,iy,iz+1])
+        τyzc = 0.25*(τ.yz[ix,iy,iz]+τ.yz[ix,iy+1,iz]+τ.yz[ix,iy,iz+1]+τ.yz[ix,iy+1,iz+1])
+        τII[ix,iy,iz] = sqrt(0.5*(τ.xx[ix+1,iy+1,iz+1]^2 + τ.yy[ix+1,iy+1,iz+1]^2 + τ.zz[ix+1,iy+1,iz+1]^2) + τxyc^2 + τxzc^2 + τyzc^2)
+    end
+    return
+end
+
+const _update_vis_fields! = Kernel(_kernel_update_vis_fields!,get_device())
+
+function update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ)
+    wait(_update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ; ndrange=axes(Vmag)))
+    return
+end
+
+main((1024,1024,64))
\ No newline at end of file
diff --git a/scripts2D_variational/volume_fraction_kernels.jl b/scripts2D_variational/volume_fraction_kernels.jl
new file mode 100644
index 00000000..5adb7009
--- /dev/null
+++ b/scripts2D_variational/volume_fraction_kernels.jl
@@ -0,0 +1,31 @@
+@tiny function _kernel_compute_volume_fractions_from_level_set!(wt, Ψ, dx, dy)
+    ix, iy = @indices
+    cell = Rect(Vec(0.0, 0.0), Vec(dx, dy))
+    ω = GeometryBasics.volume(cell)
+    @inline Ψ_ax(dix, diy) = 0.5 * (Ψ[ix+dix, iy+diy] + Ψ[ix+dix+1, iy+diy  ])
+    @inline Ψ_ay(dix, diy) = 0.5 * (Ψ[ix+dix, iy+diy] + Ψ[ix+dix  , iy+diy+1])
+    @inline Ψ_axy(dix, diy) = 0.25 * (Ψ[ix+dix, iy+diy  ] + Ψ[ix+dix+1, iy+diy  ] +
+                                      Ψ[ix+dix, iy+diy+1] + Ψ[ix+dix+1, iy+diy+1])
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # cell centers
+    @inbounds if isin(wt.c)
+        Ψs = Vec{4}(Ψ[ix, iy], Ψ[ix+1, iy], Ψ[ix+1, iy+1], Ψ[ix, iy+1])
+        wt.c[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    # x faces
+    @inbounds if isin(wt.x)
+        Ψs = Vec{4}(Ψ_ax(0, 0), Ψ_ax(1, 0), Ψ_ax(1, 1), Ψ_ax(0, 1))
+        wt.x[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    # y faces
+    @inbounds if isin(wt.y)
+        Ψs = Vec{4}(Ψ_ay(0, 0), Ψ_ay(1, 0), Ψ_ay(1, 1), Ψ_ay(0, 1))
+        wt.y[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    # xy edges
+    @inbounds if isin(wt.xy)
+        Ψs = Vec{4}(Ψ_axy(0, 0), Ψ_axy(1, 0), Ψ_axy(1, 1), Ψ_axy(0, 1))
+        wt.xy[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational/volume_fractions.jl b/scripts2D_variational/volume_fractions.jl
new file mode 100644
index 00000000..7ab5a9e3
--- /dev/null
+++ b/scripts2D_variational/volume_fractions.jl
@@ -0,0 +1,55 @@
+@inline perturb(ϕ) = abs(ϕ) > 1e-20 ? ϕ : (ϕ > 0 ? 1e-20 : -1e-20)
+
+@inline trivol(v1, v2, v3) = 0.5 * abs(cross(v3 - v1, v2 - v1))
+
+function volfrac(tri, ϕ::Vec3{T})::T where {T}
+    v1, v2, v3 = tri
+    if ϕ[1] < 0 && ϕ[2] < 0 && ϕ[3] < 0 # ---
+        return trivol(v1, v2, v3)
+    elseif ϕ[1] > 0 && ϕ[2] > 0 && ϕ[3] > 0 # +++
+        return 0.0
+    end
+    @inline vij(i, j) = tri[j] * (ϕ[i] / (ϕ[i] - ϕ[j])) - tri[i] * (ϕ[j] / (ϕ[i] - ϕ[j]))
+    v12, v13, v23 = vij(1, 2), vij(1, 3), vij(2, 3)
+    if ϕ[1] < 0
+        if ϕ[2] < 0
+            trivol(v1, v23, v13) + trivol(v1, v2, v23)  # --+
+        else
+            if ϕ[3] < 0
+                trivol(v3, v12, v23) + trivol(v3, v1, v12) # -+-
+            else
+                trivol(v1, v12, v13) # -++
+            end
+        end
+    else
+        if ϕ[2] < 0
+            if ϕ[3] < 0
+                trivol(v2, v13, v12) + trivol(v2, v3, v13) # +--
+            else
+                trivol(v12, v2, v23) # +-+
+            end
+        else
+            trivol(v13, v23, v3) # ++-
+        end
+    end
+end
+
+function volfrac(rect::Rect2{T}, ϕ::Vec4{T}) where {T}
+    or, ws = origin(rect), widths(rect)
+    v1, v2, v3, v4 = or, or + Vec(ws[1], 0.0), or + ws, or + Vec(0.0, ws[2])
+    ϕ1, ϕ2, ϕ3, ϕ4 = perturb.(ϕ)
+    return volfrac(Vec(v1, v2, v3), Vec3{T}(ϕ1, ϕ2, ϕ3)) +
+           volfrac(Vec(v1, v3, v4), Vec3{T}(ϕ1, ϕ3, ϕ4))
+end
+
+include("volume_fraction_kernels.jl")
+
+const _compute_volume_fractions_from_level_set! = _kernel_compute_volume_fractions_from_level_set!(get_device())
+
+function compute_volume_fractions_from_level_set!(wt, Ψ, dx, dy)
+    wt_inn = (; c=wt.c, x=inn_x(wt.x), y=inn_y(wt.y), xy=wt.xy)
+    wait(_compute_volume_fractions_from_level_set!(wt_inn, Ψ, dx, dy; ndrange=axes(wt.c)))
+    bc_x_neumann!(0.0, wt.x)
+    bc_y_neumann!(0.0, wt.y)
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/bc_kernels.jl b/scripts2D_variational_TM/bc_kernels.jl
new file mode 100644
index 00000000..2e926a98
--- /dev/null
+++ b/scripts2D_variational_TM/bc_kernels.jl
@@ -0,0 +1,43 @@
+@tiny function _kernel_bc_x_dirichlet!(val,arrays...)
+    iy, = @indices
+    for A in arrays
+        if iy ∈ axes(A,2)
+            @inbounds A[1  ,iy] = val
+            @inbounds A[end,iy] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_dirichlet!(val, arrays...)
+    ix, = @indices
+    for A in arrays
+        if ix ∈ axes(A,1)
+            @inbounds A[ix,1  ] = val
+            @inbounds A[ix,end] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_x_neumann!(val, arrays...)
+    iy, = @indices
+    for A in arrays
+        if iy ∈ axes(A,2)
+            @inbounds A[1  ,iy] = A[2    ,iy] + val
+            @inbounds A[end,iy] = A[end-1,iy] + val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_neumann!(val, arrays...)
+    ix, = @indices
+    for A in arrays
+        if ix ∈ axes(A,1)
+            @inbounds A[ix,1  ] = A[ix,2    ] + val
+            @inbounds A[ix,end] = A[ix,end-1] + val
+        end
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/bcs.jl b/scripts2D_variational_TM/bcs.jl
new file mode 100644
index 00000000..72f48ecf
--- /dev/null
+++ b/scripts2D_variational_TM/bcs.jl
@@ -0,0 +1,33 @@
+include("bc_kernels.jl")
+
+const _bc_x_dirichlet! = _kernel_bc_x_dirichlet!(get_device())
+const _bc_y_dirichlet! = _kernel_bc_y_dirichlet!(get_device())
+
+const _bc_x_neumann! = _kernel_bc_x_neumann!(get_device())
+const _bc_y_neumann! = _kernel_bc_y_neumann!(get_device())
+
+for fname in (:bx_x_dirichlet!, :bc_x_neumann!)
+    @eval begin
+        function $fname(val, arrays...)
+            ax = axes(arrays[1], 2)
+            for A in arrays[2:end]
+                ax = union.(ax, axes(A, 2))
+            end
+            wait($(Symbol(:_, fname))(val, arrays...; ndrange=ax))
+            return
+        end
+    end
+end
+
+for fname in (:bx_y_dirichlet!, :bc_y_neumann!)
+    @eval begin
+        function $fname(val, arrays...)
+            ax = axes(arrays[1], 1)
+            for A in arrays[2:end]
+                ax = union.(ax, axes(A, 1))
+            end
+            wait($(Symbol(:_, fname))(val, arrays...; ndrange=ax))
+            return
+        end
+    end
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/data_io.jl b/scripts2D_variational_TM/data_io.jl
new file mode 100644
index 00000000..73a32553
--- /dev/null
+++ b/scripts2D_variational_TM/data_io.jl
@@ -0,0 +1,105 @@
+function write_h5(path,fields,dim,I,args...)
+    if !HDF5.has_parallel() && (length(args)>0)
+        @warn("HDF5 has no parallel support.")
+    end
+    h5open(path, "w", args...) do io
+        for (name,field) ∈ fields
+            dset               = create_dataset(io, "/$name", datatype(eltype(field)), dataspace(dim))
+            dset[I.indices...] = Array(field)
+        end
+    end
+    return
+end
+
+function write_xdmf(path,h5_names,fields,origin,spacing,dim,timesteps)
+    xdoc = XMLDocument()
+    xroot = create_root(xdoc, "Xdmf")
+    set_attribute(xroot, "Version","3.0")
+
+    xdomain     = new_child(xroot, "Domain")
+    xcollection = new_child(xdomain, "Grid")
+    set_attribute(xcollection, "GridType","Collection")
+    set_attribute(xcollection, "CollectionType","Temporal")
+
+    for (it,tt) ∈ enumerate(timesteps)
+        xgrid   = new_child(xcollection, "Grid")
+        set_attribute(xgrid, "GridType","Uniform")
+        xtopo = new_child(xgrid, "Topology")
+        set_attribute(xtopo, "TopologyType", "2DCoRectMesh")
+        set_attribute(xtopo, "Dimensions", join(reverse(dim).+1,' '))
+
+        xtime = new_child(xgrid, "Time")
+        set_attribute(xtime, "Value", "$tt")
+
+        xgeom = new_child(xgrid, "Geometry")
+        set_attribute(xgeom, "GeometryType", "ORIGIN_DXDY")
+
+        xorig = new_child(xgeom, "DataItem")
+        set_attribute(xorig, "Format", "XML")
+        set_attribute(xorig, "NumberType", "Float")
+        set_attribute(xorig, "Dimensions", "$(length(dim)) ")
+        add_text(xorig, join(reverse(origin), ' '))
+
+        xdr = new_child(xgeom, "DataItem")
+        set_attribute(xdr, "Format", "XML")
+        set_attribute(xdr, "NumberType", "Float")
+        set_attribute(xdr, "Dimensions", "$(length(dim))")
+        add_text(xdr, join(reverse(spacing), ' '))
+
+        h5_path = h5_names[it]
+        for (name,_) ∈ fields
+            create_xdmf_attribute(xgrid,h5_path,name,dim)
+        end
+    end
+
+    save_file(xdoc, path)
+    return
+end
+
+function write_xdmf(path,h5_path,fields,origin,spacing,dim)
+    xdoc = XMLDocument()
+    xroot = create_root(xdoc, "Xdmf")
+    set_attribute(xroot, "Version","3.0")
+
+    xdomain = new_child(xroot, "Domain")
+    xgrid   = new_child(xdomain, "Grid")
+    set_attribute(xgrid, "GridType","Uniform")
+    xtopo = new_child(xgrid, "Topology")
+    set_attribute(xtopo, "TopologyType", "2DCoRectMesh")
+    set_attribute(xtopo, "Dimensions", join(reverse(dim).+1,' '))
+
+    xgeom = new_child(xgrid, "Geometry")
+    set_attribute(xgeom, "GeometryType", "ORIGIN_DXDY")
+
+    xorig = new_child(xgeom, "DataItem")
+    set_attribute(xorig, "Format", "XML")
+    set_attribute(xorig, "NumberType", "Float")
+    set_attribute(xorig, "Dimensions", "$(length(dim)) ")
+    add_text(xorig, join(reverse(origin), ' '))
+
+    xdr = new_child(xgeom, "DataItem")
+    set_attribute(xdr, "Format", "XML")
+    set_attribute(xdr, "NumberType", "Float")
+    set_attribute(xdr, "Dimensions", "$(length(dim))")
+    add_text(xdr, join(reverse(spacing), ' '))
+
+    for (name,_) ∈ fields
+        create_xdmf_attribute(xgrid,h5_path,name,dim)
+    end
+    save_file(xdoc, path)
+    return
+end
+
+function create_xdmf_attribute(xgrid,h5_path,name,dim_g)
+    # TODO: solve type and precision
+    xattr = new_child(xgrid, "Attribute")
+    set_attribute(xattr, "Name", name)
+    set_attribute(xattr, "Center", "Cell")
+    xdata = new_child(xattr, "DataItem")
+    set_attribute(xdata, "Format", "HDF")
+    set_attribute(xdata, "NumberType", "Float")
+    set_attribute(xdata, "Precision", "8")
+    set_attribute(xdata, "Dimensions", join(reverse(dim_g), ' '))
+    add_text(xdata, "$(h5_path):/$name")
+    return xattr
+end
diff --git a/scripts2D_variational_TM/geometry.jl b/scripts2D_variational_TM/geometry.jl
new file mode 100644
index 00000000..4ca7fbde
--- /dev/null
+++ b/scripts2D_variational_TM/geometry.jl
@@ -0,0 +1,22 @@
+function make_marker_chain_circle(rc, rad, hmax)
+    np = ceil(Int, 2π * rad / hmax)
+    return [rc + rad .* Point2(reverse(sincospi(2 * (i - 1) / np))...) for i in 1:np]
+end
+
+function signed_distance(p::Point2{T}, poly::AbstractVector{Point2{T}}) where {T}
+    d = dot(p - poly[1], p - poly[1])
+    s = 1.0
+    j = length(poly)
+    for i in eachindex(poly)
+        e = poly[j] - poly[i]
+        w = p - poly[i]
+        b = w - e .* clamp(dot(w, e) / dot(e, e), 0.0, 1.0)
+        d = min(d, dot(b, b))
+        c = p[2] >= poly[i][2], p[2] < poly[j][2], e[1] * w[2] > e[2] * w[1]
+        if all(c) || all(.!c)
+            s = -s
+        end
+        j = i
+    end
+    return s * sqrt(d)
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/level_set_kernels.jl b/scripts2D_variational_TM/level_set_kernels.jl
new file mode 100644
index 00000000..6bef91d4
--- /dev/null
+++ b/scripts2D_variational_TM/level_set_kernels.jl
@@ -0,0 +1,21 @@
+@tiny function _kernel_compute_levelset_from_polygon!(op, ψ, xv, yv, mc)
+    ix, iy = @indices
+    @inbounds ψ[ix, iy] = op(ψ[ix, iy], signed_distance(Point(xv[ix], yv[iy]), mc))
+end
+
+@tiny function _kernel_extrapolate_with_levelset!(∂A_∂τ,A,Ψ,Δx,Δy)
+    @inline S(x) = x/sqrt(x^2+max(Δx,Δy)^2)
+    @inline Ψ_c(_ix,_iy) = 0.25*(Ψ[_ix,_iy] + Ψ[_ix+1,_iy] + Ψ[_ix,_iy+1] + Ψ[_ix+1,_iy+1])
+    @inline Ψ_x(_ix,_iy) = 0.5*(Ψ[_ix,_iy] + Ψ[_ix+1,_iy])
+    @inline Ψ_y(_ix,_iy) = 0.5*(Ψ[_ix,_iy] + Ψ[_ix,_iy+1])
+    ix,iy = @indices
+    if Ψ[ix,iy] > 0 && Ψ[ix+1,iy] > 0 && Ψ[ix,iy+1] > 0 && Ψ[ix+1,iy+1] > 0
+        s     = S(Ψ_c(ix,iy))
+        ∇Ψx   = (Ψ_y(ix+1,iy) - Ψ_y(ix,iy))/Δx
+        ∇Ψy   = (Ψ_x(ix,iy+1) - Ψ_x(ix,iy))/Δy
+        nx,ny = ∇Ψx/sqrt(∇Ψx^2 + ∇Ψy^2), ∇Ψy/sqrt(∇Ψx^2 + ∇Ψy^2)
+        Fx    = max(s*nx,0)*(A[ix,iy]-A[ix-1,iy])/Δx + min(s*nx,0)*(A[ix+1,iy]-A[ix,iy])/Δx
+        Fy    = max(s*ny,0)*(A[ix,iy]-A[ix,iy-1])/Δy + min(s*ny,0)*(A[ix,iy+1]-A[ix,iy])/Δy
+        ∂A_∂τ[ix-1,iy-1] = -(Fx + Fy)
+    end
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/level_sets.jl b/scripts2D_variational_TM/level_sets.jl
new file mode 100644
index 00000000..dac746b5
--- /dev/null
+++ b/scripts2D_variational_TM/level_sets.jl
@@ -0,0 +1,18 @@
+include("level_set_kernels.jl")
+
+const _compute_levelset_from_polygon! = _kernel_compute_levelset_from_polygon!(get_device())
+
+function compute_levelset!(op, Ψ, xv, yv, mc)
+    wait(_compute_levelset_from_polygon!(op, Ψ, xv, yv, mc; ndrange=axes(Ψ)))
+    return
+end
+
+# by default, compute union of new and current levelset
+compute_levelset!(Ψ, xv, yv, mc) = compute_levelset!(min, Ψ, xv, yv, mc)
+
+const _extrapolate_with_levelset! = _kernel_extrapolate_with_levelset!(get_device())
+
+function extrapolate_with_levelset!(∂A_∂τ,A,Ψ,Δx,Δy)
+    wait(_extrapolate_with_levelset!(∂A_∂τ,A,Ψ,Δx,Δy; ndrange=(axes(A,1)[2:end-1],axes(A,2)[2:end-1])))
+    return
+end
diff --git a/scripts2D_variational_TM/runme.sh b/scripts2D_variational_TM/runme.sh
new file mode 100755
index 00000000..8a535606
--- /dev/null
+++ b/scripts2D_variational_TM/runme.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+module purge
+module load cuda openmpi hdf5
+
+export JULIA_HDF5_PATH=/scratch-1/soft/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/hdf5-1.12.1-cualmplov32dcc2bdnucqywutca437vp/
+
+# julia --project scripts2D_variational_TM/test_volume_fractions2D.jl
\ No newline at end of file
diff --git a/scripts2D_variational_TM/signed_distances.jl b/scripts2D_variational_TM/signed_distances.jl
new file mode 100644
index 00000000..6bc7b26e
--- /dev/null
+++ b/scripts2D_variational_TM/signed_distances.jl
@@ -0,0 +1,65 @@
+using LinearAlgebra,GeometryBasics
+
+@inline S(x) = x == zero(x) ? oneunit(x) : sign(x)
+@inline sign_triangle(p,a,b,c) = S(dot(p-a,cross(b-a,c-a)))
+
+@inline function ud_triangle(p,a,b,c)
+    dot2(v) = dot(v,v)
+    ba  = b - a; pa = p - a
+    cb  = c - b; pb = p - b
+    ac  = a - c; pc = p - c
+    nor = cross(ba,ac)
+    return sqrt(
+       (sign(dot(cross(ba,nor),pa)) +
+        sign(dot(cross(cb,nor),pb)) +
+        sign(dot(cross(ac,nor),pc)) < 2)
+        ?
+        min(
+        dot2(ba*clamp(dot(ba,pa)/dot2(ba),0,1)-pa),
+        dot2(cb*clamp(dot(cb,pb)/dot2(cb),0,1)-pb),
+        dot2(ac*clamp(dot(ac,pc)/dot2(ac),0,1)-pc) )
+        :
+        dot(nor,pa)*dot(nor,pa)/dot2(nor) )
+end
+
+@inline function closest_vertex_index(P,rc)
+    lims = map(x->x[1:end-1],axes.(rc,1))
+    Δ = step.(rc)
+    O = first.(rc)
+    I = @. clamp(Int(fld(P-O,Δ))+1,lims)
+    return CartesianIndex(I...)
+end
+
+@inline inc(I,dim) = Base.setindex(I,I[dim]+1,dim)
+@inline inc(I) = I + oneunit(I)
+
+@inline function triangle_pair(Iv,dem,rc)
+    @inline function sample_dem(I)
+        @inbounds x,y = rc[1][I[1]],rc[2][I[2]]
+        @inbounds Point3(x,y,dem[I])
+    end
+    T_BL = Triangle(sample_dem(Iv)       ,sample_dem(inc(Iv,1)),sample_dem(inc(Iv,2)))
+    T_TR = Triangle(sample_dem(inc(Iv,2)),sample_dem(inc(Iv,1)),sample_dem(inc(Iv)))
+    return T_BL,T_TR
+end
+
+@inline function distance_to_triangle_pair(P,Iv,dem,rc)
+    T_BL,T_TR = triangle_pair(Iv,dem,rc)
+    ud = min(ud_triangle(P,T_BL...),ud_triangle(P,T_TR...))
+    return ud,sign_triangle(P,T_BL...)
+end
+
+function sd_dem(P,cutoff,dem,rc)
+    @inbounds Pp = clamp.(Point(P[1],P[2]),first.(rc),last.(rc))
+    @inbounds P  = Point(Pp[1],Pp[2],P[3])
+    BL = closest_vertex_index(Pp.-cutoff,rc)
+    TR = closest_vertex_index(Pp.+cutoff,rc)
+    Ic = closest_vertex_index(Pp,rc)
+    ud,sgn = distance_to_triangle_pair(P,Ic,dem,rc)
+    for Iv in BL:TR
+        if Iv == Ic continue end
+        ud_pair,_ = distance_to_triangle_pair(P,Iv,dem,rc)
+        ud = min(ud,ud_pair)
+    end
+    return ud,sgn
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/stokes.jl b/scripts2D_variational_TM/stokes.jl
new file mode 100644
index 00000000..fbf1d9b0
--- /dev/null
+++ b/scripts2D_variational_TM/stokes.jl
@@ -0,0 +1,33 @@
+include("stokes_kernels.jl")
+
+const _update_ηs! = _kernel_update_ηs!(get_device())
+const _update_σ! = _kernel_update_σ!(get_device())
+const _update_V! = _kernel_update_V!(get_device())
+const _compute_residual! = _kernel_compute_residual!(get_device())
+
+function update_ηs!(ηs,ε̇,T,wt,K,n,Q_R,T_mlt,ηreg,χ)
+    wait(_update_ηs!(ηs,ε̇,T,wt,K,n,Q_R,T_mlt,ηreg,χ;ndrange=axes(ηs)))
+    return
+end
+
+function update_σ!(Pr, τ, ε̇, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy)
+    wait(_update_σ!(Pr, τ, ε̇, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy; ndrange=axes(Pr)))
+    return
+end
+
+function update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy)
+    V_inn = (x=inn(V.x), y=inn(V.y))
+    wait(_update_V!(V_inn, Pr, τ, ηs, wt, nudτ, ρg, dx, dy; ndrange=axes(Pr)))
+    bc_x_neumann!(0.0, V.y)
+    bc_y_neumann!(0.0, V.x)
+    TinyKernels.device_synchronize(FastIce.get_device())
+    @. V.x[end,:] = V.x[end-1,:]*wt.not_solid.x[end-1,:]
+    @. V.x[1  ,:] = V.x[2    ,:]*wt.not_solid.x[2    ,:]
+    TinyKernels.device_synchronize(FastIce.get_device())
+    return
+end
+
+function compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy)
+    wait(_compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy; ndrange=axes(Pr)))
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/stokes_kernels.jl b/scripts2D_variational_TM/stokes_kernels.jl
new file mode 100644
index 00000000..d1707121
--- /dev/null
+++ b/scripts2D_variational_TM/stokes_kernels.jl
@@ -0,0 +1,136 @@
+@tiny function _kernel_update_σ!(Pr, τ, ε̇, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy)
+    ns,na = wt.not_solid, wt.not_air
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # detect and eliminate null spaces
+    isnull = (na.x[ix, iy] ≈ 0.0) || (na.x[ix+1, iy] ≈ 0.0) ||
+             (na.y[ix, iy] ≈ 0.0) || (na.y[ix, iy+1] ≈ 0.0)
+    if !isnull && (na.c[ix, iy] > 0.0)
+        ε̇.xx[ix, iy] = (V.x[ix+1, iy] * ns.x[ix+1, iy] - V.x[ix, iy] * ns.x[ix, iy]) / dx
+        ε̇.yy[ix, iy] = (V.y[ix, iy+1] * ns.y[ix, iy+1] - V.y[ix, iy] * ns.y[ix, iy]) / dy
+        ∇V = ε̇.xx[ix, iy] + ε̇.yy[ix, iy]
+        Pr[ix, iy] -= ∇V * ηs[ix, iy] * r / θ_dτ
+        τ.xx[ix, iy] += (-τ.xx[ix, iy] + 2.0 * ηs[ix, iy] * (ε̇.xx[ix,iy] - ∇V / 3.0)) * dτ_r
+        τ.yy[ix, iy] += (-τ.yy[ix, iy] + 2.0 * ηs[ix, iy] * (ε̇.yy[ix,iy] - ∇V / 3.0)) * dτ_r
+    else
+        Pr[ix, iy] = 0.0
+        τ.xx[ix, iy] = 0.0
+        τ.yy[ix, iy] = 0.0
+    end
+    @inbounds if isin(τ.xy)
+        # detect and eliminate null spaces
+        isnull = (na.x[ix+1, iy+1] ≈ 0.0) || (na.x[ix+1, iy] ≈ 0.0) ||
+                 (na.y[ix+1, iy+1] ≈ 0.0) || (na.y[ix, iy+1] ≈ 0.0)
+        if !isnull && (na.xy[ix, iy] > 0.0)
+            ε̇.xy[ix, iy] =
+                0.5 * (
+                    (V.x[ix+1,iy+1]*ns.x[ix+1,iy+1] - V.x[ix+1, iy]*ns.x[ix+1, iy]) / dy +
+                    (V.y[ix+1,iy+1]*ns.y[ix+1,iy+1] - V.y[ix, iy+1]*ns.y[ix, iy+1]) / dx
+                )
+            ηs_av = 0.25 * (ηs[ix, iy] + ηs[ix+1, iy] + ηs[ix, iy+1] + ηs[ix+1, iy+1])
+            τ.xy[ix, iy] += (-τ.xy[ix, iy] + 2.0 * ηs_av * ε̇.xy[ix, iy]) * dτ_r
+        else
+            τ.xy[ix, iy] = 0.0
+        end
+    end
+    return
+end
+
+@tiny function _kernel_update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy)
+    ns,na = wt.not_solid, wt.not_air
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # TODO: check which volume fraction (non-air or non-solid) really determines the null spaces
+    @inbounds if isin(V.x)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix+1, iy+1] ≈ 0) || (ns.c[ix, iy+1] ≈ 0) ||
+                 (ns.xy[ix, iy+1] ≈ 0) || (ns.xy[ix, iy] ≈ 0)
+        if !isnull && (ns.x[ix+1, iy+1] > 0) && (na.x[ix+1, iy+1] > 0)
+            ηs_x = max(ηs[ix, iy+1], ηs[ix+1, iy+1])
+            ∂σxx_∂x = ((-Pr[ix+1, iy+1] + τ.xx[ix+1, iy+1]) * na.c[ix+1, iy+1] -
+                       (-Pr[ix  , iy+1] + τ.xx[ix  , iy+1]) * na.c[ix  , iy+1]) / dx
+            ∂τxy_∂y = (τ.xy[ix, iy+1] * na.xy[ix, iy+1] - τ.xy[ix, iy] * na.xy[ix, iy]) / dy
+            V.x[ix, iy] += (∂σxx_∂x + ∂τxy_∂y - ρg.x) * nudτ / ηs_x
+        else
+            V.x[ix, iy] = 0.0
+        end
+    end
+    @inbounds if isin(V.y)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix+1, iy+1] ≈ 0) || (ns.c[ix+1, iy] ≈ 0) ||
+                 (ns.xy[ix+1, iy] ≈ 0) || (ns.xy[ix, iy] ≈ 0)
+        if !isnull && (ns.y[ix+1, iy+1] > 0) && (na.y[ix+1, iy+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_y is computed correctly
+            ηs_y = max(ηs[ix+1, iy], ηs[ix+1, iy+1])
+            ∂σyy_∂y = ((-Pr[ix+1, iy+1] + τ.yy[ix+1, iy+1]) * na.c[ix+1, iy+1] -
+                       (-Pr[ix+1, iy  ] + τ.yy[ix+1, iy  ]) * na.c[ix+1, iy  ]) / dy
+            ∂τxy_∂x = (τ.xy[ix+1, iy] * na.xy[ix+1, iy] - τ.xy[ix, iy] * na.xy[ix, iy]) / dx
+            V.y[ix, iy] += (∂σyy_∂y + ∂τxy_∂x - ρg.y) * nudτ / ηs_y
+        else
+            V.y[ix, iy] = 0.0
+        end
+    end
+end
+
+@tiny function _kernel_update_ηs!(ηs,ε̇,T,wt,K,n,Q_R,T_mlt,ηreg,χ)
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    @inbounds if isin(ηs)
+        ε̇xyc = 0.0
+        for idy = -1:0, idx = -1:0
+            ix2,iy2 = clamp(ix+idx,1,size(ε̇.xy,1)),clamp(iy+idy,1,size(ε̇.xy,2))
+            ε̇xyc += ε̇.xy[ix2,iy2]
+        end
+        ε̇xyc *= 0.25
+        ε̇II  = sqrt(0.5*(ε̇.xx[ix,iy]^2 + ε̇.yy[ix,iy]^2) + ε̇xyc^2)
+        ηs_t = 0.5*K*exp(-1/n*Q_R*(1/T_mlt - 1/T[ix,iy]))*ε̇II^(1/n-1)
+        ηs_t = wt.not_air.c[ix,iy]/(1/ηs_t + 1/ηreg)
+        ηs[ix,iy] = exp(log(ηs[ix,iy])*(1-χ) + log(ηs_t)*χ)
+    end
+end
+
+@tiny function _kernel_compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy)
+    ns,na = wt.not_solid, wt.not_air
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    @inbounds if isin(Pr)
+        # detect and eliminate null spaces
+        isnull = (na.x[ix, iy] ≈ 0.0) || (na.x[ix+1, iy] ≈ 0.0) ||
+        (na.y[ix, iy] ≈ 0.0) || (na.y[ix, iy+1] ≈ 0.0)
+        if !isnull && (na.c[ix, iy] > 0.0)
+            exx = (V.x[ix+1, iy] * ns.x[ix+1, iy] - V.x[ix, iy] * ns.x[ix, iy]) / dx
+            eyy = (V.y[ix, iy+1] * ns.y[ix, iy+1] - V.y[ix, iy] * ns.y[ix, iy]) / dy
+            ∇V  = exx + eyy
+            Res.Pr[ix, iy] = ∇V
+        else
+            Res.Pr[ix, iy] = 0.0
+        end
+    end
+    @inbounds if isin(Res.V.x)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix+1, iy+1] ≈ 0) || (ns.c[ix, iy+1] ≈ 0) ||
+                 (ns.xy[ix, iy+1] ≈ 0) || (ns.xy[ix, iy] ≈ 0)
+        if !isnull && (na.x[ix+1, iy+1] > 0) && (ns.x[ix+1, iy+1] > 0)
+            ∂σxx_∂x = ((-Pr[ix+1, iy+1] + τ.xx[ix+1, iy+1]) * na.c[ix+1, iy+1] -
+                       (-Pr[ix  , iy+1] + τ.xx[ix  , iy+1]) * na.c[ix  , iy+1]) / dx
+            ∂τxy_∂y = (τ.xy[ix, iy+1] * na.xy[ix, iy+1] - τ.xy[ix, iy] * na.xy[ix, iy]) / dy
+            Res.V.x[ix, iy] = ∂σxx_∂x + ∂τxy_∂y - ρg.x
+        else
+            Res.V.x[ix, iy] = 0.0
+        end
+    end
+    @inbounds if isin(Res.V.y)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix+1, iy+1] ≈ 0) || (ns.c[ix+1, iy] ≈ 0) ||
+                 (ns.xy[ix+1, iy] ≈ 0) || (ns.xy[ix, iy] ≈ 0)
+        if !isnull && (na.y[ix+1, iy+1] > 0) && (ns.y[ix+1, iy+1] > 0)
+            ∂σyy_∂y = ((-Pr[ix+1, iy+1] + τ.yy[ix+1, iy+1]) * na.c[ix+1, iy+1] -
+                       (-Pr[ix+1, iy  ] + τ.yy[ix+1, iy  ]) * na.c[ix+1, iy  ]) / dy
+            ∂τxy_∂x = (τ.xy[ix+1, iy] * na.xy[ix+1, iy] - τ.xy[ix, iy] * na.xy[ix, iy]) / dx
+            Res.V.y[ix, iy] = ∂σyy_∂y + ∂τxy_∂x - ρg.y
+        else
+            Res.V.y[ix, iy] = 0.0
+        end
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/test_volume_fractions2D.jl b/scripts2D_variational_TM/test_volume_fractions2D.jl
new file mode 100644
index 00000000..7dccf5f4
--- /dev/null
+++ b/scripts2D_variational_TM/test_volume_fractions2D.jl
@@ -0,0 +1,409 @@
+using FastIce
+using TinyKernels
+using HDF5
+using LightXML
+using UnicodePlots
+using LinearAlgebra
+using GeometryBasics
+using CairoMakie
+using ElasticArrays
+using Printf
+using JLD2
+
+using CUDA
+CUDA.device!(1)
+
+include("geometry.jl")
+include("signed_distances.jl")
+include("level_sets.jl")
+include("volume_fractions.jl")
+include("bcs.jl")
+include("stokes.jl")
+include("thermo.jl")
+include("data_io.jl")
+
+@views inn_x(A) = A[2:end-1,:]
+@views inn_y(A) = A[:,2:end-1]
+@views inn(A)   = A[2:end-1,2:end-1]
+@views av1(A)   = 0.5.*(A[1:end-1].+A[2:end])
+@views av4(A)   = 0.25.*(A[1:end-1,1:end-1].+A[1:end-1,2:end].+A[2:end,1:end-1].+A[2:end,2:end])
+
+# generate synthetic sinusoidal geometry with constant slope
+# `ox` - domain origin in x
+# `oz` - domain origin in z
+# `lx` - domain extent
+# `Δz` - offset of the bed from origin
+# `amp` - amplitude of the bumps
+# `ω` - frequency of the bumps
+# `α` - slope
+# `nx` - number of grid points
+function generate_sinusoidal(ox,oz,lx,Δz,amp,ω,α,nx)
+    xv = LinRange(0,lx,nx)
+    zv = amp.*cos.((2π.*ω/lx).*xv) .+ tan(α).*xv
+    zv .= zv .- minimum(zv) .+ Δz
+    return Point2.(xv.+ox,zv.+oz)
+end
+
+# generate synthetic circle shape
+# `ox` - domain origin in x
+# `oz` - domain origin in z
+# `r` - circle radius
+# `nθ` - number of grid points
+function generate_circle(ox,oz,r,θs,θe,nθ)
+    θ  = LinRange(θs,θe*(1-1/nθ),nθ)
+    zv,xv = r.*sin.(θ), r.*cos.(θ)
+    return Point2.(xv.+ox,zv.+oz)
+end
+
+@views function run_simulation(nz)
+    ## physics =========================================================================================================
+    # non-dimensional numbers
+    α       = deg2rad(-20)   # slope
+    nglen   = 3              # Glen's law power exponent
+    ρr      = 0.92           # density ratio of ice to water
+    cpr     = 0.5            # heat capacity ratio of ice to water
+    U_P     = 60.0           # ratio of sensible heat to gravitational potential energy
+    L_P     = 37.0           # ratio of latent heat to gravitational potential energy
+    Pr      = 2e-9           # Prandtl number - ratio of thermal diffusivity to momentum diffusivity
+    A_L     = 5e-2           # ratio of bump amplitude to length scale
+    lx_lz   = 3e0            # ratio of horizontal to vertical domain extents
+    nbump   = 13             # number of bumps
+    Q_RT    = 2*26.0         # ratio of activation temperature to melting temperature
+    # dimensionally independent parameters
+    lz      = 1.0            # domain size in z-direction    [m         ]
+    K       = 1.0            # consistency                   [Pa*s^(1/n)]
+    ρg      = 1.0            # ice gravity pressure gradient [Pa/m      ]
+    T_mlt   = 1.0            # ice melting temperature       [K         ]
+    # scales
+    l̄       = lz             # length scale                  [m         ]
+    σ̄       = ρg*cos(α)*l̄    # stress scale                  [Pa        ]
+    t̄       = (K/σ̄)^nglen    # time scale                    [s         ]
+    T̄       = T_mlt          # temperature scale             [K         ]
+    # dimensionally dependent
+    lx      = lx_lz*lz       # domain length                 [m         ]
+    λ_i     = Pr*σ̄*l̄^2/(T̄*t̄) # thermal conductivity          [W/m/K     ]
+    ρcp     = U_P*σ̄/T̄        # ice heat capacity             [Pa/K      ]
+    ρL      = L_P*σ̄          # latent heat of melting        [Pa        ]
+    Q_R     = Q_RT*T_mlt     # activational temperature      [K         ]
+    T_atm   = 0.9*T_mlt      # atmospheric temperature       [K         ]
+    T_ini   = 0.9*T_mlt      # initial surface temperature   [K         ]
+    amp     = A_L*l̄          # bump amplitude                [m         ]
+    ox,oz   = -0.5lx,0.0lz   # domain origin                 [m         ]
+    rgl     = 1.2lz          # glacier radius                [m         ]
+    ogx,ogz = 0.0lx,-0.3rgl  # glacier origin                [m         ]
+    ηreg    = 0.5*K*(1e-6/t̄)^(1/nglen-1)
+    # not important (cancels in the equations)
+    ρ_w     = 1.0            # density of water              [kg/m^3    ]
+    ρ_i     = ρr*ρ_w         # density of ice                [kg/m^3    ]
+    cp_i    = ρcp/ρ_i        # heat capacity of ice          [J/kg/K    ]
+    cp_w    = cp_i/cpr       # heat capacity of ice          [J/kg/K    ]
+    L       = ρL/ρ_w         # latent heat of melting        [J/kg      ]
+    # phase data
+    ρ  = (ice = ρ_i , wat = ρ_w )
+    cp = (ice = cp_i, wat = cp_w)
+    λ  = (ice = λ_i , wat = λ_i )
+    # body force
+    f  = (x = ρg*sin(α), y = ρg*cos(α))
+    # thermodynamics
+    @inline u_ice(T)  = cp.ice*(T-T_mlt)
+    @inline u_wat(T)  = L + cp.wat*(T-T_mlt)
+    @inline T_lt(u_t) = (u_t < u_ice(T_mlt)) ? T_mlt + u_t/cp.ice :
+                        (u_t > u_wat(T_mlt)) ? T_mlt + (u_t - L)/cp.wat : T_mlt
+    @inline ω_lt(u_t) = (u_t < u_ice(T_mlt)) ? 0.0 :
+                        (u_t > u_wat(T_mlt)) ? 1.0 : ρ.ice*(u_ice(T_mlt) - u_t)/(ρ.ice*(u_ice(T_mlt)-u_t) - ρ.wat*(u_wat(T_mlt)-u_t))
+    ## numerics ========================================================================================================
+    nx      = ceil(Int,nz*lx/lz)
+    ϵtol    = (1e-4,1e-4,1e-4)
+    maxiter = 50max(nx,nz)
+    ncheck  = ceil(Int,0.5max(nx,nz))
+    nviz    = 5
+    nsave   = 5
+    nt      = 500
+    # nviz    = 1
+    # nsave   = 1
+    # nt      = 1
+    χ       = 5e-3
+    ## preprocessing ===================================================================================================
+    # grid spacing
+    dx,dz = lx/nx,lz/nz
+    @info "grid resolution: $nx × $nz"
+    @info "grid spacing   : dx = $dx, dz = $dz"
+    @info "generating DEM data"
+    dem = (
+        bed = generate_sinusoidal(ox       ,oz,lx       ,0.05lz,amp,nbump            ,0,nx),
+        ice = generate_sinusoidal(ox-0.25lx,oz,lx+0.25lx,0.70lz,0  ,1    ,deg2rad(-3),nx),
+        # ice = generate_circle(ogx,ogz,rgl,0,π,ceil(Int,π*rgl/dx))
+    )
+    TinyKernels.device_synchronize(FastIce.get_device())
+    @info "computing marker chains"
+    mc = (
+        bed = to_device(push!(pushfirst!(copy(dem.bed),Point2(ox,0.0)),Point2(ox+lx,0.0))),
+        ice = to_device(push!(pushfirst!(copy(dem.ice),Point2(ox-0.25lx,0.0)),Point2(ox+lx+0.25lx,0.0))),
+        # ice = to_device(dem.ice)
+    )
+    # grid locations
+    xv = LinRange(ox,ox+lx,nx+1)
+    zv = LinRange(oz,oz+lz,nz+1)
+    xc,zc = av1.((xv,zv))
+    # PT params
+    r          = 0.7
+    lτ_re_mech = 1.5min(lx,lz)/π
+    vdτ_mech   = min(dx,dz)/sqrt(5.1)
+    θ_dτ       = lτ_re_mech*(r+4/3)/vdτ_mech
+    nudτ       = vdτ_mech*lτ_re_mech
+    dτ_r       = 1.0/(θ_dτ+1.0)
+    ## fields allocation ===============================================================================================
+    # level set
+    Ψ = (
+        not_solid = scalar_field(Float64,nx+1,nz+1),
+        not_air   = scalar_field(Float64,nx+1,nz+1),
+    )
+    wt = (
+        not_solid = volfrac_field(Float64,nx,nz),
+        not_air   = volfrac_field(Float64,nx,nz),
+    )
+    # mechanics
+    Pr = scalar_field(Float64,nx,nz)
+    τ  = tensor_field(Float64,nx,nz)
+    ε̇  = tensor_field(Float64,nx,nz)
+    V  = vector_field(Float64,nx,nz)
+    ηs = scalar_field(Float64,nx,nz)
+    # thermal
+    ρU = scalar_field(Float64,nx,nz)
+    T  = scalar_field(Float64,nx,nz)
+    qT = vector_field(Float64,nx,nz)
+    # hydro
+    ω  = scalar_field(Float64,nx,nz)
+    # residuals
+    Res = (
+        Pr = scalar_field(Float64,nx  ,nz  ),
+        V  = vector_field(Float64,nx-2,nz-2)
+    )
+    # visualisation
+    Vmag = scalar_field(Float64,nx-2,nz-2)
+    ε̇II  = scalar_field(Float64,nx-2,nz-2)
+    Ψav  = (
+        not_solid = scalar_field(Float64,nx-2,nz-2),
+        not_air   = scalar_field(Float64,nx-2,nz-2),
+    )
+    ## initialisation ==================================================================================================
+    # level set
+    TinyKernels.device_synchronize(FastIce.get_device())
+    for comp in eachindex(Ψ) fill!(Ψ[comp],Inf) end
+    @info "computing the level set for the ice surface"
+    compute_levelset!(Ψ.not_air,xv,zv,mc.ice)
+    @info "computing the level set for the bedrock surface"
+    compute_levelset!(Ψ.not_solid,xv,zv,mc.bed)
+    TinyKernels.device_synchronize(FastIce.get_device())
+    @. Ψ.not_solid *= -1.0
+    TinyKernels.device_synchronize(FastIce.get_device())
+    @info "computing volume fractions from level sets"
+    for phase in eachindex(Ψ)
+        compute_volume_fractions_from_level_set!(wt[phase],Ψ[phase],dx,dz)
+    end
+    TinyKernels.device_synchronize(FastIce.get_device())
+    # mechanics
+    for comp in eachindex(V) fill!(V[comp],0.0) end
+    for comp in eachindex(τ) fill!(τ[comp],0.0) end
+    for comp in eachindex(τ) fill!(ε̇[comp],0.0) end
+    fill!(Pr,0.0)
+    fill!(ηs,0.5*K*(1e-1/t̄)^(1/nglen-1)*exp(-1/nglen*Q_R*(1/T_mlt-1/T_ini)))    
+    # fill!(ηs,1.0)
+    TinyKernels.device_synchronize(FastIce.get_device())
+    # thermo
+    for comp in eachindex(qT) fill!(qT[comp],0.0) end
+    @. T  = lerp(T_atm,T_ini,wt.not_air.c) 
+    @. ρU = ρ.ice*u_ice(T)
+    @. ω  = ω_lt(ρU/ρ.ice)
+    TinyKernels.device_synchronize(FastIce.get_device())
+    # convergence tracking
+    iter_evo = Float64[]
+    errs_evo = ElasticArray{Float64}(undef, length(ϵtol), 0)
+    # figures
+    fig = Figure(resolution=(3000,1200),fontsize=32)
+    axs = (
+        hmaps = (
+            Pr   = Axis(fig[1,1][1,1];aspect=DataAspect(),title="p"  ),
+            ε̇II  = Axis(fig[1,2][1,1];aspect=DataAspect(),title="ε̇II"),
+            Vmag = Axis(fig[1,3][1,1];aspect=DataAspect(),title="|V|"),
+            T    = Axis(fig[2,1][1,1];aspect=DataAspect(),title="T"  ),
+            ω    = Axis(fig[2,2][1,1];aspect=DataAspect(),title="ω"  ),
+            ηs   = Axis(fig[3,1][1,1];aspect=DataAspect(),title="ηs" ),
+        ),
+        errs = Axis(fig[2,3];yscale=log10, title="Convergence", xlabel="#iter/ny", ylabel="ϵ"),
+    )
+    for axname in eachindex(axs.hmaps)
+        xlims!(axs.hmaps[axname],ox,ox+lx)
+        ylims!(axs.hmaps[axname],oz,oz+lz)
+    end
+    plt = (
+        hmaps = (
+            Pr   = heatmap!(axs.hmaps.Pr  ,xv,zv,to_host(Pr        );colormap=:turbo),
+            ε̇II  = heatmap!(axs.hmaps.ε̇II ,xv,zv,to_host(ε̇II       );colormap=:turbo),
+            Vmag = heatmap!(axs.hmaps.Vmag,xv,zv,to_host(Vmag      );colormap=:turbo),
+            T    = heatmap!(axs.hmaps.T   ,xc,zc,to_host(T         );colormap=:turbo),
+            ω    = heatmap!(axs.hmaps.ω   ,xc,zc,to_host(ω         );colormap=:turbo),
+            ηs   = heatmap!(axs.hmaps.ηs  ,xc,zc,to_host(log10.(ηs));colormap=:turbo,colorrange=(1,4)),
+        ),
+        errs=[scatterlines!(axs.errs, Point2.(iter_evo, errs_evo[ir, :])) for ir in eachindex(ϵtol)],
+    )
+    plt_bed = [
+        (
+            bed =  poly!(axs.hmaps[f],to_host(mc.bed);strokewidth=2,color=:black),
+            ice = lines!(axs.hmaps[f],to_host(mc.ice);strokewidth=2,color=:black),
+        ) for f in eachindex(axs.hmaps)
+    ]
+    Colorbar(fig[1,1][1,2],plt.hmaps.Pr)
+    Colorbar(fig[1,2][1,2],plt.hmaps.ε̇II)
+    Colorbar(fig[1,3][1,2],plt.hmaps.Vmag)
+    Colorbar(fig[2,1][1,2],plt.hmaps.T)
+    Colorbar(fig[2,2][1,2],plt.hmaps.ω)
+    Colorbar(fig[3,1][1,2],plt.hmaps.ηs)
+    TinyKernels.device_synchronize(FastIce.get_device())
+    update_vis_fields!(Vmag,ε̇II,Ψav,V,ε̇,Ψ)
+    plt.hmaps.Pr[3][]   = to_host(Pr)
+    plt.hmaps.ε̇II[3][]  = to_host(ε̇II)
+    plt.hmaps.Vmag[3][] = to_host(Vmag)
+    plt.hmaps.T[3][]    = to_host(T)
+    plt.hmaps.ω[3][]    = to_host(ω)
+    plt.hmaps.ηs[3][]   = to_host(ηs)
+    TinyKernels.device_synchronize(FastIce.get_device())
+    display(fig)
+    ## simulation run ==================================================================================================
+    @info "time loop"
+    # save static data
+    outdir = joinpath("out_visu","egu2023",@sprintf("nbump_%d_slope_%.1f",nbump,rad2deg(α)))
+    mkpath(outdir)
+    jldsave(joinpath(outdir,"static.h5");xc,xv,zc,zv,Ψ,wt,dem,mc)
+    tcur = 0.0; isave = 1
+    for it in 1:nt
+        @info @sprintf("time step #%d, time = %g",it,tcur)
+        empty!(iter_evo); resize!(errs_evo,(length(ϵtol),0))
+        TinyKernels.device_synchronize(FastIce.get_device())
+        # mechanics
+        for iter in 1:maxiter
+            update_σ!(Pr,τ,ε̇,V,ηs,wt,r,θ_dτ,dτ_r,dx,dz)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            update_V!(V,Pr,τ,ηs,wt,nudτ,f,dx,dz)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            update_ηs!(ηs,ε̇,T,wt,K,nglen,Q_R,T_mlt,ηreg,χ)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            if iter % ncheck == 0
+                compute_residual!(Res,Pr,V,τ,wt,f,dx,dz)
+                TinyKernels.device_synchronize(FastIce.get_device())
+                errs = (maximum(abs.(Res.V.x))*l̄/σ̄,
+                        maximum(abs.(Res.V.y))*l̄/σ̄,
+                        maximum(abs.(inn(Res.Pr)))*t̄)
+                TinyKernels.device_synchronize(FastIce.get_device())
+                @printf("  iter/nz # %2.1f, errs: [ Vx = %1.3e, Vy = %1.3e, Pr = %1.3e ]\n", iter/nz, errs...)
+                push!(iter_evo, iter/nz); append!(errs_evo, errs)
+
+                # debug viz
+                # for ir in eachindex(plt.errs)
+                #     plt.errs[ir][1] = Point2.(iter_evo, errs_evo[ir, :])
+                # end
+                # autolimits!(axs.errs)
+                # update_vis_fields!(Vmag,ε̇II,Ψav,V,ε̇,Ψ)
+                # TinyKernels.device_synchronize(FastIce.get_device())
+                # plt.hmaps.Pr[3][]   = to_host(Pr)
+                # plt.hmaps.ε̇II[3][]  = to_host(ε̇II)
+                # plt.hmaps.Vmag[3][] = to_host(Vmag)
+                # plt.hmaps.T[3][]    = to_host(T)
+                # plt.hmaps.ω[3][]    = to_host(ω)
+                # plt.hmaps.ηs[3][]   = to_host(log10.(ηs))
+                # yield()
+
+                # check convergence
+                if any(.!isfinite.(errs)) error("simulation failed") end
+                if all(errs .< ϵtol) break end
+            end
+        end
+        TinyKernels.device_synchronize(FastIce.get_device())
+        dt = min(dx,dz)^2/max(λ.ice*ρ.ice*cp.ice,λ.wat*ρ.wat*cp.wat)/4.1
+        # thermal
+        update_qT!(qT,T,wt,λ,T_atm,dx,dz)
+        TinyKernels.device_synchronize(FastIce.get_device())
+        update_ρU!(ρU,qT,τ,ε̇,wt,ρ.ice*u_ice(T_atm),dt,dx,dz)
+        TinyKernels.device_synchronize(FastIce.get_device())
+        @. T = T_lt(ρU/(ρ.ice*(1-ω) + ρ.wat*ω))
+        @. ω = ω_lt(ρU/(ρ.ice*(1-ω) + ρ.wat*ω))
+        TinyKernels.device_synchronize(FastIce.get_device())
+        tcur += dt
+        # update figures
+        if it % nviz == 0
+            for ir in eachindex(plt.errs)
+                plt.errs[ir][1] = Point2.(iter_evo, errs_evo[ir, :])
+            end
+            autolimits!(axs.errs)
+            update_vis_fields!(Vmag,ε̇II,Ψav,V,ε̇,Ψ)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            plt.hmaps.Pr[3][]   = to_host(Pr)
+            plt.hmaps.ε̇II[3][]  = to_host(ε̇II)
+            plt.hmaps.Vmag[3][] = to_host(Vmag)
+            plt.hmaps.T[3][]    = to_host(T)
+            plt.hmaps.ω[3][]    = to_host(ω)
+            plt.hmaps.ηs[3][]   = to_host(log10.(ηs))
+            display(fig)
+            yield()
+        end
+        # save timestep
+        if it % nsave == 0
+            @info "saving timestep"
+            update_vis_fields!(Vmag,ε̇II,Ψav,V,ε̇,Ψ)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            jldsave(joinpath(outdir,@sprintf("%04d.jld2",isave));Pr,τ,ε̇,ε̇II,V,T,ω,ηs)
+            isave += 1
+        end
+    end
+
+    TinyKernels.device_synchronize(FastIce.get_device())
+    @info "saving results on disk"
+    out_h5 = "results.h5"
+    ndrange = CartesianIndices((nx-2,nz-2))
+    fields = Dict("LS_ice"=>Ψav.not_air,"LS_bed"=>Ψav.not_solid,"Vmag"=>Vmag,"TII"=>ε̇II,"Pr"=>inn(Pr))
+    @info "saving HDF5 file"
+    write_h5(out_h5,fields,(nx,nz),ndrange)
+
+    @info "saving XDMF file..."
+    write_xdmf("results.xdmf3",out_h5,fields,(xc[2],zc[2]),(dx,dz),(nx-2,nz,2))
+
+    return
+end
+
+@tiny function _kernel_update_vis_fields!(Vmag, ε̇II, Ψav, V, ε̇, Ψ)
+    ix,iz = @indices
+    @inline isin(A) = checkbounds(Bool,A,ix,iz)
+    @inbounds if isin(Ψav.not_air)
+        pav = 0.0
+        for idz = 0:1, idx = 0:1
+            pav += Ψ.not_air[ix+idx,iz+idz]
+        end
+        Ψav.not_air[ix,iz] = pav/4
+    end
+    @inbounds if isin(Ψav.not_solid)
+        pav = 0.0
+        for idz = 0:1, idx = 0:1
+            pav += Ψ.not_solid[ix+idx,iz+idz]
+        end
+        Ψav.not_solid[ix,iz] = pav/4
+    end
+    @inbounds if isin(Vmag)
+        vxc = 0.5*(V.x[ix+1,iz+1] + V.x[ix+2,iz+1])
+        vzc = 0.5*(V.y[ix+1,iz+1] + V.y[ix+1,iz+2])
+        Vmag[ix,iz] = sqrt(vxc^2 + vzc^2)
+    end
+    @inbounds if isin(ε̇II)
+        ε̇xzc = 0.25*(ε̇.xy[ix,iz]+ε̇.xy[ix+1,iz]+ε̇.xy[ix,iz+1]+ε̇.xy[ix+1,iz+1])
+        ε̇II[ix,iz] = sqrt(0.5*(ε̇.xx[ix+1,iz+1]^2 + ε̇.yy[ix+1,iz+1]^2) + ε̇xzc^2)
+    end
+    return
+end
+
+const _update_vis_fields! = _kernel_update_vis_fields!(get_device())
+
+function update_vis_fields!(Vmag, ε̇II, Ψav, V, ε̇, Ψ)
+    wait(_update_vis_fields!(Vmag, ε̇II, Ψav, V, ε̇, Ψ; ndrange=axes(Vmag)))
+    return
+end
+
+run_simulation(100)
\ No newline at end of file
diff --git a/scripts2D_variational_TM/thermo.jl b/scripts2D_variational_TM/thermo.jl
new file mode 100644
index 00000000..1b302859
--- /dev/null
+++ b/scripts2D_variational_TM/thermo.jl
@@ -0,0 +1,24 @@
+include("thermo_kernels.jl")
+
+const _update_qT! = _kernel_update_qT!(get_device())
+const _update_ρU! = _kernel_update_ρU!(get_device())
+
+function update_qT!(qT,T,wt,λ,T_atm,dx,dy)
+    qT_inn = (x=inn_x(qT.x), y=inn_y(qT.y))
+    vf_inn(vf) = (
+        c = vf.c,
+        x = inn_x(vf.x),
+        y = inn_y(vf.y),
+    )
+    wt_inn = (
+        not_air   = vf_inn(wt.not_air  ),
+        not_solid = vf_inn(wt.not_solid),
+    )
+    wait(_update_qT!(qT_inn,T,wt_inn,λ,T_atm,dx,dy; ndrange=axes(T)))
+    return
+end
+
+function update_ρU!(ρU,qT,τ,ε̇,wt,ρU_atm,dt,dx,dy)
+    wait(_update_ρU!(ρU,qT,τ,ε̇,wt,ρU_atm,dt,dx,dy; ndrange=axes(ρU)))
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/thermo_kernels.jl b/scripts2D_variational_TM/thermo_kernels.jl
new file mode 100644
index 00000000..dd1f41cb
--- /dev/null
+++ b/scripts2D_variational_TM/thermo_kernels.jl
@@ -0,0 +1,56 @@
+@inline lerp(a,b,t) = a*(1-t) + b*t
+
+@tiny function _kernel_update_qT!(qT,T,wt,λ,T_atm,dx,dy)
+    ns,na = wt.not_solid, wt.not_air
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    @inbounds if isin(qT.x)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix+1,iy] ≈ 0) || (ns.c[ix,iy] ≈ 0)
+        if !isnull && (na.x[ix,iy] > 0) && (ns.x[ix,iy] > 0)
+            T_w = lerp(T_atm,T[ix+1,iy],na.c[ix+1,iy])
+            T_e = lerp(T_atm,T[ix  ,iy],na.c[ix  ,iy])
+            qT.x[ix,iy] = -λ.ice*(T_w - T_e)/dx
+        else
+            qT.x[ix,iy] = 0.0
+        end
+    end
+    @inbounds if isin(qT.y)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix,iy+1] ≈ 0) || (ns.c[ix,iy] ≈ 0)
+        if !isnull && (na.y[ix,iy] > 0) && (ns.y[ix,iy] > 0)
+            T_n = lerp(T_atm,T[ix,iy+1],na.c[ix,iy+1])
+            T_s = lerp(T_atm,T[ix,iy  ],na.c[ix,iy  ])
+            qT.y[ix,iy] = -λ.ice*(T_n - T_s)/dy
+        else
+            qT.y[ix,iy] = 0.0
+        end
+    end
+end
+
+@tiny function _kernel_update_ρU!(ρU,qT,τ,ε̇,wt,ρU_atm,dt,dx,dy)
+    ns,na = wt.not_solid, wt.not_air
+    ix, iy = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    @inbounds if isin(ρU)
+        isnull = (na.x[ix,iy] ≈ 0.0) || (na.x[ix+1,iy] ≈ 0.0) ||
+                 (na.y[ix,iy] ≈ 0.0) || (na.y[ix,iy+1] ≈ 0.0)
+        if !isnull && (na.c[ix, iy] > 0.0 && ns.c[ix, iy] > 0.0)
+            ∇qx = (qT.x[ix+1,iy]*ns.x[ix+1,iy] - qT.x[ix, iy]*ns.x[ix,iy])/dx
+            ∇qy = (qT.y[ix,iy+1]*ns.y[ix,iy+1] - qT.y[ix, iy]*ns.y[ix,iy])/dy
+            ∇qT = ∇qx + ∇qy
+            # average shear heating contribution on cell vertices
+            τxyc,ε̇xyc = 0.0,0.0
+            for idy = -1:0, idx = -1:0
+                ix2,iy2 = clamp(ix+idx,1,size(τ.xy,1)),clamp(iy+idy,1,size(τ.xy,2))
+                τxyc += τ.xy[ix2,iy2]
+                ε̇xyc += ε̇.xy[ix2,iy2]
+            end
+            τxyc *= 0.25; ε̇xyc *= 0.25
+            SH = τ.xx[ix,iy]*ε̇.xx[ix,iy] + τ.yy[ix,iy]*ε̇.yy[ix,iy] + 2.0*τxyc*ε̇xyc
+            ρU[ix,iy] += dt*(-∇qT + SH)
+        else
+            ρU[ix,iy] = ρU_atm
+        end
+    end
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/visme.jl b/scripts2D_variational_TM/visme.jl
new file mode 100644
index 00000000..76ef5f57
--- /dev/null
+++ b/scripts2D_variational_TM/visme.jl
@@ -0,0 +1,104 @@
+using CairoMakie
+using JLD2
+using Printf
+
+nbump = 10
+slope = -15.0
+
+# ist    = 1
+
+simdir = "out_visu/egu2023/nbump_$(nbump)_slope_$(slope)"
+xc,xv,zc,zv,Ψ,wt,dem,mc = load("$simdir/static.jld2","xc","xv","zc","zv","Ψ","wt","dem","mc")    
+Pr,τ,ε̇,ε̇II,V,T,ω,ηs     = load(@sprintf("%s/%04d.jld2",simdir,1),"Pr","τ","ε̇","ε̇II","V","T","ω","ηs")
+
+
+fig = Figure(resolution=(800,500),fontsize=36)
+# fig = Figure(resolution=(800,950),fontsize=36)
+axs = (
+    hmaps = (
+        ε̇II = Axis(fig[1,1][1,1];aspect=DataAspect(),title=L"\log_{10}(\dot{\varepsilon}_{II})",xlabel=L"x",ylabel=L"z"),
+        # T   = Axis(fig[2,1][1,1];aspect=DataAspect(),title=L"T",xlabel=L"x",ylabel=L"z"),
+        # ω   = Axis(fig[3,1][1,1];aspect=DataAspect(),title=L"\omega",xlabel=L"x",ylabel=L"z"),
+    ),
+)
+
+plts = (
+    hmaps = (
+        ε̇II = heatmap!(axs.hmaps.ε̇II,xc,zc,log10.(ε̇II);colormap=:turbo,colorrange=(-6,-2)),
+        # T   = heatmap!(axs.hmaps.T  ,xc,zc,T  ;colormap=:magma,colorrange=(0.9,1)),
+        # ω   = heatmap!(axs.hmaps.ω  ,xc,zc,ω  ;colormap=Reverse(:grays),colorrange=(0,0.06)),
+    ),
+)
+
+for axname in eachindex(axs.hmaps)
+    xlims!(axs.hmaps[axname],-0.5,0.5)
+    ylims!(axs.hmaps[axname],zc[1],0.6)
+end
+
+mc_air = mc.ice[2:end-1]
+push!(mc_air,Point(mc.ice[end][1],zv[end]+0.2))
+push!(mc_air,Point(mc.ice[1  ][1],zv[end]+0.2))
+
+plt_bed = [
+    (
+        bed = poly!(axs.hmaps[f],mc.bed;strokewidth=2,color=:gray),
+        ice = poly!(axs.hmaps[f],mc_air;strokewidth=4,color=:white),
+    ) for f in eachindex(axs.hmaps)
+]
+
+Colorbar(fig[1,1][1,2],plts.hmaps.ε̇II)
+# Colorbar(fig[2,1][1,2],plts.hmaps.T  )
+# Colorbar(fig[3,1][1,2],plts.hmaps.ω  )
+
+display(fig)
+
+record(fig,"video_$(nbump)_$(slope).mp4",1:50;framerate=5) do it
+    if it == 1 return end
+    local Pr,τ,ε̇,ε̇II,V,T,ω,ηs = load(@sprintf("%s/%04d.jld2",simdir,it),"Pr","τ","ε̇","ε̇II","V","T","ω","ηs")
+    plts.hmaps.ε̇II[3] = log10.(ε̇II)
+    # plts.hmaps.T[3]   = T
+    # plts.hmaps.ω[3]   = ω
+end
+
+display(fig)
+
+
+# fig = Figure(resolution=(1000,800),fontsize=32)
+# axs = (
+#     Vx  = Axis(fig[1,1];ylabel=L"z",xlabel=L"v_x"),
+#     ε̇xy = Axis(fig[1,2];ylabel=L"z",xlabel=L"\dot{\varepsilon}_{xy}"),
+# )
+
+# for axname in eachindex(axs)
+#     # xlims!(axs[axname],xc[1],xc[end])
+#     ylims!(axs[axname],0.07,0.75)
+# end
+
+# xlims!(axs.ε̇xy,(-5e-3,nothing))
+
+# exy = (V.x[ix,2:end] .- V.x[ix,1:end-1])./(zc[2]-zc[1])
+
+# plts = (
+#     Vx  = lines!(axs.Vx,V.x[ix,:].*100,zc;linewidth=2),
+#     ε̇xy = lines!(axs.ε̇xy,exy,zv[2:end-1];linewidth=2),
+# )
+
+# display(fig)
+
+# it = 50
+
+# for slope in (-20.0,-15.0,-10.0,-5.0)
+#     simdir = "out_visu/egu2023/nbump_$(nbump)_slope_$(slope)"
+#     xc,xv,zc,zv,Ψ,wt,dem,mc = load("$simdir/static.jld2","xc","xv","zc","zv","Ψ","wt","dem","mc")    
+#     Pr,τ,ε̇,ε̇II,V,T,ω,ηs     = load(@sprintf("%s/%04d.jld2",simdir,it),"Pr","τ","ε̇","ε̇II","V","T","ω","ηs")
+
+#     nx,nz = length(xc),length(zc)
+#     ix = nx ÷ 2
+#     exy = (V.x[ix,2:end] .- V.x[ix,1:end-1])./(zc[2]-zc[1])
+#     plts = (
+#         Vx  = lines!(axs.Vx,V.x[ix,:],zc;linewidth=2),
+#         ε̇xy = lines!(axs.ε̇xy,exy,zv[2:end-1];linewidth=2),
+#     )
+# end
+
+# display(fig)
\ No newline at end of file
diff --git a/scripts2D_variational_TM/visme_2.jl b/scripts2D_variational_TM/visme_2.jl
new file mode 100644
index 00000000..70dfcdfd
--- /dev/null
+++ b/scripts2D_variational_TM/visme_2.jl
@@ -0,0 +1,41 @@
+using CairoMakie
+using JLD2
+using Printf
+
+nbump = 20
+slope = -20.0
+
+fig = Figure(resolution=(1000,800),fontsize=38)
+axs = (
+    Vx  = Axis(fig[1,1];ylabel=L"z",xlabel=L"v_x\times 10^{4}"),
+    ε̇xy = Axis(fig[1,2];ylabel=L"z",xlabel=L"\dot{\varepsilon}_{xy}"),
+)
+
+for axname in eachindex(axs)
+    # xlims!(axs[axname],xc[1],xc[end])
+    ylims!(axs[axname],0.07,0.75)
+end
+
+xlims!(axs.ε̇xy,(-5e-3,nothing))
+
+it = 50
+
+for slope in (-10.0,-19.0,-20.0)
+    simdir = "out_visu/egu2023/nbump_$(nbump)_slope_$(slope)"
+    xc,xv,zc,zv,Ψ,wt,dem,mc = load("$simdir/static.jld2","xc","xv","zc","zv","Ψ","wt","dem","mc")    
+    Pr,τ,ε̇,ε̇II,V,T,ω,ηs     = load(@sprintf("%s/%04d.jld2",simdir,it),"Pr","τ","ε̇","ε̇II","V","T","ω","ηs")
+
+    nx,nz = length(xc),length(zc)
+    ix = nx ÷ 2
+    exy = (V.x[ix,2:end] .- V.x[ix,1:end-1])./(zc[2]-zc[1])
+    plts = (
+        Vx  = lines!(axs.Vx,V.x[ix,:].*10000,zc;linewidth=3),
+        ε̇xy = lines!(axs.ε̇xy,exy,zv[2:end-1];linewidth=3,label=L"\theta = %$(slope)^\circ"),
+    )
+end
+
+axislegend(axs.ε̇xy)
+
+display(fig)
+
+save("comparison.png",fig)
\ No newline at end of file
diff --git a/scripts2D_variational_TM/volume_fraction_kernels.jl b/scripts2D_variational_TM/volume_fraction_kernels.jl
new file mode 100644
index 00000000..5adb7009
--- /dev/null
+++ b/scripts2D_variational_TM/volume_fraction_kernels.jl
@@ -0,0 +1,31 @@
+@tiny function _kernel_compute_volume_fractions_from_level_set!(wt, Ψ, dx, dy)
+    ix, iy = @indices
+    cell = Rect(Vec(0.0, 0.0), Vec(dx, dy))
+    ω = GeometryBasics.volume(cell)
+    @inline Ψ_ax(dix, diy) = 0.5 * (Ψ[ix+dix, iy+diy] + Ψ[ix+dix+1, iy+diy  ])
+    @inline Ψ_ay(dix, diy) = 0.5 * (Ψ[ix+dix, iy+diy] + Ψ[ix+dix  , iy+diy+1])
+    @inline Ψ_axy(dix, diy) = 0.25 * (Ψ[ix+dix, iy+diy  ] + Ψ[ix+dix+1, iy+diy  ] +
+                                      Ψ[ix+dix, iy+diy+1] + Ψ[ix+dix+1, iy+diy+1])
+    @inline isin(A) = checkbounds(Bool, A, ix, iy)
+    # cell centers
+    @inbounds if isin(wt.c)
+        Ψs = Vec{4}(Ψ[ix, iy], Ψ[ix+1, iy], Ψ[ix+1, iy+1], Ψ[ix, iy+1])
+        wt.c[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    # x faces
+    @inbounds if isin(wt.x)
+        Ψs = Vec{4}(Ψ_ax(0, 0), Ψ_ax(1, 0), Ψ_ax(1, 1), Ψ_ax(0, 1))
+        wt.x[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    # y faces
+    @inbounds if isin(wt.y)
+        Ψs = Vec{4}(Ψ_ay(0, 0), Ψ_ay(1, 0), Ψ_ay(1, 1), Ψ_ay(0, 1))
+        wt.y[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    # xy edges
+    @inbounds if isin(wt.xy)
+        Ψs = Vec{4}(Ψ_axy(0, 0), Ψ_axy(1, 0), Ψ_axy(1, 1), Ψ_axy(0, 1))
+        wt.xy[ix, iy] = volfrac(cell, Ψs) / ω
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts2D_variational_TM/volume_fractions.jl b/scripts2D_variational_TM/volume_fractions.jl
new file mode 100644
index 00000000..7ab5a9e3
--- /dev/null
+++ b/scripts2D_variational_TM/volume_fractions.jl
@@ -0,0 +1,55 @@
+@inline perturb(ϕ) = abs(ϕ) > 1e-20 ? ϕ : (ϕ > 0 ? 1e-20 : -1e-20)
+
+@inline trivol(v1, v2, v3) = 0.5 * abs(cross(v3 - v1, v2 - v1))
+
+function volfrac(tri, ϕ::Vec3{T})::T where {T}
+    v1, v2, v3 = tri
+    if ϕ[1] < 0 && ϕ[2] < 0 && ϕ[3] < 0 # ---
+        return trivol(v1, v2, v3)
+    elseif ϕ[1] > 0 && ϕ[2] > 0 && ϕ[3] > 0 # +++
+        return 0.0
+    end
+    @inline vij(i, j) = tri[j] * (ϕ[i] / (ϕ[i] - ϕ[j])) - tri[i] * (ϕ[j] / (ϕ[i] - ϕ[j]))
+    v12, v13, v23 = vij(1, 2), vij(1, 3), vij(2, 3)
+    if ϕ[1] < 0
+        if ϕ[2] < 0
+            trivol(v1, v23, v13) + trivol(v1, v2, v23)  # --+
+        else
+            if ϕ[3] < 0
+                trivol(v3, v12, v23) + trivol(v3, v1, v12) # -+-
+            else
+                trivol(v1, v12, v13) # -++
+            end
+        end
+    else
+        if ϕ[2] < 0
+            if ϕ[3] < 0
+                trivol(v2, v13, v12) + trivol(v2, v3, v13) # +--
+            else
+                trivol(v12, v2, v23) # +-+
+            end
+        else
+            trivol(v13, v23, v3) # ++-
+        end
+    end
+end
+
+function volfrac(rect::Rect2{T}, ϕ::Vec4{T}) where {T}
+    or, ws = origin(rect), widths(rect)
+    v1, v2, v3, v4 = or, or + Vec(ws[1], 0.0), or + ws, or + Vec(0.0, ws[2])
+    ϕ1, ϕ2, ϕ3, ϕ4 = perturb.(ϕ)
+    return volfrac(Vec(v1, v2, v3), Vec3{T}(ϕ1, ϕ2, ϕ3)) +
+           volfrac(Vec(v1, v3, v4), Vec3{T}(ϕ1, ϕ3, ϕ4))
+end
+
+include("volume_fraction_kernels.jl")
+
+const _compute_volume_fractions_from_level_set! = _kernel_compute_volume_fractions_from_level_set!(get_device())
+
+function compute_volume_fractions_from_level_set!(wt, Ψ, dx, dy)
+    wt_inn = (; c=wt.c, x=inn_x(wt.x), y=inn_y(wt.y), xy=wt.xy)
+    wait(_compute_volume_fractions_from_level_set!(wt_inn, Ψ, dx, dy; ndrange=axes(wt.c)))
+    bc_x_neumann!(0.0, wt.x)
+    bc_y_neumann!(0.0, wt.y)
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational/bc_kernels.jl b/scripts3D_variational/bc_kernels.jl
new file mode 100644
index 00000000..5d805bee
--- /dev/null
+++ b/scripts3D_variational/bc_kernels.jl
@@ -0,0 +1,65 @@
+@tiny function _kernel_bc_x_dirichlet!(val,arrays...)
+    iy,iz = @indices
+    for A in arrays
+        if iy ∈ axes(A,2) && iz ∈ axes(A,3)
+            @inbounds A[1  ,iy,iz] = val
+            @inbounds A[end,iy,iz] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_dirichlet!(val, arrays...)
+    ix,iz = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iz ∈ axes(A,3)
+            @inbounds A[ix,1  ,iz] = val
+            @inbounds A[ix,end,iz] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_z_dirichlet!(val, arrays...)
+    ix,iy = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iy ∈ axes(A,2)
+            @inbounds A[ix,iy,1  ] = val
+            @inbounds A[ix,iy,end] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_x_neumann!(val, arrays...)
+    iy,iz = @indices
+    for A in arrays
+        if iy ∈ axes(A,2) && iz ∈ axes(A,3)
+            @inbounds A[1  ,iy,iz] = A[2    ,iy,iz] + val
+            @inbounds A[end,iy,iz] = A[end-1,iy,iz] + val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_neumann!(val, arrays...)
+    ix,iz = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iz ∈ axes(A,3)
+            @inbounds A[ix,1  ,iz] = A[ix,2    ,iz] + val
+            @inbounds A[ix,end,iz] = A[ix,end-1,iz] + val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_z_neumann!(val, arrays...)
+    ix,iy = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iy ∈ axes(A,2)
+            @inbounds A[ix,iy,1  ] = A[ix,iy,2    ] + val
+            @inbounds A[ix,iy,end] = A[ix,iy,end-1] + val
+        end
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational/bcs.jl b/scripts3D_variational/bcs.jl
new file mode 100644
index 00000000..a592d30a
--- /dev/null
+++ b/scripts3D_variational/bcs.jl
@@ -0,0 +1,48 @@
+include("bc_kernels.jl")
+
+const _bc_x_dirichlet! = _kernel_bc_x_dirichlet!(get_device())
+const _bc_y_dirichlet! = _kernel_bc_y_dirichlet!(get_device())
+const _bc_z_dirichlet! = _kernel_bc_z_dirichlet!(get_device())
+
+const _bc_x_neumann! = _kernel_bc_x_neumann!(get_device())
+const _bc_y_neumann! = _kernel_bc_y_neumann!(get_device())
+const _bc_z_neumann! = _kernel_bc_z_neumann!(get_device())
+
+for fname in (:bx_x_dirichlet!,:bc_x_neumann!)
+    @eval begin
+        function $fname(val,arrays...)
+            ax = axes(arrays[1])[[2,3]]
+            for A in arrays[2:end]
+                ax = union.(ax,axes(A)[[2,3]])
+            end
+            wait($(Symbol(:_,fname))(val,arrays...;ndrange=ax))
+            return
+        end
+    end
+end
+
+for fname in (:bx_y_dirichlet!,:bc_y_neumann!)
+    @eval begin
+        function $fname(val,arrays...)
+            ax = axes(arrays[1])[[1,3]]
+            for A in arrays[2:end]
+                ax = union.(ax,axes(A)[[1,3]])
+            end
+            wait($(Symbol(:_,fname))(val,arrays...;ndrange=ax))
+            return
+        end
+    end
+end
+
+for fname in (:bx_z_dirichlet!,:bc_z_neumann!)
+    @eval begin
+        function $fname(val,arrays...)
+            ax = axes(arrays[1])[[1,2]]
+            for A in arrays[2:end]
+                ax = union.(ax,axes(A)[[1,2]])
+            end
+            wait($(Symbol(:_,fname))(val,arrays...;ndrange=ax))
+            return
+        end
+    end
+end
\ No newline at end of file
diff --git a/scripts3D_variational/data_io.jl b/scripts3D_variational/data_io.jl
new file mode 100644
index 00000000..7e6ef484
--- /dev/null
+++ b/scripts3D_variational/data_io.jl
@@ -0,0 +1,105 @@
+function write_h5(path,fields,dim_g,I,args...)
+    if !HDF5.has_parallel() && (length(args)>0)
+        @warn("HDF5 has no parallel support.")
+    end
+    h5open(path, "w", args...) do io
+        for (name,field) ∈ fields
+            dset               = create_dataset(io, "/$name", datatype(eltype(field)), dataspace(dim_g))
+            dset[I.indices...] = Array(field)
+        end
+    end
+    return
+end
+
+function write_xdmf(path,h5_names,fields,origin,spacing,dim_g,timesteps)
+    xdoc = XMLDocument()
+    xroot = create_root(xdoc, "Xdmf")
+    set_attribute(xroot, "Version","3.0")
+
+    xdomain     = new_child(xroot, "Domain")
+    xcollection = new_child(xdomain, "Grid")
+    set_attribute(xcollection, "GridType","Collection")
+    set_attribute(xcollection, "CollectionType","Temporal")
+
+    for (it,tt) ∈ enumerate(timesteps)
+        xgrid   = new_child(xcollection, "Grid")
+        set_attribute(xgrid, "GridType","Uniform")
+        xtopo = new_child(xgrid, "Topology")
+        set_attribute(xtopo, "TopologyType", "3DCoRectMesh")
+        set_attribute(xtopo, "Dimensions", join(reverse(dim_g).+1,' '))
+
+        xtime = new_child(xgrid, "Time")
+        set_attribute(xtime, "Value", "$tt")
+
+        xgeom = new_child(xgrid, "Geometry")
+        set_attribute(xgeom, "GeometryType", "ORIGIN_DXDYDZ")
+
+        xorig = new_child(xgeom, "DataItem")
+        set_attribute(xorig, "Format", "XML")
+        set_attribute(xorig, "NumberType", "Float")
+        set_attribute(xorig, "Dimensions", "$(length(dim_g)) ")
+        add_text(xorig, join(reverse(origin), ' '))
+
+        xdr = new_child(xgeom, "DataItem")
+        set_attribute(xdr, "Format", "XML")
+        set_attribute(xdr, "NumberType", "Float")
+        set_attribute(xdr, "Dimensions", "$(length(dim_g))")
+        add_text(xdr, join(reverse(spacing), ' '))
+
+        h5_path = h5_names[it]
+        for (name,_) ∈ fields
+            create_xdmf_attribute(xgrid,h5_path,name,dim_g)
+        end
+    end
+
+    save_file(xdoc, path)
+    return
+end
+
+function write_xdmf(path,h5_path,fields,origin,spacing,dim_g)
+    xdoc = XMLDocument()
+    xroot = create_root(xdoc, "Xdmf")
+    set_attribute(xroot, "Version","3.0")
+
+    xdomain = new_child(xroot, "Domain")
+    xgrid   = new_child(xdomain, "Grid")
+    set_attribute(xgrid, "GridType","Uniform")
+    xtopo = new_child(xgrid, "Topology")
+    set_attribute(xtopo, "TopologyType", "3DCoRectMesh")
+    set_attribute(xtopo, "Dimensions", join(reverse(dim_g).+1,' '))
+
+    xgeom = new_child(xgrid, "Geometry")
+    set_attribute(xgeom, "GeometryType", "ORIGIN_DXDYDZ")
+
+    xorig = new_child(xgeom, "DataItem")
+    set_attribute(xorig, "Format", "XML")
+    set_attribute(xorig, "NumberType", "Float")
+    set_attribute(xorig, "Dimensions", "$(length(dim_g)) ")
+    add_text(xorig, join(reverse(origin), ' '))
+
+    xdr = new_child(xgeom, "DataItem")
+    set_attribute(xdr, "Format", "XML")
+    set_attribute(xdr, "NumberType", "Float")
+    set_attribute(xdr, "Dimensions", "$(length(dim_g))")
+    add_text(xdr, join(reverse(spacing), ' '))
+
+    for (name,_) ∈ fields
+        create_xdmf_attribute(xgrid,h5_path,name,dim_g)
+    end
+    save_file(xdoc, path)
+    return
+end
+
+function create_xdmf_attribute(xgrid,h5_path,name,dim_g)
+    # TODO: solve type and precision
+    xattr = new_child(xgrid, "Attribute")
+    set_attribute(xattr, "Name", name)
+    set_attribute(xattr, "Center", "Cell")
+    xdata = new_child(xattr, "DataItem")
+    set_attribute(xdata, "Format", "HDF")
+    set_attribute(xdata, "NumberType", "Float")
+    set_attribute(xdata, "Precision", "8")
+    set_attribute(xdata, "Dimensions", join(reverse(dim_g), ' '))
+    add_text(xdata, "$(h5_path):/$name")
+    return xattr
+end
diff --git a/scripts3D_variational/dual_contouring.jl b/scripts3D_variational/dual_contouring.jl
new file mode 100644
index 00000000..cba484c0
--- /dev/null
+++ b/scripts3D_variational/dual_contouring.jl
@@ -0,0 +1,139 @@
+using GLMakie
+using GeometryBasics
+using StaticArrays
+using LinearAlgebra
+
+function dual_contour(Ψ::AbstractArray{T,3},xc,yc,zc) where T
+    vertices  = Point{3,Float64}[]
+    tris = TriangleFace{Int}[]
+    vert_idx  = Array{Int,3}(undef,size(Ψ).-1)
+    # insert vertices
+    for iz in 1:size(Ψ,3)-1, iy in 1:size(Ψ,2)-1, ix in 1:size(Ψ,1)-1
+        S = MArray{NTuple{3,2},T}(undef)
+        for idz in 0:1,idy in 0:1,idx in 0:1
+            S[idx+1,idy+1,idz+1] = Ψ[ix+idx,iy+idy,iz+idz]
+        end
+        change_sign = !(all(S .> 0) || all(S .< 0))
+        if change_sign
+            push!(vertices,Point(xc[ix],yc[iy],zc[iz]))
+            vert_idx[ix,iy,iz] = length(vertices)
+        end
+    end
+    # insert triangles
+    for iz in 1:size(Ψ,3)-1, iy in 1:size(Ψ,2)-1, ix in 1:size(Ψ,1)-1
+        if Ψ[ix,iy,iz]*Ψ[ix+1,iy,iz] <= 0
+            if iy < 2 || iz < 2
+                continue
+            end
+            i1 = vert_idx[ix,iy-1,iz-1]
+            i2 = vert_idx[ix,iy  ,iz-1]
+            i3 = vert_idx[ix,iy  ,iz  ]
+            i4 = vert_idx[ix,iy-1,iz  ]
+            push!(tris,TriangleFace(i1,i2,i3),TriangleFace(i1,i3,i4))
+        end
+        if Ψ[ix,iy,iz]*Ψ[ix,iy+1,iz] <= 0
+            if ix < 2 || iz < 2
+                continue
+            end
+            i1 = vert_idx[ix-1,iy,iz-1]
+            i2 = vert_idx[ix  ,iy,iz-1]
+            i3 = vert_idx[ix  ,iy,iz  ]
+            i4 = vert_idx[ix-1,iy,iz  ]
+            push!(tris,TriangleFace(i1,i2,i3),TriangleFace(i1,i3,i4))
+        end
+        if Ψ[ix,iy,iz]*Ψ[ix,iy,iz+1] <= 0
+            if ix < 2 || iy < 2
+                continue
+            end
+            i1 = vert_idx[ix-1,iy-1,iz]
+            i2 = vert_idx[ix  ,iy-1,iz]
+            i3 = vert_idx[ix  ,iy  ,iz]
+            i4 = vert_idx[ix-1,iy  ,iz]
+            push!(tris,TriangleFace(i1,i2,i3),TriangleFace(i1,i3,i4))
+        end
+    end
+    return vertices, tris
+end
+
+# Coefficients of a cubic Hermite spline in 1D
+function hspline_coeffs(p::StaticVector{2},∇p::StaticVector{2})
+    return SVector(
+        p[1], ∇p[1],
+        3*(p[2] - p[1]) - 2*∇p[1] - ∇p[2],
+        2*(p[1] - p[2]) +   ∇p[1] + ∇p[2]
+    )
+end
+
+function hspline_coeffs(p::StaticMatrix{2},∇px::StaticMatrix{2},∇py::StaticMatrix{2},∇pxy::StaticMatrix{2})
+
+end
+
+@inline function eval_poly(a::StaticVector{4},x)
+    return a[1] + (a[2] + (a[3] + a[4]*x)*x)*x
+end
+
+@inline function eval_poly(a::StaticMatrix{4},x,y)
+    return eval_poly(
+        SVector(eval_poly(a[:,1],x),
+                eval_poly(a[:,2],x),
+                eval_poly(a[:,3],x),
+                eval_poly(a[:,4],x)),
+        y
+    )
+end
+
+function hspline_interp!(p_i,x_i,xs,ps;bcs=(nothing,nothing))
+    dx = step(xs)
+    for (ip,x) in enumerate(x_i)
+        xdiv = (x-xs[1])/dx
+        ix = clamp(floor(Int,xdiv) + 1, 1, length(xs)-1)
+        t  = xdiv - (ix-1)
+        p  = SVector(ps[ix], ps[ix+1])
+        m1 = ix > firstindex(xs)    ? (ps[ix+1] - ps[ix-1]) / 2 : isnothing(bcs[1]) ? ps[2  ] - ps[1    ] : bcs[1] * dx
+        m2 = ix < lastindex(xs) - 1 ? (ps[ix+2] - ps[ix  ]) / 2 : isnothing(bcs[2]) ? ps[end] - ps[end-1] : bcs[2] * dx
+        m  = SVector(m1, m2)
+        p_i[ip] = hspline_interp(p, m, t)
+    end
+    return
+end
+
+function test_interp()
+    xs = LinRange(-π,π,11)
+    qs = sin.(xs)
+    x_i = LinRange(-1.1π,1.1π,101)
+    q_i = similar(x_i)
+    hspline_interp!(q_i,x_i,xs,qs;bcs=(-1,-1))
+    fig = Figure()
+    ax  = Axis(fig[1,1];aspect=DataAspect())
+    lines!(ax,x_i,q_i)
+    scatter!(ax,xs,qs)
+    display(fig)
+    return
+end
+
+test_interp()
+
+function main()
+    println("Hello world!")
+    Ψ = Array{Float64}(undef,100,100,100)
+    xv = LinRange(-2,2,size(Ψ,1))
+    yv = LinRange(-2,2,size(Ψ,2))
+    zv = LinRange(-2,2,size(Ψ,3))
+    xc = 0.5.*(xv[1:end-1].+xv[2:end])
+    yc = 0.5.*(yv[1:end-1].+yv[2:end])
+    zc = 0.5.*(zv[1:end-1].+zv[2:end])
+    for iz in axes(Ψ,3), iy in axes(Ψ,2), ix in axes(Ψ,1)
+        Ψ[ix,iy,iz] = sqrt(xv[ix]^2 + yv[iy]^2 + zv[iz]^2) - 1.5
+    end
+    @time verts,tris = dual_contour(Ψ,xc,yc,zc)
+    fig = Figure()
+    ax  = Axis3(fig[1,1];aspect=:data,viewmode=:fitzoom)
+    limits!(ax,extrema(xv),extrema(yv),extrema(zv))
+    isosurface = GeometryBasics.Mesh(verts,tris)
+    mesh!(ax,isosurface)
+    # wireframe!(ax,isosurface;color=:black)
+    display(fig)
+    return
+end
+
+main()
\ No newline at end of file
diff --git a/scripts3D_variational/hide_communication.jl b/scripts3D_variational/hide_communication.jl
new file mode 100644
index 00000000..0a1e85e0
--- /dev/null
+++ b/scripts3D_variational/hide_communication.jl
@@ -0,0 +1,33 @@
+@inline __subrange(nr,bw,I,::Val{1}) = 1:bw[I]
+@inline __subrange(nr,bw,I,::Val{2}) = (size(nr,I)-bw[I]+1):size(nr,I)
+@inline __subrange(nr,bw,I,::Val{3}) = (bw[I]+1):(size(nr,I)-bw[I])
+
+@inline split_ndrange(ndrange,ndwidth) = split_ndrange(CartesianIndices(ndrange),ndwidth)
+
+function split_ndrange(ndrange::CartesianIndices{N},ndwidth::NTuple{N,<:Integer}) where N
+    @assert all(size(ndrange) .> ndwidth.*2)
+    @inline ndsubrange(I,::Val{J}) where J = ntuple(Val(N)) do idim
+        if idim < I
+            1:size(ndrange,idim)
+        elseif idim == I
+            __subrange(ndrange,ndwidth,idim,Val(J))
+        else
+            __subrange(ndrange,ndwidth,idim,Val(3))
+        end
+    end
+    ndinner = ntuple(idim -> __subrange(ndrange,ndwidth,idim,Val(3)), Val(N))
+    return ntuple(Val(2N+1)) do i
+        if i == 2N+1
+            ndrange[ndinner...]
+        else
+            idim,idir = divrem(i-1,2) .+ 1
+            ndrange[ndsubrange(idim,Val(idir))...]
+        end
+    end
+end
+
+function hide_comm(f,ranges)
+    ie = f(ranges[end])
+    oe = ntuple(i->f(ranges[i]), length(ranges)-1)
+    return ie, oe
+end
\ No newline at end of file
diff --git a/scripts3D_variational/level_set_kernels.jl b/scripts3D_variational/level_set_kernels.jl
new file mode 100644
index 00000000..cd5a68d2
--- /dev/null
+++ b/scripts3D_variational/level_set_kernels.jl
@@ -0,0 +1,62 @@
+@tiny function _kernel_init_level_set!(Ψ,dem,dem_grid,Ψ_grid,cutoff,R)
+    ix,iy,iz = @indices
+    x,y,z    = Ψ_grid[1][ix],Ψ_grid[2][iy],Ψ_grid[3][iz]
+    P        = R*Point3(x,y,z)
+    ud,sgn   = sd_dem(P,cutoff,dem,dem_grid)
+    @inbounds Ψ[ix,iy,iz] = ud*sgn
+    return
+end
+
+@tiny function _kernel_compute_dΨ_dt!(dΨ_dt,Ψ,Ψ0,dx,dy,dz)
+    ix,iy,iz = @indices
+    @inline changes_sign_x(disp) = @inbounds Ψ0[ix,iy,iz]*Ψ0[ix+disp,iy,iz] < 0
+    @inline changes_sign_y(disp) = @inbounds Ψ0[ix,iy,iz]*Ψ0[ix,iy+disp,iz] < 0
+    @inline changes_sign_z(disp) = @inbounds Ψ0[ix,iy,iz]*Ψ0[ix,iy,iz+disp] < 0
+    ch_x, ch_y, ch_z = false, false, false
+    ∂Ψ0_∂x, ∂Ψ0_∂y, ∂Ψ0_∂z = 0.0, 0.0, 0.0
+    if ix ∈ axes(Ψ0,1)[2:end-1]
+        ch_x = changes_sign_x(1) || changes_sign_x(-1)
+        @inbounds ∂Ψ0_∂x = (Ψ0[ix+1,iy,iz]-Ψ0[ix-1,iy,iz])/(2dx)
+    end
+    if iy ∈ axes(Ψ0,2)[2:end-1]
+        ch_y = changes_sign_y(1) || changes_sign_y(-1)
+        @inbounds ∂Ψ0_∂y = (Ψ0[ix,iy+1,iz]-Ψ0[ix,iy-1,iz])/(2dy)
+    end
+    if iz ∈ axes(Ψ0,3)[2:end-1]
+        ch_z = changes_sign_z(1) || changes_sign_z(-1)
+        @inbounds ∂Ψ0_∂z = (Ψ0[ix,iy,iz+1]-Ψ0[ix,iy,iz-1])/(2dz)
+    end
+    if (ch_x || ch_y || ch_z)
+        # local surface reconstruction
+        @inbounds D = Ψ0[ix,iy,iz]/sqrt(∂Ψ0_∂x^2 + ∂Ψ0_∂y^2 + ∂Ψ0_∂z^2)
+        @inbounds dΨ_dt[ix,iy,iz] = (D-sign(Ψ0[ix,iy,iz])*abs(Ψ[ix,iy,iz]))/dx
+    else
+        @inbounds begin
+            # Hamilton-Jacobi with Godunov flux
+            # direction '-' derivatives
+            ∂Ψ_∂x⁻ = ix > 1 ? (Ψ[ix,iy,iz] - Ψ[ix-1,iy,iz])/dx : 0.0
+            ∂Ψ_∂y⁻ = iy > 1 ? (Ψ[ix,iy,iz] - Ψ[ix,iy-1,iz])/dy : 0.0
+            ∂Ψ_∂z⁻ = iz > 1 ? (Ψ[ix,iy,iz] - Ψ[ix,iy,iz-1])/dy : 0.0
+            # direction '+' derivatives
+            ∂Ψ_∂x⁺ = ix < size(Ψ,1) ? (Ψ[ix+1,iy,iz] - Ψ[ix,iy,iz]) / dx : 0.0
+            ∂Ψ_∂y⁺ = iy < size(Ψ,2) ? (Ψ[ix,iy+1,iz] - Ψ[ix,iy,iz]) / dy : 0.0
+            ∂Ψ_∂z⁺ = iz < size(Ψ,3) ? (Ψ[ix,iy,iz+1] - Ψ[ix,iy,iz]) / dz : 0.0
+            # upwind fluxes
+            ∂Ψ_∂x2 = Ψ0[ix,iy,iz] >= 0 ? max(max(∂Ψ_∂x⁻,0)^2, min(∂Ψ_∂x⁺,0)^2) :
+                                         max(min(∂Ψ_∂x⁻,0)^2, max(∂Ψ_∂x⁺,0)^2)
+            ∂Ψ_∂y2 = Ψ0[ix,iy,iz] >= 0 ? max(max(∂Ψ_∂y⁻,0)^2, min(∂Ψ_∂y⁺,0)^2) :
+                                         max(min(∂Ψ_∂y⁻,0)^2, max(∂Ψ_∂y⁺,0)^2)
+            ∂Ψ_∂z2 = Ψ0[ix,iy,iz] >= 0 ? max(max(∂Ψ_∂z⁻,0)^2, min(∂Ψ_∂z⁺,0)^2) :
+                                         max(min(∂Ψ_∂z⁻,0)^2, max(∂Ψ_∂z⁺,0)^2)
+            # compute update
+            dΨ_dt[ix,iy,iz] = sign(Ψ0[ix,iy,iz])*(1.0-sqrt(∂Ψ_∂x2+∂Ψ_∂y2+∂Ψ_∂z2))
+        end
+    end
+    return
+end
+
+@tiny function _kernel_update_Ψ!(Ψ, dΨ_dt, dt)
+    I = @cartesianindex
+    @inbounds Ψ[I] += dt*dΨ_dt[I]
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational/level_sets.jl b/scripts3D_variational/level_sets.jl
new file mode 100644
index 00000000..13019b10
--- /dev/null
+++ b/scripts3D_variational/level_sets.jl
@@ -0,0 +1,15 @@
+include("level_set_kernels.jl")
+
+const _init_level_set! = _kernel_init_level_set!(get_device())
+const _compute_dΨ_dt!  = _kernel_compute_dΨ_dt!(get_device())
+const _update_Ψ!       = _kernel_update_Ψ!(get_device())
+
+function compute_level_set_from_dem!(Ψ,dem,dem_grid,Ψ_grid)
+    TinyKernels.device_synchronize(get_device())
+    dx,dy,dz = step.(Ψ_grid)
+    cutoff   = 4max(dx,dy,dz)
+    R        = LinearAlgebra.I
+    wait(_init_level_set!(Ψ,dem,dem_grid,Ψ_grid,cutoff,R;ndrange=axes(Ψ)))
+    return
+end
+
diff --git a/scripts3D_variational/load_dem.jl b/scripts3D_variational/load_dem.jl
new file mode 100644
index 00000000..0385198b
--- /dev/null
+++ b/scripts3D_variational/load_dem.jl
@@ -0,0 +1,15 @@
+using FileIO
+
+function filter_range(r,lims)
+    istart = something(findfirst(v -> v>lims[1], r), length(r))
+    iend   = something( findlast(v -> v<lims[2], r), 1)
+    return istart:iend
+end
+
+function load_dem(path,(;xlims,ylims))
+    x,y,bed,surface=load(path,"x","y","bed","surface")
+    # shift limits to look for offsets
+    ixs = filter_range(x,xlims .+ x[1])
+    iys = filter_range(y,ylims .+ y[1])
+    return (x = x[ixs], y = y[iys], bed = bed[ixs,iys], surface = surface[ixs,iys])
+end
\ No newline at end of file
diff --git a/scripts3D_variational/prepare_dem.jl b/scripts3D_variational/prepare_dem.jl
new file mode 100644
index 00000000..22fec202
--- /dev/null
+++ b/scripts3D_variational/prepare_dem.jl
@@ -0,0 +1,20 @@
+using JLD2
+using NetCDF
+
+const GREENLAND_PATH = "data/BedMachine/BedMachineGreenland-v5.nc"
+
+function prepare_greenland()
+    x = ncread(GREENLAND_PATH,"x")
+    y = reverse(ncread(GREENLAND_PATH,"y"))
+    @assert issorted(x)
+    @assert issorted(y)
+    x = LinRange(x[1],x[end],length(x))
+    y = LinRange(y[1],y[end],length(y))
+    bed     = reverse(ncread(GREENLAND_PATH,"bed")    ; dims=2)
+    surface = reverse(ncread(GREENLAND_PATH,"surface"); dims=2)
+    mask    = reverse(ncread(GREENLAND_PATH,"mask")   ; dims=2)
+    jldsave("data/BedMachine/greenland.jld2";x,y,bed,surface,mask)
+    return
+end
+
+prepare_greenland()
\ No newline at end of file
diff --git a/scripts3D_variational/signed_distances.jl b/scripts3D_variational/signed_distances.jl
new file mode 100644
index 00000000..6bc7b26e
--- /dev/null
+++ b/scripts3D_variational/signed_distances.jl
@@ -0,0 +1,65 @@
+using LinearAlgebra,GeometryBasics
+
+@inline S(x) = x == zero(x) ? oneunit(x) : sign(x)
+@inline sign_triangle(p,a,b,c) = S(dot(p-a,cross(b-a,c-a)))
+
+@inline function ud_triangle(p,a,b,c)
+    dot2(v) = dot(v,v)
+    ba  = b - a; pa = p - a
+    cb  = c - b; pb = p - b
+    ac  = a - c; pc = p - c
+    nor = cross(ba,ac)
+    return sqrt(
+       (sign(dot(cross(ba,nor),pa)) +
+        sign(dot(cross(cb,nor),pb)) +
+        sign(dot(cross(ac,nor),pc)) < 2)
+        ?
+        min(
+        dot2(ba*clamp(dot(ba,pa)/dot2(ba),0,1)-pa),
+        dot2(cb*clamp(dot(cb,pb)/dot2(cb),0,1)-pb),
+        dot2(ac*clamp(dot(ac,pc)/dot2(ac),0,1)-pc) )
+        :
+        dot(nor,pa)*dot(nor,pa)/dot2(nor) )
+end
+
+@inline function closest_vertex_index(P,rc)
+    lims = map(x->x[1:end-1],axes.(rc,1))
+    Δ = step.(rc)
+    O = first.(rc)
+    I = @. clamp(Int(fld(P-O,Δ))+1,lims)
+    return CartesianIndex(I...)
+end
+
+@inline inc(I,dim) = Base.setindex(I,I[dim]+1,dim)
+@inline inc(I) = I + oneunit(I)
+
+@inline function triangle_pair(Iv,dem,rc)
+    @inline function sample_dem(I)
+        @inbounds x,y = rc[1][I[1]],rc[2][I[2]]
+        @inbounds Point3(x,y,dem[I])
+    end
+    T_BL = Triangle(sample_dem(Iv)       ,sample_dem(inc(Iv,1)),sample_dem(inc(Iv,2)))
+    T_TR = Triangle(sample_dem(inc(Iv,2)),sample_dem(inc(Iv,1)),sample_dem(inc(Iv)))
+    return T_BL,T_TR
+end
+
+@inline function distance_to_triangle_pair(P,Iv,dem,rc)
+    T_BL,T_TR = triangle_pair(Iv,dem,rc)
+    ud = min(ud_triangle(P,T_BL...),ud_triangle(P,T_TR...))
+    return ud,sign_triangle(P,T_BL...)
+end
+
+function sd_dem(P,cutoff,dem,rc)
+    @inbounds Pp = clamp.(Point(P[1],P[2]),first.(rc),last.(rc))
+    @inbounds P  = Point(Pp[1],Pp[2],P[3])
+    BL = closest_vertex_index(Pp.-cutoff,rc)
+    TR = closest_vertex_index(Pp.+cutoff,rc)
+    Ic = closest_vertex_index(Pp,rc)
+    ud,sgn = distance_to_triangle_pair(P,Ic,dem,rc)
+    for Iv in BL:TR
+        if Iv == Ic continue end
+        ud_pair,_ = distance_to_triangle_pair(P,Iv,dem,rc)
+        ud = min(ud,ud_pair)
+    end
+    return ud,sgn
+end
\ No newline at end of file
diff --git a/scripts3D_variational/stokes.jl b/scripts3D_variational/stokes.jl
new file mode 100644
index 00000000..cae191dd
--- /dev/null
+++ b/scripts3D_variational/stokes.jl
@@ -0,0 +1,24 @@
+include("stokes_kernels.jl")
+
+const _update_σ! = _kernel_update_σ!(get_device())
+const _update_V! = _kernel_update_V!(get_device())
+
+function update_σ!(Pr, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz)
+    wait(_update_σ!(Pr, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz; ndrange=axes(Pr)))
+    return
+end
+
+function update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy, dz; bwidth)
+    V_inn = (x = inn(V.x), y = inn(V.y), z = inn(V.z))
+    ranges = split_ndrange(axes(Pr),bwidth)
+    ie,oe  =  hide_comm(ranges) do ndrange
+        _update_V!(V_inn, Pr, τ, ηs, wt, nudτ, ρg, dx, dy, dz; ndrange)
+    end
+    wait.(oe)
+    bc_x_neumann!(0.0,V.y,V.z)
+    bc_y_neumann!(0.0,V.x,V.z)
+    bc_z_neumann!(0.0,V.x,V.y)
+    update_halo!(V.x,V.y,V.z)
+    wait(ie)
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational/stokes_kernels.jl b/scripts3D_variational/stokes_kernels.jl
new file mode 100644
index 00000000..0a9aab4a
--- /dev/null
+++ b/scripts3D_variational/stokes_kernels.jl
@@ -0,0 +1,131 @@
+@tiny function _kernel_update_σ!(Pr, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz)
+    ix,iy,iz = @indices
+    # na,ns    = wt.not_air, wt.not_solid
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    # detect and eliminate null spaces
+    isnull = (wt.not_air.x[ix,iy,iz] ≈ 0.0) || (wt.not_air.x[ix+1,iy  ,iz  ] ≈ 0.0) ||
+             (wt.not_air.y[ix,iy,iz] ≈ 0.0) || (wt.not_air.y[ix  ,iy+1,iz  ] ≈ 0.0) ||
+             (wt.not_air.z[ix,iy,iz] ≈ 0.0) || (wt.not_air.z[ix  ,iy  ,iz+1] ≈ 0.0)
+    if !isnull && (wt.not_air.c[ix,iy,iz] > 0.0)
+        exx = (V.x[ix+1,iy  ,iz  ]*wt.not_solid.x[ix+1,iy  ,iz  ] - V.x[ix,iy,iz]*wt.not_solid.x[ix,iy,iz])/dx
+        eyy = (V.y[ix  ,iy+1,iz  ]*wt.not_solid.y[ix  ,iy+1,iz  ] - V.y[ix,iy,iz]*wt.not_solid.y[ix,iy,iz])/dy
+        ezz = (V.z[ix  ,iy  ,iz+1]*wt.not_solid.z[ix  ,iy  ,iz+1] - V.z[ix,iy,iz]*wt.not_solid.z[ix,iy,iz])/dz
+        ∇V = exx + eyy + ezz
+        Pr[ix,iy,iz] -= ∇V*ηs[ix,iy,iz]*r/θ_dτ
+        τ.xx[ix,iy,iz] += (-τ.xx[ix,iy,iz] + 2.0*ηs[ix,iy,iz]*(exx-∇V/3.0)) * dτ_r
+        τ.yy[ix,iy,iz] += (-τ.yy[ix,iy,iz] + 2.0*ηs[ix,iy,iz]*(eyy-∇V/3.0)) * dτ_r
+        τ.zz[ix,iy,iz] += (-τ.zz[ix,iy,iz] + 2.0*ηs[ix,iy,iz]*(ezz-∇V/3.0)) * dτ_r
+    else
+        Pr[ix,iy,iz] = 0.0
+        τ.xx[ix,iy,iz] = 0.0
+        τ.yy[ix,iy,iz] = 0.0
+        τ.zz[ix,iy,iz] = 0.0
+    end
+    @inbounds if isin(τ.xy)
+        # detect and eliminate null spaces
+        isnull = (wt.not_air.x[ix+1,iy+1,iz+1] ≈ 0.0) || (wt.not_air.x[ix+1,iy  ,iz+1] ≈ 0.0) ||
+                 (wt.not_air.y[ix+1,iy+1,iz+1] ≈ 0.0) || (wt.not_air.y[ix  ,iy+1,iz+1] ≈ 0.0)
+        if !isnull && (wt.not_air.xy[ix,iy,iz] > 0.0)
+            exy =
+                0.5 * (
+                    (V.x[ix+1,iy+1,iz+1]*wt.not_solid.x[ix+1,iy+1,iz+1] - V.x[ix+1,iy  ,iz+1]*wt.not_solid.x[ix+1,iy  ,iz+1])/dy +
+                    (V.y[ix+1,iy+1,iz+1]*wt.not_solid.y[ix+1,iy+1,iz+1] - V.y[ix  ,iy+1,iz+1]*wt.not_solid.y[ix  ,iy+1,iz+1])/dx
+                )
+            ηs_av = 0.25*(ηs[ix,iy,iz+1] + ηs[ix+1,iy,iz+1] + ηs[ix,iy+1,iz+1] + ηs[ix+1,iy+1,iz+1])
+            τ.xy[ix,iy,iz] += (-τ.xy[ix,iy,iz] + 2.0*ηs_av*exy)*dτ_r
+        else
+            τ.xy[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(τ.xz)
+        # detect and eliminate null spaces
+        isnull = (wt.not_air.x[ix+1,iy+1,iz+1] ≈ 0.0) || (wt.not_air.x[ix+1,iy+1,iz  ] ≈ 0.0) ||
+                 (wt.not_air.z[ix+1,iy+1,iz+1] ≈ 0.0) || (wt.not_air.z[ix  ,iy+1,iz+1] ≈ 0.0)
+        if !isnull && (wt.not_air.xz[ix,iy,iz] > 0.0)
+            exz =
+                0.5 * (
+                    (V.x[ix+1,iy+1,iz+1]*wt.not_solid.x[ix+1,iy+1,iz+1] - V.x[ix+1,iy+1,iz  ]*wt.not_solid.x[ix+1,iy+1,iz  ])/dz +
+                    (V.z[ix+1,iy+1,iz+1]*wt.not_solid.z[ix+1,iy+1,iz+1] - V.z[ix  ,iy+1,iz+1]*wt.not_solid.z[ix  ,iy+1,iz+1])/dx
+                )
+            ηs_av = 0.25*(ηs[ix,iy+1,iz] + ηs[ix+1,iy+1,iz] + ηs[ix,iy+1,iz+1] + ηs[ix+1,iy+1,iz+1])
+            τ.xz[ix,iy,iz] += (-τ.xz[ix,iy,iz] + 2.0*ηs_av*exz)*dτ_r
+        else
+            τ.xz[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(τ.yz)
+        # detect and eliminate null spaces
+        isnull = (wt.not_air.y[ix+1,iy+1,iz+1] ≈ 0.0) || (wt.not_air.y[ix+1,iy+1,iz  ] ≈ 0.0) ||
+                 (wt.not_air.z[ix+1,iy+1,iz+1] ≈ 0.0) || (wt.not_air.z[ix+1,iy  ,iz+1] ≈ 0.0)
+        if !isnull && (wt.not_air.yz[ix,iy,iz] > 0.0)
+            eyz =
+                0.5 * (
+                    (V.y[ix+1,iy+1,iz+1]*wt.not_solid.y[ix+1,iy+1,iz+1] - V.y[ix+1,iy+1,iz  ]*wt.not_solid.y[ix+1,iy+1,iz  ])/dz +
+                    (V.z[ix+1,iy+1,iz+1]*wt.not_solid.z[ix+1,iy+1,iz+1] - V.z[ix+1,iy  ,iz+1]*wt.not_solid.z[ix+1,iy  ,iz+1])/dy
+                )
+            ηs_av = 0.25*(ηs[ix+1,iy,iz] + ηs[ix+1,iy+1,iz] + ηs[ix+1,iy,iz+1] + ηs[ix+1,iy+1,iz+1])
+            τ.yz[ix,iy,iz] += (-τ.yz[ix,iy,iz] + 2.0*ηs_av*eyz)*dτ_r
+        else
+            τ.yz[ix,iy,iz] = 0.0
+        end
+    end
+    return
+end
+
+@tiny function _kernel_update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy, dz)
+    ix,iy,iz = @indices
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    # TODO: check which volume fraction (non-air or non-solid) really determines the null spaces
+    @inbounds if isin(V.x)
+        # detect and eliminate null spaces
+        isnull = ( wt.not_solid.c[ix+1,iy+1,iz+1] ≈ 0) || ( wt.not_solid.c[ix,iy+1,iz+1] ≈ 0) ||
+                 (wt.not_solid.xy[ix  ,iy+1,iz  ] ≈ 0) || (wt.not_solid.xy[ix,iy  ,iz  ] ≈ 0) ||
+                 (wt.not_solid.xz[ix  ,iy  ,iz+1] ≈ 0) || (wt.not_solid.xz[ix,iy  ,iz  ] ≈ 0)
+        if !isnull && (wt.not_air.x[ix+1,iy+1,iz+1] > 0) && (wt.not_solid.x[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_x is computed correctly
+            ηs_x = max(ηs[ix,iy+1,iz+1],ηs[ix+1,iy+1,iz+1])
+            ∂σxx_∂x = ((-Pr[ix+1,iy+1,iz+1]+τ.xx[ix+1,iy+1,iz+1])*wt.not_air.c[ix+1,iy+1,iz+1] -
+                       (-Pr[ix  ,iy+1,iz+1]+τ.xx[ix  ,iy+1,iz+1])*wt.not_air.c[ix  ,iy+1,iz+1])/dx
+            ∂τxy_∂y = (τ.xy[ix,iy+1,iz]*wt.not_air.xy[ix,iy+1,iz] - τ.xy[ix,iy,iz]*wt.not_air.xy[ix,iy,iz])/dy
+            ∂τxz_∂z = (τ.xz[ix,iy,iz+1]*wt.not_air.xz[ix,iy,iz+1] - τ.xz[ix,iy,iz]*wt.not_air.xz[ix,iy,iz])/dz
+            V.x[ix,iy,iz] += (∂σxx_∂x + ∂τxy_∂y + ∂τxz_∂z - ρg.x)*nudτ/ηs_x
+        else
+            V.x[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(V.y)
+        # detect and eliminate null spaces
+        isnull = ( wt.not_solid.c[ix+1,iy+1,iz+1] ≈ 0) || ( wt.not_solid.c[ix+1,iy,iz+1] ≈ 0) ||
+                 (wt.not_solid.xy[ix+1,iy  ,iz  ] ≈ 0) || (wt.not_solid.xy[ix  ,iy,iz  ] ≈ 0) ||
+                 (wt.not_solid.yz[ix  ,iy  ,iz+1] ≈ 0) || (wt.not_solid.yz[ix  ,iy,iz  ] ≈ 0)
+        if !isnull && (wt.not_air.y[ix+1,iy+1,iz+1] > 0) && (wt.not_solid.y[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_y is computed correctly
+            ηs_y = max(ηs[ix+1,iy,iz+1],ηs[ix+1,iy+1,iz+1])
+            ∂σyy_∂y = ((-Pr[ix+1,iy+1,iz+1] + τ.yy[ix+1,iy+1,iz+1])*wt.not_air.c[ix+1,iy+1,iz+1] - 
+                       (-Pr[ix+1,iy  ,iz+1] + τ.yy[ix+1,iy  ,iz+1])*wt.not_air.c[ix+1,iy  ,iz+1])/dy
+            ∂τxy_∂x = (τ.xy[ix+1,iy,iz  ]*wt.not_air.xy[ix+1,iy,iz] - τ.xy[ix,iy,iz]*wt.not_air.xy[ix,iy,iz])/dx
+            ∂τyz_∂z = (τ.yz[ix  ,iy,iz+1]*wt.not_air.yz[ix,iy,iz+1] - τ.yz[ix,iy,iz]*wt.not_air.yz[ix,iy,iz])/dz
+            V.y[ix,iy,iz] += (∂σyy_∂y + ∂τxy_∂x + ∂τyz_∂z - ρg.y)*nudτ/ηs_y
+        else
+            V.y[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(V.z)
+        # detect and eliminate null spaces
+        isnull = ( wt.not_solid.c[ix+1,iy+1,iz+1] ≈ 0) || ( wt.not_solid.c[ix+1,iy+1,iz  ] ≈ 0) ||
+                 (wt.not_solid.xy[ix+1,iy  ,iz  ] ≈ 0) || (wt.not_solid.xy[ix  ,iy  ,iz  ] ≈ 0) ||
+                 (wt.not_solid.yz[ix  ,iy+1,iz  ] ≈ 0) || (wt.not_solid.yz[ix  ,iy  ,iz  ] ≈ 0)
+        if !isnull && (wt.not_air.y[ix+1,iy+1,iz+1] > 0) && (wt.not_solid.y[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_z is computed correctly
+            ηs_z = max(ηs[ix+1,iy+1,iz],ηs[ix+1,iy+1,iz+1])
+            ∂σzz_∂z = ((-Pr[ix+1,iy+1,iz+1] + τ.zz[ix+1,iy+1,iz+1])*wt.not_air.c[ix+1,iy+1,iz+1] - 
+                       (-Pr[ix+1,iy+1,iz  ] + τ.zz[ix+1,iy+1,iz  ])*wt.not_air.c[ix+1,iy+1,iz  ])/dz
+            ∂τxz_∂x = (τ.xz[ix+1,iy,iz]*wt.not_air.xz[ix+1,iy,iz] - τ.xz[ix,iy,iz]*wt.not_air.xz[ix,iy,iz])/dx
+            ∂τyz_∂y = (τ.yz[ix,iy+1,iz]*wt.not_air.yz[ix,iy+1,iz] - τ.yz[ix,iy,iz]*wt.not_air.yz[ix,iy,iz])/dy
+            V.z[ix,iy,iz] += (∂σzz_∂z + ∂τxz_∂x + ∂τyz_∂y - ρg.z)*nudτ/ηs_z
+        else
+            V.z[ix,iy,iz] = 0.0
+        end
+    end
+    return
+end
diff --git a/scripts3D_variational/test_volume_fractions.jl b/scripts3D_variational/test_volume_fractions.jl
new file mode 100644
index 00000000..5a28f584
--- /dev/null
+++ b/scripts3D_variational/test_volume_fractions.jl
@@ -0,0 +1,251 @@
+using FastIce
+using Logging
+using MPI
+using ImplicitGlobalGrid
+using TinyKernels
+using HDF5
+using LightXML
+using CairoMakie
+
+include("load_dem.jl")
+include("signed_distances.jl")
+include("level_sets.jl")
+include("volume_fractions.jl")
+include("bcs.jl")
+include("stokes.jl")
+include("data_io.jl")
+include("hide_communication.jl")
+
+@views av1(A) = 0.5 .* (A[1:end-1] .+ A[2:end])
+@views inn_x(A) = A[2:end-1, :, :]
+@views inn_y(A) = A[:, 2:end-1, :]
+@views inn_z(A) = A[:, :, 2:end-1]
+@views inn(A) = A[2:end-1, 2:end-1, 2:end-1]
+
+@views function main(grid_dims,grid)
+    # unpack values
+    me, dims, nprocs, coords, comm_cart = grid
+
+    # init logger
+    global_logger(FastIce.Logging.MPILogger(0, comm_cart, global_logger()))
+
+    # path to DEM data
+    greenland_path = "data/BedMachine/greenland.jld2"
+
+    # region to simulate
+    global_region = (xlims=(1100.0e3, 1200.0e3), ylims=(1000.0e3, 1100.0e3))
+
+    # load DEM
+    @info "loading DEM data from the file '$greenland_path'"
+    (; x, y, bed, surface) = load_dem(greenland_path, global_region)
+    @info "DEM resolution: $(size(bed,1)) × $(size(bed,2))"
+
+    @info "plot DEMs"
+    if me == 0
+        fig = Figure(resolution=(2000,700),fontsize=32)
+        ax  = (
+            bed = Axis(fig[1,1][1,1];aspect=DataAspect(),title="bedrock",xlabel="x",ylabel="y"),
+            ice = Axis(fig[1,2][1,1];aspect=DataAspect(),title="ice"    ,xlabel="x",ylabel="y"),
+        )
+        plt = (
+            bed = heatmap!(ax.bed,x,y,bed    ;colormap=:terrain),
+            ice = heatmap!(ax.ice,x,y,surface;colormap=:terrain),
+        )
+        Colorbar(fig[1,1][1,2],plt.bed)
+        Colorbar(fig[1,2][1,2],plt.ice)
+        save("region.png",fig)
+    end
+
+    # compute origin and size of the domain (required for scaling and computing the grid size)
+    ox, oy, oz = x[1], y[1], minimum(bed)
+    lx = x[end] - ox
+    ly = y[end] - oy
+    lz = maximum(surface) - oz
+
+    # shift and scale the domain before computation (center of the domain is (0,0) in x-y plane)
+    δx, δy = ox + 0.5lx, oy + 0.5ly # required to avoid conversion to Vector  
+    x = @. (x - δx) / lz
+    y = @. (y - δy) / lz
+    @. bed = (bed - oz) / lz
+    @. surface = (surface - oz) / lz
+
+    @. surface -= 0.05
+
+    # run simulation
+    dem_data = (; x, y, bed, surface)
+    @info "running the simulation"
+    run_simulation(dem_data, grid_dims, me, dims, coords, comm_cart)
+
+    return
+end
+
+@views function run_simulation(dem_data, grid_dims, me, dims, coords, comm_cart)
+    # physics
+    # global domain origin and size
+    ox_g, oy_g, oz_g = dem_data.x[1], dem_data.y[1], 0.0
+    lx_g = dem_data.x[end] - ox_g
+    ly_g = dem_data.y[end] - oy_g
+    lz_g = 1.0
+
+    ρg = (x=0.0, y=0.0, z=1.0)
+
+    # local domain size and origin
+    lx_l, ly_l, lz_l = (lx_g, ly_g, lz_g) ./ dims
+    ox_l, oy_l, oz_l = (ox_g, oy_g, oz_g) .+ coords .* (lx_l, ly_l, lz_l)
+
+    # numerics
+    nx, ny, nz       = grid_dims
+    nx_l, ny_l, nz_l = grid_dims .+ 2 # include ghost nodes
+    nx_g, ny_g, nz_g = grid_dims.*dims
+    bwidth = (8, 4, 4)
+
+    # preprocessing
+    dx, dy, dz = lx_g / nx_g, ly_g / ny_g, lz_g / nz_g
+    @info "grid spacing: dx = $dx, dy = $dy, dz = $dz"
+
+    # take into account ghost nodes to simplify model setup
+    xv_l = LinRange(ox_l - dx, ox_l + lx_l + dx, nx_l + 1)
+    yv_l = LinRange(oy_l - dy, oy_l + ly_l + dy, ny_l + 1)
+    zv_l = LinRange(oz_l - dz, oz_l + lz_l + dz, nz_l + 1)
+    xc_l, yc_l, zc_l = av1.((xv_l, yv_l, zv_l))
+
+    # PT params
+    r = 0.7
+    lτ_re_mech = 0.5min(lx_g, ly_g, lz_g) / π
+    vdτ = min(dx, dy, dz) / sqrt(5.1)
+    θ_dτ = lτ_re_mech * (r + 4 / 3) / vdτ
+    nudτ = vdτ * lτ_re_mech
+    dτ_r = 1.0 / (θ_dτ + 1.0)
+
+    # fields allocation
+    # level set
+    Ψ = (
+        not_solid=scalar_field(Float64, nx_l + 1, ny_l + 1, nz_l + 1),
+        not_air  =scalar_field(Float64, nx_l + 1, ny_l + 1, nz_l + 1),
+    )
+    wt = (
+        not_solid=volfrac_field(Float64, nx_l, ny_l, nz_l),
+        not_air  =volfrac_field(Float64, nx_l, ny_l, nz_l),
+    )
+    # mechanics (stress fields include ghost nodes due to redundant computations on distributed staggered grid)
+    Pr = scalar_field(Float64, nx_l, ny_l, nz_l)
+    τ  = tensor_field(Float64, nx_l, ny_l, nz_l)
+    V  = vector_field(Float64, nx_l, ny_l, nz_l)
+    ηs = scalar_field(Float64, nx_l, ny_l, nz_l)
+    # residuals
+    Res = (
+        Pr=scalar_field(Float64, nx, ny, nz),
+        V =vector_field(Float64, nx, ny, nz)
+    )
+    # visualisation
+    Vmag = scalar_field(Float64, nx, ny, nz)
+    τII  = scalar_field(Float64, nx, ny, nz)
+    Ψav = (
+        not_air=scalar_field(Float64,nx,ny,nz),
+        not_solid=scalar_field(Float64,nx,ny,nz),
+    )
+
+    # initialisation
+    for comp in eachindex(V) fill!(V[comp], 0.0) end
+    for comp in eachindex(τ) fill!(τ[comp], 0.0) end
+    fill!(Pr, 0.0)
+    fill!(ηs, 1.0)
+
+    # compute level sets from DEM data
+    dem_grid = (dem_data.x, dem_data.y)
+    Ψ_grid = (xv_l, yv_l, zv_l)
+
+    @info "computing the level set for the ice surface"
+    compute_level_set_from_dem!(Ψ.not_air, to_device(dem_data.surface), dem_grid, Ψ_grid)
+
+    @info "computing the level set for the bedrock surface"
+    compute_level_set_from_dem!(Ψ.not_solid, to_device(dem_data.bed), dem_grid, Ψ_grid)
+    TinyKernels.device_synchronize(get_device())
+    # invert level set to set what's below the DEM surface as inside
+    @. Ψ.not_solid *= -1.0
+    TinyKernels.device_synchronize(get_device())
+
+    @info "computing volume fractions from level sets"
+    for phase in eachindex(Ψ)
+        compute_volume_fractions_from_level_set!(wt[phase], Ψ[phase], dx, dy, dz)
+    end
+
+    @info "iteration loop"
+    for iter in 1:500
+        @info "  iter: $iter"
+        update_σ!(Pr, τ, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz)
+        update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy, dz; bwidth)
+    end
+
+    @info "saving results on disk"
+    dim_g = (nx_g, ny_g, nz_g)
+    update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ)
+    out_h5 = "results.h5"
+    ndrange = CartesianIndices(((coords[1]*nx+1):(coords[1]+1)*nx,
+                                (coords[2]*ny+1):(coords[2]+1)*ny,
+                                (coords[3]*nz+1):(coords[3]+1)*nz))
+    fields = Dict("LS_ice" => Ψav.not_air, "LS_bed" => Ψav.not_solid, "Vmag" => Vmag, "TII" => τII, "Pr" => inn(Pr))
+    @info "saving HDF5 file"
+    write_h5(out_h5, fields, dim_g, ndrange, comm_cart, MPI.Info())
+
+    @info "saving XDMF file..."
+    (me == 0) && write_xdmf("results.xdmf3", out_h5, fields, (xc_l[2], yc_l[2], zc_l[2]), (dx, dy, dz), dim_g)
+
+    return
+end
+
+@tiny function _kernel_update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ)
+    ix, iy, iz = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy, iz)
+    @inbounds if isin(Ψ.not_air)
+        pav = 0.0
+        for idz = 1:2, idy = 1:2, idx = 1:2
+            pav += Ψ.not_air[ix+idx, iy+idy, iz+idz]
+        end
+        Ψav.not_air[ix, iy, iz] = pav / 8
+    end
+    @inbounds if isin(Ψ.not_solid)
+        pav = 0.0
+        for idz = 1:2, idy = 1:2, idx = 1:2
+            pav += Ψ.not_solid[ix+idx, iy+idy, iz+idz]
+        end
+        Ψav.not_solid[ix, iy, iz] = pav / 8
+    end
+    @inbounds if isin(Vmag)
+        vxc = 0.5 * (V.x[ix+1, iy+1, iz+1] + V.x[ix+2, iy+1, iz+1])
+        vyc = 0.5 * (V.y[ix+1, iy+1, iz+1] + V.y[ix+1, iy+2, iz+1])
+        vzc = 0.5 * (V.z[ix+1, iy+1, iz+1] + V.z[ix+1, iy+1, iz+2])
+        Vmag[ix, iy, iz] = sqrt(vxc^2 + vyc^2 + vzc^2)
+    end
+    @inbounds if isin(τII)
+        τxyc = 0.25 * (τ.xy[ix, iy, iz] + τ.xy[ix+1, iy, iz] + τ.xy[ix, iy+1, iz] + τ.xy[ix+1, iy+1, iz])
+        τxzc = 0.25 * (τ.xz[ix, iy, iz] + τ.xz[ix+1, iy, iz] + τ.xz[ix, iy, iz+1] + τ.xz[ix+1, iy, iz+1])
+        τyzc = 0.25 * (τ.yz[ix, iy, iz] + τ.yz[ix, iy+1, iz] + τ.yz[ix, iy, iz+1] + τ.yz[ix, iy+1, iz+1])
+        τII[ix, iy, iz] = sqrt(0.5 * (τ.xx[ix+1, iy+1, iz+1]^2 + τ.yy[ix+1, iy+1, iz+1]^2 + τ.zz[ix+1, iy+1, iz+1]^2) + τxyc^2 + τxzc^2 + τyzc^2)
+    end
+    return
+end
+
+const _update_vis_fields! = _kernel_update_vis_fields!(get_device())
+
+function update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ)
+    wait(_update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ; ndrange=axes(Vmag)))
+    return
+end
+
+grid_dims = (1000, 1000, 100)
+grid_dims_igg = grid_dims .+ 2
+
+# init MPI and IGG
+MPI.Init()
+me, dims, nprocs, coords, comm_cart = init_global_grid(grid_dims_igg...; init_MPI=false)
+dims   = Tuple(dims)
+coords = Tuple(coords)
+grid   = (me,dims,nprocs,coords,comm_cart)
+
+main(grid_dims,grid)
+
+MPI.Barrier(comm_cart)
+# finalize_global_grid(; finalize_MPI=false)
+# MPI.Barrier(comm_cart)
+MPI.Finalize()
\ No newline at end of file
diff --git a/scripts3D_variational/volume_fraction_kernels.jl b/scripts3D_variational/volume_fraction_kernels.jl
new file mode 100644
index 00000000..39df9f54
--- /dev/null
+++ b/scripts3D_variational/volume_fraction_kernels.jl
@@ -0,0 +1,58 @@
+@tiny function _kernel_compute_volume_fractions_from_level_set!(wt,Ψ,dx,dy,dz)
+    ix,iy,iz = @indices
+    cell = Rect(Vec(0.0,0.0,0.0), Vec(dx,dy,dz))
+    ω = GeometryBasics.volume(cell)
+    @inline Ψ_ax(dix,diy,diz) = 0.5*(Ψ[ix+dix,iy+diy,iz+diz]+Ψ[ix+dix+1,iy+diy,iz+diz])
+    @inline Ψ_ay(dix,diy,diz) = 0.5*(Ψ[ix+dix,iy+diy,iz+diz]+Ψ[ix+dix,iy+diy+1,iz+diz])
+    @inline Ψ_az(dix,diy,diz) = 0.5*(Ψ[ix+dix,iy+diy,iz+diz]+Ψ[ix+dix,iy+diy,iz+diz+1])
+    @inline Ψ_axy(dix,diy,diz) = 0.25*(Ψ[ix+dix  ,iy+diy  ,iz+diz+1]+Ψ[ix+dix+1,iy+diy  ,iz+diz+1]+
+                                       Ψ[ix+dix  ,iy+diy+1,iz+diz+1]+Ψ[ix+dix+1,iy+diy+1,iz+diz+1])
+    @inline Ψ_axz(dix,diy,diz) = 0.25*(Ψ[ix+dix  ,iy+diy+1,iz+diz  ]+Ψ[ix+dix+1,iy+diy+1,iz+diz  ]+
+                                       Ψ[ix+dix  ,iy+diy+1,iz+diz+1]+Ψ[ix+dix+1,iy+diy+1,iz+diz+1])
+    @inline Ψ_ayz(dix,diy,diz) = 0.25*(Ψ[ix+dix+1,iy+diy  ,iz+diz  ]+Ψ[ix+dix+1,iy+diy+1,iz+diz  ]+
+                                       Ψ[ix+dix+1,iy+diy  ,iz+diz+1]+Ψ[ix+dix+1,iy+diy+1,iz+diz+1])
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    # cell centers
+    @inbounds if isin(wt.c)
+        Ψs = Vec{8}(Ψ[ix,iy,iz  ],Ψ[ix+1,iy,iz  ],Ψ[ix,iy+1,iz  ],Ψ[ix+1,iy+1,iz  ],
+                    Ψ[ix,iy,iz+1],Ψ[ix+1,iy,iz+1],Ψ[ix,iy+1,iz+1],Ψ[ix+1,iy+1,iz+1])
+        wt.c[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # x faces
+    @inbounds if isin(wt.x)
+        Ψs = Vec{8}(Ψ_ax(0,0,0),Ψ_ax(1,0,0),Ψ_ax(0,1,0),Ψ_ax(1,1,0),
+                    Ψ_ax(0,0,1),Ψ_ax(1,0,1),Ψ_ax(0,1,1),Ψ_ax(1,1,1))
+        wt.x[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # y faces
+    @inbounds if isin(wt.y)
+        Ψs = Vec{8}(Ψ_ay(0,0,0),Ψ_ay(1,0,0),Ψ_ay(0,1,0),Ψ_ay(1,1,0),
+                    Ψ_ay(0,0,1),Ψ_ay(1,0,1),Ψ_ay(0,1,1),Ψ_ay(1,1,1))
+        wt.y[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # z faces
+    @inbounds if isin(wt.z)
+        Ψs = Vec{8}(Ψ_az(0,0,0),Ψ_az(1,0,0),Ψ_az(0,1,0),Ψ_az(1,1,0),
+                    Ψ_az(0,0,1),Ψ_az(1,0,1),Ψ_az(0,1,1),Ψ_az(1,1,1))
+        wt.z[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # xy edges
+    @inbounds if isin(wt.xy)
+        Ψs = Vec{8}(Ψ_axy(0,0,0),Ψ_axy(1,0,0),Ψ_axy(0,1,0),Ψ_axy(1,1,0),
+                    Ψ_axy(0,0,1),Ψ_axy(1,0,1),Ψ_axy(0,1,1),Ψ_axy(1,1,1))
+        wt.xy[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # xz edges
+    @inbounds if isin(wt.xz)
+        Ψs = Vec{8}(Ψ_axz(0,0,0),Ψ_axz(1,1,0),Ψ_axz(0,1,0),Ψ_axz(1,1,0),
+                    Ψ_axz(0,0,1),Ψ_axz(1,1,1),Ψ_axz(0,1,1),Ψ_axz(1,1,1))
+        wt.xz[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # yz edges
+    @inbounds if isin(wt.yz)
+        Ψs = Ψs = Vec{8}(Ψ_ayz(0,0,0),Ψ_ayz(1,1,0),Ψ_ayz(0,1,0),Ψ_ayz(1,1,0),
+                         Ψ_ayz(0,0,1),Ψ_ayz(1,1,1),Ψ_ayz(0,1,1),Ψ_ayz(1,1,1))
+        wt.yz[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational/volume_fractions.jl b/scripts3D_variational/volume_fractions.jl
new file mode 100644
index 00000000..4ef6a96d
--- /dev/null
+++ b/scripts3D_variational/volume_fractions.jl
@@ -0,0 +1,128 @@
+@inline perturb(ϕ) = abs(ϕ) > 1e-20 ? ϕ : (ϕ > 0 ? 1e-20 : -1e-20)
+
+@inline trivol(v1,v2,v3) = 0.5*abs(cross(v3-v1,v2-v1))
+
+function volfrac(tri,ϕ::Vec3{T})::T where T
+    v1,v2,v3 = tri
+    if ϕ[1] < 0 && ϕ[2] < 0 && ϕ[3] < 0 # ---
+        return trivol(v1,v2,v3)
+    elseif ϕ[1] > 0 && ϕ[2] > 0 && ϕ[3] > 0 # +++
+        return 0.0
+    end
+    @inline vij(i,j) = tri[j]*(ϕ[i]/(ϕ[i]-ϕ[j])) - tri[i]*(ϕ[j]/(ϕ[i]-ϕ[j]))
+    v12,v13,v23 = vij(1,2),vij(1,3),vij(2,3)
+    if ϕ[1] < 0
+        if ϕ[2] < 0
+            trivol(v1,v23,v13) + trivol(v1,v2,v23)  # --+
+        else
+            if ϕ[3] < 0
+                trivol(v3,v12,v23) + trivol(v3,v1,v12) # -+-
+            else
+                trivol(v1,v12,v13) # -++
+            end
+        end
+    else
+        if ϕ[2] < 0
+            if ϕ[3] < 0
+                trivol(v2,v13,v12) + trivol(v2,v3,v13) # +--
+            else
+                trivol(v12,v2,v23) # +-+
+            end
+        else
+            trivol(v13,v23,v3) # ++-
+        end 
+    end
+end
+
+function volfrac(rect::Rect2{T},ϕ::Vec4{T}) where T
+    or,ws = origin(rect), widths(rect)
+    v1,v2,v3,v4 = or,or+Vec(ws[1],0.0),or+ws,or+Vec(0.0,ws[2])
+    ϕ1,ϕ2,ϕ3,ϕ4 = perturb.(ϕ)
+    return volfrac(Vec(v1,v2,v3),Vec3{T}(ϕ1,ϕ2,ϕ3)) + 
+           volfrac(Vec(v1,v3,v4),Vec3{T}(ϕ1,ϕ3,ϕ4))
+end
+
+@inline tetvol(v1,v2,v3,v4) = abs(det([v2-v1 v3-v1 v4-v1]))/6.0
+
+function volfrac(tet,ϕ::Vec4)
+    v1,v2,v3,v4 = tet
+    @inline vij(i,j) = tet[j]*(ϕ[i]/(ϕ[i]-ϕ[j])) - tet[i]*(ϕ[j]/(ϕ[i]-ϕ[j]))
+    nneg = count(ϕ.<0)
+    if nneg == 0     # ++++
+        return 0.0
+    elseif nneg == 1 # -+++
+        if ϕ[1] < 0
+            return tetvol(v1,vij(1,2),vij(1,3),vij(1,4))
+        elseif ϕ[2] < 0
+            return tetvol(v2,vij(2,1),vij(2,3),vij(2,4))
+        elseif ϕ[3] < 0
+            return tetvol(v3,vij(3,1),vij(3,2),vij(3,4))
+        else # ϕ[4] < 0
+            return tetvol(v4,vij(4,1),vij(4,2),vij(4,3))
+        end
+    elseif nneg == 2 # --++
+        if ϕ[1] < 0 && ϕ[2] < 0
+            return tetvol(v1      ,v2      ,vij(1,3),vij(2,4)) +
+                   tetvol(vij(2,3),v2      ,vij(1,3),vij(2,4)) +
+                   tetvol(v1      ,vij(1,4),vij(1,3),vij(2,4))
+        elseif ϕ[1] < 0 && ϕ[3] < 0
+            return tetvol(v1      ,v3      ,vij(1,4),vij(3,2)) +
+                   tetvol(vij(3,4),v3      ,vij(1,4),vij(3,2)) +
+                   tetvol(v1      ,vij(1,2),vij(1,4),vij(3,2))
+        elseif ϕ[1] < 0 && ϕ[4] < 0
+            return tetvol(v1      ,v4      ,vij(1,2),vij(4,3)) +
+                   tetvol(vij(4,2),v4      ,vij(1,2),vij(4,3)) +
+                   tetvol(v1      ,vij(1,3),vij(1,2),vij(4,3))
+        elseif ϕ[2] < 0 && ϕ[3] < 0
+            return tetvol(v3      ,v2      ,vij(3,1),vij(2,4)) +
+                   tetvol(vij(2,1),v2      ,vij(3,1),vij(2,4)) +
+                   tetvol(v3      ,vij(3,4),vij(3,1),vij(2,4))
+        elseif ϕ[2] < 0 && ϕ[4] < 0
+            return tetvol(v4      ,v2      ,vij(4,1),vij(2,3)) +
+                   tetvol(vij(2,1),v2      ,vij(4,1),vij(2,3)) +
+                   tetvol(v4      ,vij(4,3),vij(4,1),vij(2,3))
+        else # ϕ[3] < 0 && ϕ[4] < 0
+            return tetvol(v3      ,v4      ,vij(3,1),vij(4,2)) +
+                   tetvol(vij(4,1),v4      ,vij(3,1),vij(4,2)) +
+                   tetvol(v3      ,vij(3,2),vij(3,1),vij(4,2))
+        end
+    elseif nneg == 3 # ---+
+        vol_tot = tetvol(v1,v2,v3,v4)
+        if ϕ[1] >= 0
+            return vol_tot - tetvol(v1,vij(1,2),vij(1,3),vij(1,4))
+        elseif ϕ[2] >= 0
+            return vol_tot - tetvol(v2,vij(2,1),vij(2,3),vij(2,4))
+        elseif ϕ[3] >= 0
+            return vol_tot - tetvol(v3,vij(3,1),vij(3,2),vij(3,4))
+        else # ϕ[4] >= 0
+            return vol_tot - tetvol(v4,vij(4,1),vij(4,2),vij(4,3))
+        end
+    else # ----
+        return tetvol(v1,v2,v3,v4)
+    end
+end
+
+function volfrac(rect::Rect3,ϕ::Vec{8})
+    or,ws = origin(rect), widths(rect)
+    v000,v001,v100,v101 = or                   ,or+Vec(ws[1],0.0,0.0  ),or+Vec(0.0,ws[2],0.0  ),or+Vec(ws[1],ws[2],0.0  )
+    v010,v011,v110,v111 = or+Vec(0.0,0.0,ws[3]),or+Vec(ws[1],0.0,ws[3]),or+Vec(0.0,ws[2],ws[3]),or+Vec(ws[1],ws[2],ws[3])
+    ϕ = perturb.(ϕ)
+    return volfrac(Vec(v000,v100,v010,v001),Vec(ϕ[1],ϕ[5],ϕ[3],ϕ[2])) + 
+           volfrac(Vec(v110,v100,v010,v111),Vec(ϕ[7],ϕ[5],ϕ[3],ϕ[7])) +
+           volfrac(Vec(v101,v100,v111,v001),Vec(ϕ[6],ϕ[5],ϕ[7],ϕ[2])) +
+           volfrac(Vec(v011,v111,v010,v001),Vec(ϕ[4],ϕ[7],ϕ[3],ϕ[2])) +
+           volfrac(Vec(v111,v100,v010,v001),Vec(ϕ[7],ϕ[5],ϕ[3],ϕ[2]))
+end
+
+include("volume_fraction_kernels.jl")
+
+const _compute_volume_fractions_from_level_set! = _kernel_compute_volume_fractions_from_level_set!(get_device())
+
+function compute_volume_fractions_from_level_set!(wt,Ψ,dx,dy,dz)
+    wt_inn = (;c=wt.c,x=inn_x(wt.x),y=inn_y(wt.y),z=inn_z(wt.z),xy=wt.xy,xz=wt.xz,yz=wt.yz)
+    wait(_compute_volume_fractions_from_level_set!(wt_inn,Ψ,dx,dy,dz;ndrange=axes(Ψ)))
+    bc_x_neumann!(0.0,wt.x)
+    bc_y_neumann!(0.0,wt.y)
+    bc_z_neumann!(0.0,wt.z)
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/bc_kernels.jl b/scripts3D_variational_TM/bc_kernels.jl
new file mode 100644
index 00000000..5d805bee
--- /dev/null
+++ b/scripts3D_variational_TM/bc_kernels.jl
@@ -0,0 +1,65 @@
+@tiny function _kernel_bc_x_dirichlet!(val,arrays...)
+    iy,iz = @indices
+    for A in arrays
+        if iy ∈ axes(A,2) && iz ∈ axes(A,3)
+            @inbounds A[1  ,iy,iz] = val
+            @inbounds A[end,iy,iz] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_dirichlet!(val, arrays...)
+    ix,iz = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iz ∈ axes(A,3)
+            @inbounds A[ix,1  ,iz] = val
+            @inbounds A[ix,end,iz] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_z_dirichlet!(val, arrays...)
+    ix,iy = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iy ∈ axes(A,2)
+            @inbounds A[ix,iy,1  ] = val
+            @inbounds A[ix,iy,end] = val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_x_neumann!(val, arrays...)
+    iy,iz = @indices
+    for A in arrays
+        if iy ∈ axes(A,2) && iz ∈ axes(A,3)
+            @inbounds A[1  ,iy,iz] = A[2    ,iy,iz] + val
+            @inbounds A[end,iy,iz] = A[end-1,iy,iz] + val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_y_neumann!(val, arrays...)
+    ix,iz = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iz ∈ axes(A,3)
+            @inbounds A[ix,1  ,iz] = A[ix,2    ,iz] + val
+            @inbounds A[ix,end,iz] = A[ix,end-1,iz] + val
+        end
+    end
+    return
+end
+
+@tiny function _kernel_bc_z_neumann!(val, arrays...)
+    ix,iy = @indices
+    for A in arrays
+        if ix ∈ axes(A,1) && iy ∈ axes(A,2)
+            @inbounds A[ix,iy,1  ] = A[ix,iy,2    ] + val
+            @inbounds A[ix,iy,end] = A[ix,iy,end-1] + val
+        end
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/bcs.jl b/scripts3D_variational_TM/bcs.jl
new file mode 100644
index 00000000..a592d30a
--- /dev/null
+++ b/scripts3D_variational_TM/bcs.jl
@@ -0,0 +1,48 @@
+include("bc_kernels.jl")
+
+const _bc_x_dirichlet! = _kernel_bc_x_dirichlet!(get_device())
+const _bc_y_dirichlet! = _kernel_bc_y_dirichlet!(get_device())
+const _bc_z_dirichlet! = _kernel_bc_z_dirichlet!(get_device())
+
+const _bc_x_neumann! = _kernel_bc_x_neumann!(get_device())
+const _bc_y_neumann! = _kernel_bc_y_neumann!(get_device())
+const _bc_z_neumann! = _kernel_bc_z_neumann!(get_device())
+
+for fname in (:bx_x_dirichlet!,:bc_x_neumann!)
+    @eval begin
+        function $fname(val,arrays...)
+            ax = axes(arrays[1])[[2,3]]
+            for A in arrays[2:end]
+                ax = union.(ax,axes(A)[[2,3]])
+            end
+            wait($(Symbol(:_,fname))(val,arrays...;ndrange=ax))
+            return
+        end
+    end
+end
+
+for fname in (:bx_y_dirichlet!,:bc_y_neumann!)
+    @eval begin
+        function $fname(val,arrays...)
+            ax = axes(arrays[1])[[1,3]]
+            for A in arrays[2:end]
+                ax = union.(ax,axes(A)[[1,3]])
+            end
+            wait($(Symbol(:_,fname))(val,arrays...;ndrange=ax))
+            return
+        end
+    end
+end
+
+for fname in (:bx_z_dirichlet!,:bc_z_neumann!)
+    @eval begin
+        function $fname(val,arrays...)
+            ax = axes(arrays[1])[[1,2]]
+            for A in arrays[2:end]
+                ax = union.(ax,axes(A)[[1,2]])
+            end
+            wait($(Symbol(:_,fname))(val,arrays...;ndrange=ax))
+            return
+        end
+    end
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/data_io.jl b/scripts3D_variational_TM/data_io.jl
new file mode 100644
index 00000000..7e6ef484
--- /dev/null
+++ b/scripts3D_variational_TM/data_io.jl
@@ -0,0 +1,105 @@
+function write_h5(path,fields,dim_g,I,args...)
+    if !HDF5.has_parallel() && (length(args)>0)
+        @warn("HDF5 has no parallel support.")
+    end
+    h5open(path, "w", args...) do io
+        for (name,field) ∈ fields
+            dset               = create_dataset(io, "/$name", datatype(eltype(field)), dataspace(dim_g))
+            dset[I.indices...] = Array(field)
+        end
+    end
+    return
+end
+
+function write_xdmf(path,h5_names,fields,origin,spacing,dim_g,timesteps)
+    xdoc = XMLDocument()
+    xroot = create_root(xdoc, "Xdmf")
+    set_attribute(xroot, "Version","3.0")
+
+    xdomain     = new_child(xroot, "Domain")
+    xcollection = new_child(xdomain, "Grid")
+    set_attribute(xcollection, "GridType","Collection")
+    set_attribute(xcollection, "CollectionType","Temporal")
+
+    for (it,tt) ∈ enumerate(timesteps)
+        xgrid   = new_child(xcollection, "Grid")
+        set_attribute(xgrid, "GridType","Uniform")
+        xtopo = new_child(xgrid, "Topology")
+        set_attribute(xtopo, "TopologyType", "3DCoRectMesh")
+        set_attribute(xtopo, "Dimensions", join(reverse(dim_g).+1,' '))
+
+        xtime = new_child(xgrid, "Time")
+        set_attribute(xtime, "Value", "$tt")
+
+        xgeom = new_child(xgrid, "Geometry")
+        set_attribute(xgeom, "GeometryType", "ORIGIN_DXDYDZ")
+
+        xorig = new_child(xgeom, "DataItem")
+        set_attribute(xorig, "Format", "XML")
+        set_attribute(xorig, "NumberType", "Float")
+        set_attribute(xorig, "Dimensions", "$(length(dim_g)) ")
+        add_text(xorig, join(reverse(origin), ' '))
+
+        xdr = new_child(xgeom, "DataItem")
+        set_attribute(xdr, "Format", "XML")
+        set_attribute(xdr, "NumberType", "Float")
+        set_attribute(xdr, "Dimensions", "$(length(dim_g))")
+        add_text(xdr, join(reverse(spacing), ' '))
+
+        h5_path = h5_names[it]
+        for (name,_) ∈ fields
+            create_xdmf_attribute(xgrid,h5_path,name,dim_g)
+        end
+    end
+
+    save_file(xdoc, path)
+    return
+end
+
+function write_xdmf(path,h5_path,fields,origin,spacing,dim_g)
+    xdoc = XMLDocument()
+    xroot = create_root(xdoc, "Xdmf")
+    set_attribute(xroot, "Version","3.0")
+
+    xdomain = new_child(xroot, "Domain")
+    xgrid   = new_child(xdomain, "Grid")
+    set_attribute(xgrid, "GridType","Uniform")
+    xtopo = new_child(xgrid, "Topology")
+    set_attribute(xtopo, "TopologyType", "3DCoRectMesh")
+    set_attribute(xtopo, "Dimensions", join(reverse(dim_g).+1,' '))
+
+    xgeom = new_child(xgrid, "Geometry")
+    set_attribute(xgeom, "GeometryType", "ORIGIN_DXDYDZ")
+
+    xorig = new_child(xgeom, "DataItem")
+    set_attribute(xorig, "Format", "XML")
+    set_attribute(xorig, "NumberType", "Float")
+    set_attribute(xorig, "Dimensions", "$(length(dim_g)) ")
+    add_text(xorig, join(reverse(origin), ' '))
+
+    xdr = new_child(xgeom, "DataItem")
+    set_attribute(xdr, "Format", "XML")
+    set_attribute(xdr, "NumberType", "Float")
+    set_attribute(xdr, "Dimensions", "$(length(dim_g))")
+    add_text(xdr, join(reverse(spacing), ' '))
+
+    for (name,_) ∈ fields
+        create_xdmf_attribute(xgrid,h5_path,name,dim_g)
+    end
+    save_file(xdoc, path)
+    return
+end
+
+function create_xdmf_attribute(xgrid,h5_path,name,dim_g)
+    # TODO: solve type and precision
+    xattr = new_child(xgrid, "Attribute")
+    set_attribute(xattr, "Name", name)
+    set_attribute(xattr, "Center", "Cell")
+    xdata = new_child(xattr, "DataItem")
+    set_attribute(xdata, "Format", "HDF")
+    set_attribute(xdata, "NumberType", "Float")
+    set_attribute(xdata, "Precision", "8")
+    set_attribute(xdata, "Dimensions", join(reverse(dim_g), ' '))
+    add_text(xdata, "$(h5_path):/$name")
+    return xattr
+end
diff --git a/scripts3D_variational_TM/dual_contouring.jl b/scripts3D_variational_TM/dual_contouring.jl
new file mode 100644
index 00000000..cba484c0
--- /dev/null
+++ b/scripts3D_variational_TM/dual_contouring.jl
@@ -0,0 +1,139 @@
+using GLMakie
+using GeometryBasics
+using StaticArrays
+using LinearAlgebra
+
+function dual_contour(Ψ::AbstractArray{T,3},xc,yc,zc) where T
+    vertices  = Point{3,Float64}[]
+    tris = TriangleFace{Int}[]
+    vert_idx  = Array{Int,3}(undef,size(Ψ).-1)
+    # insert vertices
+    for iz in 1:size(Ψ,3)-1, iy in 1:size(Ψ,2)-1, ix in 1:size(Ψ,1)-1
+        S = MArray{NTuple{3,2},T}(undef)
+        for idz in 0:1,idy in 0:1,idx in 0:1
+            S[idx+1,idy+1,idz+1] = Ψ[ix+idx,iy+idy,iz+idz]
+        end
+        change_sign = !(all(S .> 0) || all(S .< 0))
+        if change_sign
+            push!(vertices,Point(xc[ix],yc[iy],zc[iz]))
+            vert_idx[ix,iy,iz] = length(vertices)
+        end
+    end
+    # insert triangles
+    for iz in 1:size(Ψ,3)-1, iy in 1:size(Ψ,2)-1, ix in 1:size(Ψ,1)-1
+        if Ψ[ix,iy,iz]*Ψ[ix+1,iy,iz] <= 0
+            if iy < 2 || iz < 2
+                continue
+            end
+            i1 = vert_idx[ix,iy-1,iz-1]
+            i2 = vert_idx[ix,iy  ,iz-1]
+            i3 = vert_idx[ix,iy  ,iz  ]
+            i4 = vert_idx[ix,iy-1,iz  ]
+            push!(tris,TriangleFace(i1,i2,i3),TriangleFace(i1,i3,i4))
+        end
+        if Ψ[ix,iy,iz]*Ψ[ix,iy+1,iz] <= 0
+            if ix < 2 || iz < 2
+                continue
+            end
+            i1 = vert_idx[ix-1,iy,iz-1]
+            i2 = vert_idx[ix  ,iy,iz-1]
+            i3 = vert_idx[ix  ,iy,iz  ]
+            i4 = vert_idx[ix-1,iy,iz  ]
+            push!(tris,TriangleFace(i1,i2,i3),TriangleFace(i1,i3,i4))
+        end
+        if Ψ[ix,iy,iz]*Ψ[ix,iy,iz+1] <= 0
+            if ix < 2 || iy < 2
+                continue
+            end
+            i1 = vert_idx[ix-1,iy-1,iz]
+            i2 = vert_idx[ix  ,iy-1,iz]
+            i3 = vert_idx[ix  ,iy  ,iz]
+            i4 = vert_idx[ix-1,iy  ,iz]
+            push!(tris,TriangleFace(i1,i2,i3),TriangleFace(i1,i3,i4))
+        end
+    end
+    return vertices, tris
+end
+
+# Coefficients of a cubic Hermite spline in 1D
+function hspline_coeffs(p::StaticVector{2},∇p::StaticVector{2})
+    return SVector(
+        p[1], ∇p[1],
+        3*(p[2] - p[1]) - 2*∇p[1] - ∇p[2],
+        2*(p[1] - p[2]) +   ∇p[1] + ∇p[2]
+    )
+end
+
+function hspline_coeffs(p::StaticMatrix{2},∇px::StaticMatrix{2},∇py::StaticMatrix{2},∇pxy::StaticMatrix{2})
+
+end
+
+@inline function eval_poly(a::StaticVector{4},x)
+    return a[1] + (a[2] + (a[3] + a[4]*x)*x)*x
+end
+
+@inline function eval_poly(a::StaticMatrix{4},x,y)
+    return eval_poly(
+        SVector(eval_poly(a[:,1],x),
+                eval_poly(a[:,2],x),
+                eval_poly(a[:,3],x),
+                eval_poly(a[:,4],x)),
+        y
+    )
+end
+
+function hspline_interp!(p_i,x_i,xs,ps;bcs=(nothing,nothing))
+    dx = step(xs)
+    for (ip,x) in enumerate(x_i)
+        xdiv = (x-xs[1])/dx
+        ix = clamp(floor(Int,xdiv) + 1, 1, length(xs)-1)
+        t  = xdiv - (ix-1)
+        p  = SVector(ps[ix], ps[ix+1])
+        m1 = ix > firstindex(xs)    ? (ps[ix+1] - ps[ix-1]) / 2 : isnothing(bcs[1]) ? ps[2  ] - ps[1    ] : bcs[1] * dx
+        m2 = ix < lastindex(xs) - 1 ? (ps[ix+2] - ps[ix  ]) / 2 : isnothing(bcs[2]) ? ps[end] - ps[end-1] : bcs[2] * dx
+        m  = SVector(m1, m2)
+        p_i[ip] = hspline_interp(p, m, t)
+    end
+    return
+end
+
+function test_interp()
+    xs = LinRange(-π,π,11)
+    qs = sin.(xs)
+    x_i = LinRange(-1.1π,1.1π,101)
+    q_i = similar(x_i)
+    hspline_interp!(q_i,x_i,xs,qs;bcs=(-1,-1))
+    fig = Figure()
+    ax  = Axis(fig[1,1];aspect=DataAspect())
+    lines!(ax,x_i,q_i)
+    scatter!(ax,xs,qs)
+    display(fig)
+    return
+end
+
+test_interp()
+
+function main()
+    println("Hello world!")
+    Ψ = Array{Float64}(undef,100,100,100)
+    xv = LinRange(-2,2,size(Ψ,1))
+    yv = LinRange(-2,2,size(Ψ,2))
+    zv = LinRange(-2,2,size(Ψ,3))
+    xc = 0.5.*(xv[1:end-1].+xv[2:end])
+    yc = 0.5.*(yv[1:end-1].+yv[2:end])
+    zc = 0.5.*(zv[1:end-1].+zv[2:end])
+    for iz in axes(Ψ,3), iy in axes(Ψ,2), ix in axes(Ψ,1)
+        Ψ[ix,iy,iz] = sqrt(xv[ix]^2 + yv[iy]^2 + zv[iz]^2) - 1.5
+    end
+    @time verts,tris = dual_contour(Ψ,xc,yc,zc)
+    fig = Figure()
+    ax  = Axis3(fig[1,1];aspect=:data,viewmode=:fitzoom)
+    limits!(ax,extrema(xv),extrema(yv),extrema(zv))
+    isosurface = GeometryBasics.Mesh(verts,tris)
+    mesh!(ax,isosurface)
+    # wireframe!(ax,isosurface;color=:black)
+    display(fig)
+    return
+end
+
+main()
\ No newline at end of file
diff --git a/scripts3D_variational_TM/hide_communication.jl b/scripts3D_variational_TM/hide_communication.jl
new file mode 100644
index 00000000..0a1e85e0
--- /dev/null
+++ b/scripts3D_variational_TM/hide_communication.jl
@@ -0,0 +1,33 @@
+@inline __subrange(nr,bw,I,::Val{1}) = 1:bw[I]
+@inline __subrange(nr,bw,I,::Val{2}) = (size(nr,I)-bw[I]+1):size(nr,I)
+@inline __subrange(nr,bw,I,::Val{3}) = (bw[I]+1):(size(nr,I)-bw[I])
+
+@inline split_ndrange(ndrange,ndwidth) = split_ndrange(CartesianIndices(ndrange),ndwidth)
+
+function split_ndrange(ndrange::CartesianIndices{N},ndwidth::NTuple{N,<:Integer}) where N
+    @assert all(size(ndrange) .> ndwidth.*2)
+    @inline ndsubrange(I,::Val{J}) where J = ntuple(Val(N)) do idim
+        if idim < I
+            1:size(ndrange,idim)
+        elseif idim == I
+            __subrange(ndrange,ndwidth,idim,Val(J))
+        else
+            __subrange(ndrange,ndwidth,idim,Val(3))
+        end
+    end
+    ndinner = ntuple(idim -> __subrange(ndrange,ndwidth,idim,Val(3)), Val(N))
+    return ntuple(Val(2N+1)) do i
+        if i == 2N+1
+            ndrange[ndinner...]
+        else
+            idim,idir = divrem(i-1,2) .+ 1
+            ndrange[ndsubrange(idim,Val(idir))...]
+        end
+    end
+end
+
+function hide_comm(f,ranges)
+    ie = f(ranges[end])
+    oe = ntuple(i->f(ranges[i]), length(ranges)-1)
+    return ie, oe
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/level_set_kernels.jl b/scripts3D_variational_TM/level_set_kernels.jl
new file mode 100644
index 00000000..cd5a68d2
--- /dev/null
+++ b/scripts3D_variational_TM/level_set_kernels.jl
@@ -0,0 +1,62 @@
+@tiny function _kernel_init_level_set!(Ψ,dem,dem_grid,Ψ_grid,cutoff,R)
+    ix,iy,iz = @indices
+    x,y,z    = Ψ_grid[1][ix],Ψ_grid[2][iy],Ψ_grid[3][iz]
+    P        = R*Point3(x,y,z)
+    ud,sgn   = sd_dem(P,cutoff,dem,dem_grid)
+    @inbounds Ψ[ix,iy,iz] = ud*sgn
+    return
+end
+
+@tiny function _kernel_compute_dΨ_dt!(dΨ_dt,Ψ,Ψ0,dx,dy,dz)
+    ix,iy,iz = @indices
+    @inline changes_sign_x(disp) = @inbounds Ψ0[ix,iy,iz]*Ψ0[ix+disp,iy,iz] < 0
+    @inline changes_sign_y(disp) = @inbounds Ψ0[ix,iy,iz]*Ψ0[ix,iy+disp,iz] < 0
+    @inline changes_sign_z(disp) = @inbounds Ψ0[ix,iy,iz]*Ψ0[ix,iy,iz+disp] < 0
+    ch_x, ch_y, ch_z = false, false, false
+    ∂Ψ0_∂x, ∂Ψ0_∂y, ∂Ψ0_∂z = 0.0, 0.0, 0.0
+    if ix ∈ axes(Ψ0,1)[2:end-1]
+        ch_x = changes_sign_x(1) || changes_sign_x(-1)
+        @inbounds ∂Ψ0_∂x = (Ψ0[ix+1,iy,iz]-Ψ0[ix-1,iy,iz])/(2dx)
+    end
+    if iy ∈ axes(Ψ0,2)[2:end-1]
+        ch_y = changes_sign_y(1) || changes_sign_y(-1)
+        @inbounds ∂Ψ0_∂y = (Ψ0[ix,iy+1,iz]-Ψ0[ix,iy-1,iz])/(2dy)
+    end
+    if iz ∈ axes(Ψ0,3)[2:end-1]
+        ch_z = changes_sign_z(1) || changes_sign_z(-1)
+        @inbounds ∂Ψ0_∂z = (Ψ0[ix,iy,iz+1]-Ψ0[ix,iy,iz-1])/(2dz)
+    end
+    if (ch_x || ch_y || ch_z)
+        # local surface reconstruction
+        @inbounds D = Ψ0[ix,iy,iz]/sqrt(∂Ψ0_∂x^2 + ∂Ψ0_∂y^2 + ∂Ψ0_∂z^2)
+        @inbounds dΨ_dt[ix,iy,iz] = (D-sign(Ψ0[ix,iy,iz])*abs(Ψ[ix,iy,iz]))/dx
+    else
+        @inbounds begin
+            # Hamilton-Jacobi with Godunov flux
+            # direction '-' derivatives
+            ∂Ψ_∂x⁻ = ix > 1 ? (Ψ[ix,iy,iz] - Ψ[ix-1,iy,iz])/dx : 0.0
+            ∂Ψ_∂y⁻ = iy > 1 ? (Ψ[ix,iy,iz] - Ψ[ix,iy-1,iz])/dy : 0.0
+            ∂Ψ_∂z⁻ = iz > 1 ? (Ψ[ix,iy,iz] - Ψ[ix,iy,iz-1])/dy : 0.0
+            # direction '+' derivatives
+            ∂Ψ_∂x⁺ = ix < size(Ψ,1) ? (Ψ[ix+1,iy,iz] - Ψ[ix,iy,iz]) / dx : 0.0
+            ∂Ψ_∂y⁺ = iy < size(Ψ,2) ? (Ψ[ix,iy+1,iz] - Ψ[ix,iy,iz]) / dy : 0.0
+            ∂Ψ_∂z⁺ = iz < size(Ψ,3) ? (Ψ[ix,iy,iz+1] - Ψ[ix,iy,iz]) / dz : 0.0
+            # upwind fluxes
+            ∂Ψ_∂x2 = Ψ0[ix,iy,iz] >= 0 ? max(max(∂Ψ_∂x⁻,0)^2, min(∂Ψ_∂x⁺,0)^2) :
+                                         max(min(∂Ψ_∂x⁻,0)^2, max(∂Ψ_∂x⁺,0)^2)
+            ∂Ψ_∂y2 = Ψ0[ix,iy,iz] >= 0 ? max(max(∂Ψ_∂y⁻,0)^2, min(∂Ψ_∂y⁺,0)^2) :
+                                         max(min(∂Ψ_∂y⁻,0)^2, max(∂Ψ_∂y⁺,0)^2)
+            ∂Ψ_∂z2 = Ψ0[ix,iy,iz] >= 0 ? max(max(∂Ψ_∂z⁻,0)^2, min(∂Ψ_∂z⁺,0)^2) :
+                                         max(min(∂Ψ_∂z⁻,0)^2, max(∂Ψ_∂z⁺,0)^2)
+            # compute update
+            dΨ_dt[ix,iy,iz] = sign(Ψ0[ix,iy,iz])*(1.0-sqrt(∂Ψ_∂x2+∂Ψ_∂y2+∂Ψ_∂z2))
+        end
+    end
+    return
+end
+
+@tiny function _kernel_update_Ψ!(Ψ, dΨ_dt, dt)
+    I = @cartesianindex
+    @inbounds Ψ[I] += dt*dΨ_dt[I]
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/level_sets.jl b/scripts3D_variational_TM/level_sets.jl
new file mode 100644
index 00000000..13019b10
--- /dev/null
+++ b/scripts3D_variational_TM/level_sets.jl
@@ -0,0 +1,15 @@
+include("level_set_kernels.jl")
+
+const _init_level_set! = _kernel_init_level_set!(get_device())
+const _compute_dΨ_dt!  = _kernel_compute_dΨ_dt!(get_device())
+const _update_Ψ!       = _kernel_update_Ψ!(get_device())
+
+function compute_level_set_from_dem!(Ψ,dem,dem_grid,Ψ_grid)
+    TinyKernels.device_synchronize(get_device())
+    dx,dy,dz = step.(Ψ_grid)
+    cutoff   = 4max(dx,dy,dz)
+    R        = LinearAlgebra.I
+    wait(_init_level_set!(Ψ,dem,dem_grid,Ψ_grid,cutoff,R;ndrange=axes(Ψ)))
+    return
+end
+
diff --git a/scripts3D_variational_TM/load_dem.jl b/scripts3D_variational_TM/load_dem.jl
new file mode 100644
index 00000000..0385198b
--- /dev/null
+++ b/scripts3D_variational_TM/load_dem.jl
@@ -0,0 +1,15 @@
+using FileIO
+
+function filter_range(r,lims)
+    istart = something(findfirst(v -> v>lims[1], r), length(r))
+    iend   = something( findlast(v -> v<lims[2], r), 1)
+    return istart:iend
+end
+
+function load_dem(path,(;xlims,ylims))
+    x,y,bed,surface=load(path,"x","y","bed","surface")
+    # shift limits to look for offsets
+    ixs = filter_range(x,xlims .+ x[1])
+    iys = filter_range(y,ylims .+ y[1])
+    return (x = x[ixs], y = y[iys], bed = bed[ixs,iys], surface = surface[ixs,iys])
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/prepare_dem.jl b/scripts3D_variational_TM/prepare_dem.jl
new file mode 100644
index 00000000..22fec202
--- /dev/null
+++ b/scripts3D_variational_TM/prepare_dem.jl
@@ -0,0 +1,20 @@
+using JLD2
+using NetCDF
+
+const GREENLAND_PATH = "data/BedMachine/BedMachineGreenland-v5.nc"
+
+function prepare_greenland()
+    x = ncread(GREENLAND_PATH,"x")
+    y = reverse(ncread(GREENLAND_PATH,"y"))
+    @assert issorted(x)
+    @assert issorted(y)
+    x = LinRange(x[1],x[end],length(x))
+    y = LinRange(y[1],y[end],length(y))
+    bed     = reverse(ncread(GREENLAND_PATH,"bed")    ; dims=2)
+    surface = reverse(ncread(GREENLAND_PATH,"surface"); dims=2)
+    mask    = reverse(ncread(GREENLAND_PATH,"mask")   ; dims=2)
+    jldsave("data/BedMachine/greenland.jld2";x,y,bed,surface,mask)
+    return
+end
+
+prepare_greenland()
\ No newline at end of file
diff --git a/scripts3D_variational_TM/signed_distances.jl b/scripts3D_variational_TM/signed_distances.jl
new file mode 100644
index 00000000..6bc7b26e
--- /dev/null
+++ b/scripts3D_variational_TM/signed_distances.jl
@@ -0,0 +1,65 @@
+using LinearAlgebra,GeometryBasics
+
+@inline S(x) = x == zero(x) ? oneunit(x) : sign(x)
+@inline sign_triangle(p,a,b,c) = S(dot(p-a,cross(b-a,c-a)))
+
+@inline function ud_triangle(p,a,b,c)
+    dot2(v) = dot(v,v)
+    ba  = b - a; pa = p - a
+    cb  = c - b; pb = p - b
+    ac  = a - c; pc = p - c
+    nor = cross(ba,ac)
+    return sqrt(
+       (sign(dot(cross(ba,nor),pa)) +
+        sign(dot(cross(cb,nor),pb)) +
+        sign(dot(cross(ac,nor),pc)) < 2)
+        ?
+        min(
+        dot2(ba*clamp(dot(ba,pa)/dot2(ba),0,1)-pa),
+        dot2(cb*clamp(dot(cb,pb)/dot2(cb),0,1)-pb),
+        dot2(ac*clamp(dot(ac,pc)/dot2(ac),0,1)-pc) )
+        :
+        dot(nor,pa)*dot(nor,pa)/dot2(nor) )
+end
+
+@inline function closest_vertex_index(P,rc)
+    lims = map(x->x[1:end-1],axes.(rc,1))
+    Δ = step.(rc)
+    O = first.(rc)
+    I = @. clamp(Int(fld(P-O,Δ))+1,lims)
+    return CartesianIndex(I...)
+end
+
+@inline inc(I,dim) = Base.setindex(I,I[dim]+1,dim)
+@inline inc(I) = I + oneunit(I)
+
+@inline function triangle_pair(Iv,dem,rc)
+    @inline function sample_dem(I)
+        @inbounds x,y = rc[1][I[1]],rc[2][I[2]]
+        @inbounds Point3(x,y,dem[I])
+    end
+    T_BL = Triangle(sample_dem(Iv)       ,sample_dem(inc(Iv,1)),sample_dem(inc(Iv,2)))
+    T_TR = Triangle(sample_dem(inc(Iv,2)),sample_dem(inc(Iv,1)),sample_dem(inc(Iv)))
+    return T_BL,T_TR
+end
+
+@inline function distance_to_triangle_pair(P,Iv,dem,rc)
+    T_BL,T_TR = triangle_pair(Iv,dem,rc)
+    ud = min(ud_triangle(P,T_BL...),ud_triangle(P,T_TR...))
+    return ud,sign_triangle(P,T_BL...)
+end
+
+function sd_dem(P,cutoff,dem,rc)
+    @inbounds Pp = clamp.(Point(P[1],P[2]),first.(rc),last.(rc))
+    @inbounds P  = Point(Pp[1],Pp[2],P[3])
+    BL = closest_vertex_index(Pp.-cutoff,rc)
+    TR = closest_vertex_index(Pp.+cutoff,rc)
+    Ic = closest_vertex_index(Pp,rc)
+    ud,sgn = distance_to_triangle_pair(P,Ic,dem,rc)
+    for Iv in BL:TR
+        if Iv == Ic continue end
+        ud_pair,_ = distance_to_triangle_pair(P,Iv,dem,rc)
+        ud = min(ud,ud_pair)
+    end
+    return ud,sgn
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/stokes.jl b/scripts3D_variational_TM/stokes.jl
new file mode 100644
index 00000000..154ef2be
--- /dev/null
+++ b/scripts3D_variational_TM/stokes.jl
@@ -0,0 +1,44 @@
+include("stokes_kernels.jl")
+
+const _update_ηs! = _kernel_update_ηs!(get_device())
+const _update_σ!  = _kernel_update_σ!(get_device())
+const _update_V!  = _kernel_update_V!(get_device())
+const _compute_residual! = _kernel_compute_residual!(get_device())
+
+function update_ηs!(ηs,ε̇,T,wt,K,n,Q_R,T_mlt,ηreg,χ)
+    wait(_update_ηs!(ηs,ε̇,T,wt,K,n,Q_R,T_mlt,ηreg,χ;ndrange=axes(ηs)))
+    return
+end
+
+function update_σ!(Pr, τ, ε̇, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz)
+    wait(_update_σ!(Pr, τ, ε̇, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz; ndrange=axes(Pr)))
+    return
+end
+
+function update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy, dz; bwidth)
+    V_inn = (x = inn(V.x), y = inn(V.y), z = inn(V.z))
+    # ranges = split_ndrange(axes(Pr),bwidth)
+    # ie,oe  =  hide_comm(ranges) do ndrange
+        ndrange = axes(Pr)
+        wait(_update_V!(V_inn, Pr, τ, ηs, wt, nudτ, ρg, dx, dy, dz; ndrange))
+    # end
+    # wait.(oe)
+    TinyKernels.device_synchronize(FastIce.get_device())
+    bc_x_neumann!(0.0,V.z)
+    bc_y_neumann!(0.0,V.x,V.z)
+    bc_z_neumann!(0.0,V.x)
+    # TinyKernels.device_synchronize(FastIce.get_device())
+    # @. V.x[end,:  ,:] = V.x[end-1,:,:]*wt.not_solid.x[end-1,:,:]
+    # @. V.x[1  ,:  ,:] = V.x[2    ,:,:]*wt.not_solid.x[2    ,:,:]
+    # @. V.y[:  ,end,:] = V.y[:,end-1,:]*wt.not_solid.y[:,end-1,:]
+    # @. V.y[:  ,1  ,:] = V.y[:,2    ,:]*wt.not_solid.y[:,2    ,:]
+    TinyKernels.device_synchronize(FastIce.get_device())
+    # update_halo!(V.x,V.y,V.z)
+    # wait(ie)
+    return
+end
+
+function compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy, dz)
+    wait(_compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy, dz; ndrange=axes(Pr)))
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/stokes_kernels.jl b/scripts3D_variational_TM/stokes_kernels.jl
new file mode 100644
index 00000000..c3ee3b1d
--- /dev/null
+++ b/scripts3D_variational_TM/stokes_kernels.jl
@@ -0,0 +1,236 @@
+@tiny function _kernel_update_ηs!(ηs,ε̇,T,wt,K,n,Q_R,T_mlt,ηreg,χ)
+    ix, iy, iz = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy, iz)
+    @inbounds if isin(ηs)
+        ε̇xyc = 0.0
+        for idz = -1:-1, idy = -1:0, idx = -1:0
+            ix2,iy2,iz2 = clamp(ix+idx,1,size(ε̇.xy,1)),clamp(iy+idy,1,size(ε̇.xy,2)),clamp(iz+idz,1,size(ε̇.xy,3))
+            ε̇xyc += ε̇.xy[ix2,iy2,iz2]
+        end
+        ε̇xyc *= 0.25
+        ε̇xzc = 0.0
+        for idz = -1:0, idy = -1:-1, idx = -1:0
+            ix2,iy2,iz2 = clamp(ix+idx,1,size(ε̇.xz,1)),clamp(iy+idy,1,size(ε̇.xz,2)),clamp(iz+idz,1,size(ε̇.xz,3))
+            ε̇xzc += ε̇.xz[ix2,iy2,iz2]
+        end
+        ε̇xzc *= 0.25
+        ε̇yzc = 0.0
+        for idz = -1:0, idy = -1:0, idx = -1:-1
+            ix2,iy2,iz2 = clamp(ix+idx,1,size(ε̇.yz,1)),clamp(iy+idy,1,size(ε̇.yz,2)),clamp(iz+idz,1,size(ε̇.yz,3))
+            ε̇yzc += ε̇.yz[ix2,iy2,iz2]
+        end
+        ε̇yzc *= 0.25
+        ε̇II  = sqrt(0.5*(ε̇.xx[ix,iy,iz]^2 + ε̇.yy[ix,iy,iz]^2 + ε̇.zz[ix,iy,iz]^2) + ε̇xyc^2 + ε̇xzc^2 + ε̇yzc^2)
+        ηs_t = 0.5*K*exp(-1/n*Q_R*(1/T_mlt - 1/T[ix,iy,iz]))*ε̇II^(1/n-1)
+        ηs_t = 1.0/(1/ηs_t + 1/ηreg)
+        ηs[ix,iy,iz] = exp(log(ηs[ix,iy,iz])*(1-χ) + log(ηs_t)*χ)
+    end
+end
+
+@tiny function _kernel_update_σ!(Pr, τ, ε̇, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz)
+    ix,iy,iz = @indices
+    na,ns    = wt.not_air, wt.not_solid
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    # detect and eliminate null spaces
+    isnull = (na.x[ix,iy,iz] ≈ 0.0) || (na.x[ix+1,iy  ,iz  ] ≈ 0.0) ||
+             (na.y[ix,iy,iz] ≈ 0.0) || (na.y[ix  ,iy+1,iz  ] ≈ 0.0) ||
+             (na.z[ix,iy,iz] ≈ 0.0) || (na.z[ix  ,iy  ,iz+1] ≈ 0.0)
+    if !isnull && (na.c[ix,iy,iz] > 0.0)
+        ε̇.xx[ix,iy,iz] = (V.x[ix+1,iy  ,iz  ]*ns.x[ix+1,iy  ,iz  ] - V.x[ix,iy,iz]*ns.x[ix,iy,iz])/dx
+        ε̇.yy[ix,iy,iz] = (V.y[ix  ,iy+1,iz  ]*ns.y[ix  ,iy+1,iz  ] - V.y[ix,iy,iz]*ns.y[ix,iy,iz])/dy
+        ε̇.zz[ix,iy,iz] = (V.z[ix  ,iy  ,iz+1]*ns.z[ix  ,iy  ,iz+1] - V.z[ix,iy,iz]*ns.z[ix,iy,iz])/dz
+        ∇V = ε̇.xx[ix,iy,iz] + ε̇.yy[ix,iy,iz] + ε̇.zz[ix,iy,iz]
+        Pr[ix,iy,iz] -= ∇V*ηs[ix,iy,iz]*r/θ_dτ
+        τ.xx[ix,iy,iz] += (-τ.xx[ix,iy,iz] + 2.0*ηs[ix,iy,iz]*(ε̇.xx[ix,iy,iz]-∇V/3.0)) * dτ_r
+        τ.yy[ix,iy,iz] += (-τ.yy[ix,iy,iz] + 2.0*ηs[ix,iy,iz]*(ε̇.yy[ix,iy,iz]-∇V/3.0)) * dτ_r
+        τ.zz[ix,iy,iz] += (-τ.zz[ix,iy,iz] + 2.0*ηs[ix,iy,iz]*(ε̇.zz[ix,iy,iz]-∇V/3.0)) * dτ_r
+    else
+        Pr[ix,iy,iz] = 0.0
+        τ.xx[ix,iy,iz] = 0.0
+        τ.yy[ix,iy,iz] = 0.0
+        τ.zz[ix,iy,iz] = 0.0
+        ε̇.xx[ix,iy,iz] = 0.0
+        ε̇.yy[ix,iy,iz] = 0.0
+        ε̇.zz[ix,iy,iz] = 0.0
+    end
+    @inbounds if isin(τ.xy)
+        # detect and eliminate null spaces
+        isnull = (na.x[ix+1,iy+1,iz+1] ≈ 0.0) || (na.x[ix+1,iy  ,iz+1] ≈ 0.0) ||
+                 (na.y[ix+1,iy+1,iz+1] ≈ 0.0) || (na.y[ix  ,iy+1,iz+1] ≈ 0.0)
+        if !isnull && (na.xy[ix,iy,iz] > 0.0)
+            ε̇.xy[ix,iy,iz] =
+                0.5 * (
+                    (V.x[ix+1,iy+1,iz+1]*ns.x[ix+1,iy+1,iz+1] - V.x[ix+1,iy  ,iz+1]*ns.x[ix+1,iy  ,iz+1])/dy +
+                    (V.y[ix+1,iy+1,iz+1]*ns.y[ix+1,iy+1,iz+1] - V.y[ix  ,iy+1,iz+1]*ns.y[ix  ,iy+1,iz+1])/dx
+                )
+            ηs_av = 0.25*(ηs[ix,iy,iz+1] + ηs[ix+1,iy,iz+1] + ηs[ix,iy+1,iz+1] + ηs[ix+1,iy+1,iz+1])
+            τ.xy[ix,iy,iz] += (-τ.xy[ix,iy,iz] + 2.0*ηs_av*ε̇.xy[ix,iy,iz])*dτ_r
+        else
+            ε̇.xy[ix,iy,iz] = 0.0
+            τ.xy[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(τ.xz)
+        # detect and eliminate null spaces
+        isnull = (na.x[ix+1,iy+1,iz+1] ≈ 0.0) || (na.x[ix+1,iy+1,iz  ] ≈ 0.0) ||
+                 (na.z[ix+1,iy+1,iz+1] ≈ 0.0) || (na.z[ix  ,iy+1,iz+1] ≈ 0.0)
+        if !isnull && (na.xz[ix,iy,iz] > 0.0)
+            ε̇.xz[ix,iy,iz] =
+                0.5 * (
+                    (V.x[ix+1,iy+1,iz+1]*ns.x[ix+1,iy+1,iz+1] - V.x[ix+1,iy+1,iz  ]*ns.x[ix+1,iy+1,iz  ])/dz +
+                    (V.z[ix+1,iy+1,iz+1]*ns.z[ix+1,iy+1,iz+1] - V.z[ix  ,iy+1,iz+1]*ns.z[ix  ,iy+1,iz+1])/dx
+                )
+            ηs_av = 0.25*(ηs[ix,iy+1,iz] + ηs[ix+1,iy+1,iz] + ηs[ix,iy+1,iz+1] + ηs[ix+1,iy+1,iz+1])
+            τ.xz[ix,iy,iz] += (-τ.xz[ix,iy,iz] + 2.0*ηs_av*ε̇.xz[ix,iy,iz])*dτ_r
+        else
+            ε̇.xz[ix,iy,iz] = 0.0
+            τ.xz[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(τ.yz)
+        # detect and eliminate null spaces
+        isnull = (na.y[ix+1,iy+1,iz+1] ≈ 0.0) || (na.y[ix+1,iy+1,iz  ] ≈ 0.0) ||
+                 (na.z[ix+1,iy+1,iz+1] ≈ 0.0) || (na.z[ix+1,iy  ,iz+1] ≈ 0.0)
+        if !isnull && (na.yz[ix,iy,iz] > 0.0)
+            ε̇.yz[ix,iy,iz] =
+                0.5 * (
+                    (V.y[ix+1,iy+1,iz+1]*ns.y[ix+1,iy+1,iz+1] - V.y[ix+1,iy+1,iz  ]*ns.y[ix+1,iy+1,iz  ])/dz +
+                    (V.z[ix+1,iy+1,iz+1]*ns.z[ix+1,iy+1,iz+1] - V.z[ix+1,iy  ,iz+1]*ns.z[ix+1,iy  ,iz+1])/dy
+                )
+            ηs_av = 0.25*(ηs[ix+1,iy,iz] + ηs[ix+1,iy+1,iz] + ηs[ix+1,iy,iz+1] + ηs[ix+1,iy+1,iz+1])
+            τ.yz[ix,iy,iz] += (-τ.yz[ix,iy,iz] + 2.0*ηs_av*ε̇.yz[ix,iy,iz])*dτ_r
+        else
+            ε̇.yz[ix,iy,iz] = 0.0
+            τ.yz[ix,iy,iz] = 0.0
+        end
+    end
+    return
+end
+
+@tiny function _kernel_update_V!(V, Pr, τ, ηs, wt, nudτ, ρg, dx, dy, dz)
+    ix,iy,iz = @indices
+    na,ns    = wt.not_air, wt.not_solid
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    # TODO: check which volume fraction (non-air or non-solid) really determines the null spaces
+    @inbounds if isin(V.x)
+        # detect and eliminate null spaces
+        isnull = ( ns.c[ix+1,iy+1,iz+1] ≈ 0) || ( ns.c[ix,iy+1,iz+1] ≈ 0) ||
+                 (ns.xy[ix  ,iy+1,iz  ] ≈ 0) || (ns.xy[ix,iy  ,iz  ] ≈ 0) ||
+                 (ns.xz[ix  ,iy  ,iz+1] ≈ 0) || (ns.xz[ix,iy  ,iz  ] ≈ 0)
+        if !isnull && (na.x[ix+1,iy+1,iz+1] > 0) && (ns.x[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_x is computed correctly
+            ηs_x = max(ηs[ix,iy+1,iz+1],ηs[ix+1,iy+1,iz+1])
+            ∂σxx_∂x = ((-Pr[ix+1,iy+1,iz+1]+τ.xx[ix+1,iy+1,iz+1])*na.c[ix+1,iy+1,iz+1] -
+                       (-Pr[ix  ,iy+1,iz+1]+τ.xx[ix  ,iy+1,iz+1])*na.c[ix  ,iy+1,iz+1])/dx
+            ∂τxy_∂y = (τ.xy[ix,iy+1,iz]*na.xy[ix,iy+1,iz] - τ.xy[ix,iy,iz]*na.xy[ix,iy,iz])/dy
+            ∂τxz_∂z = (τ.xz[ix,iy,iz+1]*na.xz[ix,iy,iz+1] - τ.xz[ix,iy,iz]*na.xz[ix,iy,iz])/dz
+            V.x[ix,iy,iz] += (∂σxx_∂x + ∂τxy_∂y + ∂τxz_∂z - ρg.x)*nudτ/ηs_x
+        else
+            V.x[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(V.y)
+        # detect and eliminate null spaces
+        isnull = ( ns.c[ix+1,iy+1,iz+1] ≈ 0) || ( ns.c[ix+1,iy,iz+1] ≈ 0) ||
+                 (ns.xy[ix+1,iy  ,iz  ] ≈ 0) || (ns.xy[ix  ,iy,iz  ] ≈ 0) ||
+                 (ns.yz[ix  ,iy  ,iz+1] ≈ 0) || (ns.yz[ix  ,iy,iz  ] ≈ 0)
+        if !isnull && (na.y[ix+1,iy+1,iz+1] > 0) && (ns.y[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_y is computed correctly
+            ηs_y = max(ηs[ix+1,iy,iz+1],ηs[ix+1,iy+1,iz+1])
+            ∂σyy_∂y = ((-Pr[ix+1,iy+1,iz+1] + τ.yy[ix+1,iy+1,iz+1])*na.c[ix+1,iy+1,iz+1] - 
+                       (-Pr[ix+1,iy  ,iz+1] + τ.yy[ix+1,iy  ,iz+1])*na.c[ix+1,iy  ,iz+1])/dy
+            ∂τxy_∂x = (τ.xy[ix+1,iy,iz  ]*na.xy[ix+1,iy,iz] - τ.xy[ix,iy,iz]*na.xy[ix,iy,iz])/dx
+            ∂τyz_∂z = (τ.yz[ix  ,iy,iz+1]*na.yz[ix,iy,iz+1] - τ.yz[ix,iy,iz]*na.yz[ix,iy,iz])/dz
+            V.y[ix,iy,iz] += (∂σyy_∂y + ∂τxy_∂x + ∂τyz_∂z - ρg.y)*nudτ/ηs_y
+        else
+            V.y[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(V.z)
+        # detect and eliminate null spaces
+        isnull = ( ns.c[ix+1,iy+1,iz+1] ≈ 0) || ( ns.c[ix+1,iy+1,iz  ] ≈ 0) ||
+                 (ns.xz[ix+1,iy  ,iz  ] ≈ 0) || (ns.xz[ix  ,iy  ,iz  ] ≈ 0) ||
+                 (ns.yz[ix  ,iy+1,iz  ] ≈ 0) || (ns.yz[ix  ,iy  ,iz  ] ≈ 0)
+        if !isnull && (na.z[ix+1,iy+1,iz+1] > 0) && (ns.z[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_z is computed correctly
+            ηs_z = max(ηs[ix+1,iy+1,iz],ηs[ix+1,iy+1,iz+1])
+            ∂σzz_∂z = ((-Pr[ix+1,iy+1,iz+1] + τ.zz[ix+1,iy+1,iz+1])*na.c[ix+1,iy+1,iz+1] - 
+                       (-Pr[ix+1,iy+1,iz  ] + τ.zz[ix+1,iy+1,iz  ])*na.c[ix+1,iy+1,iz  ])/dz
+            ∂τxz_∂x = (τ.xz[ix+1,iy,iz]*na.xz[ix+1,iy,iz] - τ.xz[ix,iy,iz]*na.xz[ix,iy,iz])/dx
+            ∂τyz_∂y = (τ.yz[ix,iy+1,iz]*na.yz[ix,iy+1,iz] - τ.yz[ix,iy,iz]*na.yz[ix,iy,iz])/dy
+            V.z[ix,iy,iz] += (∂σzz_∂z + ∂τxz_∂x + ∂τyz_∂y - ρg.z)*nudτ/ηs_z
+        else
+            V.z[ix,iy,iz] = 0.0
+        end
+    end
+    return
+end
+
+@tiny function _kernel_compute_residual!(Res, Pr, V, τ, wt, ρg, dx, dy, dz)
+    ns,na = wt.not_solid, wt.not_air
+    ix,iy,iz = @indices
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    if isin(Pr)
+        # detect and eliminate null spaces
+        isnull = (na.x[ix,iy,iz] ≈ 0.0) || (na.x[ix+1,iy  ,iz  ] ≈ 0.0) ||
+                 (na.y[ix,iy,iz] ≈ 0.0) || (na.y[ix  ,iy+1,iz  ] ≈ 0.0) ||
+                 (na.z[ix,iy,iz] ≈ 0.0) || (na.z[ix  ,iy  ,iz+1] ≈ 0.0)
+        if !isnull && (na.c[ix,iy,iz] > 0.0)
+            ε̇xx = (V.x[ix+1,iy  ,iz  ]*ns.x[ix+1,iy  ,iz  ] - V.x[ix,iy,iz]*ns.x[ix,iy,iz])/dx
+            ε̇yy = (V.y[ix  ,iy+1,iz  ]*ns.y[ix  ,iy+1,iz  ] - V.y[ix,iy,iz]*ns.y[ix,iy,iz])/dy
+            ε̇zz = (V.z[ix  ,iy  ,iz+1]*ns.z[ix  ,iy  ,iz+1] - V.z[ix,iy,iz]*ns.z[ix,iy,iz])/dz
+            ∇V = ε̇xx + ε̇yy + ε̇zz
+            Res.Pr[ix,iy,iz] = ∇V
+        else
+            Res.Pr[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(Res.V.x)
+        # detect and eliminate null spaces
+        isnull = ( ns.c[ix+1,iy+1,iz+1] ≈ 0) || ( ns.c[ix,iy+1,iz+1] ≈ 0) ||
+                 (ns.xy[ix  ,iy+1,iz  ] ≈ 0) || (ns.xy[ix,iy  ,iz  ] ≈ 0) ||
+                 (ns.xz[ix  ,iy  ,iz+1] ≈ 0) || (ns.xz[ix,iy  ,iz  ] ≈ 0)
+        if !isnull && (na.x[ix+1,iy+1,iz+1] > 0) && (ns.x[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_x is computed correctly
+            ∂σxx_∂x = ((-Pr[ix+1,iy+1,iz+1]+τ.xx[ix+1,iy+1,iz+1])*na.c[ix+1,iy+1,iz+1] -
+                       (-Pr[ix  ,iy+1,iz+1]+τ.xx[ix  ,iy+1,iz+1])*na.c[ix  ,iy+1,iz+1])/dx
+            ∂τxy_∂y = (τ.xy[ix,iy+1,iz]*na.xy[ix,iy+1,iz] - τ.xy[ix,iy,iz]*na.xy[ix,iy,iz])/dy
+            ∂τxz_∂z = (τ.xz[ix,iy,iz+1]*na.xz[ix,iy,iz+1] - τ.xz[ix,iy,iz]*na.xz[ix,iy,iz])/dz
+            Res.V.x[ix,iy,iz] = ∂σxx_∂x + ∂τxy_∂y + ∂τxz_∂z - ρg.x
+        else
+            Res.V.x[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(Res.V.y)
+        # detect and eliminate null spaces
+        isnull = ( ns.c[ix+1,iy+1,iz+1] ≈ 0) || ( ns.c[ix+1,iy,iz+1] ≈ 0) ||
+                 (ns.xy[ix+1,iy  ,iz  ] ≈ 0) || (ns.xy[ix  ,iy,iz  ] ≈ 0) ||
+                 (ns.yz[ix  ,iy  ,iz+1] ≈ 0) || (ns.yz[ix  ,iy,iz  ] ≈ 0)
+        if !isnull && (na.y[ix+1,iy+1,iz+1] > 0) && (ns.y[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_y is computed correctly
+            ∂σyy_∂y = ((-Pr[ix+1,iy+1,iz+1] + τ.yy[ix+1,iy+1,iz+1])*na.c[ix+1,iy+1,iz+1] - 
+                       (-Pr[ix+1,iy  ,iz+1] + τ.yy[ix+1,iy  ,iz+1])*na.c[ix+1,iy  ,iz+1])/dy
+            ∂τxy_∂x = (τ.xy[ix+1,iy,iz  ]*na.xy[ix+1,iy,iz] - τ.xy[ix,iy,iz]*na.xy[ix,iy,iz])/dx
+            ∂τyz_∂z = (τ.yz[ix  ,iy,iz+1]*na.yz[ix,iy,iz+1] - τ.yz[ix,iy,iz]*na.yz[ix,iy,iz])/dz
+            Res.V.y[ix,iy,iz] = ∂σyy_∂y + ∂τxy_∂x + ∂τyz_∂z - ρg.y
+        else
+            Res.V.y[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(Res.V.z)
+        # detect and eliminate null spaces
+        isnull = ( ns.c[ix+1,iy+1,iz+1] ≈ 0) || ( ns.c[ix+1,iy+1,iz  ] ≈ 0) ||
+                 (ns.xz[ix+1,iy  ,iz  ] ≈ 0) || (ns.xz[ix  ,iy  ,iz  ] ≈ 0) ||
+                 (ns.yz[ix  ,iy+1,iz  ] ≈ 0) || (ns.yz[ix  ,iy  ,iz  ] ≈ 0)
+        if !isnull && (na.z[ix+1,iy+1,iz+1] > 0) && (ns.z[ix+1,iy+1,iz+1] > 0)
+            # TODO: check which cells contribute to the momentum balance to verify ηs_z is computed correctly
+            ∂σzz_∂z = ((-Pr[ix+1,iy+1,iz+1] + τ.zz[ix+1,iy+1,iz+1])*na.c[ix+1,iy+1,iz+1] - 
+                       (-Pr[ix+1,iy+1,iz  ] + τ.zz[ix+1,iy+1,iz  ])*na.c[ix+1,iy+1,iz  ])/dz
+            ∂τxz_∂x = (τ.xz[ix+1,iy,iz]*na.xz[ix+1,iy,iz] - τ.xz[ix,iy,iz]*na.xz[ix,iy,iz])/dx
+            ∂τyz_∂y = (τ.yz[ix,iy+1,iz]*na.yz[ix,iy+1,iz] - τ.yz[ix,iy,iz]*na.yz[ix,iy,iz])/dy
+            Res.V.z[ix,iy,iz] = ∂σzz_∂z + ∂τxz_∂x + ∂τyz_∂y - ρg.z
+        else
+            Res.V.z[ix,iy,iz] = 0.0
+        end
+    end
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/test_volume_fractions.jl b/scripts3D_variational_TM/test_volume_fractions.jl
new file mode 100644
index 00000000..6cb1971d
--- /dev/null
+++ b/scripts3D_variational_TM/test_volume_fractions.jl
@@ -0,0 +1,429 @@
+using FastIce
+using Logging
+using MPI
+using ImplicitGlobalGrid
+using TinyKernels
+using HDF5
+using LightXML
+using CairoMakie
+using Printf
+using JLD2
+using LinearAlgebra
+using GeometryBasics
+using ElasticArrays
+
+include("load_dem.jl")
+include("signed_distances.jl")
+include("level_sets.jl")
+include("volume_fractions.jl")
+include("bcs.jl")
+include("stokes.jl")
+include("thermo.jl")
+include("data_io.jl")
+include("hide_communication.jl")
+
+@views av1(A) = 0.5 .* (A[1:end-1] .+ A[2:end])
+@views inn_x(A) = A[2:end-1, :, :]
+@views inn_y(A) = A[:, 2:end-1, :]
+@views inn_z(A) = A[:, :, 2:end-1]
+@views inn(A) = A[2:end-1, 2:end-1, 2:end-1]
+
+# @views function main(grid_dims,grid)
+#     # unpack values
+#     me, dims, nprocs, coords, comm_cart = grid
+
+#     # init logger
+#     global_logger(FastIce.Logging.MPILogger(0, comm_cart, global_logger()))
+
+#     # path to DEM data
+#     greenland_path = "data/BedMachine/greenland.jld2"
+
+#     # region to simulate
+#     global_region = (xlims=(1100.0e3, 1200.0e3), ylims=(1000.0e3, 1100.0e3))
+
+#     # load DEM
+#     @info "loading DEM data from the file '$greenland_path'"
+#     (; x, y, bed, surface) = load_dem(greenland_path, global_region)
+#     @info "DEM resolution: $(size(bed,1)) × $(size(bed,2))"
+
+#     @info "plot DEMs"
+#     if me == 0
+#         fig = Figure(resolution=(2000,700),fontsize=32)
+#         ax  = (
+#             bed = Axis(fig[1,1][1,1];aspect=DataAspect(),title="bedrock",xlabel="x",ylabel="y"),
+#             ice = Axis(fig[1,2][1,1];aspect=DataAspect(),title="ice"    ,xlabel="x",ylabel="y"),
+#         )
+#         plt = (
+#             bed = heatmap!(ax.bed,x,y,bed    ;colormap=:terrain),
+#             ice = heatmap!(ax.ice,x,y,surface;colormap=:terrain),
+#         )
+#         Colorbar(fig[1,1][1,2],plt.bed)
+#         Colorbar(fig[1,2][1,2],plt.ice)
+#         save("region.png",fig)
+#     end
+
+#     # compute origin and size of the domain (required for scaling and computing the grid size)
+#     ox, oy, oz = x[1], y[1], minimum(bed)
+#     lx = x[end] - ox
+#     ly = y[end] - oy
+#     lz = maximum(surface) - oz
+
+#     # shift and scale the domain before computation (center of the domain is (0,0) in x-y plane)
+#     δx, δy = ox + 0.5lx, oy + 0.5ly # required to avoid conversion to Vector  
+#     x = @. (x - δx) / lz
+#     y = @. (y - δy) / lz
+#     @. bed = (bed - oz) / lz
+#     @. surface = (surface - oz) / lz
+
+#     @. surface -= 0.05
+
+#     # run simulation
+#     dem_data = (; x, y, bed, surface)
+#     @info "running the simulation"
+#     run_simulation(dem_data, grid_dims, me, dims, coords)
+
+#     return
+# end
+
+function make_dem(ox,oy,lx,ly,Δz,rgl,ogx,ogy,ogz,nx,ny)
+    x       = LinRange(ox,ox+lx,nx+1)
+    y       = LinRange(ox,oy+ly,ny+1)
+    bed     = zeros(nx+1,ny+1)
+    for iy in axes(bed,2), ix in axes(bed,1)
+        ωx = 2π*10*(x[ix] - ox)/lx
+        ωy = 2π*10*(y[iy] - oy)/lx
+        bed[ix,iy] = 0.025*rgl*sin(ωx)*cos(ωy)
+    end
+    bed .= bed .- minimum(bed) .+ Δz
+    surface = zeros(nx+1,ny+1)
+    for iy in axes(surface,2), ix in axes(surface,1)
+        δx  = x[ix] - ogx
+        δy  = y[iy] - ogy
+        surface[ix,iy] = sqrt(max(rgl^2 - δx^2 - δy^2, 0.0)) + ogz
+    end
+    return (;x,y,bed,surface)
+end
+
+@views function main(grid_dims,grid)
+    # unpack values
+    me, dims, nprocs, coords, comm_cart = grid
+
+    # init logger
+    global_logger(FastIce.Logging.MPILogger(0, comm_cart, global_logger()))
+
+    lx,ly,lz = 5.0,2.5,1.0
+    ox,oy,oz = -lx/2,-ly/2,0.0
+    Δz = 0.1lz
+
+    rgl = 4lz
+    ogx,ogy,ogz = 0.0lx,0.0ly,-3.2lz
+
+    dem_data = make_dem(ox,oy,lx,ly,Δz,rgl,ogx,ogy,ogz,grid_dims[1],grid_dims[2])
+    run_simulation(dem_data, grid_dims, me, dims, coords)
+
+    return
+end
+
+@views function run_simulation(dem_data, grid_dims, me, dims, coords)
+    # physics
+    # global domain origin and size
+    ox_g, oy_g, oz_g = dem_data.x[1], dem_data.y[1], 0.0
+    lx_g = dem_data.x[end] - ox_g
+    ly_g = dem_data.y[end] - oy_g
+    lz_g = 1.0
+
+    # local domain size and origin
+    lx_l, ly_l, lz_l = (lx_g, ly_g, lz_g) ./ dims
+    ox_l, oy_l, oz_l = (ox_g, oy_g, oz_g) .+ coords .* (lx_l, ly_l, lz_l)
+
+    ####################################################################
+    # non-dimensional numbers
+    α       = deg2rad(-15)   # slope
+    nglen   = 3              # Glen's law power exponent
+    ρr      = 0.92           # density ratio of ice to water
+    cpr     = 0.5            # heat capacity ratio of ice to water
+    U_P     = 60.0           # ratio of sensible heat to gravitational potential energy
+    L_P     = 37.0           # ratio of latent heat to gravitational potential energy
+    Pr      = 2e-9           # Prandtl number - ratio of thermal diffusivity to momentum diffusivity
+    A_L     = 5e-2           # ratio of bump amplitude to length scale
+    nbump   = 10             # number of bumps
+    Q_RT    = 2*26.0         # ratio of activation temperature to melting temperature
+    # dimensionally independent parameters
+    K       = 1.0            # consistency                   [Pa*s^(1/n)]
+    ρg      = 1.0            # ice gravity pressure gradient [Pa/m      ]
+    T_mlt   = 1.0            # ice melting temperature       [K         ]
+    # scales
+    l̄       = lz_g           # length scale                  [m         ]
+    σ̄       = ρg*cos(α)*l̄    # stress scale                  [Pa        ]
+    t̄       = (K/σ̄)^nglen    # time scale                    [s         ]
+    T̄       = T_mlt          # temperature scale             [K         ]
+    # dimensionally dependent
+    λ_i     = Pr*σ̄*l̄^2/(T̄*t̄) # thermal conductivity          [W/m/K     ]
+    ρcp     = U_P*σ̄/T̄        # ice heat capacity             [Pa/K      ]
+    ρL      = L_P*σ̄          # latent heat of melting        [Pa        ]
+    Q_R     = Q_RT*T_mlt     # activational temperature      [K         ]
+    T_atm   = 0.9*T_mlt      # atmospheric temperature       [K         ]
+    T_ini   = 0.9*T_mlt      # initial surface temperature   [K         ]
+    amp     = A_L*l̄          # bump amplitude                [m         ]
+    rgl     = 1.2l̄           # glacier radius                [m         ]
+    ηreg    = 0.5*K*(1e-6/t̄)^(1/nglen-1)
+    # not important (cancels in the equations)
+    ρ_w     = 1.0            # density of water              [kg/m^3    ]
+    ρ_i     = ρr*ρ_w         # density of ice                [kg/m^3    ]
+    cp_i    = ρcp/ρ_i        # heat capacity of ice          [J/kg/K    ]
+    cp_w    = cp_i/cpr       # heat capacity of ice          [J/kg/K    ]
+    L       = ρL/ρ_w         # latent heat of melting        [J/kg      ]
+    # phase data
+    ρ  = (ice = ρ_i , wat = ρ_w )
+    cp = (ice = cp_i, wat = cp_w)
+    λ  = (ice = λ_i , wat = λ_i )
+    # body force
+    f  = (x = ρg*sin(α), y = 0ρg, z = ρg*cos(α))
+    # thermodynamics
+    @inline u_ice(T)  = cp.ice*(T-T_mlt)
+    @inline u_wat(T)  = L + cp.wat*(T-T_mlt)
+    @inline T_lt(u_t) = (u_t < u_ice(T_mlt)) ? T_mlt + u_t/cp.ice :
+                        (u_t > u_wat(T_mlt)) ? T_mlt + (u_t - L)/cp.wat : T_mlt
+    @inline ω_lt(u_t) = (u_t < u_ice(T_mlt)) ? 0.0 :
+                        (u_t > u_wat(T_mlt)) ? 1.0 : ρ.ice*(u_ice(T_mlt) - u_t)/(ρ.ice*(u_ice(T_mlt)-u_t) - ρ.wat*(u_wat(T_mlt)-u_t))
+    ####################################################################
+
+    # numerics
+    nx, ny, nz       = grid_dims
+    nx_l, ny_l, nz_l = grid_dims .+ 2 # include ghost nodes
+    nx_g, ny_g, nz_g = grid_dims.*dims
+    bwidth = (8, 4, 4)
+
+    ϵtol   = (1e-4,1e-4,1e-4,1e-4)
+    maxiter = 50max(nx,ny,nz)
+    ncheck  = ceil(Int,0.5max(nx,ny,nz))
+    nviz    = 1
+    nsave   = 1
+    nt      = 1
+    χ       = 5e-3
+
+    # preprocessing
+    dx, dy, dz = lx_g / nx_g, ly_g / ny_g, lz_g / nz_g
+    @info "grid spacing: dx = $dx, dy = $dy, dz = $dz"
+
+    # take into account ghost nodes to simplify model setup
+    xv_l = LinRange(ox_l - dx, ox_l + lx_l + dx, nx_l + 1)
+    yv_l = LinRange(oy_l - dy, oy_l + ly_l + dy, ny_l + 1)
+    zv_l = LinRange(oz_l - dz, oz_l + lz_l + dz, nz_l + 1)
+    xc_l, yc_l, zc_l = av1.((xv_l, yv_l, zv_l))
+
+    # PT params
+    r = 0.9
+    lτ_re_mech = 0.25min(lx_g, ly_g, lz_g) / π
+    vdτ = min(dx, dy, dz) / sqrt(8.1)
+    θ_dτ = lτ_re_mech * (r + 4 / 3) / vdτ
+    nudτ = vdτ * lτ_re_mech
+    dτ_r = 1.0 / (θ_dτ + 1.0)
+
+    # fields allocation
+    # level set
+    Ψ = (
+        not_solid=scalar_field(Float64, nx_l + 1, ny_l + 1, nz_l + 1),
+        not_air  =scalar_field(Float64, nx_l + 1, ny_l + 1, nz_l + 1),
+    )
+    wt = (
+        not_solid=volfrac_field(Float64, nx_l, ny_l, nz_l),
+        not_air  =volfrac_field(Float64, nx_l, ny_l, nz_l),
+    )
+    # mechanics (stress fields include ghost nodes due to redundant computations on distributed staggered grid)
+    Pr = scalar_field(Float64, nx_l, ny_l, nz_l)
+    τ  = tensor_field(Float64, nx_l, ny_l, nz_l)
+    ε̇  = tensor_field(Float64, nx_l, ny_l, nz_l)
+    V  = vector_field(Float64, nx_l, ny_l, nz_l)
+    ηs = scalar_field(Float64, nx_l, ny_l, nz_l)
+    # thermal
+    ρU = scalar_field(Float64,nx_l,ny_l,nz_l)
+    T  = scalar_field(Float64,nx_l,ny_l,nz_l)
+    qT = vector_field(Float64,nx_l,ny_l,nz_l)
+    # hydro
+    ω  = scalar_field(Float64,nx_l,ny_l,nz_l)
+    # residuals
+    Res = (
+        Pr=scalar_field(Float64, nx_l, ny_l, nz_l),
+        V =vector_field(Float64, nx_l-2, ny_l-2, nz_l-2)
+    )
+    # visualisation
+    Vmag = scalar_field(Float64, nx, ny, nz)
+    ε̇II  = scalar_field(Float64, nx, ny, nz)
+    Ψav = (
+        not_air=scalar_field(Float64,nx,ny,nz),
+        not_solid=scalar_field(Float64,nx,ny,nz),
+    )
+
+    # initialisation 
+    # compute level sets from DEM data
+    dem_grid = (dem_data.x, dem_data.y)
+    Ψ_grid = (xv_l, yv_l, zv_l)
+
+    @info "computing the level set for the ice surface"
+    compute_level_set_from_dem!(Ψ.not_air, to_device(dem_data.surface), dem_grid, Ψ_grid)
+
+    @info "computing the level set for the bedrock surface"
+    compute_level_set_from_dem!(Ψ.not_solid, to_device(dem_data.bed), dem_grid, Ψ_grid)
+    TinyKernels.device_synchronize(get_device())
+    # invert level set to set what's below the DEM surface as inside
+    @. Ψ.not_solid *= -1.0
+    TinyKernels.device_synchronize(get_device())
+
+    @info "computing volume fractions from level sets"
+    for phase in eachindex(Ψ)
+        compute_volume_fractions_from_level_set!(wt[phase], Ψ[phase], dx, dy, dz)
+    end
+    TinyKernels.device_synchronize(FastIce.get_device())
+
+    @info "initialize mechanics"
+    for comp in eachindex(V) fill!(V[comp], 0.0) end
+    for comp in eachindex(τ) fill!(τ[comp], 0.0) end
+    fill!(Pr, 0.0)
+    fill!(ηs,0.5*K*(1e-1/t̄)^(1/nglen-1)*exp(-1/nglen*Q_R*(1/T_mlt-1/T_ini)))   
+    # fill!(ηs,1.0)   
+    TinyKernels.device_synchronize(get_device())
+
+    @info "initialize thermo"
+    for comp in eachindex(qT) fill!(qT[comp],0.0) end
+    @. T  = lerp(T_atm,T_ini,wt.not_air.c) 
+    @. ρU = ρ.ice*u_ice(T)
+    @. ω  = ω_lt(ρU/ρ.ice)
+    TinyKernels.device_synchronize(FastIce.get_device())
+
+    # convergence tracking
+    iter_evo = Float64[]
+    errs_evo = ElasticArray{Float64}(undef, length(ϵtol), 0)
+    ts       = Float64[]
+    h5names  = String[]
+
+    # save static data
+    outdir = joinpath("out_visu","egu2023/greenland")
+    mkpath(outdir)
+    jldsave(joinpath(outdir,"static.jld2");xc_l,xv_l,yc_l,yv_l,zc_l,zv_l,Ψ,wt,dem_data)
+    tcur = 0.0; isave = 1
+    for it in 1:nt
+        @info @sprintf("time step #%d, time = %g",it,tcur)
+        empty!(iter_evo); resize!(errs_evo,(length(ϵtol),0))
+        TinyKernels.device_synchronize(FastIce.get_device())
+        # mechanics
+        for iter in 1:maxiter
+            update_σ!(Pr, τ, ε̇, V, ηs, wt, r, θ_dτ, dτ_r, dx, dy, dz)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            update_V!(V, Pr, τ, ηs, wt, nudτ, f, dx, dy, dz; bwidth)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            update_ηs!(ηs,ε̇,T,wt,K,nglen,Q_R,T_mlt,ηreg,χ)
+            TinyKernels.device_synchronize(FastIce.get_device())
+            if iter % ncheck == 0
+                compute_residual!(Res,Pr,V,τ,wt,f,dx,dy,dz)
+                TinyKernels.device_synchronize(FastIce.get_device())
+                errs = (maximum(abs.(Res.V.x))*l̄/σ̄,
+                        maximum(abs.(Res.V.y))*l̄/σ̄,
+                        maximum(abs.(Res.V.z))*l̄/σ̄,
+                        maximum(abs.(inn(Res.Pr)))*t̄)
+                TinyKernels.device_synchronize(FastIce.get_device())
+                @printf("  iter/nz # %2.1f, errs: [ Vx = %1.3e, Vy = %1.3e, Vz = %1.3e, Pr = %1.3e ]\n", iter/nz, errs...)
+                push!(iter_evo, iter/nz); append!(errs_evo, errs)
+                # check convergence
+                if any(.!isfinite.(errs)) @error("simulation failed"); break; end
+                if all(errs .< ϵtol) break end
+            end
+        end
+        TinyKernels.device_synchronize(FastIce.get_device())
+        dt = min(dx,dy,dz)^2/max(λ.ice*ρ.ice*cp.ice,λ.wat*ρ.wat*cp.wat)/6.1
+        # thermal
+        update_qT!(qT,T,wt,λ,T_atm,dx,dy,dz)
+        TinyKernels.device_synchronize(FastIce.get_device())
+        update_ρU!(ρU,qT,τ,ε̇,wt,ρ.ice*u_ice(T_atm),dt,dx,dy,dz)
+        TinyKernels.device_synchronize(FastIce.get_device())
+        @. T = T_lt(ρU/(ρ.ice*(1-ω) + ρ.wat*ω))
+        @. ω = ω_lt(ρU/(ρ.ice*(1-ω) + ρ.wat*ω))
+        TinyKernels.device_synchronize(FastIce.get_device())
+        tcur += dt
+        # save timestep
+        if it % nsave == 0
+            @info "saving results on disk"
+            update_vis_fields!(Vmag, ε̇II, Ψav, V, ε̇, Ψ)
+            jldsave(joinpath(outdir,@sprintf("%04d.jld2",isave));Pr,τ,ε̇,ε̇II,V,T,ω,ηs)
+            # h5
+            dim_g = (nx_g, ny_g, nz_g)
+            update_vis_fields!(Vmag, ε̇II, Ψav, V, ε̇, Ψ)
+            out_h5 = joinpath(outdir,@sprintf("step_%04d.h5",isave))
+            ndrange = CartesianIndices(((coords[1]*nx+1):(coords[1]+1)*nx,
+                                        (coords[2]*ny+1):(coords[2]+1)*ny,
+                                        (coords[3]*nz+1):(coords[3]+1)*nz))
+            fields = Dict("LS_ice" => Ψav.not_air,
+                          "LS_bed" => Ψav.not_solid,
+                          "Vmag" => Vmag,
+                          "EII" => ε̇II,
+                          "Pr" => inn(Pr),
+                          "T" => inn(T),
+                          "omega" => inn(ω),
+                          "etas" => inn(ηs),
+                          "wt_na" => inn(wt.not_air.c),
+                          "wt_ns" => inn(wt.not_solid.c),)
+            @info "saving HDF5 file"
+            write_h5(out_h5, fields, dim_g, ndrange)
+            push!(ts,tcur);push!(h5names,out_h5)
+            isave += 1
+        end
+    end
+
+    @info "saving XDMF file..."
+    (me == 0) && write_xdmf(joinpath(outdir,"results.xdmf3"), h5names, fields, (xc_l[2], yc_l[2], zc_l[2]), (dx, dy, dz), dim_g, ts)
+
+    return
+end
+
+@tiny function _kernel_update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ)
+    ix, iy, iz = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy, iz)
+    @inbounds if isin(Ψ.not_air)
+        pav = 0.0
+        for idz = 1:2, idy = 1:2, idx = 1:2
+            pav += Ψ.not_air[ix+idx, iy+idy, iz+idz]
+        end
+        Ψav.not_air[ix, iy, iz] = pav / 8
+    end
+    @inbounds if isin(Ψ.not_solid)
+        pav = 0.0
+        for idz = 1:2, idy = 1:2, idx = 1:2
+            pav += Ψ.not_solid[ix+idx, iy+idy, iz+idz]
+        end
+        Ψav.not_solid[ix, iy, iz] = pav / 8
+    end
+    @inbounds if isin(Vmag)
+        vxc = 0.5 * (V.x[ix+1, iy+1, iz+1] + V.x[ix+2, iy+1, iz+1])
+        vyc = 0.5 * (V.y[ix+1, iy+1, iz+1] + V.y[ix+1, iy+2, iz+1])
+        vzc = 0.5 * (V.z[ix+1, iy+1, iz+1] + V.z[ix+1, iy+1, iz+2])
+        Vmag[ix, iy, iz] = sqrt(vxc^2 + vyc^2 + vzc^2)
+    end
+    @inbounds if isin(τII)
+        τxyc = 0.25 * (τ.xy[ix, iy, iz] + τ.xy[ix+1, iy, iz] + τ.xy[ix, iy+1, iz] + τ.xy[ix+1, iy+1, iz])
+        τxzc = 0.25 * (τ.xz[ix, iy, iz] + τ.xz[ix+1, iy, iz] + τ.xz[ix, iy, iz+1] + τ.xz[ix+1, iy, iz+1])
+        τyzc = 0.25 * (τ.yz[ix, iy, iz] + τ.yz[ix, iy+1, iz] + τ.yz[ix, iy, iz+1] + τ.yz[ix, iy+1, iz+1])
+        τII[ix, iy, iz] = sqrt(0.5 * (τ.xx[ix+1, iy+1, iz+1]^2 + τ.yy[ix+1, iy+1, iz+1]^2 + τ.zz[ix+1, iy+1, iz+1]^2) + τxyc^2 + τxzc^2 + τyzc^2)
+    end
+    return
+end
+
+const _update_vis_fields! = _kernel_update_vis_fields!(get_device())
+
+function update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ)
+    wait(_update_vis_fields!(Vmag, τII, Ψav, V, τ, Ψ; ndrange=axes(Vmag)))
+    return
+end
+
+grid_dims = (200, 100, 50)
+
+# init MPI and IGG
+MPI.Init()
+me, dims, nprocs, coords, comm_cart = init_global_grid(grid_dims...; init_MPI=false)
+dims   = Tuple(dims)
+coords = Tuple(coords)
+grid   = (me,dims,nprocs,coords,comm_cart)
+
+main(grid_dims,grid)
+
+# finalize_global_grid(; finalize_MPI=false)
+# MPI.Finalize()
\ No newline at end of file
diff --git a/scripts3D_variational_TM/thermo.jl b/scripts3D_variational_TM/thermo.jl
new file mode 100644
index 00000000..560299e0
--- /dev/null
+++ b/scripts3D_variational_TM/thermo.jl
@@ -0,0 +1,25 @@
+include("thermo_kernels.jl")
+
+const _update_qT! = _kernel_update_qT!(get_device())
+const _update_ρU! = _kernel_update_ρU!(get_device())
+
+function update_qT!(qT,T,wt,λ,T_atm,dx,dy,dz)
+    qT_inn = (x=inn_x(qT.x), y=inn_y(qT.y), z=inn_z(qT.z))
+    vf_inn(vf) = (
+        c = vf.c,
+        x = inn_x(vf.x),
+        y = inn_y(vf.y),
+        z = inn_z(vf.z),
+    )
+    wt_inn = (
+        not_air   = vf_inn(wt.not_air  ),
+        not_solid = vf_inn(wt.not_solid),
+    )
+    wait(_update_qT!(qT_inn,T,wt_inn,λ,T_atm,dx,dy,dz; ndrange=axes(T)))
+    return
+end
+
+function update_ρU!(ρU,qT,τ,ε̇,wt,ρU_atm,dt,dx,dy,dz)
+    wait(_update_ρU!(ρU,qT,τ,ε̇,wt,ρU_atm,dt,dx,dy,dz; ndrange=axes(ρU)))
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/thermo_kernels.jl b/scripts3D_variational_TM/thermo_kernels.jl
new file mode 100644
index 00000000..58de02eb
--- /dev/null
+++ b/scripts3D_variational_TM/thermo_kernels.jl
@@ -0,0 +1,88 @@
+@inline lerp(a,b,t) = a*(1-t) + b*t
+
+@tiny function _kernel_update_qT!(qT,T,wt,λ,T_atm,dx,dy,dz)
+    ns,na = wt.not_solid, wt.not_air
+    ix, iy, iz = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy, iz)
+    @inbounds if isin(qT.x)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix+1,iy,iz] ≈ 0) || (ns.c[ix,iy,iz] ≈ 0)
+        if !isnull && (na.x[ix,iy,iz] > 0) && (ns.x[ix,iy,iz] > 0)
+            T_w = lerp(T_atm,T[ix+1,iy,iz],na.c[ix+1,iy,iz])
+            T_e = lerp(T_atm,T[ix  ,iy,iz],na.c[ix  ,iy,iz])
+            qT.x[ix,iy,iz] = -λ.ice*(T_w - T_e)/dx
+        else
+            qT.x[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(qT.y)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix,iy+1,iz] ≈ 0) || (ns.c[ix,iy,iz] ≈ 0)
+        if !isnull && (na.y[ix,iy,iz] > 0) && (ns.y[ix,iy,iz] > 0)
+            T_n = lerp(T_atm,T[ix,iy+1,iz],na.c[ix,iy+1,iz])
+            T_s = lerp(T_atm,T[ix,iy  ,iz],na.c[ix,iy  ,iz])
+            qT.y[ix,iy,iz] = -λ.ice*(T_n - T_s)/dy
+        else
+            qT.y[ix,iy,iz] = 0.0
+        end
+    end
+    @inbounds if isin(qT.z)
+        # detect and eliminate null spaces
+        isnull = (ns.c[ix,iy,iz+1] ≈ 0) || (ns.c[ix,iy,iz] ≈ 0)
+        if !isnull && (na.z[ix,iy,iz] > 0) && (ns.z[ix,iy,iz] > 0)
+            T_f = lerp(T_atm,T[ix,iy,iz+1],na.c[ix,iy,iz+1])
+            T_b = lerp(T_atm,T[ix,iy,iz  ],na.c[ix,iy,iz  ])
+            qT.z[ix,iy,iz] = -λ.ice*(T_f - T_b)/dz
+        else
+            qT.z[ix,iy,iz] = 0.0
+        end
+    end
+end
+
+@tiny function _kernel_update_ρU!(ρU,qT,τ,ε̇,wt,ρU_atm,dt,dx,dy,dz)
+    ns,na = wt.not_solid, wt.not_air
+    ix, iy, iz = @indices
+    @inline isin(A) = checkbounds(Bool, A, ix, iy, iz)
+    @inbounds if isin(ρU)
+        isnull = (na.x[ix,iy,iz] ≈ 0.0) || (na.x[ix+1,iy,iz] ≈ 0.0) ||
+                 (na.y[ix,iy,iz] ≈ 0.0) || (na.y[ix,iy+1,iz] ≈ 0.0) ||
+                 (na.z[ix,iy,iz] ≈ 0.0) || (na.z[ix,iy,iz+1] ≈ 0.0)
+        if !isnull && (na.c[ix,iy,iz] > 0.0 && ns.c[ix,iy,iz] > 0.0)
+            ∇qx = (qT.x[ix+1,iy,iz]*ns.x[ix+1,iy,iz] - qT.x[ix,iy,iz]*ns.x[ix,iy,iz])/dx
+            ∇qy = (qT.y[ix,iy+1,iz]*ns.y[ix,iy+1,iz] - qT.y[ix,iy,iz]*ns.y[ix,iy,iz])/dy
+            ∇qz = (qT.z[ix,iy,iz+1]*ns.z[ix,iy,iz+1] - qT.z[ix,iy,iz]*ns.z[ix,iy,iz])/dz
+            ∇qT = ∇qx + ∇qy + ∇qz
+            # average shear heating contribution on cell vertices
+            τxyc,ε̇xyc = 0.0,0.0
+            for idz = -1:-1, idy = -1:0, idx = -1:0
+                ix2,iy2,iz2 = clamp(ix+idx,1,size(τ.xy,1)),clamp(iy+idy,1,size(τ.xy,2)),clamp(iz+idz,1,size(τ.xy,3))
+                τxyc += τ.xy[ix2,iy2,iz2]
+                ε̇xyc += ε̇.xy[ix2,iy2,iz2]
+            end
+            τxyc *= 0.25; ε̇xyc *= 0.25
+            # average shear heating contribution on cell vertices
+            τxzc,ε̇xzc = 0.0,0.0
+            for idz = -1:0, idy = -1:-1, idx = -1:0
+                ix2,iy2,iz2 = clamp(ix+idx,1,size(τ.xz,1)),clamp(iy+idy,1,size(τ.xz,2)),clamp(iz+idz,1,size(τ.xz,3))
+                τxzc += τ.xz[ix2,iy2,iz2]
+                ε̇xzc += ε̇.xz[ix2,iy2,iz2]
+            end
+            τxzc *= 0.25; ε̇xzc *= 0.25
+            # average shear heating contribution on cell vertices
+            τyzc,ε̇yzc = 0.0,0.0
+            for idz = -1:0, idy = -1:0, idx = -1:-1
+                ix2,iy2,iz2 = clamp(ix+idx,1,size(τ.yz,1)),clamp(iy+idy,1,size(τ.yz,2)),clamp(iz+idz,1,size(τ.yz,3))
+                τyzc += τ.yz[ix2,iy2,iz2]
+                ε̇yzc += ε̇.yz[ix2,iy2,iz2]
+            end
+            τyzc *= 0.25; ε̇yzc *= 0.25
+            SH = τ.xx[ix,iy,iz]*ε̇.xx[ix,iy,iz] +
+                 τ.yy[ix,iy,iz]*ε̇.yy[ix,iy,iz] +
+                 τ.zz[ix,iy,iz]*ε̇.zz[ix,iy,iz] +
+                 2.0*τxyc*ε̇xyc + 2.0*τxzc*ε̇xzc + 2.0*τyzc*ε̇yzc
+            ρU[ix,iy,iz] += dt*(-∇qT + SH)
+        else
+            ρU[ix,iy,iz] = ρU_atm
+        end
+    end
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/volume_fraction_kernels.jl b/scripts3D_variational_TM/volume_fraction_kernels.jl
new file mode 100644
index 00000000..ea58938d
--- /dev/null
+++ b/scripts3D_variational_TM/volume_fraction_kernels.jl
@@ -0,0 +1,58 @@
+@tiny function _kernel_compute_volume_fractions_from_level_set!(wt,Ψ,dx,dy,dz)
+    ix,iy,iz = @indices
+    cell = Rect(Vec(0.0,0.0,0.0), Vec(dx,dy,dz))
+    ω = GeometryBasics.volume(cell)
+    @inline Ψ_ax(dix,diy,diz) = 0.5*(Ψ[ix+dix,iy+diy,iz+diz]+Ψ[ix+dix+1,iy+diy,iz+diz])
+    @inline Ψ_ay(dix,diy,diz) = 0.5*(Ψ[ix+dix,iy+diy,iz+diz]+Ψ[ix+dix,iy+diy+1,iz+diz])
+    @inline Ψ_az(dix,diy,diz) = 0.5*(Ψ[ix+dix,iy+diy,iz+diz]+Ψ[ix+dix,iy+diy,iz+diz+1])
+    @inline Ψ_axy(dix,diy,diz) = 0.25*(Ψ[ix+dix  ,iy+diy  ,iz+diz+1]+Ψ[ix+dix+1,iy+diy  ,iz+diz+1]+
+                                       Ψ[ix+dix  ,iy+diy+1,iz+diz+1]+Ψ[ix+dix+1,iy+diy+1,iz+diz+1])
+    @inline Ψ_axz(dix,diy,diz) = 0.25*(Ψ[ix+dix  ,iy+diy+1,iz+diz  ]+Ψ[ix+dix+1,iy+diy+1,iz+diz  ]+
+                                       Ψ[ix+dix  ,iy+diy+1,iz+diz+1]+Ψ[ix+dix+1,iy+diy+1,iz+diz+1])
+    @inline Ψ_ayz(dix,diy,diz) = 0.25*(Ψ[ix+dix+1,iy+diy  ,iz+diz  ]+Ψ[ix+dix+1,iy+diy+1,iz+diz  ]+
+                                       Ψ[ix+dix+1,iy+diy  ,iz+diz+1]+Ψ[ix+dix+1,iy+diy+1,iz+diz+1])
+    @inline isin(A) = checkbounds(Bool,A,ix,iy,iz)
+    # cell centers
+    @inbounds if isin(wt.c)
+        Ψs = Vec{8}(Ψ[ix,iy,iz  ],Ψ[ix+1,iy,iz  ],Ψ[ix,iy+1,iz  ],Ψ[ix+1,iy+1,iz  ],
+                    Ψ[ix,iy,iz+1],Ψ[ix+1,iy,iz+1],Ψ[ix,iy+1,iz+1],Ψ[ix+1,iy+1,iz+1])
+        wt.c[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # x faces
+    @inbounds if isin(wt.x)
+        Ψs = Vec{8}(Ψ_ax(0,0,0),Ψ_ax(1,0,0),Ψ_ax(0,1,0),Ψ_ax(1,1,0),
+                    Ψ_ax(0,0,1),Ψ_ax(1,0,1),Ψ_ax(0,1,1),Ψ_ax(1,1,1))
+        wt.x[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # y faces
+    @inbounds if isin(wt.y)
+        Ψs = Vec{8}(Ψ_ay(0,0,0),Ψ_ay(1,0,0),Ψ_ay(0,1,0),Ψ_ay(1,1,0),
+                    Ψ_ay(0,0,1),Ψ_ay(1,0,1),Ψ_ay(0,1,1),Ψ_ay(1,1,1))
+        wt.y[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # z faces
+    @inbounds if isin(wt.z)
+        Ψs = Vec{8}(Ψ_az(0,0,0),Ψ_az(1,0,0),Ψ_az(0,1,0),Ψ_az(1,1,0),
+                    Ψ_az(0,0,1),Ψ_az(1,0,1),Ψ_az(0,1,1),Ψ_az(1,1,1))
+        wt.z[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # xy edges
+    @inbounds if isin(wt.xy)
+        Ψs = Vec{8}(Ψ_axy(0,0,0),Ψ_axy(1,0,0),Ψ_axy(0,1,0),Ψ_axy(1,1,0),
+                    Ψ_axy(0,0,1),Ψ_axy(1,0,1),Ψ_axy(0,1,1),Ψ_axy(1,1,1))
+        wt.xy[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # xz edges
+    @inbounds if isin(wt.xz)
+        Ψs = Vec{8}(Ψ_axz(0,0,0),Ψ_axz(1,0,0),Ψ_axz(0,1,0),Ψ_axz(1,1,0),
+                    Ψ_axz(0,0,1),Ψ_axz(1,0,1),Ψ_axz(0,1,1),Ψ_axz(1,1,1))
+        wt.xz[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    # yz edges
+    @inbounds if isin(wt.yz)
+        Ψs = Ψs = Vec{8}(Ψ_ayz(0,0,0),Ψ_ayz(1,0,0),Ψ_ayz(0,1,0),Ψ_ayz(1,1,0),
+                         Ψ_ayz(0,0,1),Ψ_ayz(1,0,1),Ψ_ayz(0,1,1),Ψ_ayz(1,1,1))
+        wt.yz[ix,iy,iz] = volfrac(cell,Ψs)/ω
+    end
+    return
+end
\ No newline at end of file
diff --git a/scripts3D_variational_TM/volume_fractions.jl b/scripts3D_variational_TM/volume_fractions.jl
new file mode 100644
index 00000000..4ef6a96d
--- /dev/null
+++ b/scripts3D_variational_TM/volume_fractions.jl
@@ -0,0 +1,128 @@
+@inline perturb(ϕ) = abs(ϕ) > 1e-20 ? ϕ : (ϕ > 0 ? 1e-20 : -1e-20)
+
+@inline trivol(v1,v2,v3) = 0.5*abs(cross(v3-v1,v2-v1))
+
+function volfrac(tri,ϕ::Vec3{T})::T where T
+    v1,v2,v3 = tri
+    if ϕ[1] < 0 && ϕ[2] < 0 && ϕ[3] < 0 # ---
+        return trivol(v1,v2,v3)
+    elseif ϕ[1] > 0 && ϕ[2] > 0 && ϕ[3] > 0 # +++
+        return 0.0
+    end
+    @inline vij(i,j) = tri[j]*(ϕ[i]/(ϕ[i]-ϕ[j])) - tri[i]*(ϕ[j]/(ϕ[i]-ϕ[j]))
+    v12,v13,v23 = vij(1,2),vij(1,3),vij(2,3)
+    if ϕ[1] < 0
+        if ϕ[2] < 0
+            trivol(v1,v23,v13) + trivol(v1,v2,v23)  # --+
+        else
+            if ϕ[3] < 0
+                trivol(v3,v12,v23) + trivol(v3,v1,v12) # -+-
+            else
+                trivol(v1,v12,v13) # -++
+            end
+        end
+    else
+        if ϕ[2] < 0
+            if ϕ[3] < 0
+                trivol(v2,v13,v12) + trivol(v2,v3,v13) # +--
+            else
+                trivol(v12,v2,v23) # +-+
+            end
+        else
+            trivol(v13,v23,v3) # ++-
+        end 
+    end
+end
+
+function volfrac(rect::Rect2{T},ϕ::Vec4{T}) where T
+    or,ws = origin(rect), widths(rect)
+    v1,v2,v3,v4 = or,or+Vec(ws[1],0.0),or+ws,or+Vec(0.0,ws[2])
+    ϕ1,ϕ2,ϕ3,ϕ4 = perturb.(ϕ)
+    return volfrac(Vec(v1,v2,v3),Vec3{T}(ϕ1,ϕ2,ϕ3)) + 
+           volfrac(Vec(v1,v3,v4),Vec3{T}(ϕ1,ϕ3,ϕ4))
+end
+
+@inline tetvol(v1,v2,v3,v4) = abs(det([v2-v1 v3-v1 v4-v1]))/6.0
+
+function volfrac(tet,ϕ::Vec4)
+    v1,v2,v3,v4 = tet
+    @inline vij(i,j) = tet[j]*(ϕ[i]/(ϕ[i]-ϕ[j])) - tet[i]*(ϕ[j]/(ϕ[i]-ϕ[j]))
+    nneg = count(ϕ.<0)
+    if nneg == 0     # ++++
+        return 0.0
+    elseif nneg == 1 # -+++
+        if ϕ[1] < 0
+            return tetvol(v1,vij(1,2),vij(1,3),vij(1,4))
+        elseif ϕ[2] < 0
+            return tetvol(v2,vij(2,1),vij(2,3),vij(2,4))
+        elseif ϕ[3] < 0
+            return tetvol(v3,vij(3,1),vij(3,2),vij(3,4))
+        else # ϕ[4] < 0
+            return tetvol(v4,vij(4,1),vij(4,2),vij(4,3))
+        end
+    elseif nneg == 2 # --++
+        if ϕ[1] < 0 && ϕ[2] < 0
+            return tetvol(v1      ,v2      ,vij(1,3),vij(2,4)) +
+                   tetvol(vij(2,3),v2      ,vij(1,3),vij(2,4)) +
+                   tetvol(v1      ,vij(1,4),vij(1,3),vij(2,4))
+        elseif ϕ[1] < 0 && ϕ[3] < 0
+            return tetvol(v1      ,v3      ,vij(1,4),vij(3,2)) +
+                   tetvol(vij(3,4),v3      ,vij(1,4),vij(3,2)) +
+                   tetvol(v1      ,vij(1,2),vij(1,4),vij(3,2))
+        elseif ϕ[1] < 0 && ϕ[4] < 0
+            return tetvol(v1      ,v4      ,vij(1,2),vij(4,3)) +
+                   tetvol(vij(4,2),v4      ,vij(1,2),vij(4,3)) +
+                   tetvol(v1      ,vij(1,3),vij(1,2),vij(4,3))
+        elseif ϕ[2] < 0 && ϕ[3] < 0
+            return tetvol(v3      ,v2      ,vij(3,1),vij(2,4)) +
+                   tetvol(vij(2,1),v2      ,vij(3,1),vij(2,4)) +
+                   tetvol(v3      ,vij(3,4),vij(3,1),vij(2,4))
+        elseif ϕ[2] < 0 && ϕ[4] < 0
+            return tetvol(v4      ,v2      ,vij(4,1),vij(2,3)) +
+                   tetvol(vij(2,1),v2      ,vij(4,1),vij(2,3)) +
+                   tetvol(v4      ,vij(4,3),vij(4,1),vij(2,3))
+        else # ϕ[3] < 0 && ϕ[4] < 0
+            return tetvol(v3      ,v4      ,vij(3,1),vij(4,2)) +
+                   tetvol(vij(4,1),v4      ,vij(3,1),vij(4,2)) +
+                   tetvol(v3      ,vij(3,2),vij(3,1),vij(4,2))
+        end
+    elseif nneg == 3 # ---+
+        vol_tot = tetvol(v1,v2,v3,v4)
+        if ϕ[1] >= 0
+            return vol_tot - tetvol(v1,vij(1,2),vij(1,3),vij(1,4))
+        elseif ϕ[2] >= 0
+            return vol_tot - tetvol(v2,vij(2,1),vij(2,3),vij(2,4))
+        elseif ϕ[3] >= 0
+            return vol_tot - tetvol(v3,vij(3,1),vij(3,2),vij(3,4))
+        else # ϕ[4] >= 0
+            return vol_tot - tetvol(v4,vij(4,1),vij(4,2),vij(4,3))
+        end
+    else # ----
+        return tetvol(v1,v2,v3,v4)
+    end
+end
+
+function volfrac(rect::Rect3,ϕ::Vec{8})
+    or,ws = origin(rect), widths(rect)
+    v000,v001,v100,v101 = or                   ,or+Vec(ws[1],0.0,0.0  ),or+Vec(0.0,ws[2],0.0  ),or+Vec(ws[1],ws[2],0.0  )
+    v010,v011,v110,v111 = or+Vec(0.0,0.0,ws[3]),or+Vec(ws[1],0.0,ws[3]),or+Vec(0.0,ws[2],ws[3]),or+Vec(ws[1],ws[2],ws[3])
+    ϕ = perturb.(ϕ)
+    return volfrac(Vec(v000,v100,v010,v001),Vec(ϕ[1],ϕ[5],ϕ[3],ϕ[2])) + 
+           volfrac(Vec(v110,v100,v010,v111),Vec(ϕ[7],ϕ[5],ϕ[3],ϕ[7])) +
+           volfrac(Vec(v101,v100,v111,v001),Vec(ϕ[6],ϕ[5],ϕ[7],ϕ[2])) +
+           volfrac(Vec(v011,v111,v010,v001),Vec(ϕ[4],ϕ[7],ϕ[3],ϕ[2])) +
+           volfrac(Vec(v111,v100,v010,v001),Vec(ϕ[7],ϕ[5],ϕ[3],ϕ[2]))
+end
+
+include("volume_fraction_kernels.jl")
+
+const _compute_volume_fractions_from_level_set! = _kernel_compute_volume_fractions_from_level_set!(get_device())
+
+function compute_volume_fractions_from_level_set!(wt,Ψ,dx,dy,dz)
+    wt_inn = (;c=wt.c,x=inn_x(wt.x),y=inn_y(wt.y),z=inn_z(wt.z),xy=wt.xy,xz=wt.xz,yz=wt.yz)
+    wait(_compute_volume_fractions_from_level_set!(wt_inn,Ψ,dx,dy,dz;ndrange=axes(Ψ)))
+    bc_x_neumann!(0.0,wt.x)
+    bc_y_neumann!(0.0,wt.y)
+    bc_z_neumann!(0.0,wt.z)
+    return
+end
\ No newline at end of file
diff --git a/scripts_future_API/tm_stokes.jl b/scripts_future_API/tm_stokes.jl
new file mode 100644
index 00000000..db1b89f1
--- /dev/null
+++ b/scripts_future_API/tm_stokes.jl
@@ -0,0 +1,180 @@
+## TODOs
+# 1. How to parametrise boundary conditions for energy and ice flow for the free surface case
+# 2. How to efficiently parametrise multiphysics
+
+using FastIce
+using FastIce.Thermodynamics.EOS
+# using FastIce.Models.Thermal
+# using FastIce.Models.FullStokes.Isothermal
+# using FastIce.Models.FullStokes.IsothermalPlasticity
+# using FastIce.Models.FullStokes.Thermomechanical
+using FastIce.Models.FullStokes.ThermomechanicalWithTopo
+
+## Supported topologies
+# 1. Bounded
+# 2. Periodic
+grid = CartesianGrid(
+    origin = (-0.5, -0.5, 0.0),
+    extent = ( 1.0,  1.0, 1.0),
+    size   = ( 100,  100, 100);
+    topology = (Bounded, Bounded, Bounded)
+)
+
+free_surface_bc = PrescribedTraction(0.0, 0.0, 0.0)
+no_slip_wall_bc = PrescribedVelocity(0.0, 0.0, 0.0)
+
+adiabatic_flux_bc    = PrescribedHeatFlux(0.0)
+geothermal_flux_bc   = PrescribedHeatFlux(0.1)
+fixed_temperature_bc = PrescribedTemperature(-0.1)
+
+boundary_conditions = (
+    stokes = BoundaryConditions(
+        no_slip_wall_bc; top = free_surface_bc, immersed_ice_air = free_surface_bc
+    ),
+    thermal = BoundaryConditions(
+        adiabatic_flux_bc; immersed_ice_air = fixed_temperature; immersed_ice_bed = geothermal_flux_bc
+    )
+)
+
+## Thermodynamics
+equation_of_state = IncompressibleEOS(
+    density       = (ice = 920.0 , water = 1000.0),
+    heat_capacity = (ice = 2100.0, water = 4200.0),
+    conductivity  = (ice = 2.0   , water = 1.0),
+    latent_heat   = 334e3
+)
+
+## Rheology
+rheology = IceRheology(
+    stress_strain     = Glen(A = 1e-20, n = 3),               # variants: Linear(μ), Goldsby(...)
+    thermal_weakening = Arrhenius(Q = 1.0, R = 8.31),         # variants: nothing
+    melt_weakening    = PowerLawWeakening(ϕref = 0.1, n = 2), # variants: nothing
+)
+
+physics = (;equation_of_state, rheology)
+
+numerics = (
+    tolerance              = (v = 1e-6, τ = 1e-6, p = 1e-8),
+    check_after_iterations = 100,
+    max_iterations         = 20(size(grid),3)
+)
+
+advection = UpwindAdvection() # variants: 
+
+mass_balance = nothing
+
+model = ThermomechanicalStokesModel(
+    grid,
+    physics,
+    advection,
+    boundary_conditions,
+    numerics,
+)
+
+## Initialisation
+# Variants:
+# 1. Constant
+#   τ_ini = 0.0
+# 2. Constant per-component
+#   τ_ini = (
+#     xx = 0.0, yy = 0.0, zz = 0.0,
+#     xy = 0.0, xz = 0.0, yz = 0.0,
+# )
+# 3. Function of coordinates and time
+#   Pr_ini(x,y,z) = ρg*(zc[end] - z)
+# 4. Parametrized function
+#   Pr_ini(x,y,z,p) = p.ρg*(p.lz - z)
+# 5. Function of grid indices and time
+#   Pr_ini(ix,iy,iz,grid) = ρg*(grid[ix,iy,end][3] - grid[ix,iy,iz][3])
+# 6. Parametrized function of grid indices
+#   Pr_ini(ix,iy,iz,grid,p) = p.ρg*(p.lz - grid[ix,iy,iz][3])
+
+τ_ini = 0.0
+P_ini = 0.0
+v_ini = 0.0
+
+ρU_ini = total_energy(equation_of_state, (T=-10.0, ϕ=0.0))
+
+set!(model, τ = τ_ini, P = P_ini, v = v_ini, ρU = ρU_ini)
+
+timestepping = (
+    rep_Δt     = 1.0,
+    total_time = 1000.0,
+    min_Δt     = 0.01,
+    max_Δt     = 0.5,
+    cfl        = 1/3.1,
+)
+
+simulation = Simulation(model; timestepping...)
+
+## High-level framework api
+# callbacks = CallbackSet(...)
+# run!(simulation; callbacks)
+run!(simulation)
+
+## Low-level library-like API
+for (it, rep_Δt, current_time) in timesteps(simulation)
+    copy_double_buffers!(model)
+    target_Δt = timestepping.max_Δt
+    if !isnothing(timestepping.cfl)
+        cfl_Δt    = estimate_Δt(model, timestepping.cfl)
+        target_Δt = min(target_Δt, cfl_Δt)
+    end
+    nsub = ceil(Int,rep_Δt/target_Δt)
+    Δt   = rep_Δt/nsub
+    isub = 0; Δt_stack = fill(Δt,nsub)
+    while !isempty(Δt_stack)
+        isub += 1
+        Δt   = pop!(Δt_stack)
+        ϵtol = Tuple(numerics.tolerance)
+        iter = 1; errs = copy(ϵtol); finished = false; success = true
+        while !finished
+            advance_iteration!(model, Δt, current_time)
+            iter += 1
+            if iter % numerics.check_after_iterations == 0
+                errs = compute_residual_norm(model)
+                if !all(isfinite.(errs))
+                    success  = false
+                    finished = true
+                else 
+                    finished = all(errs .<= ϵtol)
+                end
+            end
+            if iter > numerics.max_iterations
+                success  = false
+                finished = true
+            end
+        end
+        if failed
+            recover!(model)
+            push!(Δt_stack, 0.5Δt, 0.5Δt)
+        else
+            copy_double_buffers!(model)
+            current_time += Δt
+        end
+    end
+end
+
+## Intermediate-level API
+for (it, rep_Δt, current_time) in timesteps(simulation)
+    copy_double_buffers!(model)
+    target_Δt = timestepping.max_Δt
+    if !isnothing(timestepping.cfl)
+        cfl_Δt    = estimate_Δt(model, timestepping.cfl)
+        target_Δt = min(target_Δt, cfl_Δt)
+    end
+    nsub = ceil(Int,rep_Δt/target_Δt)
+    Δt   = rep_Δt/nsub
+    isub = 0; Δt_stack = fill(Δt,nsub)
+    while !isempty(Δt_stack)
+        isub += 1
+        Δt   = pop!(Δt_stack)
+        if !advance_timestep!(model, Δt, current_time)
+            recover!(model)
+            push!(Δt_stack, 0.5Δt, 0.5Δt)
+        else
+            copy_double_buffers!(model)
+            current_time += Δt
+        end
+    end
+end
diff --git a/src/FastIce.jl b/src/FastIce.jl
index 1b20ba76..94e11cef 100644
--- a/src/FastIce.jl
+++ b/src/FastIce.jl
@@ -1,6 +1,57 @@
 module FastIce
 
-include("level_sets/level_sets.jl")
-include("geometry.jl")
+export field_array,get_device,get_backend,set_backend!
+export to_device, to_host
+
+using Preferences
+using TinyKernels
+
+const BACKEND = @load_preference("backend", "CPU")
+
+@static if BACKEND == "CPU"
+    const DEVICE = CPUDevice()
+elseif BACKEND == "CUDA"
+    using CUDA; const DEVICE = CUDADevice()
+elseif BACKEND == "AMDGPU"
+    using AMDGPU; const DEVICE = AMDGPUDevice()
+elseif BACKEND == "Metal"
+    using Metal; const DEVICE = MetalDevice()
+else
+    error("unsupported backend \"$BACKEND\"")
+end
+
+@inline get_device()  = DEVICE
+@inline get_backend() = BACKEND
+
+function set_backend!(new_backend)
+    if !(new_backend ∈ ("CPU", "CUDA", "AMDGPU", "Metal"))
+        throw(ArgumentError("invalid backend \"$new_backend\""))
+    end
+    @set_preferences!("backend" => new_backend)
+    @info("new backend set; restart your Julia session for this change to take effect")
+end
+
+@static if BACKEND == "CPU"
+    @inline to_device(array::AbstractArray) = array
+    @inline to_host(array::AbstractArray)   = array
+elseif BACKEND == "CUDA"
+    @inline to_device(array::CuArray)       = array
+    @inline to_device(array::AbstractArray) = CuArray(array)
+    @inline to_host(array::CuArray)         = Array(array)
+    @inline to_host(array::AbstractArray)   = array
+elseif BACKEND == "AMDGPU"
+    @inline to_device(array::ROCArray)      = array
+    @inline to_device(array::AbstractArray) = ROCArray(array)
+    @inline to_host(array::ROCArray)        = Array(array)
+    @inline to_host(array::AbstractArray)   = array
+elseif BACKEND == "Metal"
+    @inline to_device(array::MtlArray)      = array
+    @inline to_device(array::AbstractArray) = MtlArray(array)
+    @inline to_host(array::MtlArray)        = Array(array)
+    @inline to_host(array::AbstractArray)   = array
+end
+
+include("logging.jl")
+include("fields.jl")
 
 end # module
diff --git a/src/fields.jl b/src/fields.jl
new file mode 100644
index 00000000..cdfdf3b2
--- /dev/null
+++ b/src/fields.jl
@@ -0,0 +1,50 @@
+export field_array, scalar_field, vector_field, tensor_field, volfrac_field
+
+@inline field_array(::Type{T}, args...) where {T} = TinyKernels.device_array(T, DEVICE, args...)
+
+@inline scalar_field(::Type{T}, args...) where {T} = field_array(T, args...)
+
+# 2D fields
+@inline vector_field(::Type{T}, nx, ny) where {T} = (
+    x=field_array(T, nx + 1, ny),
+    y=field_array(T, nx, ny + 1)
+)
+
+@inline tensor_field(::Type{T}, nx, ny) where {T} = (
+    xx=field_array(T, nx, ny),
+    yy=field_array(T, nx, ny),
+    xy=field_array(T, nx - 1, ny - 1)
+)
+
+@inline volfrac_field(::Type{T}, nx, ny) where {T} = (
+    c=field_array(T, nx, ny),
+    x=field_array(T, nx + 1, ny),
+    y=field_array(T, nx, ny + 1),
+    xy=field_array(T, nx - 1, ny - 1)
+)
+
+# 3D fields
+@inline vector_field(::Type{T}, nx, ny, nz) where {T} = (
+    x=field_array(T, nx + 1, ny, nz),
+    y=field_array(T, nx, ny + 1, nz),
+    z=field_array(T, nx, ny, nz + 1)
+)
+
+@inline tensor_field(::Type{T}, nx, ny, nz) where {T} = (
+    xx=field_array(T, nx, ny, nz),
+    yy=field_array(T, nx, ny, nz),
+    zz=field_array(T, nx, ny, nz),
+    xy=field_array(T, nx - 1, ny - 1, nz - 2),
+    xz=field_array(T, nx - 1, ny - 2, nz - 1),
+    yz=field_array(T, nx - 2, ny - 1, nz - 1)
+)
+
+@inline volfrac_field(::Type{T}, nx, ny, nz) where {T} = (
+    c=field_array(T, nx, ny, nz),
+    x=field_array(T, nx + 1, ny, nz),
+    y=field_array(T, nx, ny + 1, nz),
+    z=field_array(T, nx, ny, nz + 1),
+    xy=field_array(T, nx - 1, ny - 1, nz - 2),
+    xz=field_array(T, nx - 1, ny - 2, nz - 1),
+    yz=field_array(T, nx - 2, ny - 1, nz - 1)
+)
\ No newline at end of file
diff --git a/src/level_sets/amdgpu_backend.jl b/src/level_sets/amdgpu_backend.jl
deleted file mode 100644
index d6f5abb3..00000000
--- a/src/level_sets/amdgpu_backend.jl
+++ /dev/null
@@ -1,57 +0,0 @@
-module AMDGPUBackend
-
-using AMDGPU
-using LinearAlgebra,GeometryBasics,Printf
-
-using ..LevelSets
-
-export init_level_set!,solve_eikonal!
-
-macro get_thread_idx() esc(:( begin
-    ix = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
-    iy = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
-    iz = (workgroupIdx().z - 1) * workgroupDim().z + workitemIdx().z
-    end ))
-end
-
-include("kernels.jl")
-
-"""
-    init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-
-Initialise level set as a signed distance function in a narrow band around a heightmap
-
-# Arguments
-- `R` is the rotation matrix
-- `cutoff` is the distance from the heightmap within which the levelset computation is accurate
-"""
-function init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-    nthreads = (8,8,4)
-    wait(@roc groupsize=nthreads gridsize=size(ls) _init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R))
-    return
-end
-
-
-"""
-    solve_eikonal!(ls,dldt,mask,dx,dy,dz)
-
-Solve eikonal equation to reinitialise the level specified by the approximation `ls`
-"""
-function solve_eikonal!(ls,dldt,mask,dx,dy,dz;ϵtol = 1e-8)
-    dt = 0.5min(dx,dy,dz)
-    nthreads = (8,8,8)
-    minsteps,maxsteps = extrema(size(ls))
-    ncheck = cld(minsteps,4)
-    for istep in 1:5maxsteps
-        wait(@roc groupsize=nthreads gridsize=size(ls) _update_dldt!(dldt,ls,mask,dx,dy,dz))
-        wait(@roc groupsize=nthreads gridsize=size(ls) _update_ls!(ls,dldt,dt))
-        if istep % ncheck == 0
-            err = maximum(abs.(dldt))
-            @debug @sprintf("iteration # %d , error = %1.3e\n",istep,err)
-            if err < ϵtol break end
-        end
-    end
-    return
-end
-
-end # module AMDGPUBackend
\ No newline at end of file
diff --git a/src/level_sets/cuda_backend.jl b/src/level_sets/cuda_backend.jl
deleted file mode 100644
index 19a05d95..00000000
--- a/src/level_sets/cuda_backend.jl
+++ /dev/null
@@ -1,60 +0,0 @@
-module CUDABackend
-
-using CUDA
-using LinearAlgebra,GeometryBasics,Printf
-
-using ..LevelSets
-
-export init_level_set!,solve_eikonal!
-
-macro get_thread_idx() esc(:( begin
-    ix = (blockIdx().x-1)*blockDim().x + threadIdx().x
-    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y
-    iz = (blockIdx().z-1)*blockDim().z + threadIdx().z
-    end ))
-end
-
-include("kernels.jl")
-
-"""
-    init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-
-Initialise level set as a signed distance function in a narrow band around a heightmap
-
-# Arguments
-- `R` is the rotation matrix
-- `cutoff` is the distance from the heightmap within which the levelset computation is accurate
-"""
-function init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-    nthreads = (8,8,4)
-    nblocks  = cld.(size(ls),nthreads)
-    CUDA.@sync @cuda threads=nthreads blocks=nblocks _init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-    return
-end
-
-
-"""
-    solve_eikonal!(ls,dldt,mask,dx,dy,dz)
-
-Solve eikonal equation to reinitialise the level specified by the approximation `ls`
-"""
-function solve_eikonal!(ls,dldt,mask,dx,dy,dz;ϵtol = 1e-8)
-    dt = 0.5min(dx,dy,dz)
-    nthreads = (8,8,8)
-    nblocks  = cld.(size(ls),nthreads)
-    minsteps,maxsteps = extrema(size(ls))
-    ncheck = cld(minsteps,4)
-    for istep in 1:maxsteps
-    CUDA.@sync @cuda threads=nthreads blocks=nblocks _update_dldt!(dldt,ls,mask,dx,dy,dz)
-        CUDA.@sync @cuda threads=nthreads blocks=nblocks _update_ls!(ls,dldt,dt)
-        if istep % ncheck == 0
-            err = maximum(abs.(dldt))
-            @printf("iteration # %d , error = %1.3e\n",istep,err)
-            @debug @sprintf("iteration # %d , error = %1.3e\n",istep,err)
-            if err < ϵtol break end
-        end
-    end
-    return
-end
-
-end # module CUDABackend
\ No newline at end of file
diff --git a/src/level_sets/kernels.jl b/src/level_sets/kernels.jl
deleted file mode 100644
index f9c25d3b..00000000
--- a/src/level_sets/kernels.jl
+++ /dev/null
@@ -1,39 +0,0 @@
-function _init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-    @get_thread_idx()
-    if !(ix ∈ axes(ls,1)) || !(iy ∈ axes(ls,2)) || !(iz ∈ axes(ls,3)) return end
-    x,y,z = rc[1][ix],rc[2][iy],rc[3][iz]
-    P = R*Point3(x,y,z)
-    ud,sgn = LevelSets.sd_dem(P,cutoff,dem,dem_rc)
-    ls[ix,iy,iz]   = ud*sgn
-    mask[ix,iy,iz] = ud < cutoff
-    return
-end
-
-function _update_dldt!(dldt,ls,mask,dx,dy,dz)
-    @get_thread_idx()
-    if !(ix ∈ axes(ls,1)) || !(iy ∈ axes(ls,2)) || !(iz ∈ axes(ls,3)) return end
-    if mask[ix,iy,iz] 
-        dldt[ix,iy,iz] = 0
-        return
-    end
-    # eikonal solve
-    dLdx_m = if ix > 1;          (ls[ix  ,iy  ,iz  ]-ls[ix-1,iy  ,iz  ])/dx; else 0.0 end
-    dLdy_m = if iy > 1;          (ls[ix  ,iy  ,iz  ]-ls[ix  ,iy-1,iz  ])/dy; else 0.0 end
-    dLdz_m = if iz > 1;          (ls[ix  ,iy  ,iz  ]-ls[ix  ,iy  ,iz-1])/dz; else 0.0 end
-    dLdx_p = if ix < size(ls,1); (ls[ix+1,iy  ,iz  ]-ls[ix  ,iy  ,iz  ])/dx; else 0.0 end
-    dLdy_p = if iy < size(ls,2); (ls[ix  ,iy+1,iz  ]-ls[ix  ,iy  ,iz  ])/dy; else 0.0 end
-    dLdz_p = if iz < size(ls,3); (ls[ix  ,iy  ,iz+1]-ls[ix  ,iy  ,iz  ])/dz; else 0.0 end
-    dLdx2 = ls[ix,iy,iz] > 0 ? max(max(dLdx_m,0)^2,min(dLdx_p,0)^2) : max(min(dLdx_m,0)^2,max(dLdx_p,0)^2)
-    dLdy2 = ls[ix,iy,iz] > 0 ? max(max(dLdy_m,0)^2,min(dLdy_p,0)^2) : max(min(dLdy_m,0)^2,max(dLdy_p,0)^2)
-    dLdz2 = ls[ix,iy,iz] > 0 ? max(max(dLdz_m,0)^2,min(dLdz_p,0)^2) : max(min(dLdz_m,0)^2,max(dLdz_p,0)^2)
-    dldt[ix,iy,iz] = sign(ls[ix,iy,iz])*(1 - sqrt(dLdx2 + dLdy2 + dLdz2))
-    return
-end
-
-function _update_ls!(ls,dldt,dt)
-    @get_thread_idx()
-    if ix ∈ axes(ls,1) && iy ∈ axes(ls,2) && iz ∈ axes(ls,3)
-        ls[ix,iy,iz] += dt*dldt[ix,iy,iz]
-    end
-    return
-end
\ No newline at end of file
diff --git a/src/level_sets/level_sets.jl b/src/level_sets/level_sets.jl
deleted file mode 100644
index 47e5d2b9..00000000
--- a/src/level_sets/level_sets.jl
+++ /dev/null
@@ -1,8 +0,0 @@
-module LevelSets
-
-include("signed_distances.jl")
-include("cuda_backend.jl")
-include("amdgpu_backend.jl")
-# include("ps_backend.jl")
-
-end # module LevelSets
diff --git a/src/level_sets/ps_backend.jl b/src/level_sets/ps_backend.jl
deleted file mode 100644
index c1a878ac..00000000
--- a/src/level_sets/ps_backend.jl
+++ /dev/null
@@ -1,53 +0,0 @@
-module PSBackend
-
-using ParallelStencil
-@init_parallel_stencil(CUDA,Float64,3)
-
-using ImplicitGlobalGrid
-
-using LinearAlgebra,GeometryBasics,Printf
-
-using ..LevelSets
-
-export init_level_set!,solve_eikonal!
-
-
-include("ps_kernels.jl")
-
-"""
-    init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-
-Initialise level set as a signed distance function in a narrow band around a heightmap
-
-# Arguments
-- `R` is the rotation matrix
-- `cutoff` is the distance from the heightmap within which the levelset computation is accurate
-"""
-function init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-    @parallel (1:size(ls,1),1:size(ls,2),1:size(ls,3)) _init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-    return
-end
-
-
-"""
-    solve_eikonal!(ls,dldt,mask,dx,dy,dz)
-
-Solve eikonal equation to reinitialise the level specified by the approximation `ls`
-"""
-function solve_eikonal!(ls,dldt,mask,dx,dy,dz;ϵtol = 1e-8)
-    dt = 0.5min(dx,dy,dz)
-    minsteps,maxsteps = extrema(size(ls))
-    ncheck = cld(minsteps,4)
-    for istep in 1:maxsteps
-        @parallel _update_dldt!(dldt,ls,mask,dx,dy,dz)
-        @parallel _update_ls!(ls,dldt,dt)
-        if istep % ncheck == 0
-            err = maximum(abs.(dldt))
-            @debug @sprintf("iteration # %d , error = %1.3e\n",istep,err)
-            if err < ϵtol break end
-        end
-    end
-    return
-end
-
-end # module PSBackend
\ No newline at end of file
diff --git a/src/level_sets/ps_kernels.jl b/src/level_sets/ps_kernels.jl
deleted file mode 100644
index ef4384f3..00000000
--- a/src/level_sets/ps_kernels.jl
+++ /dev/null
@@ -1,37 +0,0 @@
-@parallel_indices (ix,iy,iz) function _init_level_set!(ls,mask,dem,rc,dem_rc,cutoff,R)
-    if ix ∈ axes(ls,1) && iy ∈ axes(ls,2) && iz ∈ axes(ls,3)
-        P = R*Point3(getindex.(rc,(ix,iy,iz))...)
-        ud,sgn = LevelSets.sd_dem(P,cutoff,dem,dem_rc)
-        ls[ix,iy,iz]   = ud*sgn
-        mask[ix,iy,iz] = ud < cutoff
-    end
-    return
-end
-
-@parallel_indices (ix,iy,iz) function _update_dldt!(dldt,ls,mask,dx,dy,dz)
-    if ix ∈ axes(ls,1) && iy ∈ axes(ls,2) && iz ∈ axes(ls,3)
-        if mask[ix,iy,iz] 
-            dldt[ix,iy,iz] = 0
-        else
-            # eikonal solve
-            dLdx_m = if ix > 1;          (ls[ix  ,iy  ,iz  ]-ls[ix-1,iy  ,iz  ])/dx; else 0.0 end
-            dLdy_m = if iy > 1;          (ls[ix  ,iy  ,iz  ]-ls[ix  ,iy-1,iz  ])/dy; else 0.0 end
-            dLdz_m = if iz > 1;          (ls[ix  ,iy  ,iz  ]-ls[ix  ,iy  ,iz-1])/dz; else 0.0 end
-            dLdx_p = if ix < size(ls,1); (ls[ix+1,iy  ,iz  ]-ls[ix  ,iy  ,iz  ])/dx; else 0.0 end
-            dLdy_p = if iy < size(ls,2); (ls[ix  ,iy+1,iz  ]-ls[ix  ,iy  ,iz  ])/dy; else 0.0 end
-            dLdz_p = if iz < size(ls,3); (ls[ix  ,iy  ,iz+1]-ls[ix  ,iy  ,iz  ])/dz; else 0.0 end
-            dLdx2 = ls[ix,iy,iz] > 0 ? max(max(dLdx_m,0)^2,min(dLdx_p,0)^2) : max(min(dLdx_m,0)^2,max(dLdx_p,0)^2)
-            dLdy2 = ls[ix,iy,iz] > 0 ? max(max(dLdy_m,0)^2,min(dLdy_p,0)^2) : max(min(dLdy_m,0)^2,max(dLdy_p,0)^2)
-            dLdz2 = ls[ix,iy,iz] > 0 ? max(max(dLdz_m,0)^2,min(dLdz_p,0)^2) : max(min(dLdz_m,0)^2,max(dLdz_p,0)^2)
-            dldt[ix,iy,iz] = sign(ls[ix,iy,iz])*(1 - sqrt(dLdx2 + dLdy2 + dLdz2))
-        end
-    end
-    return
-end
-
-@parallel_indices (ix,iy,iz) function _update_ls!(ls,dldt,dt)
-    if ix ∈ axes(ls,1) && iy ∈ axes(ls,2) && iz ∈ axes(ls,3)
-        ls[ix,iy,iz] += dt*dldt[ix,iy,iz]
-    end
-    return
-end
\ No newline at end of file
diff --git a/src/logging.jl b/src/logging.jl
new file mode 100644
index 00000000..f3b7eebb
--- /dev/null
+++ b/src/logging.jl
@@ -0,0 +1,24 @@
+module Logging
+
+export MPILogger
+
+import Logging: AbstractLogger, handle_message, shouldlog, min_enabled_level
+import MPI
+
+struct MPILogger{B<:AbstractLogger} <: AbstractLogger
+    rank::Int64
+    comm::MPI.Comm
+    base_logger::B
+end
+
+function handle_message(l::MPILogger,args...;kwargs...)
+    if MPI.Comm_rank(l.comm) == l.rank
+        handle_message(l.base_logger,args...;kwargs...)
+    end
+end
+
+shouldlog(l::MPILogger,args...) = (MPI.Comm_rank(l.comm) == l.rank) && shouldlog(l.base_logger,args...)
+
+min_enabled_level(l::MPILogger) = min_enabled_level(l.base_logger)
+
+end
\ No newline at end of file
diff --git a/startup_ault.sh b/startup_ault.sh
deleted file mode 100755
index 43ab9e57..00000000
--- a/startup_ault.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-source ./scripts3D_amdgpu/setenv_ault.sh
-
-julian --project -e 'using Pkg; Pkg.resolve()'
-
-julian --project -e 'using Pkg; pkg"add https://github.com/luraess/ImplicitGlobalGrid.jl#lr/amdgpu-0.4.x-support";'
-
-julian --project -e 'using MPIPreferences; MPIPreferences.use_system_binary()'
-
-julian --project -e 'using AMDGPU; AMDGPU.versioninfo()'