From 5ab925a05fe1f4f9eea64ff203f8e17e7630ac0e Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Tue, 16 May 2023 15:53:30 +0200
Subject: [PATCH] Add function barrier to fix HG shock capturing on macOS ARM
 (#1462)

* Add function barrier to fix HG shock capturing on macOS ARM

* Implement suggestions

* Add comment

* rename to calc_indicator_hennemann_gassner!

---------

Co-authored-by: Hendrik Ranocha <mail@ranocha.de>
Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>
---
 src/solvers/dgsem_tree/indicators.jl    |  34 +++++++
 src/solvers/dgsem_tree/indicators_1d.jl | 117 ++++++++++-------------
 src/solvers/dgsem_tree/indicators_2d.jl | 120 +++++++++++-------------
 src/solvers/dgsem_tree/indicators_3d.jl | 119 ++++++++++-------------
 4 files changed, 190 insertions(+), 200 deletions(-)

diff --git a/src/solvers/dgsem_tree/indicators.jl b/src/solvers/dgsem_tree/indicators.jl
index 0cf099d95f2..30d3b2c0448 100644
--- a/src/solvers/dgsem_tree/indicators.jl
+++ b/src/solvers/dgsem_tree/indicators.jl
@@ -102,6 +102,40 @@ function Base.show(io::IO, ::MIME"text/plain", indicator::IndicatorHennemannGass
 end
 
 
+function (indicator_hg::IndicatorHennemannGassner)(u, mesh, equations, dg::DGSEM, cache;
+                                                   kwargs...)
+  @unpack alpha_smooth = indicator_hg
+  @unpack alpha, alpha_tmp = indicator_hg.cache
+  # TODO: Taal refactor, when to `resize!` stuff changed possibly by AMR?
+  #       Shall we implement `resize!(semi::AbstractSemidiscretization, new_size)`
+  #       or just `resize!` whenever we call the relevant methods as we do now?
+  resize!(alpha, nelements(dg, cache))
+  if alpha_smooth
+    resize!(alpha_tmp, nelements(dg, cache))
+  end
+
+  # magic parameters
+  threshold = 0.5 * 10^(-1.8 * (nnodes(dg))^0.25)
+  parameter_s = log((1 - 0.0001) / 0.0001)
+
+  @threaded for element in eachelement(dg, cache)
+    # This is dispatched by mesh dimension.
+    # Use this function barrier and unpack inside to avoid passing closures to
+    # Polyester.jl with `@batch` (`@threaded`).
+    # Otherwise, `@threaded` does not work here with Julia ARM on macOS.
+    # See https://github.com/JuliaSIMD/Polyester.jl/issues/88.
+    calc_indicator_hennemann_gassner!(
+      indicator_hg, threshold, parameter_s, u,
+      element, mesh, equations, dg, cache)
+  end
+
+  if alpha_smooth
+    apply_smoothing!(mesh, alpha, alpha_tmp, dg, cache)
+  end
+
+  return alpha
+end
+
 
 """
     IndicatorLöhner (equivalent to IndicatorLoehner)
diff --git a/src/solvers/dgsem_tree/indicators_1d.jl b/src/solvers/dgsem_tree/indicators_1d.jl
index b2bbb282725..c1a88161245 100644
--- a/src/solvers/dgsem_tree/indicators_1d.jl
+++ b/src/solvers/dgsem_tree/indicators_1d.jl
@@ -24,84 +24,69 @@ function create_cache(typ::Type{IndicatorHennemannGassner}, mesh, equations::Abs
 end
 
 
-function (indicator_hg::IndicatorHennemannGassner)(u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
-                                                   equations, dg::DGSEM, cache;
-                                                   kwargs...)
+# Use this function barrier and unpack inside to avoid passing closures to Polyester.jl
+# with @batch (@threaded).
+# Otherwise, @threaded does not work here with Julia ARM on macOS.
+# See https://github.com/JuliaSIMD/Polyester.jl/issues/88.
+@inline function calc_indicator_hennemann_gassner!(indicator_hg, threshold, parameter_s, u,
+                                                   element, mesh::AbstractMesh{1},
+                                                   equations, dg, cache)
   @unpack alpha_max, alpha_min, alpha_smooth, variable = indicator_hg
   @unpack alpha, alpha_tmp, indicator_threaded, modal_threaded = indicator_hg.cache
-  # TODO: Taal refactor, when to `resize!` stuff changed possibly by AMR?
-  #       Shall we implement `resize!(semi::AbstractSemidiscretization, new_size)`
-  #       or just `resize!` whenever we call the relevant methods as we do now?
-  resize!(alpha, nelements(dg, cache))
-  if alpha_smooth
-    resize!(alpha_tmp, nelements(dg, cache))
-  end
-
-  # magic parameters
-  threshold = 0.5 * 10^(-1.8 * (nnodes(dg))^0.25)
-  parameter_s = log((1 - 0.0001)/0.0001)
 
-  @threaded for element in eachelement(dg, cache)
-    indicator = indicator_threaded[Threads.threadid()]
-    modal     = modal_threaded[Threads.threadid()]
+  indicator = indicator_threaded[Threads.threadid()]
+  modal     = modal_threaded[Threads.threadid()]
 
-    # Calculate indicator variables at Gauss-Lobatto nodes
-    for i in eachnode(dg)
-      u_local = get_node_vars(u, equations, dg, i, element)
-      indicator[i] = indicator_hg.variable(u_local, equations)
-    end
-
-    # Convert to modal representation
-    multiply_scalar_dimensionwise!(modal, dg.basis.inverse_vandermonde_legendre, indicator)
-
-    # Calculate total energies for all modes, without highest, without two highest
-    total_energy = zero(eltype(modal))
-    for i in 1:nnodes(dg)
-      total_energy += modal[i]^2
-    end
-    total_energy_clip1 = zero(eltype(modal))
-    for i in 1:(nnodes(dg)-1)
-      total_energy_clip1 += modal[i]^2
-    end
-    total_energy_clip2 = zero(eltype(modal))
-    for i in 1:(nnodes(dg)-2)
-      total_energy_clip2 += modal[i]^2
-    end
+  # Calculate indicator variables at Gauss-Lobatto nodes
+  for i in eachnode(dg)
+    u_local = get_node_vars(u, equations, dg, i, element)
+    indicator[i] = indicator_hg.variable(u_local, equations)
+  end
 
-    # Calculate energy in higher modes
-    if !(iszero(total_energy))
-      energy_frac_1 = (total_energy - total_energy_clip1) / total_energy
-    else
-      energy_frac_1 = zero(total_energy)
-    end
-    if !(iszero(total_energy_clip1))
-      energy_frac_2 = (total_energy_clip1 - total_energy_clip2) / total_energy_clip1
-    else
-      energy_frac_2 = zero(total_energy_clip1)
-    end
-    energy = max(energy_frac_1, energy_frac_2)
+  # Convert to modal representation
+  multiply_scalar_dimensionwise!(modal, dg.basis.inverse_vandermonde_legendre, indicator)
 
-    alpha_element = 1 / (1 + exp(-parameter_s / threshold * (energy - threshold)))
+  # Calculate total energies for all modes, without highest, without two highest
+  total_energy = zero(eltype(modal))
+  for i in 1:nnodes(dg)
+    total_energy += modal[i]^2
+  end
+  total_energy_clip1 = zero(eltype(modal))
+  for i in 1:(nnodes(dg)-1)
+    total_energy_clip1 += modal[i]^2
+  end
+  total_energy_clip2 = zero(eltype(modal))
+  for i in 1:(nnodes(dg)-2)
+    total_energy_clip2 += modal[i]^2
+  end
 
-    # Take care of the case close to pure DG
-    if alpha_element < alpha_min
-      alpha_element = zero(alpha_element)
-    end
+  # Calculate energy in higher modes
+  if !(iszero(total_energy))
+    energy_frac_1 = (total_energy - total_energy_clip1) / total_energy
+  else
+    energy_frac_1 = zero(total_energy)
+  end
+  if !(iszero(total_energy_clip1))
+    energy_frac_2 = (total_energy_clip1 - total_energy_clip2) / total_energy_clip1
+  else
+    energy_frac_2 = zero(total_energy_clip1)
+  end
+  energy = max(energy_frac_1, energy_frac_2)
 
-    # Take care of the case close to pure FV
-    if alpha_element > 1 - alpha_min
-      alpha_element = one(alpha_element)
-    end
+  alpha_element = 1 / (1 + exp(-parameter_s / threshold * (energy - threshold)))
 
-    # Clip the maximum amount of FV allowed
-    alpha[element] = min(alpha_max, alpha_element)
+  # Take care of the case close to pure DG
+  if alpha_element < alpha_min
+    alpha_element = zero(alpha_element)
   end
 
-  if alpha_smooth
-    apply_smoothing!(mesh, alpha, alpha_tmp, dg, cache)
+  # Take care of the case close to pure FV
+  if alpha_element > 1 - alpha_min
+    alpha_element = one(alpha_element)
   end
 
-  return alpha
+  # Clip the maximum amount of FV allowed
+  alpha[element] = min(alpha_max, alpha_element)
 end
 
 # Diffuse alpha values by setting each alpha to at least 50% of neighboring elements' alpha
@@ -411,4 +396,4 @@ function (indicator_ann::IndicatorNeuralNetwork{NeuralNetworkRayHesthaven})(
   return alpha
 end
 
-end # @muladd
\ No newline at end of file
+end # @muladd
diff --git a/src/solvers/dgsem_tree/indicators_2d.jl b/src/solvers/dgsem_tree/indicators_2d.jl
index fe1c5908152..eb08657563b 100644
--- a/src/solvers/dgsem_tree/indicators_2d.jl
+++ b/src/solvers/dgsem_tree/indicators_2d.jl
@@ -25,85 +25,71 @@ function create_cache(typ::Type{IndicatorHennemannGassner}, mesh, equations::Abs
 end
 
 
-function (indicator_hg::IndicatorHennemannGassner)(u::AbstractArray{<:Any,4},
-                                                   mesh, equations, dg::DGSEM, cache;
-                                                   kwargs...)
+# Use this function barrier and unpack inside to avoid passing closures to Polyester.jl
+# with @batch (@threaded).
+# Otherwise, @threaded does not work here with Julia ARM on macOS.
+# See https://github.com/JuliaSIMD/Polyester.jl/issues/88.
+@inline function calc_indicator_hennemann_gassner!(indicator_hg, threshold, parameter_s, u,
+                                                   element, mesh::AbstractMesh{2},
+                                                   equations, dg, cache)
   @unpack alpha_max, alpha_min, alpha_smooth, variable = indicator_hg
-  @unpack alpha, alpha_tmp, indicator_threaded, modal_threaded, modal_tmp1_threaded = indicator_hg.cache
-  # TODO: Taal refactor, when to `resize!` stuff changed possibly by AMR?
-  #       Shall we implement `resize!(semi::AbstractSemidiscretization, new_size)`
-  #       or just `resize!` whenever we call the relevant methods as we do now?
-  resize!(alpha, nelements(dg, cache))
-  if alpha_smooth
-    resize!(alpha_tmp, nelements(dg, cache))
-  end
-
-  # magic parameters
-  threshold = 0.5 * 10^(-1.8 * (nnodes(dg))^0.25)
-  parameter_s = log((1 - 0.0001)/0.0001)
+  @unpack alpha, alpha_tmp, indicator_threaded, modal_threaded,
+          modal_tmp1_threaded = indicator_hg.cache
 
-  @threaded for element in eachelement(dg, cache)
-    indicator  = indicator_threaded[Threads.threadid()]
-    modal      = modal_threaded[Threads.threadid()]
-    modal_tmp1 = modal_tmp1_threaded[Threads.threadid()]
+  indicator  = indicator_threaded[Threads.threadid()]
+  modal      = modal_threaded[Threads.threadid()]
+  modal_tmp1 = modal_tmp1_threaded[Threads.threadid()]
 
-    # Calculate indicator variables at Gauss-Lobatto nodes
-    for j in eachnode(dg), i in eachnode(dg)
-      u_local = get_node_vars(u, equations, dg, i, j, element)
-      indicator[i, j] = indicator_hg.variable(u_local, equations)
-    end
-
-    # Convert to modal representation
-    multiply_scalar_dimensionwise!(modal, dg.basis.inverse_vandermonde_legendre, indicator, modal_tmp1)
-
-    # Calculate total energies for all modes, without highest, without two highest
-    total_energy = zero(eltype(modal))
-    for j in 1:nnodes(dg), i in 1:nnodes(dg)
-      total_energy += modal[i, j]^2
-    end
-    total_energy_clip1 = zero(eltype(modal))
-    for j in 1:(nnodes(dg)-1), i in 1:(nnodes(dg)-1)
-      total_energy_clip1 += modal[i, j]^2
-    end
-    total_energy_clip2 = zero(eltype(modal))
-    for j in 1:(nnodes(dg)-2), i in 1:(nnodes(dg)-2)
-      total_energy_clip2 += modal[i, j]^2
-    end
+  # Calculate indicator variables at Gauss-Lobatto nodes
+  for j in eachnode(dg), i in eachnode(dg)
+    u_local = get_node_vars(u, equations, dg, i, j, element)
+    indicator[i, j] = indicator_hg.variable(u_local, equations)
+  end
 
-    # Calculate energy in higher modes
-    if !(iszero(total_energy))
-      energy_frac_1 = (total_energy - total_energy_clip1) / total_energy
-    else
-      energy_frac_1 = zero(total_energy)
-    end
-    if !(iszero(total_energy_clip1))
-      energy_frac_2 = (total_energy_clip1 - total_energy_clip2) / total_energy_clip1
-    else
-      energy_frac_2 = zero(total_energy_clip1)
-    end
-    energy = max(energy_frac_1, energy_frac_2)
+  # Convert to modal representation
+  multiply_scalar_dimensionwise!(modal, dg.basis.inverse_vandermonde_legendre, indicator, modal_tmp1)
 
-    alpha_element = 1 / (1 + exp(-parameter_s / threshold * (energy - threshold)))
+  # Calculate total energies for all modes, without highest, without two highest
+  total_energy = zero(eltype(modal))
+  for j in 1:nnodes(dg), i in 1:nnodes(dg)
+    total_energy += modal[i, j]^2
+  end
+  total_energy_clip1 = zero(eltype(modal))
+  for j in 1:(nnodes(dg)-1), i in 1:(nnodes(dg)-1)
+    total_energy_clip1 += modal[i, j]^2
+  end
+  total_energy_clip2 = zero(eltype(modal))
+  for j in 1:(nnodes(dg)-2), i in 1:(nnodes(dg)-2)
+    total_energy_clip2 += modal[i, j]^2
+  end
 
-    # Take care of the case close to pure DG
-    if alpha_element < alpha_min
-      alpha_element = zero(alpha_element)
-    end
+  # Calculate energy in higher modes
+  if !(iszero(total_energy))
+    energy_frac_1 = (total_energy - total_energy_clip1) / total_energy
+  else
+    energy_frac_1 = zero(total_energy)
+  end
+  if !(iszero(total_energy_clip1))
+    energy_frac_2 = (total_energy_clip1 - total_energy_clip2) / total_energy_clip1
+  else
+    energy_frac_2 = zero(total_energy_clip1)
+  end
+  energy = max(energy_frac_1, energy_frac_2)
 
-    # Take care of the case close to pure FV
-    if alpha_element > 1 - alpha_min
-      alpha_element = one(alpha_element)
-    end
+  alpha_element = 1 / (1 + exp(-parameter_s / threshold * (energy - threshold)))
 
-    # Clip the maximum amount of FV allowed
-    alpha[element] = min(alpha_max, alpha_element)
+  # Take care of the case close to pure DG
+  if alpha_element < alpha_min
+    alpha_element = zero(alpha_element)
   end
 
-  if alpha_smooth
-    apply_smoothing!(mesh, alpha, alpha_tmp, dg, cache)
+  # Take care of the case close to pure FV
+  if alpha_element > 1 - alpha_min
+    alpha_element = one(alpha_element)
   end
 
-  return alpha
+  # Clip the maximum amount of FV allowed
+  alpha[element] = min(alpha_max, alpha_element)
 end
 
 
diff --git a/src/solvers/dgsem_tree/indicators_3d.jl b/src/solvers/dgsem_tree/indicators_3d.jl
index abb4b061aad..c1e7aee886a 100644
--- a/src/solvers/dgsem_tree/indicators_3d.jl
+++ b/src/solvers/dgsem_tree/indicators_3d.jl
@@ -26,87 +26,72 @@ function create_cache(typ::Type{IndicatorHennemannGassner}, mesh, equations::Abs
 end
 
 
-function (indicator_hg::IndicatorHennemannGassner)(u::AbstractArray{<:Any,5},
-                                                   mesh, equations, dg::DGSEM, cache;
-                                                   kwargs...)
+# Use this function barrier and unpack inside to avoid passing closures to Polyester.jl
+# with @batch (@threaded).
+# Otherwise, @threaded does not work here with Julia ARM on macOS.
+# See https://github.com/JuliaSIMD/Polyester.jl/issues/88.
+@inline function calc_indicator_hennemann_gassner!(indicator_hg, threshold, parameter_s, u,
+                                                   element, mesh::AbstractMesh{3},
+                                                   equations, dg, cache)
   @unpack alpha_max, alpha_min, alpha_smooth, variable = indicator_hg
   @unpack alpha, alpha_tmp, indicator_threaded, modal_threaded,
           modal_tmp1_threaded, modal_tmp2_threaded = indicator_hg.cache
-  # TODO: Taal refactor, when to `resize!` stuff changed possibly by AMR?
-  #       Shall we implement `resize!(semi::AbstractSemidiscretization, new_size)`
-  #       or just `resize!` whenever we call the relevant methods as we do now?
-  resize!(alpha, nelements(dg, cache))
-  if alpha_smooth
-    resize!(alpha_tmp, nelements(dg, cache))
-  end
-
-  # magic parameters
-  threshold = 0.5 * 10^(-1.8 * (nnodes(dg))^0.25)
-  parameter_s = log((1 - 0.0001)/0.0001)
-
-  @threaded for element in eachelement(dg, cache)
-    indicator  = indicator_threaded[Threads.threadid()]
-    modal      = modal_threaded[Threads.threadid()]
-    modal_tmp1 = modal_tmp1_threaded[Threads.threadid()]
-    modal_tmp2 = modal_tmp2_threaded[Threads.threadid()]
-
-    # Calculate indicator variables at Gauss-Lobatto nodes
-    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-      u_local = get_node_vars(u, equations, dg, i, j, k, element)
-      indicator[i, j, k] = indicator_hg.variable(u_local, equations)
-    end
 
-    # Convert to modal representation
-    multiply_scalar_dimensionwise!(modal, dg.basis.inverse_vandermonde_legendre, indicator, modal_tmp1, modal_tmp2)
+  indicator  = indicator_threaded[Threads.threadid()]
+  modal      = modal_threaded[Threads.threadid()]
+  modal_tmp1 = modal_tmp1_threaded[Threads.threadid()]
+  modal_tmp2 = modal_tmp2_threaded[Threads.threadid()]
 
-    # Calculate total energies for all modes, without highest, without two highest
-    total_energy = zero(eltype(modal))
-    for k in 1:nnodes(dg), j in 1:nnodes(dg), i in 1:nnodes(dg)
-      total_energy += modal[i, j, k]^2
-    end
-    total_energy_clip1 = zero(eltype(modal))
-    for k in 1:(nnodes(dg)-1), j in 1:(nnodes(dg)-1), i in 1:(nnodes(dg)-1)
-      total_energy_clip1 += modal[i, j, k]^2
-    end
-    total_energy_clip2 = zero(eltype(modal))
-    for k in 1:(nnodes(dg)-2), j in 1:(nnodes(dg)-2), i in 1:(nnodes(dg)-2)
-      total_energy_clip2 += modal[i, j, k]^2
-    end
+  # Calculate indicator variables at Gauss-Lobatto nodes
+  for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+    u_local = get_node_vars(u, equations, dg, i, j, k, element)
+    indicator[i, j, k] = indicator_hg.variable(u_local, equations)
+  end
 
-    # Calculate energy in higher modes
-    if !(iszero(total_energy))
-      energy_frac_1 = (total_energy - total_energy_clip1) / total_energy
-    else
-      energy_frac_1 = zero(total_energy)
-    end
-    if !(iszero(total_energy_clip1))
-      energy_frac_2 = (total_energy_clip1 - total_energy_clip2) / total_energy_clip1
-    else
-      energy_frac_2 = zero(total_energy_clip1)
-    end
-    energy = max(energy_frac_1, energy_frac_2)
+  # Convert to modal representation
+  multiply_scalar_dimensionwise!(modal, dg.basis.inverse_vandermonde_legendre, indicator, modal_tmp1, modal_tmp2)
 
-    alpha_element = 1 / (1 + exp(-parameter_s / threshold * (energy - threshold)))
+  # Calculate total energies for all modes, without highest, without two highest
+  total_energy = zero(eltype(modal))
+  for k in 1:nnodes(dg), j in 1:nnodes(dg), i in 1:nnodes(dg)
+    total_energy += modal[i, j, k]^2
+  end
+  total_energy_clip1 = zero(eltype(modal))
+  for k in 1:(nnodes(dg)-1), j in 1:(nnodes(dg)-1), i in 1:(nnodes(dg)-1)
+    total_energy_clip1 += modal[i, j, k]^2
+  end
+  total_energy_clip2 = zero(eltype(modal))
+  for k in 1:(nnodes(dg)-2), j in 1:(nnodes(dg)-2), i in 1:(nnodes(dg)-2)
+    total_energy_clip2 += modal[i, j, k]^2
+  end
 
-    # Take care of the case close to pure DG
-    if alpha_element < alpha_min
-      alpha_element = zero(alpha_element)
-    end
+  # Calculate energy in higher modes
+  if !(iszero(total_energy))
+    energy_frac_1 = (total_energy - total_energy_clip1) / total_energy
+  else
+    energy_frac_1 = zero(total_energy)
+  end
+  if !(iszero(total_energy_clip1))
+    energy_frac_2 = (total_energy_clip1 - total_energy_clip2) / total_energy_clip1
+  else
+    energy_frac_2 = zero(total_energy_clip1)
+  end
+  energy = max(energy_frac_1, energy_frac_2)
 
-    # Take care of the case close to pure FV
-    if alpha_element > 1 - alpha_min
-      alpha_element = one(alpha_element)
-    end
+  alpha_element = 1 / (1 + exp(-parameter_s / threshold * (energy - threshold)))
 
-    # Clip the maximum amount of FV allowed
-    alpha[element] = min(alpha_max, alpha_element)
+  # Take care of the case close to pure DG
+  if alpha_element < alpha_min
+    alpha_element = zero(alpha_element)
   end
 
-  if alpha_smooth
-    apply_smoothing!(mesh, alpha, alpha_tmp, dg, cache)
+  # Take care of the case close to pure FV
+  if alpha_element > 1 - alpha_min
+    alpha_element = one(alpha_element)
   end
 
-  return alpha
+  # Clip the maximum amount of FV allowed
+  alpha[element] = min(alpha_max, alpha_element)
 end