From dd60e39d0d37df65ed6aac131cdc93a1317d2782 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 16 Oct 2015 14:04:23 -0700
Subject: [PATCH 01/19] Implemented OpenMP in the PyClaw step3ds.f90 for the
 dimensioanlly split algorithm.  The unsplit algorithm can also be updated. 
 Usage requires setting the environment variable OMP_NUM_THREADS before
 program execution.  In addition the gfortran flag '-fopenmp' and the f2py
 flag '-lgomp' are required when compiling step3ds.f90. Added F90 and f2py
 options for OpenMP (-fopenmp, and gomp => -lgomp) in classic/setup.py so that
 when compiling 'classic3.so' the openmp lines are compiled.

---
 src/pyclaw/classic/setup.py    |   2 +-
 src/pyclaw/classic/solver.py   |  37 +-
 src/pyclaw/classic/step3ds.f90 | 743 ++++++++++++++++++---------------
 3 files changed, 421 insertions(+), 361 deletions(-)

diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py
index 03b55838e..a50de681b 100644
--- a/src/pyclaw/classic/setup.py
+++ b/src/pyclaw/classic/setup.py
@@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None):
                          ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet'])
 
     config.add_extension('classic3',
-                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'])
+                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp'])
 
     return config
 
diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index ab7277aef..53c3a023e 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -611,6 +611,12 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.aux3 = None
         self.work = None
 
+        import os
+        if os.environ['OMP_NUM_THREADS'] == None:
+            self.nthreads = 1
+        else:
+            self.nthreads = int(os.environ['OMP_NUM_THREADS'])
+
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
     # ========== Setup routine =============================   
@@ -636,10 +642,12 @@ def _allocate_workspace(self,solution):
 
         # These work arrays really ought to live inside a fortran module
         # as is done for sharpclaw
-        self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3),order='F')
-        self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3),order='F')
-        self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3),order='F')
-        mwork = (maxm+2*num_ghost) * (31*num_eqn + num_waves + num_eqn*num_waves)
+        print "nthreads:",self.nthreads
+        self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
+        self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
+        self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
+        mwork = (maxm+2*num_ghost)\
+                *(31*num_eqn + num_waves + num_eqn*num_waves)*self.nthreads
         self.work = np.empty((mwork),order='F')
 
 
@@ -676,17 +684,20 @@ def step_hyperbolic(self,solution):
                 #Right now only Godunov-dimensional-splitting is implemented.
                 #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack.
 
-                q, cfl_x = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \
-                      qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,1,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
-                q, cfl_y = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \
-                      q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,2,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
-                q, cfl_z = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \
-                      q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,3,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
                 cfl = max(cfl_x,cfl_y,cfl_z)
 
diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90
index c248a18ec..3b3ed70a6 100644
--- a/src/pyclaw/classic/step3ds.f90
+++ b/src/pyclaw/classic/step3ds.f90
@@ -1,61 +1,61 @@
 
+! ==================================================================
+subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
+     mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
+     qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,&
+     num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3)
+! ==================================================================
 
+  ! Take one time step, updating q, to be used with
+  ! dimensional splitting.
+  ! On entry, qold and qnew should be identical and give the
+  !    initial data for this step.
+  ! On exit, qnew returns values at the end of the time step.
+  !    qold is unchanged.
+  !
+  ! This is a simplified version of the subroutine step3d
+  ! Since only qadd and fadd is used in the flux updates,
+  ! the solution updates below are considerably simplified
+  ! compared to step3d.
+  
+  !-----------------------------------------------------------------
+  ! NOTE! Since dimensional splitting is used, it is possible
+  !       to reduce the memory requirement, i.e. the size of
+  !       the work array. It could be reduced with
+  !
+  !       (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux),
+  !
+  !       when also possible reductions in flux3 are included.
+  !       However, this term is small compared to the dominating
+  !       term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost).
+  !-----------------------------------------------------------------
+  
+  !$ use omp_lib
 
-!     ==================================================================
-    subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, &
-    mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
-    qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, &
-    aux1,aux2,aux3,num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3)
-!     ==================================================================
+  implicit real*8(a-h,o-z)
+  external rpn3,rpt3,rptt3
+  dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension method(7),mthlim(num_waves)
+  dimension work(mwork)
+  logical :: use_fwave
 
-!     # Take one time step, updating q, to be used with
-!     # dimensional splitting.
-!     # On entry, qold and qnew should be identical and give the
-!     #    initial data for this step.
-!     # On exit, qnew returns values at the end of the time step.
-!     #    qold is unchanged.
-
-!     # This is a simplified version of the subroutine step3d
-!     # Since only qadd and fadd is used in the flux updates,
-!     # the solution updates below are considerably simplified
-!     # compared to step3d.
-
-!     #-----------------------------------------------------------------
-!     # NOTE! Since dimensional splitting is used, it is possible
-!     #       to reduce the memory requirement, i.e. the size of
-!     #       the work array. It could be reduced with
-!     #
-!     #       (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux),
-!     #
-!     #       when also possible reductions in flux3 are included.
-!     #       However, this term is small compared to the dominating
-!     #       term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost).
-!     #-----------------------------------------------------------------
-
-    implicit real*8(a-h,o-z)
-    external rpn3,rpt3,rptt3
-    dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension dtdx1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdy1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdz1d(1-num_ghost:maxm+num_ghost)
-    dimension method(7),mthlim(num_waves)
-    dimension work(mwork)
-    logical ::          use_fwave
-
-    common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
+  common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
 
 !f2py intent(out) cfl
 !f2py intent(in,out) qnew
@@ -67,317 +67,366 @@ subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, &
 !f2py x=rpt3(x)
 !f2py x=rptt3(x)
 
-!     # partition work array into pieces needed for local storage in
-!     # flux2 routine.  Find starting index of each piece:
+  ! partition work array into pieces needed for local storage in
+  ! flux3 routine.  Find starting index of each piece:
 
-    i0wave     = 1
-    i0s        = i0wave     + (maxm+2*num_ghost)*num_eqn*num_waves
-    i0amdq     = i0s        + (maxm+2*num_ghost)*num_waves
-    i0apdq     = i0amdq     + (maxm+2*num_ghost)*num_eqn
-    i0cqxx     = i0apdq     + (maxm+2*num_ghost)*num_eqn
-    i0bmamdq   = i0cqxx     + (maxm+2*num_ghost)*num_eqn
-    i0bmapdq   = i0bmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpamdq   = i0bmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpapdq   = i0bpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq   = i0bpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq   = i0cmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq   = i0cmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq   = i0cpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq2  = i0cpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq2  = i0cmamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq2  = i0cmapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq2  = i0cpamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxp  = i0cpapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxm  = i0bmcqxxp  + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxp  = i0bmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxm  = i0bpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxp  = i0bpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxm  = i0cmcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxp  = i0cmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxm  = i0cpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmamdq = i0cpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn
-    iused      = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1
+  nthreads=1 ! Serial
+  !$OMP PARALLEL
+  !$OMP SINGLE
+  !$ nthreads = omp_get_num_threads()
+  !$OMP END SINGLE
+  !$OMP END PARALLEL
 
-    if (iused > mwork) then
-    !        # This shouldn't happen due to checks in claw2
-        write(6,*) '*** not enough work space in step2'
-        write(6,*) '*** iused = ', iused, '   mwork =',mwork
-        stop
-    endif
+  nsiz = (maxm+2*num_ghost)*num_eqn
+  nsiz_w = nsiz * num_waves
+  nsiz_s = (maxm+2*num_ghost)*num_waves
 
-    index_capa = method(6)
-!      num_aux = method(7)
-    cfl = 0.d0
-    dtdx = dt/dx
-    dtdy = dt/dy
-    dtdz = dt/dz
+  i0wave     = 1
+  i0s        = i0wave     + nsiz_w*nthreads
+  i0amdq     = i0s        + nsiz_s*nthreads
+  i0apdq     = i0amdq     + nsiz*nthreads
+  i0cqxx     = i0apdq     + nsiz*nthreads
+  i0bmamdq   = i0cqxx     + nsiz*nthreads
+  i0bmapdq   = i0bmamdq   + nsiz*nthreads
+  i0bpamdq   = i0bmapdq   + nsiz*nthreads
+  i0bpapdq   = i0bpamdq   + nsiz*nthreads
+  i0cmamdq   = i0bpapdq   + nsiz*nthreads
+  i0cmapdq   = i0cmamdq   + nsiz*nthreads
+  i0cpamdq   = i0cmapdq   + nsiz*nthreads
+  i0cpapdq   = i0cpamdq   + nsiz*nthreads
+  i0cmamdq2  = i0cpapdq   + nsiz*nthreads
+  i0cmapdq2  = i0cmamdq2  + nsiz*nthreads
+  i0cpamdq2  = i0cmapdq2  + nsiz*nthreads
+  i0cpapdq2  = i0cpamdq2  + nsiz*nthreads
+  i0bmcqxxp  = i0cpapdq2  + nsiz*nthreads
+  i0bmcqxxm  = i0bmcqxxp  + nsiz*nthreads
+  i0bpcqxxp  = i0bmcqxxm  + nsiz*nthreads
+  i0bpcqxxm  = i0bpcqxxp  + nsiz*nthreads
+  i0cmcqxxp  = i0bpcqxxm  + nsiz*nthreads
+  i0cmcqxxm  = i0cmcqxxp  + nsiz*nthreads
+  i0cpcqxxp  = i0cmcqxxm  + nsiz*nthreads
+  i0cpcqxxm  = i0cpcqxxp  + nsiz*nthreads
+  i0bmcmamdq = i0cpcqxxm  + nsiz*nthreads
+  i0bmcmapdq = i0bmcmamdq + nsiz*nthreads
+  i0bpcmamdq = i0bmcmapdq + nsiz*nthreads
+  i0bpcmapdq = i0bpcmamdq + nsiz*nthreads
+  i0bmcpamdq = i0bpcmapdq + nsiz*nthreads
+  i0bmcpapdq = i0bmcpamdq + nsiz*nthreads
+  i0bpcpamdq = i0bmcpapdq + nsiz*nthreads
+  i0bpcpapdq = i0bpcpamdq + nsiz*nthreads
+  iused      = i0bpcpapdq + nsiz*nthreads - 1
 
-    if (index_capa == 0) then
-    !        # no capa array:
-        do 5 i=1-num_ghost,maxm+num_ghost
-            dtdx1d(i) = dtdx
-            dtdy1d(i) = dtdy
-            dtdz1d(i) = dtdz
-        5 END DO
-    endif
+  if (iused > mwork) then
+     ! This shouldn't happen due to checks in claw3
+     write(6,*) '*** not enough work space in step3'
+     write(6,*) '*** iused = ', iused, '   mwork =',mwork
+     stop
+  endif
 
+  index_capa = method(6)
+  !num_aux = method(7)
+  cfl = 0.d0
+  dtdx = dt/dx
+  dtdy = dt/dy
+  dtdz = dt/dz
+  
+  if (index_capa == 0) then
+     ! no capa array:
+     !$OMP PARALLEL PRIVATE(i,me,me1)
+     me = 0
+     !$ me = omp_get_thread_num()
+     me1 = me + 1
+     do i=1-num_ghost,maxm+num_ghost
+        dtdx1d(i,me1) = dtdx
+        dtdy1d(i,me1) = dtdy
+        dtdz1d(i,me1) = dtdz
+     end do
+     !$OMP END PARALLEL
+  endif
 
-    if( idir == 1)then
-    
-    !     # perform x-sweeps
-    !     ==================
+  if( idir == 1)then
     
-        do 50 k = 0,mz+1
-            do 50 j = 0,my+1
-            
-                forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost)
-            !                 # copy data along a slice into 1d array:
-                q1d(m,i) = qold(m,i,j,k)
-                end forall
-            
-                if (index_capa > 0)  then
-                    do 23 i = 1-num_ghost, mx+num_ghost
-                        dtdx1d(i) = dtdx / aux(index_capa,i,j,k)
-                    23 END DO
-                endif
-            
-            !        # Since dimensional splitting is used, only aux2 is needed.
-            
-                if (num_aux > 0)  then
-                    forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1)
-                    aux2(ma,i,2+ka) = aux(ma,i,j,k+ka)
-                    end forall
-                endif
-            
-            !           # Store the value of j and k along this slice in the common block
-            !           # comxyt in case it is needed in the Riemann solver (for
-            !           # variable coefficient problems)
-            
-                jcom = j
-                kcom = k
-            
-            !           # compute modifications qadd and fadd along this slice
-            
-                call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
-                q1d,dtdx1d,dtdy,dtdz,dummy1,aux2,dummy3,num_aux, &
-                method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-                work(i0wave),work(i0s),work(i0amdq), &
-                work(i0apdq),work(i0cqxx), &
-                work(i0bmamdq),work(i0bmapdq), &
-                work(i0bpamdq),work(i0bpapdq), &
-                work(i0cmamdq),work(i0cmapdq), &
-                work(i0cpamdq),work(i0cpapdq), &
-                work(i0cmamdq2),work(i0cmapdq2), &
-                work(i0cpamdq2),work(i0cpapdq2), &
-                work(i0bmcqxxp),work(i0bpcqxxp), &
-                work(i0bmcqxxm),work(i0bpcqxxm), &
-                work(i0cmcqxxp),work(i0cpcqxxp), &
-                work(i0cmcqxxm),work(i0cpcqxxm), &
-                work(i0bmcmamdq),work(i0bmcmapdq), &
-                work(i0bpcmamdq),work(i0bpcmapdq), &
-                work(i0bmcpamdq),work(i0bmcpapdq), &
-                work(i0bpcpamdq),work(i0bpcpapdq), &
+     ! perform x-sweeps
+     ! ==================
+     !$OMP PARALLEL PRIVATE(me,me1,m,i,ma,ka,cfl1d) REDUCTION(MAX:cfl)
+     me = 0
+     !$ me = omp_get_thread_num()
+     me1 = me + 1
+     cfl1d = 0.d0
+
+     ! Guided schedule seems to produce a small performance increase
+     ! relative to static or dynamic for problems with uniform work
+     ! per column.  For problems with very nonuniform work per
+     ! column, either guided or dynamic is necessary to keep all the
+     ! threads busy.
+     
+     !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED)
+     do k = 0,mz+1
+        do j = 0,my+1
+           
+           forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost)
+              ! copy data along a slice into 1d array:
+              q1d(m,i,me1) = qold(m,i,j,k)
+           end forall
+           
+           if (index_capa > 0) then
+              do i = 1-num_ghost, mx+num_ghost
+                 dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k)
+              end do
+           endif
+            
+           ! Since dimensional splitting is used, only aux2 is needed.
+           if (num_aux > 0)  then
+              forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1)
+                 aux2(ma,i,2+ka,me1) = aux(ma,i,j,k+ka)
+              end forall
+           endif
+            
+           ! Store the value of j and k along this slice in the common block
+           ! comxyt in case it is needed in the Riemann solver (for
+           ! variable coefficient problems)
+           
+           jcom = j
+           kcom = k
+            
+           ! compute modifications qadd and fadd along this slice
+            
+           call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
+                q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz,dummy1,&
+                aux2(1,1-num_ghost,1,me1),dummy3,num_aux, &
+                method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),&
+                gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), &
+                work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
                 use_fwave,rpn3,rpt3,rptt3)
             
-                cfl = dmax1(cfl,cfl1d)
-            
-            !           # update qnew by flux differencing.
-            !           # (rather than maintaining arrays f, g and h for the total fluxes,
-            !           # the modifications are used immediately to update qnew
-            !           # in order to save storage.)
-            
-                if(index_capa == 0)then
-                !              # no capa array.  Standard flux differencing:
-                    forall (m = 1:num_eqn, i = 1:mx)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                    - dtdx * (fadd(m,i+1) - fadd(m,i))
-                    end forall
-                else
-                !              # with capa array
-                    forall (m = 1:num_eqn, i = 1:mx)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                    - dtdx * (fadd(m,i+1) - fadd(m,i)) &
-                    / aux(index_capa,i,j,k)
-                    end forall
-                endif
-            
-        50 END DO
-    
-    else if( idir == 2 )then
+           cfl = dmax1(cfl,cfl1d)
+            
+           ! update qnew by flux differencing.
+           ! (rather than maintaining arrays f, g and h for the total fluxes,
+           ! the modifications are used immediately to update qnew
+           ! in order to save storage.)
+           
+           if(index_capa == 0)then
+              ! no capa array.  Standard flux differencing:
+              forall (m = 1:num_eqn, i = 1:mx)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                      - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1))
+              end forall
+           else
+              ! with capa array
+              forall (m = 1:num_eqn, i = 1:mx)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                      - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) &
+                      / aux(index_capa,i,j,k)
+              end forall
+           endif
+           
+        end do
+     end do
+     !$OMP END PARALLEL
     
-    !     # perform y sweeps
-    !     ==================
+  else if( idir == 2 )then
     
-        do 100 k = 0, mz+1
-            do 100 i = 0, mx+1
-            
-                forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost)
-            !                 # copy data along a slice into 1d array:
-                q1d(m,j) = qold(m,i,j,k)
-                end forall
-            
-                if (index_capa > 0)  then
-                    do 71 j = 1-num_ghost, my+num_ghost
-                        dtdy1d(j) = dtdy / aux(index_capa,i,j,k)
-                    71 END DO
-                endif
-            
-            !        # Since dimensional splitting is used, only aux2 is needed.
-            
-                if (num_aux > 0)  then
-                ! There's a decent chance aux2 will all fit into cache,
-                ! so keep accesses to aux as contiguous as possible.
-                    forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost)
-                    aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k)
-                    end forall
-                endif
-            
-            !           # Store the value of i and k along this slice in the common block
-            !           # comxyzt in case it is needed in the Riemann solver (for
-            !           # variable coefficient problems)
-            
-                icom = i
-                kcom = k
-            
-            !           # compute modifications qadd and fadd along this slice
-            
-                call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
-                q1d,dtdy1d,dtdz,dtdx,dummy1,aux2,dummy3,num_aux, &
-                method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-                work(i0wave),work(i0s),work(i0amdq), &
-                work(i0apdq),work(i0cqxx), &
-                work(i0bmamdq),work(i0bmapdq), &
-                work(i0bpamdq),work(i0bpapdq), &
-                work(i0cmamdq),work(i0cmapdq), &
-                work(i0cpamdq),work(i0cpapdq), &
-                work(i0cmamdq2),work(i0cmapdq2), &
-                work(i0cpamdq2),work(i0cpapdq2), &
-                work(i0bmcqxxp),work(i0bpcqxxp), &
-                work(i0bmcqxxm),work(i0bpcqxxm), &
-                work(i0cmcqxxp),work(i0cpcqxxp), &
-                work(i0cmcqxxm),work(i0cpcqxxm), &
-                work(i0bmcmamdq),work(i0bmcmapdq), &
-                work(i0bpcmamdq),work(i0bpcmapdq), &
-                work(i0bmcpamdq),work(i0bmcpapdq), &
-                work(i0bpcpamdq),work(i0bpcpapdq), &
+     ! perform y sweeps
+     ! ==================
+     !$OMP PARALLEL PRIVATE(me,me1,m,j,ma,ia,cfl1d) REDUCTION(MAX:cfl)
+     me = 0 
+     !$ me = omp_get_thread_num()
+     me1 = me + 1 
+     cfl1d = 0.d0
+     
+     !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED)
+     do k = 0, mz+1
+        do i = 0, mx+1
+           
+           forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost)
+              ! copy data along a slice into 1d array:
+              q1d(m,j,me1) = qold(m,i,j,k)
+           end forall
+            
+           if (index_capa > 0)  then
+              do j = 1-num_ghost, my+num_ghost
+                 dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k)
+              end do
+           endif
+            
+           ! Since dimensional splitting is used, only aux2 is needed.
+            
+           if (num_aux > 0)  then
+              ! There's a decent chance aux2 will all fit into cache,
+              ! so keep accesses to aux as contiguous as possible.
+              forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost)
+                 aux2(ma, j, 2+ia,me1) = aux(ma, i+ia, j, k)
+              end forall
+           endif
+            
+           ! Store the value of i and k along this slice in the common block
+           ! comxyzt in case it is needed in the Riemann solver (for
+           ! variable coefficient problems)
+            
+           icom = i
+           kcom = k
+            
+           ! compute modifications qadd and fadd along this slice
+            
+           call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
+                q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx,dummy1,&
+                aux2(1,1-num_ghost,1,me1),dummy3,num_aux, &
+                method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),&
+                gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), &
+                work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
                 use_fwave,rpn3,rpt3,rptt3)
             
-                cfl = dmax1(cfl,cfl1d)
-            
-            !           # update qnew by flux differencing.
-            !           # Note that the roles of the flux updates are changed.
-            !           # fadd - modifies the g-fluxes
-            
-                if( index_capa == 0)then
-                !               # no capa array.  Standard flux differencing:
-                    forall (m = 1:num_eqn, j = 1:my)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                    - dtdy * (fadd(m,j+1) - fadd(m,j))
-                    end forall
-                else
-                !              #with capa array.
-                    forall (m = 1:num_eqn, j = 1:my)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                    - dtdy * (fadd(m,j+1) - fadd(m,j)) &
-                    / aux(index_capa,i,j,k)
-                    end forall
-                endif
-            
-        100 END DO
-    
-    else
+           cfl = dmax1(cfl,cfl1d)
+            
+           ! update qnew by flux differencing.
+           ! Note that the roles of the flux updates are changed.
+           ! fadd - modifies the g-fluxes
+           
+           if( index_capa == 0)then
+              ! no capa array.  Standard flux differencing:
+              forall (m = 1:num_eqn, j = 1:my)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                      - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1))
+              end forall
+           else
+              ! with capa array.
+              forall (m = 1:num_eqn, j = 1:my)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                      - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) &
+                      / aux(index_capa,i,j,k)
+              end forall
+           endif
+            
+        end do
+     end do
+     !$OMP END PARALLEL
     
-    !     # perform z sweeps
-    !     ==================
+  else
     
-    
-        do 150 j = 0, my+1
-            do 150 i = 0, mx+1
-            
-                forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost)
-            !                 # copy data along a slice into 1d array:
-                q1d(m,k) = qold(m,i,j,k)
-                end forall
-            
-                if (index_capa > 0)  then
-                    do 130 k = 1-num_ghost, mz+num_ghost
-                        dtdz1d(k) = dtdz / aux(index_capa,i,j,k)
-                    130 END DO
-                endif
-            
-            !        # Since dimensional splitting is used, only aux2 is needed.
-            
-                if (num_aux > 0)  then
-                ! There's a decent chance aux2 will all fit into cache,
-                ! so keep accesses to aux as contiguous as possible.
-                    forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost)
-                    aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k)
-                    end forall
-                endif
-            
-            !           # Store the value of i and j along this slice in the common block
-            !           # comxyzt in case it is needed in the Riemann solver (for
-            !           # variable coefficient problems)
-            
-                icom = i
-                jcom = j
-            
-            !           # compute modifications qadd and fadd along this slice
-            
-                call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
-                q1d,dtdz1d,dtdx,dtdy,dummy1,aux2,dummy3,num_aux, &
-                method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-                work(i0wave),work(i0s),work(i0amdq), &
-                work(i0apdq),work(i0cqxx), &
-                work(i0bmamdq),work(i0bmapdq), &
-                work(i0bpamdq),work(i0bpapdq), &
-                work(i0cmamdq),work(i0cmapdq), &
-                work(i0cpamdq),work(i0cpapdq), &
-                work(i0cmamdq2),work(i0cmapdq2), &
-                work(i0cpamdq2),work(i0cpapdq2), &
-                work(i0bmcqxxp),work(i0bpcqxxp), &
-                work(i0bmcqxxm),work(i0bpcqxxm), &
-                work(i0cmcqxxp),work(i0cpcqxxp), &
-                work(i0cmcqxxm),work(i0cpcqxxm), &
-                work(i0bmcmamdq),work(i0bmcmapdq), &
-                work(i0bpcmamdq),work(i0bpcmapdq), &
-                work(i0bmcpamdq),work(i0bmcpapdq), &
-                work(i0bpcpamdq),work(i0bpcpapdq), &
+     ! perform z sweeps
+     ! ==================
+     !$OMP PARALLEL PRIVATE(me,me1,m,k,ma,ja,cfl1d) REDUCTION(MAX:cfl)
+     me = 0 
+     !$ me = omp_get_thread_num()
+     me1 = me + 1 
+     cfl1d = 0.d0 
+     
+     !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED)     
+     do j = 0, my+1
+        do i = 0, mx+1
+            
+           forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost)
+              ! copy data along a slice into 1d array:
+              q1d(m,k,me1) = qold(m,i,j,k)
+           end forall
+            
+           if (index_capa > 0)  then
+              do k = 1-num_ghost, mz+num_ghost
+                 dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k)
+              end do
+           endif
+            
+           ! Since dimensional splitting is used, only aux2 is needed.
+            
+           if (num_aux > 0)  then
+              ! There's a decent chance aux2 will all fit into cache,
+              ! so keep accesses to aux as contiguous as possible.
+              forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost)
+                 aux2(ma, k, 2+ja,me1) = aux(ma, i, j+ja, k)
+              end forall
+           endif
+            
+           ! Store the value of i and j along this slice in the common block
+           ! comxyzt in case it is needed in the Riemann solver (for
+           ! variable coefficient problems)
+            
+           icom = i
+           jcom = j
+            
+           ! compute modifications qadd and fadd along this slice
+            
+           call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
+                q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy,dummy1, &
+                aux2(1,1-num_ghost,1,me1),dummy3,num_aux, &
+                method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1), &
+                gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), &
+                work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
                 use_fwave,rpn3,rpt3,rptt3)
             
-                cfl = dmax1(cfl,cfl1d)
-            
-            !           # update qnew by flux differencing.
-            !           # Note that the roles of the flux updates are changed.
-            !           # fadd - modifies the h-fluxes
-            
-                if(index_capa == 0)then
-                !              #no capa array. Standard flux differencing:
-                    forall (m = 1:num_eqn, k = 1:mz)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                    - dtdz * (fadd(m,k+1) - fadd(m,k))
-                    end forall
-                else
-                !              # with capa array
-                    forall (m = 1:num_eqn, k = 1:mz)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                    - dtdz * (fadd(m,k+1) - fadd(m,k)) &
-                    / aux(index_capa,i,j,k)
-                    end forall
-                endif
-            
-        150 END DO
+           cfl = dmax1(cfl,cfl1d)
+            
+           ! update qnew by flux differencing.
+           ! Note that the roles of the flux updates are changed.
+           ! fadd - modifies the h-fluxes
+           
+           if(index_capa == 0)then
+              ! no capa array. Standard flux differencing:
+              forall (m = 1:num_eqn, k = 1:mz)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                      - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1))
+              end forall
+           else
+              ! with capa array
+              forall (m = 1:num_eqn, k = 1:mz)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                      - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) &
+                      / aux(index_capa,i,j,k)
+              end forall
+           endif
+            
+        end do
+     end do
+     !$OMP END PARALLEL
     
-    endif
+  endif
 
-    return
-    end subroutine step3ds
+  return
+end subroutine step3ds
 
 

From 2cffe8b4cd00e45b1bb9a4a67dbe8cb41b76b0c2 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 16 Oct 2015 14:16:32 -0700
Subject: [PATCH 02/19] Fixed bug when OMP_NUM_THREADS env variable is not set,
 and uses nthreads=1

---
 src/pyclaw/classic/solver.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 53c3a023e..2eeb954c0 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -612,10 +612,7 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.work = None
 
         import os
-        if os.environ['OMP_NUM_THREADS'] == None:
-            self.nthreads = 1
-        else:
-            self.nthreads = int(os.environ['OMP_NUM_THREADS'])
+        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 

From 2bfff21e51030f5a8332565944aec669cec9fe32 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 16 Oct 2015 14:23:17 -0700
Subject: [PATCH 03/19] Added a comment related to a hack of adding and
 subtracting a variable in a dimension size in order to trick f2py into not
 making nthreads an optional argument.  Otherwise it used
 nthreads=shape(qadd,2) as a size.  Since qadd is optional, it get a bad value
 when qadd is not in the call to step3ds(), which is the default for pyclaw.

---
 src/pyclaw/classic/step3ds.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90
index 3b3ed70a6..7d672f3b6 100644
--- a/src/pyclaw/classic/step3ds.f90
+++ b/src/pyclaw/classic/step3ds.f90
@@ -38,7 +38,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
        1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
   dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
        1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) ! Adding and subtracting mx is a hack to trick f2py, so it does not make nthreads optional.
   dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
   dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
   dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)

From 3bc6ba8e48f74b45d7a0d17291cd5f219611dc7d Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@uw.edu>
Date: Mon, 19 Oct 2015 08:33:15 -0700
Subject: [PATCH 04/19] Update solver.py

removed debugging print statement
---
 src/pyclaw/classic/solver.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 2eeb954c0..e5f07f378 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -639,7 +639,6 @@ def _allocate_workspace(self,solution):
 
         # These work arrays really ought to live inside a fortran module
         # as is done for sharpclaw
-        print "nthreads:",self.nthreads
         self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
         self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
         self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')

From 7e21323cbf19553e893043216d5592ad8fa60f95 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Wed, 28 Oct 2015 23:26:11 -0700
Subject: [PATCH 05/19] Updated solver.py to set the default number of threads
 to the number of CPUs when the OMP_NUM_THREADS environment varibale is unset.
  This is consistent with what OpenMP returns for omp_get_num_threads() when
 the env variable is unset.  Updated step3.f90 to use OpenMP, based on what is
 done in clawpack/classic.  This was sucessfully tested with the Sedov.py test
 problem in the euler_3d example folder.

---
 src/pyclaw/classic/solver.py   |   12 +-
 src/pyclaw/classic/step3.f90   | 1475 +++++++++++++++++++-------------
 src/pyclaw/classic/step3ds.f90 |    5 +-
 3 files changed, 897 insertions(+), 595 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 2eeb954c0..888cb181a 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -612,7 +612,9 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.work = None
 
         import os
-        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
+        import multiprocessing
+        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\
+                                           multiprocessing.cpu_count()))
 
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
@@ -639,7 +641,6 @@ def _allocate_workspace(self,solution):
 
         # These work arrays really ought to live inside a fortran module
         # as is done for sharpclaw
-        print "nthreads:",self.nthreads
         self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
         self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
         self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
@@ -700,9 +701,10 @@ def step_hyperbolic(self,solution):
 
             else:
 
-                q, cfl = self.fmod.step3(maxm,self.num_ghost,mx,my,mz, \
-                      qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl = self.fmod.step3(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
             self.cfl.update_global_max(cfl)
             state.set_q_from_qbc(self.num_ghost,self.qbc)
diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90
index 211f626b8..ec8b91556 100644
--- a/src/pyclaw/classic/step3.f90
+++ b/src/pyclaw/classic/step3.f90
@@ -1,45 +1,50 @@
 
-!     ==================================================================
-    subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, &
-    mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
-    qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, &
-    aux1,aux2,aux3,num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3)
-!     ==================================================================
-
-!     # Take one time step, updating q.
-!     # On entry, qold and qnew should be identical and give the
-!     #    initial data for this step
-!     # On exit, qnew returns values at the end of the time step.
-!     #    qold is unchanged.
-
-!     # qadd is used to return increments to q from flux3.
-!     # fadd, gadd and hadd are used to return flux increments from flux3.
-!     # See the flux3 documentation for more information.
-
-
-    implicit real*8(a-h,o-z)
-    external rpn3,rpt3,rptt3
-    dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension dtdx1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdy1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdz1d(1-num_ghost:maxm+num_ghost)
-    dimension method(7),mthlim(num_waves)
-    dimension work(mwork)
-    logical ::          use_fwave
-
+! ==================================================================
+subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
+     mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
+     qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,&
+     num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3)
+! ==================================================================
+
+  ! Take one time step, updating q.
+  ! On entry, qold and qnew should be identical and give the
+  !    initial data for this step
+  ! On exit, qnew returns values at the end of the time step.
+  !    qold is unchanged.
+  !
+  ! qadd is used to return increments to q from flux3.
+  ! fadd, gadd and hadd are used to return flux increments from flux3.
+  ! See the flux3 documentation for more information.
+
+  !$ use omp_lib
+
+  implicit real*8(a-h,o-z)
+  external rpn3,rpt3,rptt3
+  dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension method(7),mthlim(num_waves)
+  dimension work(mwork)
+  logical :: use_fwave
+
+  ! These block sizes must be at least 2 to avoid multiple threads
+  ! trying to update the same grid cell
+  integer, parameter :: blksiz_i = 2, blksiz_j = 2, blksiz_k = 2
+    
 !f2py intent(out) cfl
 !f2py intent(in,out) qnew
 !f2py optional q1d, qadd, fadd, gadd, hadd, dtdx1d, dtdy1d, dtdz1d
@@ -50,552 +55,846 @@ subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, &
 !f2py x=rpt3(x)
 !f2py x=rptt3(x)
 
-    common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
-
-
-
-!     # partition work array into pieces needed for local storage in
-!     # flux3 routine.  Find starting index of each piece:
-
-    i0wave     = 1
-    i0s        = i0wave     + (maxm+2*num_ghost)*num_eqn*num_waves
-    i0amdq     = i0s        + (maxm+2*num_ghost)*num_waves
-    i0apdq     = i0amdq     + (maxm+2*num_ghost)*num_eqn
-    i0cqxx     = i0apdq     + (maxm+2*num_ghost)*num_eqn
-    i0bmamdq   = i0cqxx     + (maxm+2*num_ghost)*num_eqn
-    i0bmapdq   = i0bmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpamdq   = i0bmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpapdq   = i0bpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq   = i0bpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq   = i0cmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq   = i0cmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq   = i0cpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq2  = i0cpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq2  = i0cmamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq2  = i0cmapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq2  = i0cpamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxp  = i0cpapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxm  = i0bmcqxxp  + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxp  = i0bmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxm  = i0bpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxp  = i0bpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxm  = i0cmcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxp  = i0cmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxm  = i0cpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmamdq = i0cpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn
-    iused      = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1
-
-    if (iused > mwork) then
-    !        # This shouldn't happen due to checks in claw3
-        write(6,*) '*** not enough work space in step3'
-        write(6,*) '*** iused = ', iused, '   mwork =',mwork
-        stop
-    endif
-
-
-
-    index_capa = method(6)
-!      num_aux = method(7)
-    cfl = 0.d0
-    dtdx = dt/dx
-    dtdy = dt/dy
-    dtdz = dt/dz
-
-    if (index_capa == 0) then
-    !        # no capa array:
-        do 5 i=1-num_ghost,maxm+num_ghost
-            dtdx1d(i) = dtdx
-            dtdy1d(i) = dtdy
-            dtdz1d(i) = dtdz
-        5 END DO
-    endif
-
-
-!     # perform x-sweeps
-!     ==================
-
-
-    do 50 k = 0,mz+1
-        do 50 j = 0,my+1
-
-            forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost)
-        !                 # copy data along a slice into 1d array:
-            q1d(m,i) = qold(m,i,j,k)
-            end forall
-        
-            if (index_capa > 0)  then
-                do 23 i = 1-num_ghost, mx+num_ghost
-                    dtdx1d(i) = dtdx / aux(index_capa,i,j,k)
-                23 END DO
-            endif
-        
-            if (num_aux > 0)  then
-            ! This odd construct may help improve cache locality.
-            ! (The F95 standard says each statement in the FORALL
-            ! must be executed for all indices before the next
-            ! statement is started, so this is different semantically
-            ! from putting all three indices in the same FORALL.)
-                forall (ka = -1:1)
-                forall (ma = 1:num_aux, i = 1-num_ghost:mx+num_ghost)
-                aux1(ma, i, 2+ka) = aux(ma, i, j-1, k+ka)
-                aux2(ma, i, 2+ka) = aux(ma, i,   j, k+ka)
-                aux3(ma, i, 2+ka) = aux(ma, i, j+1, k+ka)
-                end forall
-                end forall
-            endif
-        
-        !           # Store the value of j and k along this slice in the common block
-        !           # comxyt in case it is needed in the Riemann solver (for
-        !           # variable coefficient problems)
-        
-            jcom = j
-            kcom = k
-        
-        !           # compute modifications fadd, gadd and hadd to fluxes along
-        !           # this slice:
-        
-            call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
-            q1d,dtdx1d,dtdy,dtdz,aux1,aux2,aux3,num_aux, &
-            method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-            work(i0wave),work(i0s),work(i0amdq), &
-            work(i0apdq),work(i0cqxx), &
-            work(i0bmamdq),work(i0bmapdq), &
-            work(i0bpamdq),work(i0bpapdq), &
-            work(i0cmamdq),work(i0cmapdq), &
-            work(i0cpamdq),work(i0cpapdq), &
-            work(i0cmamdq2),work(i0cmapdq2), &
-            work(i0cpamdq2),work(i0cpapdq2), &
-            work(i0bmcqxxp),work(i0bpcqxxp), &
-            work(i0bmcqxxm),work(i0bpcqxxm), &
-            work(i0cmcqxxp),work(i0cpcqxxp), &
-            work(i0cmcqxxm),work(i0cpcqxxm), &
-            work(i0bmcmamdq),work(i0bmcmapdq), &
-            work(i0bpcmamdq),work(i0bpcmapdq), &
-            work(i0bmcpamdq),work(i0bmcpapdq), &
-            work(i0bpcpamdq),work(i0bpcpapdq), &
-            use_fwave,rpn3,rpt3,rptt3)
-        
-            cfl = dmax1(cfl,cfl1d)
-        
-        !           # update qnew by flux differencing.
-        !           # (rather than maintaining arrays f, g and h for the total fluxes,
-        !           # the modifications are used immediately to update qnew
-        !           # in order to save storage.)
-        
-            if(index_capa == 0)then
-            
-                forall (m = 1:num_eqn, i = 1:mx)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                - dtdx * (fadd(m,i+1) - fadd(m,i)) &
-                - dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) &
-                - dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i))
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - dtdy * gadd(m,1,0,i) &
-                - dtdz * ( hadd(m,2,-1,i) &
-                -   hadd(m,1,-1,i) )
-                qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
-                - dtdy * gadd(m,1,-1,i) &
-                - dtdz * hadd(m,1,-1,i)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - dtdy * ( gadd(m,2,-1,i) &
-                -   gadd(m,1,-1,i) ) &
-                - dtdz * hadd(m,1,0,i)
-                qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
-                + dtdy * gadd(m,2,-1,i) &
-                - dtdz * hadd(m,1,1,i)
-                qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
-                + dtdy * gadd(m,2,0,i) &
-                - dtdz * ( hadd(m,2,1,i) &
-                -   hadd(m,1,1,i) )
-                qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
-                + dtdy * gadd(m,2,1,i) &
-                + dtdz * hadd(m,2,1,i)
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                - dtdy * ( gadd(m,2,1,i) &
-                -   gadd(m,1,1,i) ) &
-                + dtdz * hadd(m,2,0,i)
-                qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
-                - dtdy * gadd(m,1,1,i) &
-                + dtdz * hadd(m,2,-1,i)
-                end forall
-            
-            else
-            !              # with capa array
-                forall (m = 1:num_eqn, i = 1:mx)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                - (dtdx * (fadd(m,i+1) - fadd(m,i)) &
-                +  dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) &
-                +  dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i))) &
-                / aux(index_capa,i,j,k)
-
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - (dtdy * gadd(m,1,0,i) &
-                +  dtdz * ( hadd(m,2,-1,i) &
-                -   hadd(m,1,-1,i) )) &
-                / aux(index_capa,i,j-1,k)
-                qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
-                - (dtdy * gadd(m,1,-1,i) &
-                +  dtdz * hadd(m,1,-1,i)) &
-                / aux(index_capa,i,j-1,k-1)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - (dtdy * ( gadd(m,2,-1,i) &
-                -   gadd(m,1,-1,i) ) &
-                +  dtdz * hadd(m,1,0,i)) &
-                / aux(index_capa,i,j,k-1)
-                qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
-                + (dtdy * gadd(m,2,-1,i) &
-                -  dtdz * hadd(m,1,1,i)) &
-                / aux(index_capa,i,j+1,k-1)
-                qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
-                + (dtdy * gadd(m,2,0,i) &
-                - dtdz * ( hadd(m,2,1,i) &
-                -   hadd(m,1,1,i) )) &
-                / aux(index_capa,i,j+1,k)
-                qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
-                + (dtdy * gadd(m,2,1,i) &
-                +  dtdz * hadd(m,2,1,i)) &
-                / aux(index_capa,i,j+1,k+1)
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                - (dtdy * ( gadd(m,2,1,i) &
-                -   gadd(m,1,1,i) ) &
-                -  dtdz * hadd(m,2,0,i)) &
-                / aux(index_capa,i,j,k+1)
-                qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
-                - (dtdy * gadd(m,1,1,i) &
-                -  dtdz * hadd(m,2,-1,i)) &
-                / aux(index_capa,i,j-1,k+1)
-                end forall
-            endif
-        
-    50 END DO
-
-
-
-!     # perform y sweeps
-!     ==================
-
-
-    do 100 k = 0, mz+1
-        do 100 i = 0, mx+1
-        
-            forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost)
-        !                 # copy data along a slice into 1d array:
-            q1d(m,j) = qold(m,i,j,k)
-            end forall
-        
-            if (index_capa > 0)  then
-                do 71 j = 1-num_ghost, my+num_ghost
-                    dtdy1d(j) = dtdy / aux(index_capa,i,j,k)
-                71 END DO
-            endif
-        
-            if (num_aux > 0)  then
-            ! aux1, aux2, aux3 probably fit in cache, so optimize
-            ! access to aux
-                forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost)
-                aux1(ma, j, 2+ia) = aux(ma, i+ia, j, k-1)
-                aux2(ma, j, 2+ia) = aux(ma, i+ia, j,   k)
-                aux3(ma, j, 2+ia) = aux(ma, i+ia, j, k+1)
-                end forall
-            endif
-        
-        !           # Store the value of i and k along this slice in the common block
-        !           # comxyzt in case it is needed in the Riemann solver (for
-        !           # variable coefficient problems)
-        
-            icom = i
-            kcom = k
-        
-        !           # compute modifications fadd, gadd and hadd to fluxes along this
-        !           # slice:
-        
-            call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
-            q1d,dtdy1d,dtdz,dtdx,aux1,aux2,aux3,num_aux, &
-            method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-            work(i0wave),work(i0s),work(i0amdq), &
-            work(i0apdq),work(i0cqxx), &
-            work(i0bmamdq),work(i0bmapdq), &
-            work(i0bpamdq),work(i0bpapdq), &
-            work(i0cmamdq),work(i0cmapdq), &
-            work(i0cpamdq),work(i0cpapdq), &
-            work(i0cmamdq2),work(i0cmapdq2), &
-            work(i0cpamdq2),work(i0cpapdq2), &
-            work(i0bmcqxxp),work(i0bpcqxxp), &
-            work(i0bmcqxxm),work(i0bpcqxxm), &
-            work(i0cmcqxxp),work(i0cpcqxxp), &
-            work(i0cmcqxxm),work(i0cpcqxxm), &
-            work(i0bmcmamdq),work(i0bmcmapdq), &
-            work(i0bpcmamdq),work(i0bpcmapdq), &
-            work(i0bmcpamdq),work(i0bmcpapdq), &
-            work(i0bpcpamdq),work(i0bpcpapdq), &
-            use_fwave,rpn3,rpt3,rptt3)
-        
-            cfl = dmax1(cfl,cfl1d)
-        
-        !           # update qnew by flux differencing.
-        !           # Note that the roles of the flux updates are changed.
-        !           # fadd - modifies the g-fluxes
-        !           # gadd - modifies the h-fluxes
-        !           # hadd - modifies the f-fluxes
-        
-            if( index_capa == 0)then
-            !               # no capa array.  Standard flux differencing:
-                forall (m = 1:num_eqn, j = 1:my)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                - dtdy * (fadd(m,j+1) - fadd(m,j)) &
-                - dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) &
-                - dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j))
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                + dtdz * gadd(m,2,0,j) &
-                - dtdx * ( hadd(m,2,1,j) &
-                -   hadd(m,1,1,j) )
-                qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
-                + dtdz * gadd(m,2,1,j) &
-                +  dtdx * hadd(m,2,1,j)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                - dtdz * ( gadd(m,2,1,j) &
-                -   gadd(m,1,1,j) ) &
-                + dtdx * hadd(m,2,0,j)
-                qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
-                - dtdz * gadd(m,1,1,j) &
-                + dtdx * hadd(m,2,-1,j)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - dtdz * gadd(m,1,0,j) &
-                - dtdx * ( hadd(m,2,-1,j) &
-                -   hadd(m,1,-1,j) )
-                qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
-                - dtdz * gadd(m,1,-1,j) &
-                - dtdx * hadd(m,1,-1,j)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - dtdz * ( gadd(m,2,-1,j) &
-                -   gadd(m,1,-1,j) ) &
-                - dtdx * hadd(m,1,0,j)
-                qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
-                + dtdz * gadd(m,2,-1,j) &
-                -  dtdx*hadd(m,1,1,j)
-                end forall
-            else
-            
-            !              #with capa array.
-            
-                forall (m = 1:num_eqn, j = 1:my)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                - (dtdy * (fadd(m,j+1) - fadd(m,j)) &
-                +  dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) &
-                +  dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j))) &
-                / aux(index_capa,i,j,k)
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                + (dtdz * gadd(m,2,0,j) &
-                -  dtdx * ( hadd(m,2,1,j) &
-                -   hadd(m,1,1,j) )) &
-                / aux(index_capa,i,j,k+1)
-                qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
-                + (dtdz * gadd(m,2,1,j) &
-                +  dtdx * hadd(m,2,1,j)) &
-                / aux(index_capa,i+1,j,k+1)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                - (dtdz * ( gadd(m,2,1,j) &
-                -   gadd(m,1,1,j) ) &
-                -  dtdx * hadd(m,2,0,j) ) &
-                / aux(index_capa,i+1,j,k)
-                qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
-                - (dtdz * gadd(m,1,1,j) &
-                -  dtdx * hadd(m,2,-1,j)) &
-                / aux(index_capa,i+1,j,k-1)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - (dtdz * gadd(m,1,0,j) &
-                +  dtdx * ( hadd(m,2,-1,j) &
-                -   hadd(m,1,-1,j) )) &
-                / aux(index_capa,i,j,k-1)
-                qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
-                - (dtdz * gadd(m,1,-1,j) &
-                +  dtdx * hadd(m,1,-1,j)) &
-                / aux(index_capa,i-1,j,k-1)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - (dtdz * ( gadd(m,2,-1,j) &
-                -   gadd(m,1,-1,j) ) &
-                +  dtdx * hadd(m,1,0,j)) &
-                / aux(index_capa,i-1,j,k)
-                qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
-                + (dtdz * gadd(m,2,-1,j) &
-                -  dtdx*hadd(m,1,1,j)) &
-                / aux(index_capa,i-1,j,k+1)
-                end forall
-            endif
-        
-    100 END DO
-
-
-
-!     # perform z sweeps
-!     ==================
-
-
-    do 150 j = 0, my+1
-        do 150 i = 0, mx+1
-        
-            forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost)
-        !                 # copy data along a slice into 1d array:
-            q1d(m,k) = qold(m,i,j,k)
-            end forall
-        
-            if (index_capa > 0)  then
-                do 130 k = 1-num_ghost, mz+num_ghost
-                    dtdz1d(k) = dtdz / aux(index_capa,i,j,k)
-                130 END DO
-            endif
-        
-            if (num_aux > 0)  then
-            ! See the comment on the X sweeps.  This is semantically
-            ! slightly different than putting all the indices in the
-            ! same forall, and hopefully better for optimizing access
-            ! to aux.
-                forall (ja = -1:1, k = 1-num_ghost:mz+num_ghost)
-                forall (ma = 1:num_aux)
-                aux1(ma, k, 2+ja) = aux(ma, i-1, j+ja, k)
-                aux2(ma, k, 2+ja) = aux(ma,   i, j+ja, k)
-                aux3(ma, k, 2+ja) = aux(ma, i+1, j+ja, k)
-                end forall
-                end forall
-            endif
-        
-        !           # Store the value of i and j along this slice in the common block
-        !           # comxyzt in case it is needed in the Riemann solver (for
-        !           # variable coefficient problems)
-        
-            icom = i
-            jcom = j
-        
-        !           # compute modifications fadd, gadd and hadd to fluxes along this
-        !           # slice:
-        
-            call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
-            q1d,dtdz1d,dtdx,dtdy,aux1,aux2,aux3,num_aux, &
-            method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-            work(i0wave),work(i0s),work(i0amdq), &
-            work(i0apdq),work(i0cqxx), &
-            work(i0bmamdq),work(i0bmapdq), &
-            work(i0bpamdq),work(i0bpapdq), &
-            work(i0cmamdq),work(i0cmapdq), &
-            work(i0cpamdq),work(i0cpapdq), &
-            work(i0cmamdq2),work(i0cmapdq2), &
-            work(i0cpamdq2),work(i0cpapdq2), &
-            work(i0bmcqxxp),work(i0bpcqxxp), &
-            work(i0bmcqxxm),work(i0bpcqxxm), &
-            work(i0cmcqxxp),work(i0cpcqxxp), &
-            work(i0cmcqxxm),work(i0cpcqxxm), &
-            work(i0bmcmamdq),work(i0bmcmapdq), &
-            work(i0bpcmamdq),work(i0bpcmapdq), &
-            work(i0bmcpamdq),work(i0bmcpapdq), &
-            work(i0bpcpamdq),work(i0bpcpapdq), &
-            use_fwave,rpn3,rpt3,rptt3)
-        
-            cfl = dmax1(cfl,cfl1d)
-        
-        !           # update qnew by flux differencing.
-        !           # Note that the roles of the flux updates are changed.
-        !           # fadd - modifies the h-fluxes
-        !           # gadd - modifies the f-fluxes
-        !           # hadd - modifies the g-fluxes
-        
-            if(index_capa == 0)then
-            
-            !              #no capa array. Standard flux differencing:
-            
-                forall (m = 1:num_eqn, k = 1:mz)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                - dtdz * (fadd(m,k+1) - fadd(m,k)) &
-                - dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) &
-                - dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k))
-                qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
-                - dtdx * ( gadd(m,2,1,k) &
-                -   gadd(m,1,1,k) ) &
-                + dtdy * hadd(m,2,0,k)
-                qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
-                + dtdx * gadd(m,2,1,k) &
-                + dtdy * hadd(m,2,1,k)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                + dtdx * gadd(m,2,0,k) &
-                - dtdy * ( hadd(m,2,1,k) &
-                -   hadd(m,1,1,k) )
-                qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
-                + dtdx * gadd(m,2,-1,k) &
-                - dtdy * hadd(m,1,1,k)
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - dtdx * ( gadd(m,2,-1,k) &
-                -   gadd(m,1,-1,k) ) &
-                - dtdy * hadd(m,1,0,k)
-                qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
-                - dtdx * gadd(m,1,-1,k) &
-                - dtdy * hadd(m,1,-1,k)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - dtdx * gadd(m,1,0,k) &
-                - dtdy * ( hadd(m,2,-1,k) &
-                -   hadd(m,1,-1,k) )
-                qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
-                - dtdx * gadd(m,1,1,k) &
-                + dtdy * hadd(m,2,-1,k)
-                end forall
-            else
-            
-            !              # with capa array
-            
-                forall (m = 1:num_eqn, k = 1:mz)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                - (dtdz * (fadd(m,k+1) - fadd(m,k)) &
-                +  dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) &
-                +  dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k))) &
-                / aux(index_capa,i,j,k)
-                qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
-                - (dtdx * ( gadd(m,2,1,k) &
-                -   gadd(m,1,1,k) ) &
-                -  dtdy * hadd(m,2,0,k)) &
-                / aux(index_capa,i,j+1,k)
-                qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
-                + (dtdx * gadd(m,2,1,k) &
-                +  dtdy * hadd(m,2,1,k)) &
-                / aux(index_capa,i+1,j+1,k)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                + (dtdx * gadd(m,2,0,k) &
-                - dtdy * ( hadd(m,2,1,k) &
-                -   hadd(m,1,1,k) )) &
-                / aux(index_capa,i+1,j,k)
-                qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
-                + (dtdx * gadd(m,2,-1,k) &
-                -  dtdy * hadd(m,1,1,k)) &
-                / aux(index_capa,i+1,j-1,k)
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - (dtdx * ( gadd(m,2,-1,k) &
-                -   gadd(m,1,-1,k) ) &
-                +  dtdy * hadd(m,1,0,k)) &
-                / aux(index_capa,i,j-1,k)
-                qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
-                - (dtdx * gadd(m,1,-1,k) &
-                +  dtdy * hadd(m,1,-1,k)) &
-                / aux(index_capa,i-1,j-1,k)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - (dtdx * gadd(m,1,0,k) &
-                + dtdy * ( hadd(m,2,-1,k) &
-                -   hadd(m,1,-1,k) )) &
-                / aux(index_capa,i-1,j,k)
-                qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
-                - (dtdx * gadd(m,1,1,k) &
-                -  dtdy * hadd(m,2,-1,k)) &
-                / aux(index_capa,i-1,j+1,k)
-                end forall
-            endif
-        
-    150 END DO
-
-
-    return
-    end subroutine step3
+  common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
+
+  ! partition work array into pieces needed for local storage in
+  ! flux3 routine.  Find starting index of each piece:
+
+  nthreads=1    ! Serial
+  !$omp parallel
+  !$omp single
+  !$ nthreads = omp_get_num_threads()
+  !$omp end single
+  !$omp end parallel
+
+  nsiz = (maxm+2*num_ghost)*num_eqn
+  nsiz_w = nsiz * num_waves
+  nsiz_s = (maxm+2*num_ghost)*num_waves
+
+  i0wave     = 1 
+  i0s        = i0wave     + nsiz_w*nthreads
+  i0amdq     = i0s        + nsiz_s*nthreads
+  i0apdq     = i0amdq     + nsiz*nthreads
+  i0cqxx     = i0apdq     + nsiz*nthreads
+  i0bmamdq   = i0cqxx     + nsiz*nthreads
+  i0bmapdq   = i0bmamdq   + nsiz*nthreads
+  i0bpamdq   = i0bmapdq   + nsiz*nthreads
+  i0bpapdq   = i0bpamdq   + nsiz*nthreads
+  i0cmamdq   = i0bpapdq   + nsiz*nthreads
+  i0cmapdq   = i0cmamdq   + nsiz*nthreads
+  i0cpamdq   = i0cmapdq   + nsiz*nthreads
+  i0cpapdq   = i0cpamdq   + nsiz*nthreads
+  i0cmamdq2  = i0cpapdq   + nsiz*nthreads
+  i0cmapdq2  = i0cmamdq2  + nsiz*nthreads
+  i0cpamdq2  = i0cmapdq2  + nsiz*nthreads
+  i0cpapdq2  = i0cpamdq2  + nsiz*nthreads
+  i0bmcqxxp  = i0cpapdq2  + nsiz*nthreads
+  i0bmcqxxm  = i0bmcqxxp  + nsiz*nthreads
+  i0bpcqxxp  = i0bmcqxxm  + nsiz*nthreads
+  i0bpcqxxm  = i0bpcqxxp  + nsiz*nthreads
+  i0cmcqxxp  = i0bpcqxxm  + nsiz*nthreads
+  i0cmcqxxm  = i0cmcqxxp  + nsiz*nthreads
+  i0cpcqxxp  = i0cmcqxxm  + nsiz*nthreads
+  i0cpcqxxm  = i0cpcqxxp  + nsiz*nthreads
+  i0bmcmamdq = i0cpcqxxm  + nsiz*nthreads
+  i0bmcmapdq = i0bmcmamdq + nsiz*nthreads
+  i0bpcmamdq = i0bmcmapdq + nsiz*nthreads
+  i0bpcmapdq = i0bpcmamdq + nsiz*nthreads
+  i0bmcpamdq = i0bpcmapdq + nsiz*nthreads
+  i0bmcpapdq = i0bmcpamdq + nsiz*nthreads
+  i0bpcpamdq = i0bmcpapdq + nsiz*nthreads
+  i0bpcpapdq = i0bpcpamdq + nsiz*nthreads
+  iused      = i0bpcpapdq + nsiz*nthreads - 1
+  
+  if (iused > mwork) then
+     ! This shouldn't happen due to checks in claw3
+     write(6,*) '*** not enough work space in step3()'
+     write(6,*) '*** iused = ', iused, '   mwork =',mwork
+     write(6,*) '*** nthreads = ',nthreads
+     stop
+  endif
+
+  index_capa = method(6)
+  !num_aux = method(7)
+  cfl = 0.d0
+  dtdx = dt/dx
+  dtdy = dt/dy
+  dtdz = dt/dz
+
+  ! Calculate the number of blocks to use in each grid direction
+  ! (integer division, rounding up)
+  nblk_i = (mx + blksiz_i + 1)/blksiz_i
+  nblk_j = (my + blksiz_j + 1)/blksiz_j
+  nblk_k = (mz + blksiz_k + 1)/blksiz_k
+
+  !$omp parallel private(me,me1,m,i,j,k,ma,ia,ja,ka,cfl1d,ioffset,joffset,koffset, &
+  !$omp                  iblk,jblk,kblk) reduction(max:cfl)
+
+  me = 0
+  !$ me = omp_get_thread_num()
+  me1 = me + 1
+
+  if (index_capa == 0) then
+     ! no capa array:
+     do i=1-num_ghost,maxm+num_ghost
+        dtdx1d(i,me1) = dtdx
+        dtdy1d(i,me1) = dtdy
+        dtdz1d(i,me1) = dtdz
+     end do
+  endif
+
+  ! perform x-sweeps
+  ! ==================
+
+  ! This block-based approach keeps race conditions from happening
+  ! with OpenMP parallelization.  Because the results of the flux3
+  ! call affect the column flux3 was called on and all adjacent
+  ! columns, an easy way to keep threads from interfering with each
+  ! other is to make sure that every thread takes a column that is
+  ! at least three cells away from all the other threads in each
+  ! direction.  The code here accomplishes this by breaking the
+  ! columns into blocks; for a block size of 2 in all directions,
+  ! the breakdown looks like:
+  !
+  ! xx..xx..
+  ! xx..xx..
+  ! **++**++
+  ! **++**++
+  ! xx..xx..
+  ! xx..xx..
+  ! **++**++
+  ! **++**++
+  !
+  ! All the blocks of columns labeled with the same symbol (e.g. x)
+  ! are done in parallel in one pass; within each block, the work is
+  ! all done by one thread, avoiding race conditions.  Then the code
+  ! flushes memory and synchronizes, and starts the next set of
+  ! blocks (e.g. *).
+  !
+  ! Breaking up into blocks in this fashion helps provide a large
+  ! number of small, independent work items, which helps with
+  ! parallel performance.
+  
+  do koffset=0,1
+     do joffset=0,1
+
+        ! Guided or dynamic scheduling is necessary to keep all the
+        ! threads busy if the work per column is very nonuniform.  With
+        ! many blocks to process, guided is probably the best choice.
+
+        !$omp do collapse(2) schedule(guided)
+        do kblk = koffset, nblk_k-1, 2
+           do jblk = joffset, nblk_j-1, 2
+
+              do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1)
+                 do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1)
+                    
+                    ! copy data along a slice into 1d array:
+                    do i = 1-num_ghost, mx+num_ghost
+                       do m = 1, num_eqn
+                          q1d(m,i,me1) = qold(m,i,j,k)
+                       end do
+                    end do
+
+                    if (index_capa > 0)  then
+                       do i = 1-num_ghost, mx+num_ghost
+                          dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k)
+                       end do
+                    endif
+
+                    if (num_aux > 0)  then
+                       do ka = -1, 1
+                          do i = 1-num_ghost, mx+num_ghost
+                             do ma = 1, num_aux
+                                aux1(ma, i, 2+ka, me1) = aux(ma, i, j-1, k+ka)
+                             end do
+                          end do
+                       end do
+                       do ka = -1, 1
+                          do i = 1-num_ghost, mx+num_ghost
+                             do ma = 1, num_aux
+                                aux2(ma, i, 2+ka, me1) = aux(ma, i,   j, k+ka)
+                             end do
+                          end do
+                       end do
+                       do ka = -1, 1
+                          do i = 1-num_ghost, mx+num_ghost
+                             do ma = 1, num_aux
+                                aux3(ma, i, 2+ka, me1) = aux(ma, i, j+1, k+ka)
+                             end do
+                          end do
+                       end do
+                    endif
+        
+                    ! Store the value of j and k along this slice in the common block
+                    ! comxyt in case it is needed in the Riemann solver (for
+                    ! variable coefficient problems)
+                    
+                    jcom = j
+                    kcom = k
+        
+                    ! compute modifications fadd, gadd and hadd to fluxes along
+                    ! this slice:
+                    
+                    call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
+                         q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz, &
+                         aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), &
+                         aux3(1,1-num_ghost,1,me1),num_aux, &
+                         method,mthlim,qadd(1,1-num_ghost,me1),&
+                         fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),&
+                         hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                         work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),&
+                         work(i0amdq+me*nsiz), &
+                         work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                         work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                         work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                         work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                         work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                         work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                         work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                         work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                         work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                         work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                         work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                         work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                         work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                         work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                         work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
+                         use_fwave,rpn3,rpt3,rptt3)
+        
+                    cfl = dmax1(cfl,cfl1d)
+        
+                    ! update qnew by flux differencing.
+                    ! (rather than maintaining arrays f, g and h for
+                    ! the total fluxes, the modifications are used immediately
+                    ! to update qnew in order to save storage.)
+
+                    if(index_capa == 0)then
+                       ! Order so that access to qnew is in memory-contiguous
+                       ! order as much as possible
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
+                                  - dtdy * gadd(m,1,-1,i,me1) &
+                                  - dtdz * hadd(m,1,-1,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - dtdy * ( gadd(m,2,-1,i,me1) &
+                                  -   gadd(m,1,-1,i,me1) ) &
+                                  - dtdz * hadd(m,1,0,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
+                                  + dtdy * gadd(m,2,-1,i,me1) &
+                                  - dtdz * hadd(m,1,1,i,me1)
+                          end do
+                       end do
+                       
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - dtdy * gadd(m,1,0,i,me1) &
+                                  - dtdz * ( hadd(m,2,-1,i,me1) &
+                                  -   hadd(m,1,-1,i,me1) )
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                                  - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) &
+                                  - dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) &
+                                  - dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1))
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
+                                  + dtdy * gadd(m,2,0,i,me1) &
+                                  - dtdz * ( hadd(m,2,1,i,me1) &
+                                  -   hadd(m,1,1,i,me1) )
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
+                                  - dtdy * gadd(m,1,1,i,me1) &
+                                  + dtdz * hadd(m,2,-1,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1) = qnew(m,i,j,k+1) &
+                                  - dtdy * ( gadd(m,2,1,i,me1) &
+                                  -   gadd(m,1,1,i,me1) ) &
+                                  + dtdz * hadd(m,2,0,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
+                                  + dtdy * gadd(m,2,1,i,me1) &
+                                  + dtdz * hadd(m,2,1,i,me1)
+                          end do
+                       end do
+                    else
+                       ! with capa array
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
+                                  - (dtdy * gadd(m,1,-1,i,me1) &
+                                  +  dtdz * hadd(m,1,-1,i,me1)) &
+                                  / aux(index_capa,i,j-1,k-1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - (dtdy * ( gadd(m,2,-1,i,me1) &
+                                  -   gadd(m,1,-1,i,me1) ) &
+                                  +  dtdz * hadd(m,1,0,i,me1)) &
+                                  / aux(index_capa,i,j,k-1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
+                                  + (dtdy * gadd(m,2,-1,i,me1) &
+                                  -  dtdz * hadd(m,1,1,i,me1)) &
+                                  / aux(index_capa,i,j+1,k-1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - (dtdy * gadd(m,1,0,i,me1) &
+                                  +  dtdz * ( hadd(m,2,-1,i,me1) &
+                                  -   hadd(m,1,-1,i,me1) )) &
+                                  / aux(index_capa,i,j-1,k)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                                  - (dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) &
+                                  +  dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) &
+                                  +  dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)))&
+                                  / aux(index_capa,i,j,k)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
+                                  + (dtdy * gadd(m,2,0,i,me1) &
+                                  - dtdz * ( hadd(m,2,1,i,me1) &
+                                  -   hadd(m,1,1,i,me1) )) &
+                                  / aux(index_capa,i,j+1,k)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
+                                  - (dtdy * gadd(m,1,1,i,me1) &
+                                  -  dtdz * hadd(m,2,-1,i,me1)) &
+                                  / aux(index_capa,i,j-1,k+1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
+                                  - (dtdy * ( gadd(m,2,1,i,me1) &
+                                  -   gadd(m,1,1,i,me1) ) &
+                                  -  dtdz * hadd(m,2,0,i,me1)) &
+                                  / aux(index_capa,i,j,k+1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
+                                  + (dtdy * gadd(m,2,1,i,me1) &
+                                  +  dtdz * hadd(m,2,1,i,me1)) &
+                                  / aux(index_capa,i,j+1,k+1)
+                          end do
+                       end do
+                    endif
+                    
+                 end do
+              end do    ! End loops within block
+              
+           end do
+        end do    ! End loops over blocks
+
+        ! Flush memory (may not be necessary), then make sure everybody
+        ! synchronizes before the next offset
+
+        !$omp flush
+        !$omp barrier
+     end do
+  end do    ! End loops over offsets
+
+  ! perform y sweeps
+  ! ==================
+
+  do koffset=0,1
+     do ioffset=0,1
+
+        !$omp do collapse(2) schedule(guided)
+        do kblk = koffset, nblk_k-1, 2
+           do iblk = ioffset, nblk_i-1, 2
+
+              do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1)
+                 do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1)
+                    
+                    ! copy data along a slice into 1d array:
+                    do j = 1-num_ghost, my+num_ghost
+                       do m = 1, num_eqn
+                          q1d(m,j,me1) = qold(m,i,j,k)
+                       end do
+                    end do
+
+                    if (index_capa > 0)  then
+                       do j = 1-num_ghost, my+num_ghost
+                          dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k)
+                       end do
+                    endif
+                    
+                    if (num_aux > 0)  then
+                       ! aux1, aux2, aux3 probably fit in cache, so optimize
+                       ! access to aux
+                       do j = 1-num_ghost, my+num_ghost
+                          do ia = -1, 1
+                             do ma = 1, num_aux
+                                aux1(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k-1)
+                             end do
+                          end do
+                       end do
+                       do j = 1-num_ghost, my+num_ghost
+                          do ia = -1, 1
+                             do ma = 1, num_aux
+                                aux2(ma, j, 2+ia, me1) = aux(ma, i+ia, j,   k)
+                             end do
+                          end do
+                       end do
+                       do j = 1-num_ghost, my+num_ghost
+                          do ia = -1, 1
+                             do ma = 1, num_aux
+                                aux3(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k+1)
+                             end do
+                          end do
+                       end do
+                    endif
+                    
+                    ! Store the value of i and k along this slice in the common block
+                    ! comxyzt in case it is needed in the Riemann solver (for
+                    ! variable coefficient problems)
+        
+                    icom = i
+                    kcom = k
+        
+                    ! compute modifications fadd, gadd and hadd to fluxes along this
+                    ! slice:
+
+                    call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
+                         q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx, &
+                         aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), &
+                         aux3(1,1-num_ghost,1,me1),num_aux, &
+                         method,mthlim,qadd(1,1-num_ghost,me1),&
+                         fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),&
+                         hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                         work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),&
+                         work(i0amdq+me*nsiz), &
+                         work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                         work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                         work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                         work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                         work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                         work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                         work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                         work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                         work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                         work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                         work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                         work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                         work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                         work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                         work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
+                         use_fwave,rpn3,rpt3,rptt3)
+        
+                    cfl = dmax1(cfl,cfl1d)
+
+                    ! update qnew by flux differencing.
+                    ! Note that the roles of the flux updates are changed.
+                    ! fadd - modifies the g-fluxes
+                    ! gadd - modifies the h-fluxes
+                    ! hadd - modifies the f-fluxes
+                    
+                    if( index_capa == 0)then
+                       ! no capa array.  Standard flux differencing:
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
+                                  - dtdz * gadd(m,1,-1,j,me1) &
+                                  - dtdx * hadd(m,1,-1,j,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - dtdz * gadd(m,1,0,j,me1) &
+                                  - dtdx * ( hadd(m,2,-1,j,me1) &
+                                  -   hadd(m,1,-1,j,me1) )
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
+                                  - dtdz * gadd(m,1,1,j,me1) &
+                                  + dtdx * hadd(m,2,-1,j,me1)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k) = qnew(m,i-1,j,k) &
+                                  - dtdz * ( gadd(m,2,-1,j,me1) &
+                                  -   gadd(m,1,-1,j,me1) ) &
+                                  - dtdx * hadd(m,1,0,j,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                                  - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) &
+                                  - dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) &
+                                  - dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1))
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  - dtdz * ( gadd(m,2,1,j,me1) &
+                                  -   gadd(m,1,1,j,me1) ) &
+                                  + dtdx * hadd(m,2,0,j,me1)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
+                                  + dtdz * gadd(m,2,-1,j,me1) &
+                                  -  dtdx*hadd(m,1,1,j,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1) = qnew(m,i,j,k+1) &
+                                  + dtdz * gadd(m,2,0,j,me1) &
+                                  - dtdx * ( hadd(m,2,1,j,me1) &
+                                  -   hadd(m,1,1,j,me1) )
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
+                                  + dtdz * gadd(m,2,1,j,me1) &
+                                  +  dtdx * hadd(m,2,1,j,me1)
+                          end do
+                       end do
+                    else
+                       ! with capa array.
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
+                                  - (dtdz * gadd(m,1,-1,j,me1) &
+                                  +  dtdx * hadd(m,1,-1,j,me1)) &
+                                  / aux(index_capa,i-1,j,k-1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - (dtdz * gadd(m,1,0,j,me1) &
+                                  +  dtdx * ( hadd(m,2,-1,j,me1) &
+                                  -   hadd(m,1,-1,j,me1) )) &
+                                  / aux(index_capa,i,j,k-1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
+                                  - (dtdz * gadd(m,1,1,j,me1) &
+                                  -  dtdx * hadd(m,2,-1,j,me1)) &
+                                  / aux(index_capa,i+1,j,k-1)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k) = qnew(m,i-1,j,k) &
+                                  - (dtdz * ( gadd(m,2,-1,j,me1) &
+                                  -   gadd(m,1,-1,j,me1) ) &
+                                  +  dtdx * hadd(m,1,0,j,me1)) &
+                                  / aux(index_capa,i-1,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                                  - (dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) &
+                                  +  dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) &
+                                  +  dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)))&
+                                  / aux(index_capa,i,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  - (dtdz * ( gadd(m,2,1,j,me1) &
+                                  -   gadd(m,1,1,j,me1) ) &
+                                  -  dtdx * hadd(m,2,0,j,me1) ) &
+                                  / aux(index_capa,i+1,j,k)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
+                                  + (dtdz * gadd(m,2,-1,j,me1) &
+                                  -  dtdx*hadd(m,1,1,j,me1)) &
+                                  / aux(index_capa,i-1,j,k+1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1) = qnew(m,i,j,k+1) &
+                                  + (dtdz * gadd(m,2,0,j,me1) &
+                                  -  dtdx * ( hadd(m,2,1,j,me1) &
+                                  -   hadd(m,1,1,j,me1) )) &
+                                  / aux(index_capa,i,j,k+1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
+                                  + (dtdz * gadd(m,2,1,j,me1) &
+                                  +  dtdx * hadd(m,2,1,j,me1)) &
+                                  / aux(index_capa,i+1,j,k+1)
+                          end do
+                       end do
+                    endif
+                    
+                 end do
+              end do    ! End loops within blocks
+
+           end do
+        end do    ! End loops over blocks
+        
+        !$omp flush
+        !$omp barrier
+     end do
+  end do    ! End loops over offsets
+
+  ! perform z sweeps
+  ! ==================
+
+  do joffset=0,1
+     do ioffset=0,1
+        
+        !$omp do collapse(2) schedule(guided)
+        do jblk = joffset, nblk_j-1, 2
+           do iblk = ioffset, nblk_i-1, 2
+              
+              do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1)
+                 do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1)
+                    
+                    do k = 1-num_ghost, mz+num_ghost
+                       do m = 1, num_eqn
+                          q1d(m,k,me1) = qold(m,i,j,k)
+                       end do
+                    end do
+                    
+                    if (index_capa > 0)  then
+                       do k = 1-num_ghost, mz+num_ghost
+                          dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k)
+                       end do
+                    endif
+                    
+                    if (num_aux > 0)  then
+                       do k = 1-num_ghost, mz+num_ghost
+                          do ja = -1, 1
+                             do ma = 1, num_aux
+                                aux1(ma, k, 2+ja, me1) = aux(ma, i-1, j+ja, k)
+                             end do
+                             do ma = 1, num_aux
+                                aux2(ma, k, 2+ja, me1) = aux(ma,   i, j+ja, k)
+                             end do
+                             do ma = 1, num_aux
+                                aux3(ma, k, 2+ja, me1) = aux(ma, i+1, j+ja, k)
+                             end do
+                          end do
+                       end do
+                    endif
+
+
+        
+                    ! Store the value of i and j along this slice in the common block
+                    ! comxyzt in case it is needed in the Riemann solver (for
+                    ! variable coefficient problems)
+        
+                    icom = i
+                    jcom = j
+        
+                    ! compute modifications fadd, gadd and hadd to fluxes along this
+                    ! slice:
+
+                    call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
+                         q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy, &
+                         aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), &
+                         aux3(1,1-num_ghost,1,me1),num_aux, &
+                         method,mthlim,qadd(1,1-num_ghost,me1), &
+                         fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1), &
+                         hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                         work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s), &
+                         work(i0amdq+me*nsiz), &
+                         work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                         work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                         work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                         work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                         work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                         work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                         work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                         work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                         work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                         work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                         work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                         work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                         work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                         work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                         work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
+                         use_fwave,rpn3,rpt3,rptt3)
+        
+                    cfl = dmax1(cfl,cfl1d)
+
+                    ! update qnew by flux differencing.
+                    ! Note that the roles of the flux updates are changed.
+                    ! fadd - modifies the h-fluxes
+                    ! gadd - modifies the f-fluxes
+                    ! hadd - modifies the g-fluxes
+
+                    if(index_capa == 0)then
+                       
+                       ! no capa array. Standard flux differencing:
+                       do k = 1, mz
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
+                                  - dtdx * gadd(m,1,-1,k,me1) &
+                                  - dtdy * hadd(m,1,-1,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - dtdx * ( gadd(m,2,-1,k,me1) &
+                                  -   gadd(m,1,-1,k,me1) ) &
+                                  - dtdy * hadd(m,1,0,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
+                                  + dtdx * gadd(m,2,-1,k,me1) &
+                                  - dtdy * hadd(m,1,1,k,me1)
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k) = qnew(m,i-1,j,k) &
+                                  - dtdx * gadd(m,1,0,k,me1) &
+                                  - dtdy * ( hadd(m,2,-1,k,me1) &
+                                  -   hadd(m,1,-1,k,me1) )
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                                  - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) &
+                                  - dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) &
+                                  - dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1))
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  + dtdx * gadd(m,2,0,k,me1) &
+                                  - dtdy * ( hadd(m,2,1,k,me1) &
+                                  -   hadd(m,1,1,k,me1) )
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
+                                  - dtdx * gadd(m,1,1,k,me1) &
+                                  + dtdy * hadd(m,2,-1,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
+                                  - dtdx * ( gadd(m,2,1,k,me1) &
+                                  -   gadd(m,1,1,k,me1) ) &
+                                  + dtdy * hadd(m,2,0,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
+                                  + dtdx * gadd(m,2,1,k,me1) &
+                                  + dtdy * hadd(m,2,1,k,me1)
+                          end do
+                       end do
+                    else
+                       
+                       ! with capa array
+                       do k = 1, mz
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
+                                  - (dtdx * gadd(m,1,-1,k,me1) &
+                                  +  dtdy * hadd(m,1,-1,k,me1)) &
+                                  / aux(index_capa,i-1,j-1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - (dtdx * ( gadd(m,2,-1,k,me1) &
+                                  -   gadd(m,1,-1,k,me1) ) &
+                                  +  dtdy * hadd(m,1,0,k,me1)) &
+                                  / aux(index_capa,i,j-1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
+                                  + (dtdx * gadd(m,2,-1,k,me1) &
+                                  -  dtdy * hadd(m,1,1,k,me1)) &
+                                  / aux(index_capa,i+1,j-1,k)
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
+                                  - (dtdx * gadd(m,1,0,k,me1) &
+                                  + dtdy * ( hadd(m,2,-1,k,me1) &
+                                  -   hadd(m,1,-1,k,me1) )) &
+                                  / aux(index_capa,i-1,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                                  - (dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) &
+                                  +  dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) &
+                                  +  dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)))&
+                                  / aux(index_capa,i,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  + (dtdx * gadd(m,2,0,k,me1) &
+                                  - dtdy * ( hadd(m,2,1,k,me1) &
+                                  -   hadd(m,1,1,k,me1) )) &
+                                  / aux(index_capa,i+1,j,k)
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
+                                  - (dtdx * gadd(m,1,1,k,me1) &
+                                  -  dtdy * hadd(m,2,-1,k,me1)) &
+                                  / aux(index_capa,i-1,j+1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
+                                  - (dtdx * ( gadd(m,2,1,k,me1) &
+                                  -   gadd(m,1,1,k,me1) ) &
+                                  -  dtdy * hadd(m,2,0,k,me1)) &
+                                  / aux(index_capa,i,j+1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
+                                  + (dtdx * gadd(m,2,1,k,me1) &
+                                  +  dtdy * hadd(m,2,1,k,me1)) &
+                                  / aux(index_capa,i+1,j+1,k)
+                          end do
+                       end do
+                    endif
+                    
+                 end do
+              end do
+              
+           end do
+        end do
+
+        !$omp flush
+        !$omp barrier
+     end do
+  end do
+  !$omp end parallel
+
+  return
+end subroutine step3
 
 
diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90
index 7d672f3b6..79d9f0d89 100644
--- a/src/pyclaw/classic/step3ds.f90
+++ b/src/pyclaw/classic/step3ds.f90
@@ -38,7 +38,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
        1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
   dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
        1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) ! Adding and subtracting mx is a hack to trick f2py, so it does not make nthreads optional.
+  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
   dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
   dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
   dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
@@ -118,8 +118,9 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
 
   if (iused > mwork) then
      ! This shouldn't happen due to checks in claw3
-     write(6,*) '*** not enough work space in step3'
+     write(6,*) '*** not enough work space in step3ds()'
      write(6,*) '*** iused = ', iused, '   mwork =',mwork
+     write(6,*) '*** nthreads = ',nthreads
      stop
   endif
 

From 9a871d6b7e54dbd366735053923600418aa43889 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Thu, 5 Nov 2015 09:29:33 -0800
Subject: [PATCH 06/19] Updated implementation to use 1 OpenMP thread by
 default and give a warning that the 'OMP_NUM_THREADS' environment varibale is
 unset.  This allows an unaware user to run their codes without any changes,
 and they will see the warning and can adjust accordingly.

---
 src/pyclaw/classic/solver.py   | 9 ++++++---
 src/pyclaw/classic/step3.f90   | 5 +++++
 src/pyclaw/classic/step3ds.f90 | 5 +++++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 888cb181a..af9b94523 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -612,9 +612,12 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.work = None
 
         import os
-        import multiprocessing
-        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\
-                                           multiprocessing.cpu_count()))
+        import warnings
+        if 'OMP_NUM_THREADS' in os.environ:
+            pass # Using OpenMP
+        else:
+            warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset.  Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).")
+        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90
index ec8b91556..89233b85c 100644
--- a/src/pyclaw/classic/step3.f90
+++ b/src/pyclaw/classic/step3.f90
@@ -40,6 +40,7 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   dimension method(7),mthlim(num_waves)
   dimension work(mwork)
   logical :: use_fwave
+  character :: env_value
 
   ! These block sizes must be at least 2 to avoid multiple threads
   ! trying to update the same grid cell
@@ -61,6 +62,10 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   ! flux3 routine.  Find starting index of each piece:
 
   nthreads=1    ! Serial
+  CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat)
+  IF(nstat >= 1)THEN
+     !$ call omp_set_num_threads(1) ! set default to 1 thread
+  END IF
   !$omp parallel
   !$omp single
   !$ nthreads = omp_get_num_threads()
diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90
index 79d9f0d89..4186d86e5 100644
--- a/src/pyclaw/classic/step3ds.f90
+++ b/src/pyclaw/classic/step3ds.f90
@@ -54,6 +54,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   dimension method(7),mthlim(num_waves)
   dimension work(mwork)
   logical :: use_fwave
+  character :: env_value
 
   common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
 
@@ -71,6 +72,10 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   ! flux3 routine.  Find starting index of each piece:
 
   nthreads=1 ! Serial
+  CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat)
+  IF(nstat >= 1)THEN
+     !$ call omp_set_num_threads(1) ! set default to 1 thread
+  END IF
   !$OMP PARALLEL
   !$OMP SINGLE
   !$ nthreads = omp_get_num_threads()

From 44230e85a024a5a754eab25ac0c7f4102002f140 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Thu, 5 Nov 2015 09:43:21 -0800
Subject: [PATCH 07/19] small simplification to OMP_NUM_THREADS env variable
 check.

---
 src/pyclaw/classic/solver.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index af9b94523..642320912 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -613,9 +613,7 @@ def __init__(self, riemann_solver=None, claw_package=None):
 
         import os
         import warnings
-        if 'OMP_NUM_THREADS' in os.environ:
-            pass # Using OpenMP
-        else:
+        if 'OMP_NUM_THREADS' not in os.environ:
             warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset.  Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).")
         self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 

From 2d325b45f5e62c3036e24571b857f2bc6d97f2ba Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 6 Nov 2015 09:31:56 -0800
Subject: [PATCH 08/19] Updated OpenMP warning to use the PyClaw logging.  This
 prints to the console and to the log file.

---
 src/pyclaw/classic/solver.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 642320912..2b91c234e 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -610,15 +610,17 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.aux2 = None
         self.aux3 = None
         self.work = None
+        self.nthreads = None
+
+        super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
         import os
-        import warnings
         if 'OMP_NUM_THREADS' not in os.environ:
-            warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset.  Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).")
+            self.logger.warn(""""The env variable 'OMP_NUM_THREADS' is unset.
+                                       Set this variable to use OpenMP with more 
+                                       than 1 thread (i.e. export OMP_NUM_THREADS=4).""")
         self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 
-        super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
-
     # ========== Setup routine =============================   
     def _allocate_workspace(self,solution):
         r"""

From 66a28ee57987e783e7a4d48a55f10d4cfb5f85f2 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 16 Oct 2015 11:30:22 -0700
Subject: [PATCH 09/19] Implemented OpenMP in the PyClaw step3ds.f90 for the
 dimensioanlly split algorithm.  The unsplit algorithm can also be updated. 
 Usage requires setting the environment variable OMP_NUM_THREADS before
 program execution.  In addition the gfortran flag '-fopenmp' and the f2py
 flag '-lgomp' are required when compiling step3ds.f90.  This is done for a
 custom version of classic3.so, and might also be necessary for the general
 version.

---
 src/pyclaw/classic/solver.py   |  37 +-
 src/pyclaw/classic/step3ds.f90 | 743 ++++++++++++++++++---------------
 2 files changed, 420 insertions(+), 360 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index ab7277aef..53c3a023e 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -611,6 +611,12 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.aux3 = None
         self.work = None
 
+        import os
+        if os.environ['OMP_NUM_THREADS'] == None:
+            self.nthreads = 1
+        else:
+            self.nthreads = int(os.environ['OMP_NUM_THREADS'])
+
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
     # ========== Setup routine =============================   
@@ -636,10 +642,12 @@ def _allocate_workspace(self,solution):
 
         # These work arrays really ought to live inside a fortran module
         # as is done for sharpclaw
-        self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3),order='F')
-        self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3),order='F')
-        self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3),order='F')
-        mwork = (maxm+2*num_ghost) * (31*num_eqn + num_waves + num_eqn*num_waves)
+        print "nthreads:",self.nthreads
+        self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
+        self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
+        self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
+        mwork = (maxm+2*num_ghost)\
+                *(31*num_eqn + num_waves + num_eqn*num_waves)*self.nthreads
         self.work = np.empty((mwork),order='F')
 
 
@@ -676,17 +684,20 @@ def step_hyperbolic(self,solution):
                 #Right now only Godunov-dimensional-splitting is implemented.
                 #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack.
 
-                q, cfl_x = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \
-                      qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,1,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
-                q, cfl_y = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \
-                      q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,2,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
-                q, cfl_z = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \
-                      q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,3,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
                 cfl = max(cfl_x,cfl_y,cfl_z)
 
diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90
index c248a18ec..3b3ed70a6 100644
--- a/src/pyclaw/classic/step3ds.f90
+++ b/src/pyclaw/classic/step3ds.f90
@@ -1,61 +1,61 @@
 
+! ==================================================================
+subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
+     mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
+     qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,&
+     num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3)
+! ==================================================================
 
+  ! Take one time step, updating q, to be used with
+  ! dimensional splitting.
+  ! On entry, qold and qnew should be identical and give the
+  !    initial data for this step.
+  ! On exit, qnew returns values at the end of the time step.
+  !    qold is unchanged.
+  !
+  ! This is a simplified version of the subroutine step3d
+  ! Since only qadd and fadd is used in the flux updates,
+  ! the solution updates below are considerably simplified
+  ! compared to step3d.
+  
+  !-----------------------------------------------------------------
+  ! NOTE! Since dimensional splitting is used, it is possible
+  !       to reduce the memory requirement, i.e. the size of
+  !       the work array. It could be reduced with
+  !
+  !       (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux),
+  !
+  !       when also possible reductions in flux3 are included.
+  !       However, this term is small compared to the dominating
+  !       term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost).
+  !-----------------------------------------------------------------
+  
+  !$ use omp_lib
 
-!     ==================================================================
-    subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, &
-    mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
-    qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, &
-    aux1,aux2,aux3,num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3)
-!     ==================================================================
+  implicit real*8(a-h,o-z)
+  external rpn3,rpt3,rptt3
+  dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension method(7),mthlim(num_waves)
+  dimension work(mwork)
+  logical :: use_fwave
 
-!     # Take one time step, updating q, to be used with
-!     # dimensional splitting.
-!     # On entry, qold and qnew should be identical and give the
-!     #    initial data for this step.
-!     # On exit, qnew returns values at the end of the time step.
-!     #    qold is unchanged.
-
-!     # This is a simplified version of the subroutine step3d
-!     # Since only qadd and fadd is used in the flux updates,
-!     # the solution updates below are considerably simplified
-!     # compared to step3d.
-
-!     #-----------------------------------------------------------------
-!     # NOTE! Since dimensional splitting is used, it is possible
-!     #       to reduce the memory requirement, i.e. the size of
-!     #       the work array. It could be reduced with
-!     #
-!     #       (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux),
-!     #
-!     #       when also possible reductions in flux3 are included.
-!     #       However, this term is small compared to the dominating
-!     #       term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost).
-!     #-----------------------------------------------------------------
-
-    implicit real*8(a-h,o-z)
-    external rpn3,rpt3,rptt3
-    dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension dtdx1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdy1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdz1d(1-num_ghost:maxm+num_ghost)
-    dimension method(7),mthlim(num_waves)
-    dimension work(mwork)
-    logical ::          use_fwave
-
-    common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
+  common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
 
 !f2py intent(out) cfl
 !f2py intent(in,out) qnew
@@ -67,317 +67,366 @@ subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, &
 !f2py x=rpt3(x)
 !f2py x=rptt3(x)
 
-!     # partition work array into pieces needed for local storage in
-!     # flux2 routine.  Find starting index of each piece:
+  ! partition work array into pieces needed for local storage in
+  ! flux3 routine.  Find starting index of each piece:
 
-    i0wave     = 1
-    i0s        = i0wave     + (maxm+2*num_ghost)*num_eqn*num_waves
-    i0amdq     = i0s        + (maxm+2*num_ghost)*num_waves
-    i0apdq     = i0amdq     + (maxm+2*num_ghost)*num_eqn
-    i0cqxx     = i0apdq     + (maxm+2*num_ghost)*num_eqn
-    i0bmamdq   = i0cqxx     + (maxm+2*num_ghost)*num_eqn
-    i0bmapdq   = i0bmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpamdq   = i0bmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpapdq   = i0bpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq   = i0bpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq   = i0cmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq   = i0cmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq   = i0cpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq2  = i0cpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq2  = i0cmamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq2  = i0cmapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq2  = i0cpamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxp  = i0cpapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxm  = i0bmcqxxp  + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxp  = i0bmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxm  = i0bpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxp  = i0bpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxm  = i0cmcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxp  = i0cmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxm  = i0cpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmamdq = i0cpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn
-    iused      = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1
+  nthreads=1 ! Serial
+  !$OMP PARALLEL
+  !$OMP SINGLE
+  !$ nthreads = omp_get_num_threads()
+  !$OMP END SINGLE
+  !$OMP END PARALLEL
 
-    if (iused > mwork) then
-    !        # This shouldn't happen due to checks in claw2
-        write(6,*) '*** not enough work space in step2'
-        write(6,*) '*** iused = ', iused, '   mwork =',mwork
-        stop
-    endif
+  nsiz = (maxm+2*num_ghost)*num_eqn
+  nsiz_w = nsiz * num_waves
+  nsiz_s = (maxm+2*num_ghost)*num_waves
 
-    index_capa = method(6)
-!      num_aux = method(7)
-    cfl = 0.d0
-    dtdx = dt/dx
-    dtdy = dt/dy
-    dtdz = dt/dz
+  i0wave     = 1
+  i0s        = i0wave     + nsiz_w*nthreads
+  i0amdq     = i0s        + nsiz_s*nthreads
+  i0apdq     = i0amdq     + nsiz*nthreads
+  i0cqxx     = i0apdq     + nsiz*nthreads
+  i0bmamdq   = i0cqxx     + nsiz*nthreads
+  i0bmapdq   = i0bmamdq   + nsiz*nthreads
+  i0bpamdq   = i0bmapdq   + nsiz*nthreads
+  i0bpapdq   = i0bpamdq   + nsiz*nthreads
+  i0cmamdq   = i0bpapdq   + nsiz*nthreads
+  i0cmapdq   = i0cmamdq   + nsiz*nthreads
+  i0cpamdq   = i0cmapdq   + nsiz*nthreads
+  i0cpapdq   = i0cpamdq   + nsiz*nthreads
+  i0cmamdq2  = i0cpapdq   + nsiz*nthreads
+  i0cmapdq2  = i0cmamdq2  + nsiz*nthreads
+  i0cpamdq2  = i0cmapdq2  + nsiz*nthreads
+  i0cpapdq2  = i0cpamdq2  + nsiz*nthreads
+  i0bmcqxxp  = i0cpapdq2  + nsiz*nthreads
+  i0bmcqxxm  = i0bmcqxxp  + nsiz*nthreads
+  i0bpcqxxp  = i0bmcqxxm  + nsiz*nthreads
+  i0bpcqxxm  = i0bpcqxxp  + nsiz*nthreads
+  i0cmcqxxp  = i0bpcqxxm  + nsiz*nthreads
+  i0cmcqxxm  = i0cmcqxxp  + nsiz*nthreads
+  i0cpcqxxp  = i0cmcqxxm  + nsiz*nthreads
+  i0cpcqxxm  = i0cpcqxxp  + nsiz*nthreads
+  i0bmcmamdq = i0cpcqxxm  + nsiz*nthreads
+  i0bmcmapdq = i0bmcmamdq + nsiz*nthreads
+  i0bpcmamdq = i0bmcmapdq + nsiz*nthreads
+  i0bpcmapdq = i0bpcmamdq + nsiz*nthreads
+  i0bmcpamdq = i0bpcmapdq + nsiz*nthreads
+  i0bmcpapdq = i0bmcpamdq + nsiz*nthreads
+  i0bpcpamdq = i0bmcpapdq + nsiz*nthreads
+  i0bpcpapdq = i0bpcpamdq + nsiz*nthreads
+  iused      = i0bpcpapdq + nsiz*nthreads - 1
 
-    if (index_capa == 0) then
-    !        # no capa array:
-        do 5 i=1-num_ghost,maxm+num_ghost
-            dtdx1d(i) = dtdx
-            dtdy1d(i) = dtdy
-            dtdz1d(i) = dtdz
-        5 END DO
-    endif
+  if (iused > mwork) then
+     ! This shouldn't happen due to checks in claw3
+     write(6,*) '*** not enough work space in step3'
+     write(6,*) '*** iused = ', iused, '   mwork =',mwork
+     stop
+  endif
 
+  index_capa = method(6)
+  !num_aux = method(7)
+  cfl = 0.d0
+  dtdx = dt/dx
+  dtdy = dt/dy
+  dtdz = dt/dz
+  
+  if (index_capa == 0) then
+     ! no capa array:
+     !$OMP PARALLEL PRIVATE(i,me,me1)
+     me = 0
+     !$ me = omp_get_thread_num()
+     me1 = me + 1
+     do i=1-num_ghost,maxm+num_ghost
+        dtdx1d(i,me1) = dtdx
+        dtdy1d(i,me1) = dtdy
+        dtdz1d(i,me1) = dtdz
+     end do
+     !$OMP END PARALLEL
+  endif
 
-    if( idir == 1)then
-    
-    !     # perform x-sweeps
-    !     ==================
+  if( idir == 1)then
     
-        do 50 k = 0,mz+1
-            do 50 j = 0,my+1
-            
-                forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost)
-            !                 # copy data along a slice into 1d array:
-                q1d(m,i) = qold(m,i,j,k)
-                end forall
-            
-                if (index_capa > 0)  then
-                    do 23 i = 1-num_ghost, mx+num_ghost
-                        dtdx1d(i) = dtdx / aux(index_capa,i,j,k)
-                    23 END DO
-                endif
-            
-            !        # Since dimensional splitting is used, only aux2 is needed.
-            
-                if (num_aux > 0)  then
-                    forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1)
-                    aux2(ma,i,2+ka) = aux(ma,i,j,k+ka)
-                    end forall
-                endif
-            
-            !           # Store the value of j and k along this slice in the common block
-            !           # comxyt in case it is needed in the Riemann solver (for
-            !           # variable coefficient problems)
-            
-                jcom = j
-                kcom = k
-            
-            !           # compute modifications qadd and fadd along this slice
-            
-                call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
-                q1d,dtdx1d,dtdy,dtdz,dummy1,aux2,dummy3,num_aux, &
-                method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-                work(i0wave),work(i0s),work(i0amdq), &
-                work(i0apdq),work(i0cqxx), &
-                work(i0bmamdq),work(i0bmapdq), &
-                work(i0bpamdq),work(i0bpapdq), &
-                work(i0cmamdq),work(i0cmapdq), &
-                work(i0cpamdq),work(i0cpapdq), &
-                work(i0cmamdq2),work(i0cmapdq2), &
-                work(i0cpamdq2),work(i0cpapdq2), &
-                work(i0bmcqxxp),work(i0bpcqxxp), &
-                work(i0bmcqxxm),work(i0bpcqxxm), &
-                work(i0cmcqxxp),work(i0cpcqxxp), &
-                work(i0cmcqxxm),work(i0cpcqxxm), &
-                work(i0bmcmamdq),work(i0bmcmapdq), &
-                work(i0bpcmamdq),work(i0bpcmapdq), &
-                work(i0bmcpamdq),work(i0bmcpapdq), &
-                work(i0bpcpamdq),work(i0bpcpapdq), &
+     ! perform x-sweeps
+     ! ==================
+     !$OMP PARALLEL PRIVATE(me,me1,m,i,ma,ka,cfl1d) REDUCTION(MAX:cfl)
+     me = 0
+     !$ me = omp_get_thread_num()
+     me1 = me + 1
+     cfl1d = 0.d0
+
+     ! Guided schedule seems to produce a small performance increase
+     ! relative to static or dynamic for problems with uniform work
+     ! per column.  For problems with very nonuniform work per
+     ! column, either guided or dynamic is necessary to keep all the
+     ! threads busy.
+     
+     !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED)
+     do k = 0,mz+1
+        do j = 0,my+1
+           
+           forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost)
+              ! copy data along a slice into 1d array:
+              q1d(m,i,me1) = qold(m,i,j,k)
+           end forall
+           
+           if (index_capa > 0) then
+              do i = 1-num_ghost, mx+num_ghost
+                 dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k)
+              end do
+           endif
+            
+           ! Since dimensional splitting is used, only aux2 is needed.
+           if (num_aux > 0)  then
+              forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1)
+                 aux2(ma,i,2+ka,me1) = aux(ma,i,j,k+ka)
+              end forall
+           endif
+            
+           ! Store the value of j and k along this slice in the common block
+           ! comxyt in case it is needed in the Riemann solver (for
+           ! variable coefficient problems)
+           
+           jcom = j
+           kcom = k
+            
+           ! compute modifications qadd and fadd along this slice
+            
+           call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
+                q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz,dummy1,&
+                aux2(1,1-num_ghost,1,me1),dummy3,num_aux, &
+                method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),&
+                gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), &
+                work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
                 use_fwave,rpn3,rpt3,rptt3)
             
-                cfl = dmax1(cfl,cfl1d)
-            
-            !           # update qnew by flux differencing.
-            !           # (rather than maintaining arrays f, g and h for the total fluxes,
-            !           # the modifications are used immediately to update qnew
-            !           # in order to save storage.)
-            
-                if(index_capa == 0)then
-                !              # no capa array.  Standard flux differencing:
-                    forall (m = 1:num_eqn, i = 1:mx)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                    - dtdx * (fadd(m,i+1) - fadd(m,i))
-                    end forall
-                else
-                !              # with capa array
-                    forall (m = 1:num_eqn, i = 1:mx)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                    - dtdx * (fadd(m,i+1) - fadd(m,i)) &
-                    / aux(index_capa,i,j,k)
-                    end forall
-                endif
-            
-        50 END DO
-    
-    else if( idir == 2 )then
+           cfl = dmax1(cfl,cfl1d)
+            
+           ! update qnew by flux differencing.
+           ! (rather than maintaining arrays f, g and h for the total fluxes,
+           ! the modifications are used immediately to update qnew
+           ! in order to save storage.)
+           
+           if(index_capa == 0)then
+              ! no capa array.  Standard flux differencing:
+              forall (m = 1:num_eqn, i = 1:mx)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                      - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1))
+              end forall
+           else
+              ! with capa array
+              forall (m = 1:num_eqn, i = 1:mx)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                      - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) &
+                      / aux(index_capa,i,j,k)
+              end forall
+           endif
+           
+        end do
+     end do
+     !$OMP END PARALLEL
     
-    !     # perform y sweeps
-    !     ==================
+  else if( idir == 2 )then
     
-        do 100 k = 0, mz+1
-            do 100 i = 0, mx+1
-            
-                forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost)
-            !                 # copy data along a slice into 1d array:
-                q1d(m,j) = qold(m,i,j,k)
-                end forall
-            
-                if (index_capa > 0)  then
-                    do 71 j = 1-num_ghost, my+num_ghost
-                        dtdy1d(j) = dtdy / aux(index_capa,i,j,k)
-                    71 END DO
-                endif
-            
-            !        # Since dimensional splitting is used, only aux2 is needed.
-            
-                if (num_aux > 0)  then
-                ! There's a decent chance aux2 will all fit into cache,
-                ! so keep accesses to aux as contiguous as possible.
-                    forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost)
-                    aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k)
-                    end forall
-                endif
-            
-            !           # Store the value of i and k along this slice in the common block
-            !           # comxyzt in case it is needed in the Riemann solver (for
-            !           # variable coefficient problems)
-            
-                icom = i
-                kcom = k
-            
-            !           # compute modifications qadd and fadd along this slice
-            
-                call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
-                q1d,dtdy1d,dtdz,dtdx,dummy1,aux2,dummy3,num_aux, &
-                method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-                work(i0wave),work(i0s),work(i0amdq), &
-                work(i0apdq),work(i0cqxx), &
-                work(i0bmamdq),work(i0bmapdq), &
-                work(i0bpamdq),work(i0bpapdq), &
-                work(i0cmamdq),work(i0cmapdq), &
-                work(i0cpamdq),work(i0cpapdq), &
-                work(i0cmamdq2),work(i0cmapdq2), &
-                work(i0cpamdq2),work(i0cpapdq2), &
-                work(i0bmcqxxp),work(i0bpcqxxp), &
-                work(i0bmcqxxm),work(i0bpcqxxm), &
-                work(i0cmcqxxp),work(i0cpcqxxp), &
-                work(i0cmcqxxm),work(i0cpcqxxm), &
-                work(i0bmcmamdq),work(i0bmcmapdq), &
-                work(i0bpcmamdq),work(i0bpcmapdq), &
-                work(i0bmcpamdq),work(i0bmcpapdq), &
-                work(i0bpcpamdq),work(i0bpcpapdq), &
+     ! perform y sweeps
+     ! ==================
+     !$OMP PARALLEL PRIVATE(me,me1,m,j,ma,ia,cfl1d) REDUCTION(MAX:cfl)
+     me = 0 
+     !$ me = omp_get_thread_num()
+     me1 = me + 1 
+     cfl1d = 0.d0
+     
+     !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED)
+     do k = 0, mz+1
+        do i = 0, mx+1
+           
+           forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost)
+              ! copy data along a slice into 1d array:
+              q1d(m,j,me1) = qold(m,i,j,k)
+           end forall
+            
+           if (index_capa > 0)  then
+              do j = 1-num_ghost, my+num_ghost
+                 dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k)
+              end do
+           endif
+            
+           ! Since dimensional splitting is used, only aux2 is needed.
+            
+           if (num_aux > 0)  then
+              ! There's a decent chance aux2 will all fit into cache,
+              ! so keep accesses to aux as contiguous as possible.
+              forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost)
+                 aux2(ma, j, 2+ia,me1) = aux(ma, i+ia, j, k)
+              end forall
+           endif
+            
+           ! Store the value of i and k along this slice in the common block
+           ! comxyzt in case it is needed in the Riemann solver (for
+           ! variable coefficient problems)
+            
+           icom = i
+           kcom = k
+            
+           ! compute modifications qadd and fadd along this slice
+            
+           call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
+                q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx,dummy1,&
+                aux2(1,1-num_ghost,1,me1),dummy3,num_aux, &
+                method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),&
+                gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), &
+                work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
                 use_fwave,rpn3,rpt3,rptt3)
             
-                cfl = dmax1(cfl,cfl1d)
-            
-            !           # update qnew by flux differencing.
-            !           # Note that the roles of the flux updates are changed.
-            !           # fadd - modifies the g-fluxes
-            
-                if( index_capa == 0)then
-                !               # no capa array.  Standard flux differencing:
-                    forall (m = 1:num_eqn, j = 1:my)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                    - dtdy * (fadd(m,j+1) - fadd(m,j))
-                    end forall
-                else
-                !              #with capa array.
-                    forall (m = 1:num_eqn, j = 1:my)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                    - dtdy * (fadd(m,j+1) - fadd(m,j)) &
-                    / aux(index_capa,i,j,k)
-                    end forall
-                endif
-            
-        100 END DO
-    
-    else
+           cfl = dmax1(cfl,cfl1d)
+            
+           ! update qnew by flux differencing.
+           ! Note that the roles of the flux updates are changed.
+           ! fadd - modifies the g-fluxes
+           
+           if( index_capa == 0)then
+              ! no capa array.  Standard flux differencing:
+              forall (m = 1:num_eqn, j = 1:my)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                      - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1))
+              end forall
+           else
+              ! with capa array.
+              forall (m = 1:num_eqn, j = 1:my)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                      - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) &
+                      / aux(index_capa,i,j,k)
+              end forall
+           endif
+            
+        end do
+     end do
+     !$OMP END PARALLEL
     
-    !     # perform z sweeps
-    !     ==================
+  else
     
-    
-        do 150 j = 0, my+1
-            do 150 i = 0, mx+1
-            
-                forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost)
-            !                 # copy data along a slice into 1d array:
-                q1d(m,k) = qold(m,i,j,k)
-                end forall
-            
-                if (index_capa > 0)  then
-                    do 130 k = 1-num_ghost, mz+num_ghost
-                        dtdz1d(k) = dtdz / aux(index_capa,i,j,k)
-                    130 END DO
-                endif
-            
-            !        # Since dimensional splitting is used, only aux2 is needed.
-            
-                if (num_aux > 0)  then
-                ! There's a decent chance aux2 will all fit into cache,
-                ! so keep accesses to aux as contiguous as possible.
-                    forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost)
-                    aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k)
-                    end forall
-                endif
-            
-            !           # Store the value of i and j along this slice in the common block
-            !           # comxyzt in case it is needed in the Riemann solver (for
-            !           # variable coefficient problems)
-            
-                icom = i
-                jcom = j
-            
-            !           # compute modifications qadd and fadd along this slice
-            
-                call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
-                q1d,dtdz1d,dtdx,dtdy,dummy1,aux2,dummy3,num_aux, &
-                method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-                work(i0wave),work(i0s),work(i0amdq), &
-                work(i0apdq),work(i0cqxx), &
-                work(i0bmamdq),work(i0bmapdq), &
-                work(i0bpamdq),work(i0bpapdq), &
-                work(i0cmamdq),work(i0cmapdq), &
-                work(i0cpamdq),work(i0cpapdq), &
-                work(i0cmamdq2),work(i0cmapdq2), &
-                work(i0cpamdq2),work(i0cpapdq2), &
-                work(i0bmcqxxp),work(i0bpcqxxp), &
-                work(i0bmcqxxm),work(i0bpcqxxm), &
-                work(i0cmcqxxp),work(i0cpcqxxp), &
-                work(i0cmcqxxm),work(i0cpcqxxm), &
-                work(i0bmcmamdq),work(i0bmcmapdq), &
-                work(i0bpcmamdq),work(i0bpcmapdq), &
-                work(i0bmcpamdq),work(i0bmcpapdq), &
-                work(i0bpcpamdq),work(i0bpcpapdq), &
+     ! perform z sweeps
+     ! ==================
+     !$OMP PARALLEL PRIVATE(me,me1,m,k,ma,ja,cfl1d) REDUCTION(MAX:cfl)
+     me = 0 
+     !$ me = omp_get_thread_num()
+     me1 = me + 1 
+     cfl1d = 0.d0 
+     
+     !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED)     
+     do j = 0, my+1
+        do i = 0, mx+1
+            
+           forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost)
+              ! copy data along a slice into 1d array:
+              q1d(m,k,me1) = qold(m,i,j,k)
+           end forall
+            
+           if (index_capa > 0)  then
+              do k = 1-num_ghost, mz+num_ghost
+                 dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k)
+              end do
+           endif
+            
+           ! Since dimensional splitting is used, only aux2 is needed.
+            
+           if (num_aux > 0)  then
+              ! There's a decent chance aux2 will all fit into cache,
+              ! so keep accesses to aux as contiguous as possible.
+              forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost)
+                 aux2(ma, k, 2+ja,me1) = aux(ma, i, j+ja, k)
+              end forall
+           endif
+            
+           ! Store the value of i and j along this slice in the common block
+           ! comxyzt in case it is needed in the Riemann solver (for
+           ! variable coefficient problems)
+            
+           icom = i
+           jcom = j
+            
+           ! compute modifications qadd and fadd along this slice
+            
+           call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
+                q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy,dummy1, &
+                aux2(1,1-num_ghost,1,me1),dummy3,num_aux, &
+                method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1), &
+                gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), &
+                work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
                 use_fwave,rpn3,rpt3,rptt3)
             
-                cfl = dmax1(cfl,cfl1d)
-            
-            !           # update qnew by flux differencing.
-            !           # Note that the roles of the flux updates are changed.
-            !           # fadd - modifies the h-fluxes
-            
-                if(index_capa == 0)then
-                !              #no capa array. Standard flux differencing:
-                    forall (m = 1:num_eqn, k = 1:mz)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                    - dtdz * (fadd(m,k+1) - fadd(m,k))
-                    end forall
-                else
-                !              # with capa array
-                    forall (m = 1:num_eqn, k = 1:mz)
-                    qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                    - dtdz * (fadd(m,k+1) - fadd(m,k)) &
-                    / aux(index_capa,i,j,k)
-                    end forall
-                endif
-            
-        150 END DO
+           cfl = dmax1(cfl,cfl1d)
+            
+           ! update qnew by flux differencing.
+           ! Note that the roles of the flux updates are changed.
+           ! fadd - modifies the h-fluxes
+           
+           if(index_capa == 0)then
+              ! no capa array. Standard flux differencing:
+              forall (m = 1:num_eqn, k = 1:mz)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                      - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1))
+              end forall
+           else
+              ! with capa array
+              forall (m = 1:num_eqn, k = 1:mz)
+                 qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                      - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) &
+                      / aux(index_capa,i,j,k)
+              end forall
+           endif
+            
+        end do
+     end do
+     !$OMP END PARALLEL
     
-    endif
+  endif
 
-    return
-    end subroutine step3ds
+  return
+end subroutine step3ds
 
 

From 87f5f16fdd88e3e5a2033788321647c685dc5579 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 16 Oct 2015 13:48:42 -0700
Subject: [PATCH 10/19] Added F90 and f2py options for OpenMP (-fopenmp, and
 gomp => -lgomp) so that when compiling 'classic3.so' the openmp lines are
 compiled.

---
 src/pyclaw/classic/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py
index 03b55838e..a50de681b 100644
--- a/src/pyclaw/classic/setup.py
+++ b/src/pyclaw/classic/setup.py
@@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None):
                          ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet'])
 
     config.add_extension('classic3',
-                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'])
+                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp'])
 
     return config
 

From 00880f66d7fcd391c88ca0c4a7154e1b2d6a05d6 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 16 Oct 2015 14:50:03 -0700
Subject: [PATCH 11/19] Fixed bug when OMP_NUM_THREADS env variable is not set,
 and uses nthreads=1

---
 src/pyclaw/classic/solver.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 53c3a023e..2eeb954c0 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -612,10 +612,7 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.work = None
 
         import os
-        if os.environ['OMP_NUM_THREADS'] == None:
-            self.nthreads = 1
-        else:
-            self.nthreads = int(os.environ['OMP_NUM_THREADS'])
+        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 

From 3dad707298d8cdc932d4b6da1df12941a0d9c1d4 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Mon, 19 Oct 2015 08:49:04 -0700
Subject: [PATCH 12/19] removed debugging print statement

---
 src/pyclaw/classic/solver.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 2eeb954c0..e5f07f378 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -639,7 +639,6 @@ def _allocate_workspace(self,solution):
 
         # These work arrays really ought to live inside a fortran module
         # as is done for sharpclaw
-        print "nthreads:",self.nthreads
         self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
         self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')
         self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F')

From 51f716194bca0593b3eee52785b2b673ed3ba535 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Wed, 28 Oct 2015 23:24:13 -0700
Subject: [PATCH 13/19] Updated solver.py to set the default number of threads
 to the number of CPUs when the OMP_NUM_THREADS environment varibale is unset.
  This is consistent with what OpenMP returns for omp_get_num_threads() when
 the env variable is unset.  Updated step3.f90 to use OpenMP, based on what is
 done in clawpack/classic.  This was sucessfully tested with the Sedov.py test
 problem in the euler_3d example folder.

---
 src/pyclaw/classic/solver.py   |   11 +-
 src/pyclaw/classic/step3.f90   | 1475 +++++++++++++++++++-------------
 src/pyclaw/classic/step3ds.f90 |    3 +-
 3 files changed, 896 insertions(+), 593 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index e5f07f378..888cb181a 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -612,7 +612,9 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.work = None
 
         import os
-        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
+        import multiprocessing
+        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\
+                                           multiprocessing.cpu_count()))
 
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
@@ -699,9 +701,10 @@ def step_hyperbolic(self,solution):
 
             else:
 
-                q, cfl = self.fmod.step3(maxm,self.num_ghost,mx,my,mz, \
-                      qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\
-                      self.aux1,self.aux2,self.aux3,self.work,self.fwave,rpn3,rpt3,rptt3)
+                q, cfl = self.fmod.step3(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,\
+                    self.fwave,rpn3,rpt3,rptt3)
 
             self.cfl.update_global_max(cfl)
             state.set_q_from_qbc(self.num_ghost,self.qbc)
diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90
index 211f626b8..ec8b91556 100644
--- a/src/pyclaw/classic/step3.f90
+++ b/src/pyclaw/classic/step3.f90
@@ -1,45 +1,50 @@
 
-!     ==================================================================
-    subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, &
-    mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
-    qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, &
-    aux1,aux2,aux3,num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3)
-!     ==================================================================
-
-!     # Take one time step, updating q.
-!     # On entry, qold and qnew should be identical and give the
-!     #    initial data for this step
-!     # On exit, qnew returns values at the end of the time step.
-!     #    qold is unchanged.
-
-!     # qadd is used to return increments to q from flux3.
-!     # fadd, gadd and hadd are used to return flux increments from flux3.
-!     # See the flux3 documentation for more information.
-
-
-    implicit real*8(a-h,o-z)
-    external rpn3,rpt3,rptt3
-    dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost)
-    dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost)
-    dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
-    1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
-    dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3)
-    dimension dtdx1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdy1d(1-num_ghost:maxm+num_ghost)
-    dimension dtdz1d(1-num_ghost:maxm+num_ghost)
-    dimension method(7),mthlim(num_waves)
-    dimension work(mwork)
-    logical ::          use_fwave
-
+! ==================================================================
+subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
+     mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, &
+     qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,&
+     num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3)
+! ==================================================================
+
+  ! Take one time step, updating q.
+  ! On entry, qold and qnew should be identical and give the
+  !    initial data for this step
+  ! On exit, qnew returns values at the end of the time step.
+  !    qold is unchanged.
+  !
+  ! qadd is used to return increments to q from flux3.
+  ! fadd, gadd and hadd are used to return flux increments from flux3.
+  ! See the flux3 documentation for more information.
+
+  !$ use omp_lib
+
+  implicit real*8(a-h,o-z)
+  external rpn3,rpt3,rptt3
+  dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension  q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension aux(num_aux, 1-num_ghost:mx+num_ghost, &
+       1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost)
+  dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx)
+  dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx)
+  dimension method(7),mthlim(num_waves)
+  dimension work(mwork)
+  logical :: use_fwave
+
+  ! These block sizes must be at least 2 to avoid multiple threads
+  ! trying to update the same grid cell
+  integer, parameter :: blksiz_i = 2, blksiz_j = 2, blksiz_k = 2
+    
 !f2py intent(out) cfl
 !f2py intent(in,out) qnew
 !f2py optional q1d, qadd, fadd, gadd, hadd, dtdx1d, dtdy1d, dtdz1d
@@ -50,552 +55,846 @@ subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, &
 !f2py x=rpt3(x)
 !f2py x=rptt3(x)
 
-    common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
-
-
-
-!     # partition work array into pieces needed for local storage in
-!     # flux3 routine.  Find starting index of each piece:
-
-    i0wave     = 1
-    i0s        = i0wave     + (maxm+2*num_ghost)*num_eqn*num_waves
-    i0amdq     = i0s        + (maxm+2*num_ghost)*num_waves
-    i0apdq     = i0amdq     + (maxm+2*num_ghost)*num_eqn
-    i0cqxx     = i0apdq     + (maxm+2*num_ghost)*num_eqn
-    i0bmamdq   = i0cqxx     + (maxm+2*num_ghost)*num_eqn
-    i0bmapdq   = i0bmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpamdq   = i0bmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0bpapdq   = i0bpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq   = i0bpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq   = i0cmamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq   = i0cmapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq   = i0cpamdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmamdq2  = i0cpapdq   + (maxm+2*num_ghost)*num_eqn
-    i0cmapdq2  = i0cmamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpamdq2  = i0cmapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0cpapdq2  = i0cpamdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxp  = i0cpapdq2  + (maxm+2*num_ghost)*num_eqn
-    i0bmcqxxm  = i0bmcqxxp  + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxp  = i0bmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bpcqxxm  = i0bpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxp  = i0bpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cmcqxxm  = i0cmcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxp  = i0cmcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0cpcqxxm  = i0cpcqxxp   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmamdq = i0cpcqxxm   + (maxm+2*num_ghost)*num_eqn
-    i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn
-    i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn
-    i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn
-    iused      = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1
-
-    if (iused > mwork) then
-    !        # This shouldn't happen due to checks in claw3
-        write(6,*) '*** not enough work space in step3'
-        write(6,*) '*** iused = ', iused, '   mwork =',mwork
-        stop
-    endif
-
-
-
-    index_capa = method(6)
-!      num_aux = method(7)
-    cfl = 0.d0
-    dtdx = dt/dx
-    dtdy = dt/dy
-    dtdz = dt/dz
-
-    if (index_capa == 0) then
-    !        # no capa array:
-        do 5 i=1-num_ghost,maxm+num_ghost
-            dtdx1d(i) = dtdx
-            dtdy1d(i) = dtdy
-            dtdz1d(i) = dtdz
-        5 END DO
-    endif
-
-
-!     # perform x-sweeps
-!     ==================
-
-
-    do 50 k = 0,mz+1
-        do 50 j = 0,my+1
-
-            forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost)
-        !                 # copy data along a slice into 1d array:
-            q1d(m,i) = qold(m,i,j,k)
-            end forall
-        
-            if (index_capa > 0)  then
-                do 23 i = 1-num_ghost, mx+num_ghost
-                    dtdx1d(i) = dtdx / aux(index_capa,i,j,k)
-                23 END DO
-            endif
-        
-            if (num_aux > 0)  then
-            ! This odd construct may help improve cache locality.
-            ! (The F95 standard says each statement in the FORALL
-            ! must be executed for all indices before the next
-            ! statement is started, so this is different semantically
-            ! from putting all three indices in the same FORALL.)
-                forall (ka = -1:1)
-                forall (ma = 1:num_aux, i = 1-num_ghost:mx+num_ghost)
-                aux1(ma, i, 2+ka) = aux(ma, i, j-1, k+ka)
-                aux2(ma, i, 2+ka) = aux(ma, i,   j, k+ka)
-                aux3(ma, i, 2+ka) = aux(ma, i, j+1, k+ka)
-                end forall
-                end forall
-            endif
-        
-        !           # Store the value of j and k along this slice in the common block
-        !           # comxyt in case it is needed in the Riemann solver (for
-        !           # variable coefficient problems)
-        
-            jcom = j
-            kcom = k
-        
-        !           # compute modifications fadd, gadd and hadd to fluxes along
-        !           # this slice:
-        
-            call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
-            q1d,dtdx1d,dtdy,dtdz,aux1,aux2,aux3,num_aux, &
-            method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-            work(i0wave),work(i0s),work(i0amdq), &
-            work(i0apdq),work(i0cqxx), &
-            work(i0bmamdq),work(i0bmapdq), &
-            work(i0bpamdq),work(i0bpapdq), &
-            work(i0cmamdq),work(i0cmapdq), &
-            work(i0cpamdq),work(i0cpapdq), &
-            work(i0cmamdq2),work(i0cmapdq2), &
-            work(i0cpamdq2),work(i0cpapdq2), &
-            work(i0bmcqxxp),work(i0bpcqxxp), &
-            work(i0bmcqxxm),work(i0bpcqxxm), &
-            work(i0cmcqxxp),work(i0cpcqxxp), &
-            work(i0cmcqxxm),work(i0cpcqxxm), &
-            work(i0bmcmamdq),work(i0bmcmapdq), &
-            work(i0bpcmamdq),work(i0bpcmapdq), &
-            work(i0bmcpamdq),work(i0bmcpapdq), &
-            work(i0bpcpamdq),work(i0bpcpapdq), &
-            use_fwave,rpn3,rpt3,rptt3)
-        
-            cfl = dmax1(cfl,cfl1d)
-        
-        !           # update qnew by flux differencing.
-        !           # (rather than maintaining arrays f, g and h for the total fluxes,
-        !           # the modifications are used immediately to update qnew
-        !           # in order to save storage.)
-        
-            if(index_capa == 0)then
-            
-                forall (m = 1:num_eqn, i = 1:mx)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                - dtdx * (fadd(m,i+1) - fadd(m,i)) &
-                - dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) &
-                - dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i))
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - dtdy * gadd(m,1,0,i) &
-                - dtdz * ( hadd(m,2,-1,i) &
-                -   hadd(m,1,-1,i) )
-                qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
-                - dtdy * gadd(m,1,-1,i) &
-                - dtdz * hadd(m,1,-1,i)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - dtdy * ( gadd(m,2,-1,i) &
-                -   gadd(m,1,-1,i) ) &
-                - dtdz * hadd(m,1,0,i)
-                qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
-                + dtdy * gadd(m,2,-1,i) &
-                - dtdz * hadd(m,1,1,i)
-                qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
-                + dtdy * gadd(m,2,0,i) &
-                - dtdz * ( hadd(m,2,1,i) &
-                -   hadd(m,1,1,i) )
-                qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
-                + dtdy * gadd(m,2,1,i) &
-                + dtdz * hadd(m,2,1,i)
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                - dtdy * ( gadd(m,2,1,i) &
-                -   gadd(m,1,1,i) ) &
-                + dtdz * hadd(m,2,0,i)
-                qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
-                - dtdy * gadd(m,1,1,i) &
-                + dtdz * hadd(m,2,-1,i)
-                end forall
-            
-            else
-            !              # with capa array
-                forall (m = 1:num_eqn, i = 1:mx)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) &
-                - (dtdx * (fadd(m,i+1) - fadd(m,i)) &
-                +  dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) &
-                +  dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i))) &
-                / aux(index_capa,i,j,k)
-
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - (dtdy * gadd(m,1,0,i) &
-                +  dtdz * ( hadd(m,2,-1,i) &
-                -   hadd(m,1,-1,i) )) &
-                / aux(index_capa,i,j-1,k)
-                qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
-                - (dtdy * gadd(m,1,-1,i) &
-                +  dtdz * hadd(m,1,-1,i)) &
-                / aux(index_capa,i,j-1,k-1)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - (dtdy * ( gadd(m,2,-1,i) &
-                -   gadd(m,1,-1,i) ) &
-                +  dtdz * hadd(m,1,0,i)) &
-                / aux(index_capa,i,j,k-1)
-                qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
-                + (dtdy * gadd(m,2,-1,i) &
-                -  dtdz * hadd(m,1,1,i)) &
-                / aux(index_capa,i,j+1,k-1)
-                qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
-                + (dtdy * gadd(m,2,0,i) &
-                - dtdz * ( hadd(m,2,1,i) &
-                -   hadd(m,1,1,i) )) &
-                / aux(index_capa,i,j+1,k)
-                qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
-                + (dtdy * gadd(m,2,1,i) &
-                +  dtdz * hadd(m,2,1,i)) &
-                / aux(index_capa,i,j+1,k+1)
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                - (dtdy * ( gadd(m,2,1,i) &
-                -   gadd(m,1,1,i) ) &
-                -  dtdz * hadd(m,2,0,i)) &
-                / aux(index_capa,i,j,k+1)
-                qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
-                - (dtdy * gadd(m,1,1,i) &
-                -  dtdz * hadd(m,2,-1,i)) &
-                / aux(index_capa,i,j-1,k+1)
-                end forall
-            endif
-        
-    50 END DO
-
-
-
-!     # perform y sweeps
-!     ==================
-
-
-    do 100 k = 0, mz+1
-        do 100 i = 0, mx+1
-        
-            forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost)
-        !                 # copy data along a slice into 1d array:
-            q1d(m,j) = qold(m,i,j,k)
-            end forall
-        
-            if (index_capa > 0)  then
-                do 71 j = 1-num_ghost, my+num_ghost
-                    dtdy1d(j) = dtdy / aux(index_capa,i,j,k)
-                71 END DO
-            endif
-        
-            if (num_aux > 0)  then
-            ! aux1, aux2, aux3 probably fit in cache, so optimize
-            ! access to aux
-                forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost)
-                aux1(ma, j, 2+ia) = aux(ma, i+ia, j, k-1)
-                aux2(ma, j, 2+ia) = aux(ma, i+ia, j,   k)
-                aux3(ma, j, 2+ia) = aux(ma, i+ia, j, k+1)
-                end forall
-            endif
-        
-        !           # Store the value of i and k along this slice in the common block
-        !           # comxyzt in case it is needed in the Riemann solver (for
-        !           # variable coefficient problems)
-        
-            icom = i
-            kcom = k
-        
-        !           # compute modifications fadd, gadd and hadd to fluxes along this
-        !           # slice:
-        
-            call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
-            q1d,dtdy1d,dtdz,dtdx,aux1,aux2,aux3,num_aux, &
-            method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-            work(i0wave),work(i0s),work(i0amdq), &
-            work(i0apdq),work(i0cqxx), &
-            work(i0bmamdq),work(i0bmapdq), &
-            work(i0bpamdq),work(i0bpapdq), &
-            work(i0cmamdq),work(i0cmapdq), &
-            work(i0cpamdq),work(i0cpapdq), &
-            work(i0cmamdq2),work(i0cmapdq2), &
-            work(i0cpamdq2),work(i0cpapdq2), &
-            work(i0bmcqxxp),work(i0bpcqxxp), &
-            work(i0bmcqxxm),work(i0bpcqxxm), &
-            work(i0cmcqxxp),work(i0cpcqxxp), &
-            work(i0cmcqxxm),work(i0cpcqxxm), &
-            work(i0bmcmamdq),work(i0bmcmapdq), &
-            work(i0bpcmamdq),work(i0bpcmapdq), &
-            work(i0bmcpamdq),work(i0bmcpapdq), &
-            work(i0bpcpamdq),work(i0bpcpapdq), &
-            use_fwave,rpn3,rpt3,rptt3)
-        
-            cfl = dmax1(cfl,cfl1d)
-        
-        !           # update qnew by flux differencing.
-        !           # Note that the roles of the flux updates are changed.
-        !           # fadd - modifies the g-fluxes
-        !           # gadd - modifies the h-fluxes
-        !           # hadd - modifies the f-fluxes
-        
-            if( index_capa == 0)then
-            !               # no capa array.  Standard flux differencing:
-                forall (m = 1:num_eqn, j = 1:my)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                - dtdy * (fadd(m,j+1) - fadd(m,j)) &
-                - dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) &
-                - dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j))
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                + dtdz * gadd(m,2,0,j) &
-                - dtdx * ( hadd(m,2,1,j) &
-                -   hadd(m,1,1,j) )
-                qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
-                + dtdz * gadd(m,2,1,j) &
-                +  dtdx * hadd(m,2,1,j)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                - dtdz * ( gadd(m,2,1,j) &
-                -   gadd(m,1,1,j) ) &
-                + dtdx * hadd(m,2,0,j)
-                qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
-                - dtdz * gadd(m,1,1,j) &
-                + dtdx * hadd(m,2,-1,j)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - dtdz * gadd(m,1,0,j) &
-                - dtdx * ( hadd(m,2,-1,j) &
-                -   hadd(m,1,-1,j) )
-                qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
-                - dtdz * gadd(m,1,-1,j) &
-                - dtdx * hadd(m,1,-1,j)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - dtdz * ( gadd(m,2,-1,j) &
-                -   gadd(m,1,-1,j) ) &
-                - dtdx * hadd(m,1,0,j)
-                qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
-                + dtdz * gadd(m,2,-1,j) &
-                -  dtdx*hadd(m,1,1,j)
-                end forall
-            else
-            
-            !              #with capa array.
-            
-                forall (m = 1:num_eqn, j = 1:my)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) &
-                - (dtdy * (fadd(m,j+1) - fadd(m,j)) &
-                +  dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) &
-                +  dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j))) &
-                / aux(index_capa,i,j,k)
-                qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
-                + (dtdz * gadd(m,2,0,j) &
-                -  dtdx * ( hadd(m,2,1,j) &
-                -   hadd(m,1,1,j) )) &
-                / aux(index_capa,i,j,k+1)
-                qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
-                + (dtdz * gadd(m,2,1,j) &
-                +  dtdx * hadd(m,2,1,j)) &
-                / aux(index_capa,i+1,j,k+1)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                - (dtdz * ( gadd(m,2,1,j) &
-                -   gadd(m,1,1,j) ) &
-                -  dtdx * hadd(m,2,0,j) ) &
-                / aux(index_capa,i+1,j,k)
-                qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
-                - (dtdz * gadd(m,1,1,j) &
-                -  dtdx * hadd(m,2,-1,j)) &
-                / aux(index_capa,i+1,j,k-1)
-                qnew(m,i,j,k-1)   = qnew(m,i,j,k-1) &
-                - (dtdz * gadd(m,1,0,j) &
-                +  dtdx * ( hadd(m,2,-1,j) &
-                -   hadd(m,1,-1,j) )) &
-                / aux(index_capa,i,j,k-1)
-                qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
-                - (dtdz * gadd(m,1,-1,j) &
-                +  dtdx * hadd(m,1,-1,j)) &
-                / aux(index_capa,i-1,j,k-1)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - (dtdz * ( gadd(m,2,-1,j) &
-                -   gadd(m,1,-1,j) ) &
-                +  dtdx * hadd(m,1,0,j)) &
-                / aux(index_capa,i-1,j,k)
-                qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
-                + (dtdz * gadd(m,2,-1,j) &
-                -  dtdx*hadd(m,1,1,j)) &
-                / aux(index_capa,i-1,j,k+1)
-                end forall
-            endif
-        
-    100 END DO
-
-
-
-!     # perform z sweeps
-!     ==================
-
-
-    do 150 j = 0, my+1
-        do 150 i = 0, mx+1
-        
-            forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost)
-        !                 # copy data along a slice into 1d array:
-            q1d(m,k) = qold(m,i,j,k)
-            end forall
-        
-            if (index_capa > 0)  then
-                do 130 k = 1-num_ghost, mz+num_ghost
-                    dtdz1d(k) = dtdz / aux(index_capa,i,j,k)
-                130 END DO
-            endif
-        
-            if (num_aux > 0)  then
-            ! See the comment on the X sweeps.  This is semantically
-            ! slightly different than putting all the indices in the
-            ! same forall, and hopefully better for optimizing access
-            ! to aux.
-                forall (ja = -1:1, k = 1-num_ghost:mz+num_ghost)
-                forall (ma = 1:num_aux)
-                aux1(ma, k, 2+ja) = aux(ma, i-1, j+ja, k)
-                aux2(ma, k, 2+ja) = aux(ma,   i, j+ja, k)
-                aux3(ma, k, 2+ja) = aux(ma, i+1, j+ja, k)
-                end forall
-                end forall
-            endif
-        
-        !           # Store the value of i and j along this slice in the common block
-        !           # comxyzt in case it is needed in the Riemann solver (for
-        !           # variable coefficient problems)
-        
-            icom = i
-            jcom = j
-        
-        !           # compute modifications fadd, gadd and hadd to fluxes along this
-        !           # slice:
-        
-            call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
-            q1d,dtdz1d,dtdx,dtdy,aux1,aux2,aux3,num_aux, &
-            method,mthlim,qadd,fadd,gadd,hadd,cfl1d, &
-            work(i0wave),work(i0s),work(i0amdq), &
-            work(i0apdq),work(i0cqxx), &
-            work(i0bmamdq),work(i0bmapdq), &
-            work(i0bpamdq),work(i0bpapdq), &
-            work(i0cmamdq),work(i0cmapdq), &
-            work(i0cpamdq),work(i0cpapdq), &
-            work(i0cmamdq2),work(i0cmapdq2), &
-            work(i0cpamdq2),work(i0cpapdq2), &
-            work(i0bmcqxxp),work(i0bpcqxxp), &
-            work(i0bmcqxxm),work(i0bpcqxxm), &
-            work(i0cmcqxxp),work(i0cpcqxxp), &
-            work(i0cmcqxxm),work(i0cpcqxxm), &
-            work(i0bmcmamdq),work(i0bmcmapdq), &
-            work(i0bpcmamdq),work(i0bpcmapdq), &
-            work(i0bmcpamdq),work(i0bmcpapdq), &
-            work(i0bpcpamdq),work(i0bpcpapdq), &
-            use_fwave,rpn3,rpt3,rptt3)
-        
-            cfl = dmax1(cfl,cfl1d)
-        
-        !           # update qnew by flux differencing.
-        !           # Note that the roles of the flux updates are changed.
-        !           # fadd - modifies the h-fluxes
-        !           # gadd - modifies the f-fluxes
-        !           # hadd - modifies the g-fluxes
-        
-            if(index_capa == 0)then
-            
-            !              #no capa array. Standard flux differencing:
-            
-                forall (m = 1:num_eqn, k = 1:mz)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                - dtdz * (fadd(m,k+1) - fadd(m,k)) &
-                - dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) &
-                - dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k))
-                qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
-                - dtdx * ( gadd(m,2,1,k) &
-                -   gadd(m,1,1,k) ) &
-                + dtdy * hadd(m,2,0,k)
-                qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
-                + dtdx * gadd(m,2,1,k) &
-                + dtdy * hadd(m,2,1,k)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                + dtdx * gadd(m,2,0,k) &
-                - dtdy * ( hadd(m,2,1,k) &
-                -   hadd(m,1,1,k) )
-                qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
-                + dtdx * gadd(m,2,-1,k) &
-                - dtdy * hadd(m,1,1,k)
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - dtdx * ( gadd(m,2,-1,k) &
-                -   gadd(m,1,-1,k) ) &
-                - dtdy * hadd(m,1,0,k)
-                qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
-                - dtdx * gadd(m,1,-1,k) &
-                - dtdy * hadd(m,1,-1,k)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - dtdx * gadd(m,1,0,k) &
-                - dtdy * ( hadd(m,2,-1,k) &
-                -   hadd(m,1,-1,k) )
-                qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
-                - dtdx * gadd(m,1,1,k) &
-                + dtdy * hadd(m,2,-1,k)
-                end forall
-            else
-            
-            !              # with capa array
-            
-                forall (m = 1:num_eqn, k = 1:mz)
-                qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) &
-                - (dtdz * (fadd(m,k+1) - fadd(m,k)) &
-                +  dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) &
-                +  dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k))) &
-                / aux(index_capa,i,j,k)
-                qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
-                - (dtdx * ( gadd(m,2,1,k) &
-                -   gadd(m,1,1,k) ) &
-                -  dtdy * hadd(m,2,0,k)) &
-                / aux(index_capa,i,j+1,k)
-                qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
-                + (dtdx * gadd(m,2,1,k) &
-                +  dtdy * hadd(m,2,1,k)) &
-                / aux(index_capa,i+1,j+1,k)
-                qnew(m,i+1,j,k)   = qnew(m,i+1,j,k) &
-                + (dtdx * gadd(m,2,0,k) &
-                - dtdy * ( hadd(m,2,1,k) &
-                -   hadd(m,1,1,k) )) &
-                / aux(index_capa,i+1,j,k)
-                qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
-                + (dtdx * gadd(m,2,-1,k) &
-                -  dtdy * hadd(m,1,1,k)) &
-                / aux(index_capa,i+1,j-1,k)
-                qnew(m,i,j-1,k)   = qnew(m,i,j-1,k) &
-                - (dtdx * ( gadd(m,2,-1,k) &
-                -   gadd(m,1,-1,k) ) &
-                +  dtdy * hadd(m,1,0,k)) &
-                / aux(index_capa,i,j-1,k)
-                qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
-                - (dtdx * gadd(m,1,-1,k) &
-                +  dtdy * hadd(m,1,-1,k)) &
-                / aux(index_capa,i-1,j-1,k)
-                qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
-                - (dtdx * gadd(m,1,0,k) &
-                + dtdy * ( hadd(m,2,-1,k) &
-                -   hadd(m,1,-1,k) )) &
-                / aux(index_capa,i-1,j,k)
-                qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
-                - (dtdx * gadd(m,1,1,k) &
-                -  dtdy * hadd(m,2,-1,k)) &
-                / aux(index_capa,i-1,j+1,k)
-                end forall
-            endif
-        
-    150 END DO
-
-
-    return
-    end subroutine step3
+  common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
+
+  ! partition work array into pieces needed for local storage in
+  ! flux3 routine.  Find starting index of each piece:
+
+  nthreads=1    ! Serial
+  !$omp parallel
+  !$omp single
+  !$ nthreads = omp_get_num_threads()
+  !$omp end single
+  !$omp end parallel
+
+  nsiz = (maxm+2*num_ghost)*num_eqn
+  nsiz_w = nsiz * num_waves
+  nsiz_s = (maxm+2*num_ghost)*num_waves
+
+  i0wave     = 1 
+  i0s        = i0wave     + nsiz_w*nthreads
+  i0amdq     = i0s        + nsiz_s*nthreads
+  i0apdq     = i0amdq     + nsiz*nthreads
+  i0cqxx     = i0apdq     + nsiz*nthreads
+  i0bmamdq   = i0cqxx     + nsiz*nthreads
+  i0bmapdq   = i0bmamdq   + nsiz*nthreads
+  i0bpamdq   = i0bmapdq   + nsiz*nthreads
+  i0bpapdq   = i0bpamdq   + nsiz*nthreads
+  i0cmamdq   = i0bpapdq   + nsiz*nthreads
+  i0cmapdq   = i0cmamdq   + nsiz*nthreads
+  i0cpamdq   = i0cmapdq   + nsiz*nthreads
+  i0cpapdq   = i0cpamdq   + nsiz*nthreads
+  i0cmamdq2  = i0cpapdq   + nsiz*nthreads
+  i0cmapdq2  = i0cmamdq2  + nsiz*nthreads
+  i0cpamdq2  = i0cmapdq2  + nsiz*nthreads
+  i0cpapdq2  = i0cpamdq2  + nsiz*nthreads
+  i0bmcqxxp  = i0cpapdq2  + nsiz*nthreads
+  i0bmcqxxm  = i0bmcqxxp  + nsiz*nthreads
+  i0bpcqxxp  = i0bmcqxxm  + nsiz*nthreads
+  i0bpcqxxm  = i0bpcqxxp  + nsiz*nthreads
+  i0cmcqxxp  = i0bpcqxxm  + nsiz*nthreads
+  i0cmcqxxm  = i0cmcqxxp  + nsiz*nthreads
+  i0cpcqxxp  = i0cmcqxxm  + nsiz*nthreads
+  i0cpcqxxm  = i0cpcqxxp  + nsiz*nthreads
+  i0bmcmamdq = i0cpcqxxm  + nsiz*nthreads
+  i0bmcmapdq = i0bmcmamdq + nsiz*nthreads
+  i0bpcmamdq = i0bmcmapdq + nsiz*nthreads
+  i0bpcmapdq = i0bpcmamdq + nsiz*nthreads
+  i0bmcpamdq = i0bpcmapdq + nsiz*nthreads
+  i0bmcpapdq = i0bmcpamdq + nsiz*nthreads
+  i0bpcpamdq = i0bmcpapdq + nsiz*nthreads
+  i0bpcpapdq = i0bpcpamdq + nsiz*nthreads
+  iused      = i0bpcpapdq + nsiz*nthreads - 1
+  
+  if (iused > mwork) then
+     ! This shouldn't happen due to checks in claw3
+     write(6,*) '*** not enough work space in step3()'
+     write(6,*) '*** iused = ', iused, '   mwork =',mwork
+     write(6,*) '*** nthreads = ',nthreads
+     stop
+  endif
+
+  index_capa = method(6)
+  !num_aux = method(7)
+  cfl = 0.d0
+  dtdx = dt/dx
+  dtdy = dt/dy
+  dtdz = dt/dz
+
+  ! Calculate the number of blocks to use in each grid direction
+  ! (integer division, rounding up)
+  nblk_i = (mx + blksiz_i + 1)/blksiz_i
+  nblk_j = (my + blksiz_j + 1)/blksiz_j
+  nblk_k = (mz + blksiz_k + 1)/blksiz_k
+
+  !$omp parallel private(me,me1,m,i,j,k,ma,ia,ja,ka,cfl1d,ioffset,joffset,koffset, &
+  !$omp                  iblk,jblk,kblk) reduction(max:cfl)
+
+  me = 0
+  !$ me = omp_get_thread_num()
+  me1 = me + 1
+
+  if (index_capa == 0) then
+     ! no capa array:
+     do i=1-num_ghost,maxm+num_ghost
+        dtdx1d(i,me1) = dtdx
+        dtdy1d(i,me1) = dtdy
+        dtdz1d(i,me1) = dtdz
+     end do
+  endif
+
+  ! perform x-sweeps
+  ! ==================
+
+  ! This block-based approach keeps race conditions from happening
+  ! with OpenMP parallelization.  Because the results of the flux3
+  ! call affect the column flux3 was called on and all adjacent
+  ! columns, an easy way to keep threads from interfering with each
+  ! other is to make sure that every thread takes a column that is
+  ! at least three cells away from all the other threads in each
+  ! direction.  The code here accomplishes this by breaking the
+  ! columns into blocks; for a block size of 2 in all directions,
+  ! the breakdown looks like:
+  !
+  ! xx..xx..
+  ! xx..xx..
+  ! **++**++
+  ! **++**++
+  ! xx..xx..
+  ! xx..xx..
+  ! **++**++
+  ! **++**++
+  !
+  ! All the blocks of columns labeled with the same symbol (e.g. x)
+  ! are done in parallel in one pass; within each block, the work is
+  ! all done by one thread, avoiding race conditions.  Then the code
+  ! flushes memory and synchronizes, and starts the next set of
+  ! blocks (e.g. *).
+  !
+  ! Breaking up into blocks in this fashion helps provide a large
+  ! number of small, independent work items, which helps with
+  ! parallel performance.
+  
+  do koffset=0,1
+     do joffset=0,1
+
+        ! Guided or dynamic scheduling is necessary to keep all the
+        ! threads busy if the work per column is very nonuniform.  With
+        ! many blocks to process, guided is probably the best choice.
+
+        !$omp do collapse(2) schedule(guided)
+        do kblk = koffset, nblk_k-1, 2
+           do jblk = joffset, nblk_j-1, 2
+
+              do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1)
+                 do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1)
+                    
+                    ! copy data along a slice into 1d array:
+                    do i = 1-num_ghost, mx+num_ghost
+                       do m = 1, num_eqn
+                          q1d(m,i,me1) = qold(m,i,j,k)
+                       end do
+                    end do
+
+                    if (index_capa > 0)  then
+                       do i = 1-num_ghost, mx+num_ghost
+                          dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k)
+                       end do
+                    endif
+
+                    if (num_aux > 0)  then
+                       do ka = -1, 1
+                          do i = 1-num_ghost, mx+num_ghost
+                             do ma = 1, num_aux
+                                aux1(ma, i, 2+ka, me1) = aux(ma, i, j-1, k+ka)
+                             end do
+                          end do
+                       end do
+                       do ka = -1, 1
+                          do i = 1-num_ghost, mx+num_ghost
+                             do ma = 1, num_aux
+                                aux2(ma, i, 2+ka, me1) = aux(ma, i,   j, k+ka)
+                             end do
+                          end do
+                       end do
+                       do ka = -1, 1
+                          do i = 1-num_ghost, mx+num_ghost
+                             do ma = 1, num_aux
+                                aux3(ma, i, 2+ka, me1) = aux(ma, i, j+1, k+ka)
+                             end do
+                          end do
+                       end do
+                    endif
+        
+                    ! Store the value of j and k along this slice in the common block
+                    ! comxyt in case it is needed in the Riemann solver (for
+                    ! variable coefficient problems)
+                    
+                    jcom = j
+                    kcom = k
+        
+                    ! compute modifications fadd, gadd and hadd to fluxes along
+                    ! this slice:
+                    
+                    call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, &
+                         q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz, &
+                         aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), &
+                         aux3(1,1-num_ghost,1,me1),num_aux, &
+                         method,mthlim,qadd(1,1-num_ghost,me1),&
+                         fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),&
+                         hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                         work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),&
+                         work(i0amdq+me*nsiz), &
+                         work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                         work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                         work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                         work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                         work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                         work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                         work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                         work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                         work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                         work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                         work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                         work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                         work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                         work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                         work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
+                         use_fwave,rpn3,rpt3,rptt3)
+        
+                    cfl = dmax1(cfl,cfl1d)
+        
+                    ! update qnew by flux differencing.
+                    ! (rather than maintaining arrays f, g and h for
+                    ! the total fluxes, the modifications are used immediately
+                    ! to update qnew in order to save storage.)
+
+                    if(index_capa == 0)then
+                       ! Order so that access to qnew is in memory-contiguous
+                       ! order as much as possible
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
+                                  - dtdy * gadd(m,1,-1,i,me1) &
+                                  - dtdz * hadd(m,1,-1,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - dtdy * ( gadd(m,2,-1,i,me1) &
+                                  -   gadd(m,1,-1,i,me1) ) &
+                                  - dtdz * hadd(m,1,0,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
+                                  + dtdy * gadd(m,2,-1,i,me1) &
+                                  - dtdz * hadd(m,1,1,i,me1)
+                          end do
+                       end do
+                       
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - dtdy * gadd(m,1,0,i,me1) &
+                                  - dtdz * ( hadd(m,2,-1,i,me1) &
+                                  -   hadd(m,1,-1,i,me1) )
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                                  - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) &
+                                  - dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) &
+                                  - dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1))
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
+                                  + dtdy * gadd(m,2,0,i,me1) &
+                                  - dtdz * ( hadd(m,2,1,i,me1) &
+                                  -   hadd(m,1,1,i,me1) )
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
+                                  - dtdy * gadd(m,1,1,i,me1) &
+                                  + dtdz * hadd(m,2,-1,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1) = qnew(m,i,j,k+1) &
+                                  - dtdy * ( gadd(m,2,1,i,me1) &
+                                  -   gadd(m,1,1,i,me1) ) &
+                                  + dtdz * hadd(m,2,0,i,me1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
+                                  + dtdy * gadd(m,2,1,i,me1) &
+                                  + dtdz * hadd(m,2,1,i,me1)
+                          end do
+                       end do
+                    else
+                       ! with capa array
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) &
+                                  - (dtdy * gadd(m,1,-1,i,me1) &
+                                  +  dtdz * hadd(m,1,-1,i,me1)) &
+                                  / aux(index_capa,i,j-1,k-1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - (dtdy * ( gadd(m,2,-1,i,me1) &
+                                  -   gadd(m,1,-1,i,me1) ) &
+                                  +  dtdz * hadd(m,1,0,i,me1)) &
+                                  / aux(index_capa,i,j,k-1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) &
+                                  + (dtdy * gadd(m,2,-1,i,me1) &
+                                  -  dtdz * hadd(m,1,1,i,me1)) &
+                                  / aux(index_capa,i,j+1,k-1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - (dtdy * gadd(m,1,0,i,me1) &
+                                  +  dtdz * ( hadd(m,2,-1,i,me1) &
+                                  -   hadd(m,1,-1,i,me1) )) &
+                                  / aux(index_capa,i,j-1,k)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) &
+                                  - (dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) &
+                                  +  dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) &
+                                  +  dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)))&
+                                  / aux(index_capa,i,j,k)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k)   = qnew(m,i,j+1,k) &
+                                  + (dtdy * gadd(m,2,0,i,me1) &
+                                  - dtdz * ( hadd(m,2,1,i,me1) &
+                                  -   hadd(m,1,1,i,me1) )) &
+                                  / aux(index_capa,i,j+1,k)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) &
+                                  - (dtdy * gadd(m,1,1,i,me1) &
+                                  -  dtdz * hadd(m,2,-1,i,me1)) &
+                                  / aux(index_capa,i,j-1,k+1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1)   = qnew(m,i,j,k+1) &
+                                  - (dtdy * ( gadd(m,2,1,i,me1) &
+                                  -   gadd(m,1,1,i,me1) ) &
+                                  -  dtdz * hadd(m,2,0,i,me1)) &
+                                  / aux(index_capa,i,j,k+1)
+                          end do
+                       end do
+                       do i = 1, mx
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) &
+                                  + (dtdy * gadd(m,2,1,i,me1) &
+                                  +  dtdz * hadd(m,2,1,i,me1)) &
+                                  / aux(index_capa,i,j+1,k+1)
+                          end do
+                       end do
+                    endif
+                    
+                 end do
+              end do    ! End loops within block
+              
+           end do
+        end do    ! End loops over blocks
+
+        ! Flush memory (may not be necessary), then make sure everybody
+        ! synchronizes before the next offset
+
+        !$omp flush
+        !$omp barrier
+     end do
+  end do    ! End loops over offsets
+
+  ! perform y sweeps
+  ! ==================
+
+  do koffset=0,1
+     do ioffset=0,1
+
+        !$omp do collapse(2) schedule(guided)
+        do kblk = koffset, nblk_k-1, 2
+           do iblk = ioffset, nblk_i-1, 2
+
+              do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1)
+                 do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1)
+                    
+                    ! copy data along a slice into 1d array:
+                    do j = 1-num_ghost, my+num_ghost
+                       do m = 1, num_eqn
+                          q1d(m,j,me1) = qold(m,i,j,k)
+                       end do
+                    end do
+
+                    if (index_capa > 0)  then
+                       do j = 1-num_ghost, my+num_ghost
+                          dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k)
+                       end do
+                    endif
+                    
+                    if (num_aux > 0)  then
+                       ! aux1, aux2, aux3 probably fit in cache, so optimize
+                       ! access to aux
+                       do j = 1-num_ghost, my+num_ghost
+                          do ia = -1, 1
+                             do ma = 1, num_aux
+                                aux1(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k-1)
+                             end do
+                          end do
+                       end do
+                       do j = 1-num_ghost, my+num_ghost
+                          do ia = -1, 1
+                             do ma = 1, num_aux
+                                aux2(ma, j, 2+ia, me1) = aux(ma, i+ia, j,   k)
+                             end do
+                          end do
+                       end do
+                       do j = 1-num_ghost, my+num_ghost
+                          do ia = -1, 1
+                             do ma = 1, num_aux
+                                aux3(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k+1)
+                             end do
+                          end do
+                       end do
+                    endif
+                    
+                    ! Store the value of i and k along this slice in the common block
+                    ! comxyzt in case it is needed in the Riemann solver (for
+                    ! variable coefficient problems)
+        
+                    icom = i
+                    kcom = k
+        
+                    ! compute modifications fadd, gadd and hadd to fluxes along this
+                    ! slice:
+
+                    call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, &
+                         q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx, &
+                         aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), &
+                         aux3(1,1-num_ghost,1,me1),num_aux, &
+                         method,mthlim,qadd(1,1-num_ghost,me1),&
+                         fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),&
+                         hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                         work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),&
+                         work(i0amdq+me*nsiz), &
+                         work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                         work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                         work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                         work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                         work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                         work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                         work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                         work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                         work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                         work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                         work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                         work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                         work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                         work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                         work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
+                         use_fwave,rpn3,rpt3,rptt3)
+        
+                    cfl = dmax1(cfl,cfl1d)
+
+                    ! update qnew by flux differencing.
+                    ! Note that the roles of the flux updates are changed.
+                    ! fadd - modifies the g-fluxes
+                    ! gadd - modifies the h-fluxes
+                    ! hadd - modifies the f-fluxes
+                    
+                    if( index_capa == 0)then
+                       ! no capa array.  Standard flux differencing:
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
+                                  - dtdz * gadd(m,1,-1,j,me1) &
+                                  - dtdx * hadd(m,1,-1,j,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - dtdz * gadd(m,1,0,j,me1) &
+                                  - dtdx * ( hadd(m,2,-1,j,me1) &
+                                  -   hadd(m,1,-1,j,me1) )
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
+                                  - dtdz * gadd(m,1,1,j,me1) &
+                                  + dtdx * hadd(m,2,-1,j,me1)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k) = qnew(m,i-1,j,k) &
+                                  - dtdz * ( gadd(m,2,-1,j,me1) &
+                                  -   gadd(m,1,-1,j,me1) ) &
+                                  - dtdx * hadd(m,1,0,j,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                                  - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) &
+                                  - dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) &
+                                  - dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1))
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  - dtdz * ( gadd(m,2,1,j,me1) &
+                                  -   gadd(m,1,1,j,me1) ) &
+                                  + dtdx * hadd(m,2,0,j,me1)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
+                                  + dtdz * gadd(m,2,-1,j,me1) &
+                                  -  dtdx*hadd(m,1,1,j,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1) = qnew(m,i,j,k+1) &
+                                  + dtdz * gadd(m,2,0,j,me1) &
+                                  - dtdx * ( hadd(m,2,1,j,me1) &
+                                  -   hadd(m,1,1,j,me1) )
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
+                                  + dtdz * gadd(m,2,1,j,me1) &
+                                  +  dtdx * hadd(m,2,1,j,me1)
+                          end do
+                       end do
+                    else
+                       ! with capa array.
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) &
+                                  - (dtdz * gadd(m,1,-1,j,me1) &
+                                  +  dtdx * hadd(m,1,-1,j,me1)) &
+                                  / aux(index_capa,i-1,j,k-1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k-1) = qnew(m,i,j,k-1) &
+                                  - (dtdz * gadd(m,1,0,j,me1) &
+                                  +  dtdx * ( hadd(m,2,-1,j,me1) &
+                                  -   hadd(m,1,-1,j,me1) )) &
+                                  / aux(index_capa,i,j,k-1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) &
+                                  - (dtdz * gadd(m,1,1,j,me1) &
+                                  -  dtdx * hadd(m,2,-1,j,me1)) &
+                                  / aux(index_capa,i+1,j,k-1)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k) = qnew(m,i-1,j,k) &
+                                  - (dtdz * ( gadd(m,2,-1,j,me1) &
+                                  -   gadd(m,1,-1,j,me1) ) &
+                                  +  dtdx * hadd(m,1,0,j,me1)) &
+                                  / aux(index_capa,i-1,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) &
+                                  - (dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) &
+                                  +  dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) &
+                                  +  dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)))&
+                                  / aux(index_capa,i,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  - (dtdz * ( gadd(m,2,1,j,me1) &
+                                  -   gadd(m,1,1,j,me1) ) &
+                                  -  dtdx * hadd(m,2,0,j,me1) ) &
+                                  / aux(index_capa,i+1,j,k)
+                          end do
+                       end do
+                       do j = 1, my
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) &
+                                  + (dtdz * gadd(m,2,-1,j,me1) &
+                                  -  dtdx*hadd(m,1,1,j,me1)) &
+                                  / aux(index_capa,i-1,j,k+1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k+1) = qnew(m,i,j,k+1) &
+                                  + (dtdz * gadd(m,2,0,j,me1) &
+                                  -  dtdx * ( hadd(m,2,1,j,me1) &
+                                  -   hadd(m,1,1,j,me1) )) &
+                                  / aux(index_capa,i,j,k+1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) &
+                                  + (dtdz * gadd(m,2,1,j,me1) &
+                                  +  dtdx * hadd(m,2,1,j,me1)) &
+                                  / aux(index_capa,i+1,j,k+1)
+                          end do
+                       end do
+                    endif
+                    
+                 end do
+              end do    ! End loops within blocks
+
+           end do
+        end do    ! End loops over blocks
+        
+        !$omp flush
+        !$omp barrier
+     end do
+  end do    ! End loops over offsets
+
+  ! perform z sweeps
+  ! ==================
+
+  do joffset=0,1
+     do ioffset=0,1
+        
+        !$omp do collapse(2) schedule(guided)
+        do jblk = joffset, nblk_j-1, 2
+           do iblk = ioffset, nblk_i-1, 2
+              
+              do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1)
+                 do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1)
+                    
+                    do k = 1-num_ghost, mz+num_ghost
+                       do m = 1, num_eqn
+                          q1d(m,k,me1) = qold(m,i,j,k)
+                       end do
+                    end do
+                    
+                    if (index_capa > 0)  then
+                       do k = 1-num_ghost, mz+num_ghost
+                          dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k)
+                       end do
+                    endif
+                    
+                    if (num_aux > 0)  then
+                       do k = 1-num_ghost, mz+num_ghost
+                          do ja = -1, 1
+                             do ma = 1, num_aux
+                                aux1(ma, k, 2+ja, me1) = aux(ma, i-1, j+ja, k)
+                             end do
+                             do ma = 1, num_aux
+                                aux2(ma, k, 2+ja, me1) = aux(ma,   i, j+ja, k)
+                             end do
+                             do ma = 1, num_aux
+                                aux3(ma, k, 2+ja, me1) = aux(ma, i+1, j+ja, k)
+                             end do
+                          end do
+                       end do
+                    endif
+
+
+        
+                    ! Store the value of i and j along this slice in the common block
+                    ! comxyzt in case it is needed in the Riemann solver (for
+                    ! variable coefficient problems)
+        
+                    icom = i
+                    jcom = j
+        
+                    ! compute modifications fadd, gadd and hadd to fluxes along this
+                    ! slice:
+
+                    call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, &
+                         q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy, &
+                         aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), &
+                         aux3(1,1-num_ghost,1,me1),num_aux, &
+                         method,mthlim,qadd(1,1-num_ghost,me1), &
+                         fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1), &
+                         hadd(1,1,-1,1-num_ghost,me1),cfl1d, &
+                         work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s), &
+                         work(i0amdq+me*nsiz), &
+                         work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), &
+                         work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), &
+                         work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), &
+                         work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), &
+                         work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), &
+                         work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), &
+                         work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), &
+                         work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), &
+                         work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), &
+                         work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), &
+                         work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), &
+                         work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), &
+                         work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), &
+                         work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), &
+                         work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), &
+                         use_fwave,rpn3,rpt3,rptt3)
+        
+                    cfl = dmax1(cfl,cfl1d)
+
+                    ! update qnew by flux differencing.
+                    ! Note that the roles of the flux updates are changed.
+                    ! fadd - modifies the h-fluxes
+                    ! gadd - modifies the f-fluxes
+                    ! hadd - modifies the g-fluxes
+
+                    if(index_capa == 0)then
+                       
+                       ! no capa array. Standard flux differencing:
+                       do k = 1, mz
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
+                                  - dtdx * gadd(m,1,-1,k,me1) &
+                                  - dtdy * hadd(m,1,-1,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - dtdx * ( gadd(m,2,-1,k,me1) &
+                                  -   gadd(m,1,-1,k,me1) ) &
+                                  - dtdy * hadd(m,1,0,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
+                                  + dtdx * gadd(m,2,-1,k,me1) &
+                                  - dtdy * hadd(m,1,1,k,me1)
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k) = qnew(m,i-1,j,k) &
+                                  - dtdx * gadd(m,1,0,k,me1) &
+                                  - dtdy * ( hadd(m,2,-1,k,me1) &
+                                  -   hadd(m,1,-1,k,me1) )
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                                  - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) &
+                                  - dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) &
+                                  - dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1))
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  + dtdx * gadd(m,2,0,k,me1) &
+                                  - dtdy * ( hadd(m,2,1,k,me1) &
+                                  -   hadd(m,1,1,k,me1) )
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
+                                  - dtdx * gadd(m,1,1,k,me1) &
+                                  + dtdy * hadd(m,2,-1,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
+                                  - dtdx * ( gadd(m,2,1,k,me1) &
+                                  -   gadd(m,1,1,k,me1) ) &
+                                  + dtdy * hadd(m,2,0,k,me1)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
+                                  + dtdx * gadd(m,2,1,k,me1) &
+                                  + dtdy * hadd(m,2,1,k,me1)
+                          end do
+                       end do
+                    else
+                       
+                       ! with capa array
+                       do k = 1, mz
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) &
+                                  - (dtdx * gadd(m,1,-1,k,me1) &
+                                  +  dtdy * hadd(m,1,-1,k,me1)) &
+                                  / aux(index_capa,i-1,j-1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j-1,k) = qnew(m,i,j-1,k) &
+                                  - (dtdx * ( gadd(m,2,-1,k,me1) &
+                                  -   gadd(m,1,-1,k,me1) ) &
+                                  +  dtdy * hadd(m,1,0,k,me1)) &
+                                  / aux(index_capa,i,j-1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) &
+                                  + (dtdx * gadd(m,2,-1,k,me1) &
+                                  -  dtdy * hadd(m,1,1,k,me1)) &
+                                  / aux(index_capa,i+1,j-1,k)
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j,k)   = qnew(m,i-1,j,k) &
+                                  - (dtdx * gadd(m,1,0,k,me1) &
+                                  + dtdy * ( hadd(m,2,-1,k,me1) &
+                                  -   hadd(m,1,-1,k,me1) )) &
+                                  / aux(index_capa,i-1,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) &
+                                  - (dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) &
+                                  +  dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) &
+                                  +  dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)))&
+                                  / aux(index_capa,i,j,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j,k) = qnew(m,i+1,j,k) &
+                                  + (dtdx * gadd(m,2,0,k,me1) &
+                                  - dtdy * ( hadd(m,2,1,k,me1) &
+                                  -   hadd(m,1,1,k,me1) )) &
+                                  / aux(index_capa,i+1,j,k)
+                          end do
+                          
+                          do m = 1, num_eqn
+                             qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) &
+                                  - (dtdx * gadd(m,1,1,k,me1) &
+                                  -  dtdy * hadd(m,2,-1,k,me1)) &
+                                  / aux(index_capa,i-1,j+1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i,j+1,k) = qnew(m,i,j+1,k) &
+                                  - (dtdx * ( gadd(m,2,1,k,me1) &
+                                  -   gadd(m,1,1,k,me1) ) &
+                                  -  dtdy * hadd(m,2,0,k,me1)) &
+                                  / aux(index_capa,i,j+1,k)
+                          end do
+                          do m = 1, num_eqn
+                             qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) &
+                                  + (dtdx * gadd(m,2,1,k,me1) &
+                                  +  dtdy * hadd(m,2,1,k,me1)) &
+                                  / aux(index_capa,i+1,j+1,k)
+                          end do
+                       end do
+                    endif
+                    
+                 end do
+              end do
+              
+           end do
+        end do
+
+        !$omp flush
+        !$omp barrier
+     end do
+  end do
+  !$omp end parallel
+
+  return
+end subroutine step3
 
 
diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90
index 3b3ed70a6..79d9f0d89 100644
--- a/src/pyclaw/classic/step3ds.f90
+++ b/src/pyclaw/classic/step3ds.f90
@@ -118,8 +118,9 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
 
   if (iused > mwork) then
      ! This shouldn't happen due to checks in claw3
-     write(6,*) '*** not enough work space in step3'
+     write(6,*) '*** not enough work space in step3ds()'
      write(6,*) '*** iused = ', iused, '   mwork =',mwork
+     write(6,*) '*** nthreads = ',nthreads
      stop
   endif
 

From ba15781dfccec35f9361140f5fee42f818aea756 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Thu, 5 Nov 2015 09:28:07 -0800
Subject: [PATCH 14/19] Updated implementation to use 1 OpenMP thread by
 default and give a warning that the 'OMP_NUM_THREADS' environment varibale is
 unset.  This allows an unaware user to run their codes without any changes,
 and they will see the warning and can adjust accordingly.

---
 src/pyclaw/classic/solver.py   | 9 ++++++---
 src/pyclaw/classic/step3.f90   | 5 +++++
 src/pyclaw/classic/step3ds.f90 | 5 +++++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 888cb181a..af9b94523 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -612,9 +612,12 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.work = None
 
         import os
-        import multiprocessing
-        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\
-                                           multiprocessing.cpu_count()))
+        import warnings
+        if 'OMP_NUM_THREADS' in os.environ:
+            pass # Using OpenMP
+        else:
+            warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset.  Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).")
+        self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 
         super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90
index ec8b91556..89233b85c 100644
--- a/src/pyclaw/classic/step3.f90
+++ b/src/pyclaw/classic/step3.f90
@@ -40,6 +40,7 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   dimension method(7),mthlim(num_waves)
   dimension work(mwork)
   logical :: use_fwave
+  character :: env_value
 
   ! These block sizes must be at least 2 to avoid multiple threads
   ! trying to update the same grid cell
@@ -61,6 +62,10 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   ! flux3 routine.  Find starting index of each piece:
 
   nthreads=1    ! Serial
+  CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat)
+  IF(nstat >= 1)THEN
+     !$ call omp_set_num_threads(1) ! set default to 1 thread
+  END IF
   !$omp parallel
   !$omp single
   !$ nthreads = omp_get_num_threads()
diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90
index 79d9f0d89..4186d86e5 100644
--- a/src/pyclaw/classic/step3ds.f90
+++ b/src/pyclaw/classic/step3ds.f90
@@ -54,6 +54,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   dimension method(7),mthlim(num_waves)
   dimension work(mwork)
   logical :: use_fwave
+  character :: env_value
 
   common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom
 
@@ -71,6 +72,10 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, &
   ! flux3 routine.  Find starting index of each piece:
 
   nthreads=1 ! Serial
+  CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat)
+  IF(nstat >= 1)THEN
+     !$ call omp_set_num_threads(1) ! set default to 1 thread
+  END IF
   !$OMP PARALLEL
   !$OMP SINGLE
   !$ nthreads = omp_get_num_threads()

From 55c4a3d48c8f2b22e02efd4ef41b97a92c20ce09 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Thu, 5 Nov 2015 09:52:37 -0800
Subject: [PATCH 15/19] small simplification to OMP_NUM_THREADS env variable
 check.

---
 src/pyclaw/classic/solver.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index af9b94523..642320912 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -613,9 +613,7 @@ def __init__(self, riemann_solver=None, claw_package=None):
 
         import os
         import warnings
-        if 'OMP_NUM_THREADS' in os.environ:
-            pass # Using OpenMP
-        else:
+        if 'OMP_NUM_THREADS' not in os.environ:
             warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset.  Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).")
         self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 

From 92158db1bcede02fc25b5e5354aa15f425c45171 Mon Sep 17 00:00:00 2001
From: Wes Lowrie <wlowrie@ara.com>
Date: Fri, 6 Nov 2015 09:31:15 -0800
Subject: [PATCH 16/19] Updated OpenMP warning to use the PyClaw logging.  This
 prints to the console and to the log file.

---
 src/pyclaw/classic/solver.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 642320912..2b91c234e 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -610,15 +610,17 @@ def __init__(self, riemann_solver=None, claw_package=None):
         self.aux2 = None
         self.aux3 = None
         self.work = None
+        self.nthreads = None
+
+        super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
 
         import os
-        import warnings
         if 'OMP_NUM_THREADS' not in os.environ:
-            warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset.  Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).")
+            self.logger.warn(""""The env variable 'OMP_NUM_THREADS' is unset.
+                                       Set this variable to use OpenMP with more 
+                                       than 1 thread (i.e. export OMP_NUM_THREADS=4).""")
         self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1))
 
-        super(ClawSolver3D,self).__init__(riemann_solver, claw_package)
-
     # ========== Setup routine =============================   
     def _allocate_workspace(self,solution):
         r"""

From 0b595de01b554af20f52c18a0f3873435c33f42e Mon Sep 17 00:00:00 2001
From: Wes Lowrie ARA/SED <wlowrie@ara.com>
Date: Wed, 22 Nov 2017 16:30:03 -0800
Subject: [PATCH 17/19] Various changes to pyclaw.  Updates to properly handle
 restarts using hdf5

---
 examples/euler_1d/shocksine.py |  7 ++++++-
 src/pyclaw/classic/setup.py    |  2 +-
 src/pyclaw/classic/solver.py   | 25 +++++++++++++++++++++++--
 src/pyclaw/io/hdf5.py          |  4 ++--
 src/pyclaw/sharpclaw/solver.py | 21 +++++++++++++++++----
 src/pyclaw/solver.py           | 21 ++++++++++++++++++++-
 src/pyclaw/state.py            |  2 +-
 7 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/examples/euler_1d/shocksine.py b/examples/euler_1d/shocksine.py
index 57d9229c8..b770f3ec9 100755
--- a/examples/euler_1d/shocksine.py
+++ b/examples/euler_1d/shocksine.py
@@ -52,10 +52,15 @@ def setup(use_petsc=False,iplot=False,htmlplot=False,outdir='./_output',solver_t
 
     if solver_type=='sharpclaw':
         solver = pyclaw.SharpClawSolver1D(rs)
-        solver.time_integrator = 'RK'
+        solver.time_integrator = 'Euler'
         solver.a, solver.b, solver.c = a, b, c
         solver.cfl_desired = 0.6
         solver.cfl_max = 0.7
+
+        solver.weno_order = 2
+        solver.num_ghost = 2
+        solver.lim_type = 1
+        
         if use_char_decomp:
             try:
                 import sharpclaw1               # Import custom Fortran code
diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py
index a50de681b..d5ff10de7 100644
--- a/src/pyclaw/classic/setup.py
+++ b/src/pyclaw/classic/setup.py
@@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None):
                          ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet'])
 
     config.add_extension('classic3',
-                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp'])
+                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp -Warray-bounds -fcheck=all'],libraries=['gomp'])
 
     return config
 
diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 2b91c234e..57fdf6474 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -141,7 +141,7 @@ def step(self,solution,take_one_step,tstart,tend):
             # Godunov Splitting
             if self.source_split == 1:
                 self.step_source(self,solution.states[0],self.dt)
-                
+
         return True
 
     def _check_cfl_settings(self):
@@ -684,7 +684,7 @@ def step_hyperbolic(self,solution):
             if self.dimensional_split:
                 #Right now only Godunov-dimensional-splitting is implemented.
                 #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack.
-
+                #"""
                 q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
                     mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\
                     self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
@@ -699,6 +699,27 @@ def step_hyperbolic(self,solution):
                     mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\
                     self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
                     self.fwave,rpn3,rpt3,rptt3)
+                #"""
+
+                """
+                #print "No Dim Splitting..."
+                # No Dimensional Splitting, Similar to SharpClaw
+                dq = qnew.copy('F')
+                dq[...] = 0.
+                dq, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
+                    self.fwave,rpn3,rpt3,rptt3)
+                dq, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\
+                    self.fwave,rpn3,rpt3,rptt3)
+                dq, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
+                    self.fwave,rpn3,rpt3,rptt3)
+                self.qbc += dq
+                """
 
                 cfl = max(cfl_x,cfl_y,cfl_z)
 
diff --git a/src/pyclaw/io/hdf5.py b/src/pyclaw/io/hdf5.py
index 390e5e4ee..8dbde118a 100644
--- a/src/pyclaw/io/hdf5.py
+++ b/src/pyclaw/io/hdf5.py
@@ -193,11 +193,11 @@ def read(solution,frame,path='./',file_prefix='claw',read_aux=True,
                 state = pyclaw.state.State(pyclaw_patch, \
                          patch.attrs['num_eqn'],patch.attrs['num_aux'])
                 state.t = patch.attrs['t']
-                state.q = patch['q'][:].reshape(state.q.shape,order='F')
+                state.q = patch['q'][:].ravel(order='F').reshape(state.q.shape,order='F')
 
                 # Read in aux if applicable
                 if read_aux and patch.get('aux',None) is not None:
-                    state.aux = patch['aux'][:].reshape(state.aux.shape,order='F')
+                    state.aux = patch['aux'][:].ravel(order='F').reshape(state.aux.shape,order='F')
 
                 solution.states.append(state)
                 patches.append(pyclaw_patch)
diff --git a/src/pyclaw/sharpclaw/solver.py b/src/pyclaw/sharpclaw/solver.py
index 3a42282b7..4158d5861 100644
--- a/src/pyclaw/sharpclaw/solver.py
+++ b/src/pyclaw/sharpclaw/solver.py
@@ -284,6 +284,7 @@ def step(self,solution,take_one_step,tstart,tend):
         Take one step with a Runge-Kutta or multistep method as specified by
         `solver.time_integrator`.
         """
+        
         state = solution.states[0]
         step_index = self.status['numsteps'] + 1
         if self.accept_step == True:
@@ -515,9 +516,19 @@ def check_3rd_ord_cond(self,state,step_index,dtFE):
     def _set_mthlim(self):
         self._mthlim = self.limiters
         if not isinstance(self.limiters,list): self._mthlim=[self._mthlim]
-        if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves
-        if len(self._mthlim)!=self.num_waves:
-            raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves')
+        if self.lim_type==1:
+            if self.char_decomp == 0 or self.char_decomp == 3:
+                if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_eqn
+                if len(self._mthlim)!=self.num_eqn:
+                    raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_eqn')
+            elif self.char_decomp == 1 or self.char_decomp == 4:
+                if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves
+                if len(self._mthlim)!=self.num_waves:
+                    raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves')
+        else:
+            if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves
+            if len(self._mthlim)!=self.num_waves:
+                raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves')
 
        
     def dq(self,state):
@@ -526,7 +537,9 @@ def dq(self,state):
         """
 
         deltaq = self.dq_hyperbolic(state)
-
+        
+        #state.q += deltaq
+        #deltaq[...] = 0.
         if self.dq_src is not None:
             deltaq+=self.dq_src(self,state,self.dt)
 
diff --git a/src/pyclaw/solver.py b/src/pyclaw/solver.py
index 5bc708c6a..e73209f3b 100644
--- a/src/pyclaw/solver.py
+++ b/src/pyclaw/solver.py
@@ -32,6 +32,14 @@ def before_step(solver,solution):
     """
     pass
 
+def after_step(solver,solution):
+    r"""
+    Dummy routine called after each step
+    
+    Replace this routine if you want to do something after each time step.
+    """
+    pass
+
 class Solver(object):
     r"""
     Pyclaw solver superclass.
@@ -85,6 +93,13 @@ class Solver(object):
         
         def before_step(solver,solution)
 
+    .. attribute:: after_step
+    
+        Function called after each time step is taken.
+        The required signature for this function is:
+        
+        def after_step(solver,solution)
+
     .. attribute:: dt_variable
     
         Whether to allow the time step to vary, ``default = True``.
@@ -179,6 +194,7 @@ def __init__(self,riemann_solver=None,claw_package=None):
         self._use_old_bc_sig = False
         self.accept_step = True
         self.before_step = None
+        self.after_step = None
 
         # select package to build solver objects from, by default this will be
         # the package that contains the module implementing the derived class
@@ -610,6 +626,9 @@ def evolve_to_time(self,solution,tend=None):
             # Note that the solver may alter dt during the step() routine
             self.step(solution,take_one_step,tstart,tend)
 
+            if self.after_step is not None:
+                self.after_step(self,solution.states[0])
+
             # Check to make sure that the Courant number was not too large
             cfl = self.cfl.get_cached_max()
             self.accept_step = self.accept_reject_step(state)
@@ -642,7 +661,7 @@ def evolve_to_time(self,solution,tend=None):
                     self.status['cflmax'] = \
                         max(cfl, self.status['cflmax'])
                     raise Exception('CFL too large, giving up!')
- 
+                
             # See if we are finished yet
             if solution.t >= tend or take_one_step:
                 break
diff --git a/src/pyclaw/state.py b/src/pyclaw/state.py
index c47dbc4d1..9851fd657 100755
--- a/src/pyclaw/state.py
+++ b/src/pyclaw/state.py
@@ -189,7 +189,7 @@ def is_valid(self):
             valid = False
         if self.aux is not None:
             if not self.aux.flags['F_CONTIGUOUS']:
-                logger.debug('q array is not Fortran contiguous.')
+                logger.debug('aux array is not Fortran contiguous.')
                 valid = False
         return valid
  

From cb6a50daf43fc6a483e5272c0b74007131a17ca8 Mon Sep 17 00:00:00 2001
From: Wes Lowrie ARA/SED <wlowrie@ara.com>
Date: Mon, 27 Nov 2017 09:47:36 -0800
Subject: [PATCH 18/19] Revert "Various changes to pyclaw.  Updates to properly
 handle restarts using hdf5"

This reverts commit 0b595de01b554af20f52c18a0f3873435c33f42e.
---
 examples/euler_1d/shocksine.py |  7 +------
 src/pyclaw/classic/setup.py    |  2 +-
 src/pyclaw/classic/solver.py   | 25 ++-----------------------
 src/pyclaw/io/hdf5.py          |  4 ++--
 src/pyclaw/sharpclaw/solver.py | 21 ++++-----------------
 src/pyclaw/solver.py           | 21 +--------------------
 src/pyclaw/state.py            |  2 +-
 7 files changed, 12 insertions(+), 70 deletions(-)

diff --git a/examples/euler_1d/shocksine.py b/examples/euler_1d/shocksine.py
index b770f3ec9..57d9229c8 100755
--- a/examples/euler_1d/shocksine.py
+++ b/examples/euler_1d/shocksine.py
@@ -52,15 +52,10 @@ def setup(use_petsc=False,iplot=False,htmlplot=False,outdir='./_output',solver_t
 
     if solver_type=='sharpclaw':
         solver = pyclaw.SharpClawSolver1D(rs)
-        solver.time_integrator = 'Euler'
+        solver.time_integrator = 'RK'
         solver.a, solver.b, solver.c = a, b, c
         solver.cfl_desired = 0.6
         solver.cfl_max = 0.7
-
-        solver.weno_order = 2
-        solver.num_ghost = 2
-        solver.lim_type = 1
-        
         if use_char_decomp:
             try:
                 import sharpclaw1               # Import custom Fortran code
diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py
index d5ff10de7..a50de681b 100644
--- a/src/pyclaw/classic/setup.py
+++ b/src/pyclaw/classic/setup.py
@@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None):
                          ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet'])
 
     config.add_extension('classic3',
-                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp -Warray-bounds -fcheck=all'],libraries=['gomp'])
+                         ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp'])
 
     return config
 
diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 57fdf6474..2b91c234e 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -141,7 +141,7 @@ def step(self,solution,take_one_step,tstart,tend):
             # Godunov Splitting
             if self.source_split == 1:
                 self.step_source(self,solution.states[0],self.dt)
-
+                
         return True
 
     def _check_cfl_settings(self):
@@ -684,7 +684,7 @@ def step_hyperbolic(self,solution):
             if self.dimensional_split:
                 #Right now only Godunov-dimensional-splitting is implemented.
                 #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack.
-                #"""
+
                 q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
                     mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\
                     self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
@@ -699,27 +699,6 @@ def step_hyperbolic(self,solution):
                     mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\
                     self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
                     self.fwave,rpn3,rpt3,rptt3)
-                #"""
-
-                """
-                #print "No Dim Splitting..."
-                # No Dimensional Splitting, Similar to SharpClaw
-                dq = qnew.copy('F')
-                dq[...] = 0.
-                dq, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
-                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
-                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
-                    self.fwave,rpn3,rpt3,rptt3)
-                dq, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
-                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
-                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\
-                    self.fwave,rpn3,rpt3,rptt3)
-                dq, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
-                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
-                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
-                    self.fwave,rpn3,rpt3,rptt3)
-                self.qbc += dq
-                """
 
                 cfl = max(cfl_x,cfl_y,cfl_z)
 
diff --git a/src/pyclaw/io/hdf5.py b/src/pyclaw/io/hdf5.py
index 8dbde118a..390e5e4ee 100644
--- a/src/pyclaw/io/hdf5.py
+++ b/src/pyclaw/io/hdf5.py
@@ -193,11 +193,11 @@ def read(solution,frame,path='./',file_prefix='claw',read_aux=True,
                 state = pyclaw.state.State(pyclaw_patch, \
                          patch.attrs['num_eqn'],patch.attrs['num_aux'])
                 state.t = patch.attrs['t']
-                state.q = patch['q'][:].ravel(order='F').reshape(state.q.shape,order='F')
+                state.q = patch['q'][:].reshape(state.q.shape,order='F')
 
                 # Read in aux if applicable
                 if read_aux and patch.get('aux',None) is not None:
-                    state.aux = patch['aux'][:].ravel(order='F').reshape(state.aux.shape,order='F')
+                    state.aux = patch['aux'][:].reshape(state.aux.shape,order='F')
 
                 solution.states.append(state)
                 patches.append(pyclaw_patch)
diff --git a/src/pyclaw/sharpclaw/solver.py b/src/pyclaw/sharpclaw/solver.py
index 4158d5861..3a42282b7 100644
--- a/src/pyclaw/sharpclaw/solver.py
+++ b/src/pyclaw/sharpclaw/solver.py
@@ -284,7 +284,6 @@ def step(self,solution,take_one_step,tstart,tend):
         Take one step with a Runge-Kutta or multistep method as specified by
         `solver.time_integrator`.
         """
-        
         state = solution.states[0]
         step_index = self.status['numsteps'] + 1
         if self.accept_step == True:
@@ -516,19 +515,9 @@ def check_3rd_ord_cond(self,state,step_index,dtFE):
     def _set_mthlim(self):
         self._mthlim = self.limiters
         if not isinstance(self.limiters,list): self._mthlim=[self._mthlim]
-        if self.lim_type==1:
-            if self.char_decomp == 0 or self.char_decomp == 3:
-                if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_eqn
-                if len(self._mthlim)!=self.num_eqn:
-                    raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_eqn')
-            elif self.char_decomp == 1 or self.char_decomp == 4:
-                if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves
-                if len(self._mthlim)!=self.num_waves:
-                    raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves')
-        else:
-            if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves
-            if len(self._mthlim)!=self.num_waves:
-                raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves')
+        if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves
+        if len(self._mthlim)!=self.num_waves:
+            raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves')
 
        
     def dq(self,state):
@@ -537,9 +526,7 @@ def dq(self,state):
         """
 
         deltaq = self.dq_hyperbolic(state)
-        
-        #state.q += deltaq
-        #deltaq[...] = 0.
+
         if self.dq_src is not None:
             deltaq+=self.dq_src(self,state,self.dt)
 
diff --git a/src/pyclaw/solver.py b/src/pyclaw/solver.py
index e73209f3b..5bc708c6a 100644
--- a/src/pyclaw/solver.py
+++ b/src/pyclaw/solver.py
@@ -32,14 +32,6 @@ def before_step(solver,solution):
     """
     pass
 
-def after_step(solver,solution):
-    r"""
-    Dummy routine called after each step
-    
-    Replace this routine if you want to do something after each time step.
-    """
-    pass
-
 class Solver(object):
     r"""
     Pyclaw solver superclass.
@@ -93,13 +85,6 @@ class Solver(object):
         
         def before_step(solver,solution)
 
-    .. attribute:: after_step
-    
-        Function called after each time step is taken.
-        The required signature for this function is:
-        
-        def after_step(solver,solution)
-
     .. attribute:: dt_variable
     
         Whether to allow the time step to vary, ``default = True``.
@@ -194,7 +179,6 @@ def __init__(self,riemann_solver=None,claw_package=None):
         self._use_old_bc_sig = False
         self.accept_step = True
         self.before_step = None
-        self.after_step = None
 
         # select package to build solver objects from, by default this will be
         # the package that contains the module implementing the derived class
@@ -626,9 +610,6 @@ def evolve_to_time(self,solution,tend=None):
             # Note that the solver may alter dt during the step() routine
             self.step(solution,take_one_step,tstart,tend)
 
-            if self.after_step is not None:
-                self.after_step(self,solution.states[0])
-
             # Check to make sure that the Courant number was not too large
             cfl = self.cfl.get_cached_max()
             self.accept_step = self.accept_reject_step(state)
@@ -661,7 +642,7 @@ def evolve_to_time(self,solution,tend=None):
                     self.status['cflmax'] = \
                         max(cfl, self.status['cflmax'])
                     raise Exception('CFL too large, giving up!')
-                
+ 
             # See if we are finished yet
             if solution.t >= tend or take_one_step:
                 break
diff --git a/src/pyclaw/state.py b/src/pyclaw/state.py
index 9851fd657..c47dbc4d1 100755
--- a/src/pyclaw/state.py
+++ b/src/pyclaw/state.py
@@ -189,7 +189,7 @@ def is_valid(self):
             valid = False
         if self.aux is not None:
             if not self.aux.flags['F_CONTIGUOUS']:
-                logger.debug('aux array is not Fortran contiguous.')
+                logger.debug('q array is not Fortran contiguous.')
                 valid = False
         return valid
  

From 7a87fef924707c98fb5116c7929065a3df79ebdc Mon Sep 17 00:00:00 2001
From: Wes Lowrie ARA/SED <wlowrie@ara.com>
Date: Mon, 27 Nov 2017 10:17:35 -0800
Subject: [PATCH 19/19] after_step readded to commit, and comments to be able
 to quickly switch from dimensional split to non-dimensional split solver.

---
 src/pyclaw/classic/solver.py | 23 +++++++++++++++++++++++
 src/pyclaw/solver.py         | 20 ++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py
index 2b91c234e..20d3b030f 100644
--- a/src/pyclaw/classic/solver.py
+++ b/src/pyclaw/classic/solver.py
@@ -685,6 +685,7 @@ def step_hyperbolic(self,solution):
                 #Right now only Godunov-dimensional-splitting is implemented.
                 #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack.
 
+                #"""
                 q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
                     mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\
                     self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
@@ -700,6 +701,28 @@ def step_hyperbolic(self,solution):
                     self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
                     self.fwave,rpn3,rpt3,rptt3)
 
+                #"""
+
+                """
+                #print "No Dim Splitting..."
+                # No Dimensional Splitting, Similar to SharpClaw
+                dq = qnew.copy('F')
+                dq[...] = 0.
+                dq, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\
+                    self.fwave,rpn3,rpt3,rptt3)
+                dq, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\
+                    self.fwave,rpn3,rpt3,rptt3)
+                dq, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\
+                    mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\
+                    self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\
+                    self.fwave,rpn3,rpt3,rptt3)
+                self.qbc += dq
+                """
+                
                 cfl = max(cfl_x,cfl_y,cfl_z)
 
             else:
diff --git a/src/pyclaw/solver.py b/src/pyclaw/solver.py
index 5bc708c6a..27a2acec5 100644
--- a/src/pyclaw/solver.py
+++ b/src/pyclaw/solver.py
@@ -32,6 +32,14 @@ def before_step(solver,solution):
     """
     pass
 
+def after_step(solver,solution):
+    r"""
+    Dummy routine called after each step
+    
+    Replace this routine if you want to do something after each time step.
+    """
+    pass
+
 class Solver(object):
     r"""
     Pyclaw solver superclass.
@@ -85,6 +93,14 @@ class Solver(object):
         
         def before_step(solver,solution)
 
+    .. attribute:: after_step
+    
+        Function called after each time step is taken.
+        The required signature for this function is:
+        
+        def after_step(solver,solution)
+
+
     .. attribute:: dt_variable
     
         Whether to allow the time step to vary, ``default = True``.
@@ -179,6 +195,7 @@ def __init__(self,riemann_solver=None,claw_package=None):
         self._use_old_bc_sig = False
         self.accept_step = True
         self.before_step = None
+        self.after_step = None
 
         # select package to build solver objects from, by default this will be
         # the package that contains the module implementing the derived class
@@ -610,6 +627,9 @@ def evolve_to_time(self,solution,tend=None):
             # Note that the solver may alter dt during the step() routine
             self.step(solution,take_one_step,tstart,tend)
 
+            if self.after_step is not None:
+                self.after_step(self,solution.states[0])
+
             # Check to make sure that the Courant number was not too large
             cfl = self.cfl.get_cached_max()
             self.accept_step = self.accept_reject_step(state)