diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py index 03b55838e..a50de681b 100644 --- a/src/pyclaw/classic/setup.py +++ b/src/pyclaw/classic/setup.py @@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None): ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet']) config.add_extension('classic3', - ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet']) + ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp']) return config diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index ab7277aef..2b91c234e 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -610,9 +610,17 @@ def __init__(self, riemann_solver=None, claw_package=None): self.aux2 = None self.aux3 = None self.work = None + self.nthreads = None super(ClawSolver3D,self).__init__(riemann_solver, claw_package) + import os + if 'OMP_NUM_THREADS' not in os.environ: + self.logger.warn(""""The env variable 'OMP_NUM_THREADS' is unset. + Set this variable to use OpenMP with more + than 1 thread (i.e. export OMP_NUM_THREADS=4).""") + self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) + # ========== Setup routine ============================= def _allocate_workspace(self,solution): r""" @@ -636,10 +644,11 @@ def _allocate_workspace(self,solution): # These work arrays really ought to live inside a fortran module # as is done for sharpclaw - self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - mwork = (maxm+2*num_ghost) * (31*num_eqn + num_waves + num_eqn*num_waves) + self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + mwork = (maxm+2*num_ghost)\ + *(31*num_eqn + num_waves + num_eqn*num_waves)*self.nthreads self.work = np.empty((mwork),order='F') @@ -676,25 +685,29 @@ def step_hyperbolic(self,solution): #Right now only Godunov-dimensional-splitting is implemented. #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack. - q, cfl_x = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,1,self.fwave,rpn3,rpt3,rptt3) + q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ + self.fwave,rpn3,rpt3,rptt3) - q, cfl_y = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,2,self.fwave,rpn3,rpt3,rptt3) + q, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\ + self.fwave,rpn3,rpt3,rptt3) - q, cfl_z = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,3,self.fwave,rpn3,rpt3,rptt3) + q, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ + self.fwave,rpn3,rpt3,rptt3) cfl = max(cfl_x,cfl_y,cfl_z) else: - q, cfl = self.fmod.step3(maxm,self.num_ghost,mx,my,mz, \ - qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,self.fwave,rpn3,rpt3,rptt3) + q, cfl = self.fmod.step3(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,\ + self.fwave,rpn3,rpt3,rptt3) self.cfl.update_global_max(cfl) state.set_q_from_qbc(self.num_ghost,self.qbc) diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90 index 211f626b8..89233b85c 100644 --- a/src/pyclaw/classic/step3.f90 +++ b/src/pyclaw/classic/step3.f90 @@ -1,45 +1,51 @@ -! ================================================================== - subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, & - mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & - qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, & - aux1,aux2,aux3,num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3) -! ================================================================== - -! # Take one time step, updating q. -! # On entry, qold and qnew should be identical and give the -! # initial data for this step -! # On exit, qnew returns values at the end of the time step. -! # qold is unchanged. - -! # qadd is used to return increments to q from flux3. -! # fadd, gadd and hadd are used to return flux increments from flux3. -! # See the flux3 documentation for more information. - - - implicit real*8(a-h,o-z) - external rpn3,rpt3,rptt3 - dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost) - dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension dtdx1d(1-num_ghost:maxm+num_ghost) - dimension dtdy1d(1-num_ghost:maxm+num_ghost) - dimension dtdz1d(1-num_ghost:maxm+num_ghost) - dimension method(7),mthlim(num_waves) - dimension work(mwork) - logical :: use_fwave - +! ================================================================== +subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & + mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & + qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,& + num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3) +! ================================================================== + + ! Take one time step, updating q. + ! On entry, qold and qnew should be identical and give the + ! initial data for this step + ! On exit, qnew returns values at the end of the time step. + ! qold is unchanged. + ! + ! qadd is used to return increments to q from flux3. + ! fadd, gadd and hadd are used to return flux increments from flux3. + ! See the flux3 documentation for more information. + + !$ use omp_lib + + implicit real*8(a-h,o-z) + external rpn3,rpt3,rptt3 + dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension method(7),mthlim(num_waves) + dimension work(mwork) + logical :: use_fwave + character :: env_value + + ! These block sizes must be at least 2 to avoid multiple threads + ! trying to update the same grid cell + integer, parameter :: blksiz_i = 2, blksiz_j = 2, blksiz_k = 2 + !f2py intent(out) cfl !f2py intent(in,out) qnew !f2py optional q1d, qadd, fadd, gadd, hadd, dtdx1d, dtdy1d, dtdz1d @@ -50,552 +56,850 @@ subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, & !f2py x=rpt3(x) !f2py x=rptt3(x) - common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom - - - -! # partition work array into pieces needed for local storage in -! # flux3 routine. Find starting index of each piece: - - i0wave = 1 - i0s = i0wave + (maxm+2*num_ghost)*num_eqn*num_waves - i0amdq = i0s + (maxm+2*num_ghost)*num_waves - i0apdq = i0amdq + (maxm+2*num_ghost)*num_eqn - i0cqxx = i0apdq + (maxm+2*num_ghost)*num_eqn - i0bmamdq = i0cqxx + (maxm+2*num_ghost)*num_eqn - i0bmapdq = i0bmamdq + (maxm+2*num_ghost)*num_eqn - i0bpamdq = i0bmapdq + (maxm+2*num_ghost)*num_eqn - i0bpapdq = i0bpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq = i0bpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq = i0cmamdq + (maxm+2*num_ghost)*num_eqn - i0cpamdq = i0cmapdq + (maxm+2*num_ghost)*num_eqn - i0cpapdq = i0cpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq2 = i0cpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq2 = i0cmamdq2 + (maxm+2*num_ghost)*num_eqn - i0cpamdq2 = i0cmapdq2 + (maxm+2*num_ghost)*num_eqn - i0cpapdq2 = i0cpamdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxp = i0cpapdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxm = i0bmcqxxp + (maxm+2*num_ghost)*num_eqn - i0bpcqxxp = i0bmcqxxm + (maxm+2*num_ghost)*num_eqn - i0bpcqxxm = i0bpcqxxp + (maxm+2*num_ghost)*num_eqn - i0cmcqxxp = i0bpcqxxm + (maxm+2*num_ghost)*num_eqn - i0cmcqxxm = i0cmcqxxp + (maxm+2*num_ghost)*num_eqn - i0cpcqxxp = i0cmcqxxm + (maxm+2*num_ghost)*num_eqn - i0cpcqxxm = i0cpcqxxp + (maxm+2*num_ghost)*num_eqn - i0bmcmamdq = i0cpcqxxm + (maxm+2*num_ghost)*num_eqn - i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn - i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn - i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn - i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn - i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn - i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn - i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn - iused = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1 - - if (iused > mwork) then - ! # This shouldn't happen due to checks in claw3 - write(6,*) '*** not enough work space in step3' - write(6,*) '*** iused = ', iused, ' mwork =',mwork - stop - endif - - - - index_capa = method(6) -! num_aux = method(7) - cfl = 0.d0 - dtdx = dt/dx - dtdy = dt/dy - dtdz = dt/dz - - if (index_capa == 0) then - ! # no capa array: - do 5 i=1-num_ghost,maxm+num_ghost - dtdx1d(i) = dtdx - dtdy1d(i) = dtdy - dtdz1d(i) = dtdz - 5 END DO - endif - - -! # perform x-sweeps -! ================== - - - do 50 k = 0,mz+1 - do 50 j = 0,my+1 - - forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,i) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 23 i = 1-num_ghost, mx+num_ghost - dtdx1d(i) = dtdx / aux(index_capa,i,j,k) - 23 END DO - endif - - if (num_aux > 0) then - ! This odd construct may help improve cache locality. - ! (The F95 standard says each statement in the FORALL - ! must be executed for all indices before the next - ! statement is started, so this is different semantically - ! from putting all three indices in the same FORALL.) - forall (ka = -1:1) - forall (ma = 1:num_aux, i = 1-num_ghost:mx+num_ghost) - aux1(ma, i, 2+ka) = aux(ma, i, j-1, k+ka) - aux2(ma, i, 2+ka) = aux(ma, i, j, k+ka) - aux3(ma, i, 2+ka) = aux(ma, i, j+1, k+ka) - end forall - end forall - endif - - ! # Store the value of j and k along this slice in the common block - ! # comxyt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - jcom = j - kcom = k - - ! # compute modifications fadd, gadd and hadd to fluxes along - ! # this slice: - - call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & - q1d,dtdx1d,dtdy,dtdz,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # (rather than maintaining arrays f, g and h for the total fluxes, - ! # the modifications are used immediately to update qnew - ! # in order to save storage.) - - if(index_capa == 0)then - - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) & - - dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) & - - dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i)) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - dtdy * gadd(m,1,0,i) & - - dtdz * ( hadd(m,2,-1,i) & - - hadd(m,1,-1,i) ) - qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & - - dtdy * gadd(m,1,-1,i) & - - dtdz * hadd(m,1,-1,i) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - dtdy * ( gadd(m,2,-1,i) & - - gadd(m,1,-1,i) ) & - - dtdz * hadd(m,1,0,i) - qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & - + dtdy * gadd(m,2,-1,i) & - - dtdz * hadd(m,1,1,i) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - + dtdy * gadd(m,2,0,i) & - - dtdz * ( hadd(m,2,1,i) & - - hadd(m,1,1,i) ) - qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & - + dtdy * gadd(m,2,1,i) & - + dtdz * hadd(m,2,1,i) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - - dtdy * ( gadd(m,2,1,i) & - - gadd(m,1,1,i) ) & - + dtdz * hadd(m,2,0,i) - qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & - - dtdy * gadd(m,1,1,i) & - + dtdz * hadd(m,2,-1,i) - end forall - - else - ! # with capa array - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - (dtdx * (fadd(m,i+1) - fadd(m,i)) & - + dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) & - + dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i))) & - / aux(index_capa,i,j,k) - - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - (dtdy * gadd(m,1,0,i) & - + dtdz * ( hadd(m,2,-1,i) & - - hadd(m,1,-1,i) )) & - / aux(index_capa,i,j-1,k) - qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & - - (dtdy * gadd(m,1,-1,i) & - + dtdz * hadd(m,1,-1,i)) & - / aux(index_capa,i,j-1,k-1) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - (dtdy * ( gadd(m,2,-1,i) & - - gadd(m,1,-1,i) ) & - + dtdz * hadd(m,1,0,i)) & - / aux(index_capa,i,j,k-1) - qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & - + (dtdy * gadd(m,2,-1,i) & - - dtdz * hadd(m,1,1,i)) & - / aux(index_capa,i,j+1,k-1) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - + (dtdy * gadd(m,2,0,i) & - - dtdz * ( hadd(m,2,1,i) & - - hadd(m,1,1,i) )) & - / aux(index_capa,i,j+1,k) - qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & - + (dtdy * gadd(m,2,1,i) & - + dtdz * hadd(m,2,1,i)) & - / aux(index_capa,i,j+1,k+1) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - - (dtdy * ( gadd(m,2,1,i) & - - gadd(m,1,1,i) ) & - - dtdz * hadd(m,2,0,i)) & - / aux(index_capa,i,j,k+1) - qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & - - (dtdy * gadd(m,1,1,i) & - - dtdz * hadd(m,2,-1,i)) & - / aux(index_capa,i,j-1,k+1) - end forall - endif - - 50 END DO - - - -! # perform y sweeps -! ================== - - - do 100 k = 0, mz+1 - do 100 i = 0, mx+1 - - forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,j) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 71 j = 1-num_ghost, my+num_ghost - dtdy1d(j) = dtdy / aux(index_capa,i,j,k) - 71 END DO - endif - - if (num_aux > 0) then - ! aux1, aux2, aux3 probably fit in cache, so optimize - ! access to aux - forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) - aux1(ma, j, 2+ia) = aux(ma, i+ia, j, k-1) - aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k) - aux3(ma, j, 2+ia) = aux(ma, i+ia, j, k+1) - end forall - endif - - ! # Store the value of i and k along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - kcom = k - - ! # compute modifications fadd, gadd and hadd to fluxes along this - ! # slice: - - call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & - q1d,dtdy1d,dtdz,dtdx,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the g-fluxes - ! # gadd - modifies the h-fluxes - ! # hadd - modifies the f-fluxes - - if( index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) & - - dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) & - - dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j)) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - + dtdz * gadd(m,2,0,j) & - - dtdx * ( hadd(m,2,1,j) & - - hadd(m,1,1,j) ) - qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & - + dtdz * gadd(m,2,1,j) & - + dtdx * hadd(m,2,1,j) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - - dtdz * ( gadd(m,2,1,j) & - - gadd(m,1,1,j) ) & - + dtdx * hadd(m,2,0,j) - qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & - - dtdz * gadd(m,1,1,j) & - + dtdx * hadd(m,2,-1,j) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - dtdz * gadd(m,1,0,j) & - - dtdx * ( hadd(m,2,-1,j) & - - hadd(m,1,-1,j) ) - qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & - - dtdz * gadd(m,1,-1,j) & - - dtdx * hadd(m,1,-1,j) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - dtdz * ( gadd(m,2,-1,j) & - - gadd(m,1,-1,j) ) & - - dtdx * hadd(m,1,0,j) - qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & - + dtdz * gadd(m,2,-1,j) & - - dtdx*hadd(m,1,1,j) - end forall - else - - ! #with capa array. - - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - (dtdy * (fadd(m,j+1) - fadd(m,j)) & - + dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) & - + dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j))) & - / aux(index_capa,i,j,k) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - + (dtdz * gadd(m,2,0,j) & - - dtdx * ( hadd(m,2,1,j) & - - hadd(m,1,1,j) )) & - / aux(index_capa,i,j,k+1) - qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & - + (dtdz * gadd(m,2,1,j) & - + dtdx * hadd(m,2,1,j)) & - / aux(index_capa,i+1,j,k+1) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - - (dtdz * ( gadd(m,2,1,j) & - - gadd(m,1,1,j) ) & - - dtdx * hadd(m,2,0,j) ) & - / aux(index_capa,i+1,j,k) - qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & - - (dtdz * gadd(m,1,1,j) & - - dtdx * hadd(m,2,-1,j)) & - / aux(index_capa,i+1,j,k-1) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - (dtdz * gadd(m,1,0,j) & - + dtdx * ( hadd(m,2,-1,j) & - - hadd(m,1,-1,j) )) & - / aux(index_capa,i,j,k-1) - qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & - - (dtdz * gadd(m,1,-1,j) & - + dtdx * hadd(m,1,-1,j)) & - / aux(index_capa,i-1,j,k-1) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - (dtdz * ( gadd(m,2,-1,j) & - - gadd(m,1,-1,j) ) & - + dtdx * hadd(m,1,0,j)) & - / aux(index_capa,i-1,j,k) - qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & - + (dtdz * gadd(m,2,-1,j) & - - dtdx*hadd(m,1,1,j)) & - / aux(index_capa,i-1,j,k+1) - end forall - endif - - 100 END DO - - - -! # perform z sweeps -! ================== - - - do 150 j = 0, my+1 - do 150 i = 0, mx+1 - - forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,k) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 130 k = 1-num_ghost, mz+num_ghost - dtdz1d(k) = dtdz / aux(index_capa,i,j,k) - 130 END DO - endif - - if (num_aux > 0) then - ! See the comment on the X sweeps. This is semantically - ! slightly different than putting all the indices in the - ! same forall, and hopefully better for optimizing access - ! to aux. - forall (ja = -1:1, k = 1-num_ghost:mz+num_ghost) - forall (ma = 1:num_aux) - aux1(ma, k, 2+ja) = aux(ma, i-1, j+ja, k) - aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k) - aux3(ma, k, 2+ja) = aux(ma, i+1, j+ja, k) - end forall - end forall - endif - - ! # Store the value of i and j along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - jcom = j - - ! # compute modifications fadd, gadd and hadd to fluxes along this - ! # slice: - - call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & - q1d,dtdz1d,dtdx,dtdy,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the h-fluxes - ! # gadd - modifies the f-fluxes - ! # hadd - modifies the g-fluxes - - if(index_capa == 0)then - - ! #no capa array. Standard flux differencing: - - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) & - - dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) & - - dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k)) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - - dtdx * ( gadd(m,2,1,k) & - - gadd(m,1,1,k) ) & - + dtdy * hadd(m,2,0,k) - qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & - + dtdx * gadd(m,2,1,k) & - + dtdy * hadd(m,2,1,k) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - + dtdx * gadd(m,2,0,k) & - - dtdy * ( hadd(m,2,1,k) & - - hadd(m,1,1,k) ) - qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & - + dtdx * gadd(m,2,-1,k) & - - dtdy * hadd(m,1,1,k) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - dtdx * ( gadd(m,2,-1,k) & - - gadd(m,1,-1,k) ) & - - dtdy * hadd(m,1,0,k) - qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & - - dtdx * gadd(m,1,-1,k) & - - dtdy * hadd(m,1,-1,k) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - dtdx * gadd(m,1,0,k) & - - dtdy * ( hadd(m,2,-1,k) & - - hadd(m,1,-1,k) ) - qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & - - dtdx * gadd(m,1,1,k) & - + dtdy * hadd(m,2,-1,k) - end forall - else - - ! # with capa array - - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - (dtdz * (fadd(m,k+1) - fadd(m,k)) & - + dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) & - + dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k))) & - / aux(index_capa,i,j,k) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - - (dtdx * ( gadd(m,2,1,k) & - - gadd(m,1,1,k) ) & - - dtdy * hadd(m,2,0,k)) & - / aux(index_capa,i,j+1,k) - qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & - + (dtdx * gadd(m,2,1,k) & - + dtdy * hadd(m,2,1,k)) & - / aux(index_capa,i+1,j+1,k) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - + (dtdx * gadd(m,2,0,k) & - - dtdy * ( hadd(m,2,1,k) & - - hadd(m,1,1,k) )) & - / aux(index_capa,i+1,j,k) - qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & - + (dtdx * gadd(m,2,-1,k) & - - dtdy * hadd(m,1,1,k)) & - / aux(index_capa,i+1,j-1,k) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - (dtdx * ( gadd(m,2,-1,k) & - - gadd(m,1,-1,k) ) & - + dtdy * hadd(m,1,0,k)) & - / aux(index_capa,i,j-1,k) - qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & - - (dtdx * gadd(m,1,-1,k) & - + dtdy * hadd(m,1,-1,k)) & - / aux(index_capa,i-1,j-1,k) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - (dtdx * gadd(m,1,0,k) & - + dtdy * ( hadd(m,2,-1,k) & - - hadd(m,1,-1,k) )) & - / aux(index_capa,i-1,j,k) - qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & - - (dtdx * gadd(m,1,1,k) & - - dtdy * hadd(m,2,-1,k)) & - / aux(index_capa,i-1,j+1,k) - end forall - endif - - 150 END DO - - - return - end subroutine step3 + common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom + + ! partition work array into pieces needed for local storage in + ! flux3 routine. Find starting index of each piece: + + nthreads=1 ! Serial + CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat) + IF(nstat >= 1)THEN + !$ call omp_set_num_threads(1) ! set default to 1 thread + END IF + !$omp parallel + !$omp single + !$ nthreads = omp_get_num_threads() + !$omp end single + !$omp end parallel + + nsiz = (maxm+2*num_ghost)*num_eqn + nsiz_w = nsiz * num_waves + nsiz_s = (maxm+2*num_ghost)*num_waves + + i0wave = 1 + i0s = i0wave + nsiz_w*nthreads + i0amdq = i0s + nsiz_s*nthreads + i0apdq = i0amdq + nsiz*nthreads + i0cqxx = i0apdq + nsiz*nthreads + i0bmamdq = i0cqxx + nsiz*nthreads + i0bmapdq = i0bmamdq + nsiz*nthreads + i0bpamdq = i0bmapdq + nsiz*nthreads + i0bpapdq = i0bpamdq + nsiz*nthreads + i0cmamdq = i0bpapdq + nsiz*nthreads + i0cmapdq = i0cmamdq + nsiz*nthreads + i0cpamdq = i0cmapdq + nsiz*nthreads + i0cpapdq = i0cpamdq + nsiz*nthreads + i0cmamdq2 = i0cpapdq + nsiz*nthreads + i0cmapdq2 = i0cmamdq2 + nsiz*nthreads + i0cpamdq2 = i0cmapdq2 + nsiz*nthreads + i0cpapdq2 = i0cpamdq2 + nsiz*nthreads + i0bmcqxxp = i0cpapdq2 + nsiz*nthreads + i0bmcqxxm = i0bmcqxxp + nsiz*nthreads + i0bpcqxxp = i0bmcqxxm + nsiz*nthreads + i0bpcqxxm = i0bpcqxxp + nsiz*nthreads + i0cmcqxxp = i0bpcqxxm + nsiz*nthreads + i0cmcqxxm = i0cmcqxxp + nsiz*nthreads + i0cpcqxxp = i0cmcqxxm + nsiz*nthreads + i0cpcqxxm = i0cpcqxxp + nsiz*nthreads + i0bmcmamdq = i0cpcqxxm + nsiz*nthreads + i0bmcmapdq = i0bmcmamdq + nsiz*nthreads + i0bpcmamdq = i0bmcmapdq + nsiz*nthreads + i0bpcmapdq = i0bpcmamdq + nsiz*nthreads + i0bmcpamdq = i0bpcmapdq + nsiz*nthreads + i0bmcpapdq = i0bmcpamdq + nsiz*nthreads + i0bpcpamdq = i0bmcpapdq + nsiz*nthreads + i0bpcpapdq = i0bpcpamdq + nsiz*nthreads + iused = i0bpcpapdq + nsiz*nthreads - 1 + + if (iused > mwork) then + ! This shouldn't happen due to checks in claw3 + write(6,*) '*** not enough work space in step3()' + write(6,*) '*** iused = ', iused, ' mwork =',mwork + write(6,*) '*** nthreads = ',nthreads + stop + endif + + index_capa = method(6) + !num_aux = method(7) + cfl = 0.d0 + dtdx = dt/dx + dtdy = dt/dy + dtdz = dt/dz + + ! Calculate the number of blocks to use in each grid direction + ! (integer division, rounding up) + nblk_i = (mx + blksiz_i + 1)/blksiz_i + nblk_j = (my + blksiz_j + 1)/blksiz_j + nblk_k = (mz + blksiz_k + 1)/blksiz_k + + !$omp parallel private(me,me1,m,i,j,k,ma,ia,ja,ka,cfl1d,ioffset,joffset,koffset, & + !$omp iblk,jblk,kblk) reduction(max:cfl) + + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + + if (index_capa == 0) then + ! no capa array: + do i=1-num_ghost,maxm+num_ghost + dtdx1d(i,me1) = dtdx + dtdy1d(i,me1) = dtdy + dtdz1d(i,me1) = dtdz + end do + endif + + ! perform x-sweeps + ! ================== + + ! This block-based approach keeps race conditions from happening + ! with OpenMP parallelization. Because the results of the flux3 + ! call affect the column flux3 was called on and all adjacent + ! columns, an easy way to keep threads from interfering with each + ! other is to make sure that every thread takes a column that is + ! at least three cells away from all the other threads in each + ! direction. The code here accomplishes this by breaking the + ! columns into blocks; for a block size of 2 in all directions, + ! the breakdown looks like: + ! + ! xx..xx.. + ! xx..xx.. + ! **++**++ + ! **++**++ + ! xx..xx.. + ! xx..xx.. + ! **++**++ + ! **++**++ + ! + ! All the blocks of columns labeled with the same symbol (e.g. x) + ! are done in parallel in one pass; within each block, the work is + ! all done by one thread, avoiding race conditions. Then the code + ! flushes memory and synchronizes, and starts the next set of + ! blocks (e.g. *). + ! + ! Breaking up into blocks in this fashion helps provide a large + ! number of small, independent work items, which helps with + ! parallel performance. + + do koffset=0,1 + do joffset=0,1 + + ! Guided or dynamic scheduling is necessary to keep all the + ! threads busy if the work per column is very nonuniform. With + ! many blocks to process, guided is probably the best choice. + + !$omp do collapse(2) schedule(guided) + do kblk = koffset, nblk_k-1, 2 + do jblk = joffset, nblk_j-1, 2 + + do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1) + do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1) + + ! copy data along a slice into 1d array: + do i = 1-num_ghost, mx+num_ghost + do m = 1, num_eqn + q1d(m,i,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do i = 1-num_ghost, mx+num_ghost + dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux1(ma, i, 2+ka, me1) = aux(ma, i, j-1, k+ka) + end do + end do + end do + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux2(ma, i, 2+ka, me1) = aux(ma, i, j, k+ka) + end do + end do + end do + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux3(ma, i, 2+ka, me1) = aux(ma, i, j+1, k+ka) + end do + end do + end do + endif + + ! Store the value of j and k along this slice in the common block + ! comxyt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + jcom = j + kcom = k + + ! compute modifications fadd, gadd and hadd to fluxes along + ! this slice: + + call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & + q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),& + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),& + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),& + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! (rather than maintaining arrays f, g and h for + ! the total fluxes, the modifications are used immediately + ! to update qnew in order to save storage.) + + if(index_capa == 0)then + ! Order so that access to qnew is in memory-contiguous + ! order as much as possible + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & + - dtdy * gadd(m,1,-1,i,me1) & + - dtdz * hadd(m,1,-1,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - dtdy * ( gadd(m,2,-1,i,me1) & + - gadd(m,1,-1,i,me1) ) & + - dtdz * hadd(m,1,0,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & + + dtdy * gadd(m,2,-1,i,me1) & + - dtdz * hadd(m,1,1,i,me1) + end do + end do + + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - dtdy * gadd(m,1,0,i,me1) & + - dtdz * ( hadd(m,2,-1,i,me1) & + - hadd(m,1,-1,i,me1) ) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + - dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) & + - dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + + dtdy * gadd(m,2,0,i,me1) & + - dtdz * ( hadd(m,2,1,i,me1) & + - hadd(m,1,1,i,me1) ) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & + - dtdy * gadd(m,1,1,i,me1) & + + dtdz * hadd(m,2,-1,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + - dtdy * ( gadd(m,2,1,i,me1) & + - gadd(m,1,1,i,me1) ) & + + dtdz * hadd(m,2,0,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & + + dtdy * gadd(m,2,1,i,me1) & + + dtdz * hadd(m,2,1,i,me1) + end do + end do + else + ! with capa array + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & + - (dtdy * gadd(m,1,-1,i,me1) & + + dtdz * hadd(m,1,-1,i,me1)) & + / aux(index_capa,i,j-1,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - (dtdy * ( gadd(m,2,-1,i,me1) & + - gadd(m,1,-1,i,me1) ) & + + dtdz * hadd(m,1,0,i,me1)) & + / aux(index_capa,i,j,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & + + (dtdy * gadd(m,2,-1,i,me1) & + - dtdz * hadd(m,1,1,i,me1)) & + / aux(index_capa,i,j+1,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - (dtdy * gadd(m,1,0,i,me1) & + + dtdz * ( hadd(m,2,-1,i,me1) & + - hadd(m,1,-1,i,me1) )) & + / aux(index_capa,i,j-1,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - (dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + + dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) & + + dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)))& + / aux(index_capa,i,j,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + + (dtdy * gadd(m,2,0,i,me1) & + - dtdz * ( hadd(m,2,1,i,me1) & + - hadd(m,1,1,i,me1) )) & + / aux(index_capa,i,j+1,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & + - (dtdy * gadd(m,1,1,i,me1) & + - dtdz * hadd(m,2,-1,i,me1)) & + / aux(index_capa,i,j-1,k+1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + - (dtdy * ( gadd(m,2,1,i,me1) & + - gadd(m,1,1,i,me1) ) & + - dtdz * hadd(m,2,0,i,me1)) & + / aux(index_capa,i,j,k+1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & + + (dtdy * gadd(m,2,1,i,me1) & + + dtdz * hadd(m,2,1,i,me1)) & + / aux(index_capa,i,j+1,k+1) + end do + end do + endif + + end do + end do ! End loops within block + + end do + end do ! End loops over blocks + + ! Flush memory (may not be necessary), then make sure everybody + ! synchronizes before the next offset + + !$omp flush + !$omp barrier + end do + end do ! End loops over offsets + + ! perform y sweeps + ! ================== + + do koffset=0,1 + do ioffset=0,1 + + !$omp do collapse(2) schedule(guided) + do kblk = koffset, nblk_k-1, 2 + do iblk = ioffset, nblk_i-1, 2 + + do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1) + do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1) + + ! copy data along a slice into 1d array: + do j = 1-num_ghost, my+num_ghost + do m = 1, num_eqn + q1d(m,j,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do j = 1-num_ghost, my+num_ghost + dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + ! aux1, aux2, aux3 probably fit in cache, so optimize + ! access to aux + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux1(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k-1) + end do + end do + end do + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux2(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k) + end do + end do + end do + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux3(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k+1) + end do + end do + end do + endif + + ! Store the value of i and k along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + kcom = k + + ! compute modifications fadd, gadd and hadd to fluxes along this + ! slice: + + call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & + q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),& + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),& + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),& + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the g-fluxes + ! gadd - modifies the h-fluxes + ! hadd - modifies the f-fluxes + + if( index_capa == 0)then + ! no capa array. Standard flux differencing: + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & + - dtdz * gadd(m,1,-1,j,me1) & + - dtdx * hadd(m,1,-1,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - dtdz * gadd(m,1,0,j,me1) & + - dtdx * ( hadd(m,2,-1,j,me1) & + - hadd(m,1,-1,j,me1) ) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & + - dtdz * gadd(m,1,1,j,me1) & + + dtdx * hadd(m,2,-1,j,me1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - dtdz * ( gadd(m,2,-1,j,me1) & + - gadd(m,1,-1,j,me1) ) & + - dtdx * hadd(m,1,0,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + - dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) & + - dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + - dtdz * ( gadd(m,2,1,j,me1) & + - gadd(m,1,1,j,me1) ) & + + dtdx * hadd(m,2,0,j,me1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & + + dtdz * gadd(m,2,-1,j,me1) & + - dtdx*hadd(m,1,1,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + + dtdz * gadd(m,2,0,j,me1) & + - dtdx * ( hadd(m,2,1,j,me1) & + - hadd(m,1,1,j,me1) ) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & + + dtdz * gadd(m,2,1,j,me1) & + + dtdx * hadd(m,2,1,j,me1) + end do + end do + else + ! with capa array. + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & + - (dtdz * gadd(m,1,-1,j,me1) & + + dtdx * hadd(m,1,-1,j,me1)) & + / aux(index_capa,i-1,j,k-1) + end do + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - (dtdz * gadd(m,1,0,j,me1) & + + dtdx * ( hadd(m,2,-1,j,me1) & + - hadd(m,1,-1,j,me1) )) & + / aux(index_capa,i,j,k-1) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & + - (dtdz * gadd(m,1,1,j,me1) & + - dtdx * hadd(m,2,-1,j,me1)) & + / aux(index_capa,i+1,j,k-1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - (dtdz * ( gadd(m,2,-1,j,me1) & + - gadd(m,1,-1,j,me1) ) & + + dtdx * hadd(m,1,0,j,me1)) & + / aux(index_capa,i-1,j,k) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - (dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + + dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) & + + dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)))& + / aux(index_capa,i,j,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + - (dtdz * ( gadd(m,2,1,j,me1) & + - gadd(m,1,1,j,me1) ) & + - dtdx * hadd(m,2,0,j,me1) ) & + / aux(index_capa,i+1,j,k) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & + + (dtdz * gadd(m,2,-1,j,me1) & + - dtdx*hadd(m,1,1,j,me1)) & + / aux(index_capa,i-1,j,k+1) + end do + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + + (dtdz * gadd(m,2,0,j,me1) & + - dtdx * ( hadd(m,2,1,j,me1) & + - hadd(m,1,1,j,me1) )) & + / aux(index_capa,i,j,k+1) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & + + (dtdz * gadd(m,2,1,j,me1) & + + dtdx * hadd(m,2,1,j,me1)) & + / aux(index_capa,i+1,j,k+1) + end do + end do + endif + + end do + end do ! End loops within blocks + + end do + end do ! End loops over blocks + + !$omp flush + !$omp barrier + end do + end do ! End loops over offsets + + ! perform z sweeps + ! ================== + + do joffset=0,1 + do ioffset=0,1 + + !$omp do collapse(2) schedule(guided) + do jblk = joffset, nblk_j-1, 2 + do iblk = ioffset, nblk_i-1, 2 + + do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1) + do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1) + + do k = 1-num_ghost, mz+num_ghost + do m = 1, num_eqn + q1d(m,k,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do k = 1-num_ghost, mz+num_ghost + dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + do k = 1-num_ghost, mz+num_ghost + do ja = -1, 1 + do ma = 1, num_aux + aux1(ma, k, 2+ja, me1) = aux(ma, i-1, j+ja, k) + end do + do ma = 1, num_aux + aux2(ma, k, 2+ja, me1) = aux(ma, i, j+ja, k) + end do + do ma = 1, num_aux + aux3(ma, k, 2+ja, me1) = aux(ma, i+1, j+ja, k) + end do + end do + end do + endif + + + + ! Store the value of i and j along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + jcom = j + + ! compute modifications fadd, gadd and hadd to fluxes along this + ! slice: + + call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & + q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1), & + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1), & + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s), & + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the h-fluxes + ! gadd - modifies the f-fluxes + ! hadd - modifies the g-fluxes + + if(index_capa == 0)then + + ! no capa array. Standard flux differencing: + do k = 1, mz + do m = 1, num_eqn + qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & + - dtdx * gadd(m,1,-1,k,me1) & + - dtdy * hadd(m,1,-1,k,me1) + end do + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - dtdx * ( gadd(m,2,-1,k,me1) & + - gadd(m,1,-1,k,me1) ) & + - dtdy * hadd(m,1,0,k,me1) + end do + do m = 1, num_eqn + qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & + + dtdx * gadd(m,2,-1,k,me1) & + - dtdy * hadd(m,1,1,k,me1) + end do + + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - dtdx * gadd(m,1,0,k,me1) & + - dtdy * ( hadd(m,2,-1,k,me1) & + - hadd(m,1,-1,k,me1) ) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + - dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) & + - dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + + dtdx * gadd(m,2,0,k,me1) & + - dtdy * ( hadd(m,2,1,k,me1) & + - hadd(m,1,1,k,me1) ) + end do + + do m = 1, num_eqn + qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & + - dtdx * gadd(m,1,1,k,me1) & + + dtdy * hadd(m,2,-1,k,me1) + end do + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + - dtdx * ( gadd(m,2,1,k,me1) & + - gadd(m,1,1,k,me1) ) & + + dtdy * hadd(m,2,0,k,me1) + end do + do m = 1, num_eqn + qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & + + dtdx * gadd(m,2,1,k,me1) & + + dtdy * hadd(m,2,1,k,me1) + end do + end do + else + + ! with capa array + do k = 1, mz + do m = 1, num_eqn + qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & + - (dtdx * gadd(m,1,-1,k,me1) & + + dtdy * hadd(m,1,-1,k,me1)) & + / aux(index_capa,i-1,j-1,k) + end do + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - (dtdx * ( gadd(m,2,-1,k,me1) & + - gadd(m,1,-1,k,me1) ) & + + dtdy * hadd(m,1,0,k,me1)) & + / aux(index_capa,i,j-1,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & + + (dtdx * gadd(m,2,-1,k,me1) & + - dtdy * hadd(m,1,1,k,me1)) & + / aux(index_capa,i+1,j-1,k) + end do + + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - (dtdx * gadd(m,1,0,k,me1) & + + dtdy * ( hadd(m,2,-1,k,me1) & + - hadd(m,1,-1,k,me1) )) & + / aux(index_capa,i-1,j,k) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - (dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + + dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) & + + dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)))& + / aux(index_capa,i,j,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + + (dtdx * gadd(m,2,0,k,me1) & + - dtdy * ( hadd(m,2,1,k,me1) & + - hadd(m,1,1,k,me1) )) & + / aux(index_capa,i+1,j,k) + end do + + do m = 1, num_eqn + qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & + - (dtdx * gadd(m,1,1,k,me1) & + - dtdy * hadd(m,2,-1,k,me1)) & + / aux(index_capa,i-1,j+1,k) + end do + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + - (dtdx * ( gadd(m,2,1,k,me1) & + - gadd(m,1,1,k,me1) ) & + - dtdy * hadd(m,2,0,k,me1)) & + / aux(index_capa,i,j+1,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & + + (dtdx * gadd(m,2,1,k,me1) & + + dtdy * hadd(m,2,1,k,me1)) & + / aux(index_capa,i+1,j+1,k) + end do + end do + endif + + end do + end do + + end do + end do + + !$omp flush + !$omp barrier + end do + end do + !$omp end parallel + + return +end subroutine step3 diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index c248a18ec..4186d86e5 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -1,61 +1,62 @@ +! ================================================================== +subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & + mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & + qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,& + num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3) +! ================================================================== + ! Take one time step, updating q, to be used with + ! dimensional splitting. + ! On entry, qold and qnew should be identical and give the + ! initial data for this step. + ! On exit, qnew returns values at the end of the time step. + ! qold is unchanged. + ! + ! This is a simplified version of the subroutine step3d + ! Since only qadd and fadd is used in the flux updates, + ! the solution updates below are considerably simplified + ! compared to step3d. + + !----------------------------------------------------------------- + ! NOTE! Since dimensional splitting is used, it is possible + ! to reduce the memory requirement, i.e. the size of + ! the work array. It could be reduced with + ! + ! (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux), + ! + ! when also possible reductions in flux3 are included. + ! However, this term is small compared to the dominating + ! term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost). + !----------------------------------------------------------------- + + !$ use omp_lib -! ================================================================== - subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, & - mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & - qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, & - aux1,aux2,aux3,num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3) -! ================================================================== + implicit real*8(a-h,o-z) + external rpn3,rpt3,rptt3 + dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension method(7),mthlim(num_waves) + dimension work(mwork) + logical :: use_fwave + character :: env_value -! # Take one time step, updating q, to be used with -! # dimensional splitting. -! # On entry, qold and qnew should be identical and give the -! # initial data for this step. -! # On exit, qnew returns values at the end of the time step. -! # qold is unchanged. - -! # This is a simplified version of the subroutine step3d -! # Since only qadd and fadd is used in the flux updates, -! # the solution updates below are considerably simplified -! # compared to step3d. - -! #----------------------------------------------------------------- -! # NOTE! Since dimensional splitting is used, it is possible -! # to reduce the memory requirement, i.e. the size of -! # the work array. It could be reduced with -! # -! # (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux), -! # -! # when also possible reductions in flux3 are included. -! # However, this term is small compared to the dominating -! # term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost). -! #----------------------------------------------------------------- - - implicit real*8(a-h,o-z) - external rpn3,rpt3,rptt3 - dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost) - dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension dtdx1d(1-num_ghost:maxm+num_ghost) - dimension dtdy1d(1-num_ghost:maxm+num_ghost) - dimension dtdz1d(1-num_ghost:maxm+num_ghost) - dimension method(7),mthlim(num_waves) - dimension work(mwork) - logical :: use_fwave - - common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom + common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom !f2py intent(out) cfl !f2py intent(in,out) qnew @@ -67,317 +68,371 @@ subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, & !f2py x=rpt3(x) !f2py x=rptt3(x) -! # partition work array into pieces needed for local storage in -! # flux2 routine. Find starting index of each piece: + ! partition work array into pieces needed for local storage in + ! flux3 routine. Find starting index of each piece: - i0wave = 1 - i0s = i0wave + (maxm+2*num_ghost)*num_eqn*num_waves - i0amdq = i0s + (maxm+2*num_ghost)*num_waves - i0apdq = i0amdq + (maxm+2*num_ghost)*num_eqn - i0cqxx = i0apdq + (maxm+2*num_ghost)*num_eqn - i0bmamdq = i0cqxx + (maxm+2*num_ghost)*num_eqn - i0bmapdq = i0bmamdq + (maxm+2*num_ghost)*num_eqn - i0bpamdq = i0bmapdq + (maxm+2*num_ghost)*num_eqn - i0bpapdq = i0bpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq = i0bpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq = i0cmamdq + (maxm+2*num_ghost)*num_eqn - i0cpamdq = i0cmapdq + (maxm+2*num_ghost)*num_eqn - i0cpapdq = i0cpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq2 = i0cpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq2 = i0cmamdq2 + (maxm+2*num_ghost)*num_eqn - i0cpamdq2 = i0cmapdq2 + (maxm+2*num_ghost)*num_eqn - i0cpapdq2 = i0cpamdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxp = i0cpapdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxm = i0bmcqxxp + (maxm+2*num_ghost)*num_eqn - i0bpcqxxp = i0bmcqxxm + (maxm+2*num_ghost)*num_eqn - i0bpcqxxm = i0bpcqxxp + (maxm+2*num_ghost)*num_eqn - i0cmcqxxp = i0bpcqxxm + (maxm+2*num_ghost)*num_eqn - i0cmcqxxm = i0cmcqxxp + (maxm+2*num_ghost)*num_eqn - i0cpcqxxp = i0cmcqxxm + (maxm+2*num_ghost)*num_eqn - i0cpcqxxm = i0cpcqxxp + (maxm+2*num_ghost)*num_eqn - i0bmcmamdq = i0cpcqxxm + (maxm+2*num_ghost)*num_eqn - i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn - i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn - i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn - i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn - i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn - i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn - i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn - iused = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1 + nthreads=1 ! Serial + CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat) + IF(nstat >= 1)THEN + !$ call omp_set_num_threads(1) ! set default to 1 thread + END IF + !$OMP PARALLEL + !$OMP SINGLE + !$ nthreads = omp_get_num_threads() + !$OMP END SINGLE + !$OMP END PARALLEL - if (iused > mwork) then - ! # This shouldn't happen due to checks in claw2 - write(6,*) '*** not enough work space in step2' - write(6,*) '*** iused = ', iused, ' mwork =',mwork - stop - endif + nsiz = (maxm+2*num_ghost)*num_eqn + nsiz_w = nsiz * num_waves + nsiz_s = (maxm+2*num_ghost)*num_waves - index_capa = method(6) -! num_aux = method(7) - cfl = 0.d0 - dtdx = dt/dx - dtdy = dt/dy - dtdz = dt/dz + i0wave = 1 + i0s = i0wave + nsiz_w*nthreads + i0amdq = i0s + nsiz_s*nthreads + i0apdq = i0amdq + nsiz*nthreads + i0cqxx = i0apdq + nsiz*nthreads + i0bmamdq = i0cqxx + nsiz*nthreads + i0bmapdq = i0bmamdq + nsiz*nthreads + i0bpamdq = i0bmapdq + nsiz*nthreads + i0bpapdq = i0bpamdq + nsiz*nthreads + i0cmamdq = i0bpapdq + nsiz*nthreads + i0cmapdq = i0cmamdq + nsiz*nthreads + i0cpamdq = i0cmapdq + nsiz*nthreads + i0cpapdq = i0cpamdq + nsiz*nthreads + i0cmamdq2 = i0cpapdq + nsiz*nthreads + i0cmapdq2 = i0cmamdq2 + nsiz*nthreads + i0cpamdq2 = i0cmapdq2 + nsiz*nthreads + i0cpapdq2 = i0cpamdq2 + nsiz*nthreads + i0bmcqxxp = i0cpapdq2 + nsiz*nthreads + i0bmcqxxm = i0bmcqxxp + nsiz*nthreads + i0bpcqxxp = i0bmcqxxm + nsiz*nthreads + i0bpcqxxm = i0bpcqxxp + nsiz*nthreads + i0cmcqxxp = i0bpcqxxm + nsiz*nthreads + i0cmcqxxm = i0cmcqxxp + nsiz*nthreads + i0cpcqxxp = i0cmcqxxm + nsiz*nthreads + i0cpcqxxm = i0cpcqxxp + nsiz*nthreads + i0bmcmamdq = i0cpcqxxm + nsiz*nthreads + i0bmcmapdq = i0bmcmamdq + nsiz*nthreads + i0bpcmamdq = i0bmcmapdq + nsiz*nthreads + i0bpcmapdq = i0bpcmamdq + nsiz*nthreads + i0bmcpamdq = i0bpcmapdq + nsiz*nthreads + i0bmcpapdq = i0bmcpamdq + nsiz*nthreads + i0bpcpamdq = i0bmcpapdq + nsiz*nthreads + i0bpcpapdq = i0bpcpamdq + nsiz*nthreads + iused = i0bpcpapdq + nsiz*nthreads - 1 - if (index_capa == 0) then - ! # no capa array: - do 5 i=1-num_ghost,maxm+num_ghost - dtdx1d(i) = dtdx - dtdy1d(i) = dtdy - dtdz1d(i) = dtdz - 5 END DO - endif + if (iused > mwork) then + ! This shouldn't happen due to checks in claw3 + write(6,*) '*** not enough work space in step3ds()' + write(6,*) '*** iused = ', iused, ' mwork =',mwork + write(6,*) '*** nthreads = ',nthreads + stop + endif + index_capa = method(6) + !num_aux = method(7) + cfl = 0.d0 + dtdx = dt/dx + dtdy = dt/dy + dtdz = dt/dz + + if (index_capa == 0) then + ! no capa array: + !$OMP PARALLEL PRIVATE(i,me,me1) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + do i=1-num_ghost,maxm+num_ghost + dtdx1d(i,me1) = dtdx + dtdy1d(i,me1) = dtdy + dtdz1d(i,me1) = dtdz + end do + !$OMP END PARALLEL + endif - if( idir == 1)then - - ! # perform x-sweeps - ! ================== + if( idir == 1)then - do 50 k = 0,mz+1 - do 50 j = 0,my+1 - - forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,i) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 23 i = 1-num_ghost, mx+num_ghost - dtdx1d(i) = dtdx / aux(index_capa,i,j,k) - 23 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1) - aux2(ma,i,2+ka) = aux(ma,i,j,k+ka) - end forall - endif - - ! # Store the value of j and k along this slice in the common block - ! # comxyt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - jcom = j - kcom = k - - ! # compute modifications qadd and fadd along this slice - - call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & - q1d,dtdx1d,dtdy,dtdz,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform x-sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,i,ma,ka,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + ! Guided schedule seems to produce a small performance increase + ! relative to static or dynamic for problems with uniform work + ! per column. For problems with very nonuniform work per + ! column, either guided or dynamic is necessary to keep all the + ! threads busy. + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do k = 0,mz+1 + do j = 0,my+1 + + forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,i,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do i = 1-num_ghost, mx+num_ghost + dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + if (num_aux > 0) then + forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1) + aux2(ma,i,2+ka,me1) = aux(ma,i,j,k+ka) + end forall + endif + + ! Store the value of j and k along this slice in the common block + ! comxyt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + jcom = j + kcom = k + + ! compute modifications qadd and fadd along this slice + + call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & + q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz,dummy1,& + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),& + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # (rather than maintaining arrays f, g and h for the total fluxes, - ! # the modifications are used immediately to update qnew - ! # in order to save storage.) - - if(index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) - end forall - else - ! # with capa array - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) & - / aux(index_capa,i,j,k) - end forall - endif - - 50 END DO - - else if( idir == 2 )then + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! (rather than maintaining arrays f, g and h for the total fluxes, + ! the modifications are used immediately to update qnew + ! in order to save storage.) + + if(index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, i = 1:mx) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) + end forall + else + ! with capa array + forall (m = 1:num_eqn, i = 1:mx) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - ! # perform y sweeps - ! ================== + else if( idir == 2 )then - do 100 k = 0, mz+1 - do 100 i = 0, mx+1 - - forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,j) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 71 j = 1-num_ghost, my+num_ghost - dtdy1d(j) = dtdy / aux(index_capa,i,j,k) - 71 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - ! There's a decent chance aux2 will all fit into cache, - ! so keep accesses to aux as contiguous as possible. - forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) - aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k) - end forall - endif - - ! # Store the value of i and k along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - kcom = k - - ! # compute modifications qadd and fadd along this slice - - call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & - q1d,dtdy1d,dtdz,dtdx,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform y sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,j,ma,ia,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do k = 0, mz+1 + do i = 0, mx+1 + + forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,j,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do j = 1-num_ghost, my+num_ghost + dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + + if (num_aux > 0) then + ! There's a decent chance aux2 will all fit into cache, + ! so keep accesses to aux as contiguous as possible. + forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) + aux2(ma, j, 2+ia,me1) = aux(ma, i+ia, j, k) + end forall + endif + + ! Store the value of i and k along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + kcom = k + + ! compute modifications qadd and fadd along this slice + + call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & + q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx,dummy1,& + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),& + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the g-fluxes - - if( index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) - end forall - else - ! #with capa array. - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) & - / aux(index_capa,i,j,k) - end forall - endif - - 100 END DO - - else + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the g-fluxes + + if( index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, j = 1:my) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) + end forall + else + ! with capa array. + forall (m = 1:num_eqn, j = 1:my) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - ! # perform z sweeps - ! ================== + else - - do 150 j = 0, my+1 - do 150 i = 0, mx+1 - - forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,k) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 130 k = 1-num_ghost, mz+num_ghost - dtdz1d(k) = dtdz / aux(index_capa,i,j,k) - 130 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - ! There's a decent chance aux2 will all fit into cache, - ! so keep accesses to aux as contiguous as possible. - forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost) - aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k) - end forall - endif - - ! # Store the value of i and j along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - jcom = j - - ! # compute modifications qadd and fadd along this slice - - call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & - q1d,dtdz1d,dtdx,dtdy,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform z sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,k,ma,ja,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do j = 0, my+1 + do i = 0, mx+1 + + forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,k,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do k = 1-num_ghost, mz+num_ghost + dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + + if (num_aux > 0) then + ! There's a decent chance aux2 will all fit into cache, + ! so keep accesses to aux as contiguous as possible. + forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost) + aux2(ma, k, 2+ja,me1) = aux(ma, i, j+ja, k) + end forall + endif + + ! Store the value of i and j along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + jcom = j + + ! compute modifications qadd and fadd along this slice + + call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & + q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy,dummy1, & + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1), & + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the h-fluxes - - if(index_capa == 0)then - ! #no capa array. Standard flux differencing: - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) - end forall - else - ! # with capa array - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) & - / aux(index_capa,i,j,k) - end forall - endif - - 150 END DO + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the h-fluxes + + if(index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, k = 1:mz) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) + end forall + else + ! with capa array + forall (m = 1:num_eqn, k = 1:mz) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - endif + endif - return - end subroutine step3ds + return +end subroutine step3ds diff --git a/src/pyclaw/solver.py b/src/pyclaw/solver.py index 5bc708c6a..27a2acec5 100644 --- a/src/pyclaw/solver.py +++ b/src/pyclaw/solver.py @@ -32,6 +32,14 @@ def before_step(solver,solution): """ pass +def after_step(solver,solution): + r""" + Dummy routine called after each step + + Replace this routine if you want to do something after each time step. + """ + pass + class Solver(object): r""" Pyclaw solver superclass. @@ -85,6 +93,14 @@ class Solver(object): def before_step(solver,solution) + .. attribute:: after_step + + Function called after each time step is taken. + The required signature for this function is: + + def after_step(solver,solution) + + .. attribute:: dt_variable Whether to allow the time step to vary, ``default = True``. @@ -179,6 +195,7 @@ def __init__(self,riemann_solver=None,claw_package=None): self._use_old_bc_sig = False self.accept_step = True self.before_step = None + self.after_step = None # select package to build solver objects from, by default this will be # the package that contains the module implementing the derived class @@ -610,6 +627,9 @@ def evolve_to_time(self,solution,tend=None): # Note that the solver may alter dt during the step() routine self.step(solution,take_one_step,tstart,tend) + if self.after_step is not None: + self.after_step(self,solution.states[0]) + # Check to make sure that the Courant number was not too large cfl = self.cfl.get_cached_max() self.accept_step = self.accept_reject_step(state)